Skip to content

Commit 9068514

Browse files
committed
Add EOS support and some minor fixes
1 parent 8c1fa68 commit 9068514

File tree

5 files changed

+52
-10
lines changed

5 files changed

+52
-10
lines changed

scripts/bench/cmake_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,6 @@ mkdir -p "$REALM_DIR/$BUILD_DIR"
7070
pushd "$REALM_DIR/$BUILD_DIR"
7171
echo "CMake Options: ${cmake_options}"
7272
# make clean
73-
cmake ../ ${cmake_options}
73+
cmake ${REALM_DIR} ${cmake_options}
7474
make -j VERBOSE=1
7575
popd

scripts/bench/common.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ function detect_platform {
2323
export PLATFORM=computelab
2424
export QUEUE=v100-sxm2-16gb@cr+mp/dgx-1v@cr+mp/8gpu-80cpu-512gb
2525
export GPU_ARCH=70
26+
elif [[ "$(uname -n)" == *"login-eos"* ]]; then
27+
export PLATFORM=eos
28+
export QUEUE=batch
29+
export GPU_ARCH=90
30+
export JOB="nvr_legate-realm.bench"
2631
else
2732
export PLATFORM=other
2833
fi

scripts/bench/computelab.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616
# limitations under the License.
1717
#
1818

19-
echo $LEGION_DIR
19+
echo $REALM_DIR
2020
# only for ucx, but it doe not break other network modules if it is set
21-
export REALM_UCP_BOOTSTRAP_PLUGIN=$LEGION_DIR/$BUILD_DIR/lib/realm_ucp_bootstrap_mpi.so
21+
export REALM_UCP_BOOTSTRAP_PLUGIN=${REALM_UCP_BOOTSTRAP_PLUGIN:-$REALM_DIR/$BUILD_DIR/lib/realm_ucp_bootstrap_mpi.so}
2222

2323
# CUDA Toolkit Path
24-
CUDA_PATH="/home/scratch.svc_compute_arch/release/cuda_toolkit/public/12.6.1/x86_64/u22.04/"
25-
MPI_PATH="/home/scratch.svc_compute_arch/release/mpi/openmpi/v4.1.4-ucx-1.13.1-cuda11.5"
24+
export CUDA_PATH="${CUDA_PATH:/home/scratch.svc_compute_arch/release/cuda_toolkit/public/12.6.1/x86_64/u22.04/}"
25+
export MPI_PATH="${MPI_PATH:/home/scratch.svc_compute_arch/release/mpi/openmpi/v4.1.4-ucx-1.13.1-cuda11.5}"
2626

2727
# Update PATH
2828
export PATH="${MPI_PATH}/bin:${CUDA_PATH}/bin:${PATH:-}"
@@ -31,7 +31,6 @@ export PATH="${MPI_PATH}/bin:${CUDA_PATH}/bin:${PATH:-}"
3131
export LD_LIBRARY_PATH="${MPI_PATH}/lib:${CUDA_PATH}/lib64:${LD_LIBRARY_PATH:-}"
3232

3333
# Other environment variables
34-
export CUDA_PATH
3534
export CUDA_ARCH="${CUDA_ARCH:-70}"
3635
export CONDUIT="${CONDUIT:-ibv}"
3736
export GASNET_HOST_DETECT="hostname"

scripts/bench/job.slurm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ echo "Job cmd: $@"
3030
echo "ITERATIONS: $ITERATIONS"
3131
echo "PATH: $PATH"
3232
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
33-
echo "REALM_UCP_BOOTSTRAP_PLUGIN: $REALM_UCP_BOOTSTRAP_PLUGIN"
33+
echo "REALM_UCP_BOOTSTRAP_PLUGIN: ${REALM_UCP_BOOTSTRAP_PLUGIN:-}"
3434
echo "script=$SCRIPT_NAME"
3535
echo "slurm_nodes=$SLURM_NNODES"
3636
echo "slurm_tasks_per_node=$SLURM_NTASKS_PER_NODE"

scripts/bench/run.sh

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020
set -e
2121

2222
export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
23-
export REALM_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
23+
export REALM_DIR=${REALM_DIR:-"$(dirname "$(dirname "$SCRIPT_DIR")")"}
2424
source "$SCRIPT_DIR/common.sh"
2525

2626
# Prepare output directory
2727
function mk_output() {
28+
export REALM_HOST_DIR=${REALM_HOST_DIR:-$REALM_DIR}
2829
DATE="$(date +%Y/%m/%d)"
29-
mkdir -p "$REALM_DIR/$DATE"
30-
export HOST_OUT_DIR="$REALM_DIR/$DATE"
30+
mkdir -p "$REALM_HOST_DIR/$DATE"
31+
export HOST_OUT_DIR="$REALM_HOST_DIR/$DATE"
3132
echo "Redirecting stdout, stderr and logs to $HOST_OUT_DIR"
3233
export CMD_OUT_DIR="$HOST_OUT_DIR"
3334
}
@@ -47,6 +48,7 @@ if [[ $# -lt 2 || ! "$1" =~ ^(1/)?[0-9]+(:[0-9]+)?$ ]]; then
4748
echo " TIMELIMIT : how much time to request for the job, in minutes (defaut: 60)"
4849
echo " NUM_GPUS : number of gpus per node"
4950
echo " ACCOUNT : account name"
51+
echo " IMAGE: path to container image in container based clusteres"
5052
exit
5153
fi
5254

@@ -132,6 +134,42 @@ elif [[ "$PLATFORM" == "computelab" || "$PLATFORM" == "oberon" ]]; then
132134

133135
sbatch_cmd+=("${slurm_cmd[@]}")
134136
submit "${sbatch_cmd[@]}"
137+
elif [[ "$PLATFORM" == "eos" ]]; then
138+
mk_output
139+
NODE_RATIO="$RATIO_OF_NODE_USED / $RANKS_PER_NODE"
140+
NUMAS_PER_NODE=2
141+
RAM_PER_NUMA=950000
142+
# Calculate available resources per OpenMP group
143+
NUM_OMPS=$((NUMAS_PER_NODE * $NODE_RATIO))
144+
if [[ $NUM_OMPS -lt 1 ]]; then
145+
NUM_OMPS=1
146+
RAM_PER_OMP=$((RAM_PER_NUMA * NUMAS_PER_NODE * $NODE_RATIO))
147+
else
148+
RAM_PER_OMP="$RAM_PER_NUMA"
149+
fi
150+
WORK_RAM=$((NUM_OMPS * RAM_PER_OMP))
151+
152+
TIME="$(date +%H%M%S)"
153+
NUM_GPUS_PER_RANK=$(($NUM_GPUS / $RANKS_PER_NODE))
154+
155+
export GASNET_AM_CREDITS_PP=16
156+
export GASNET_IBV_PORTS=mlx5_0+mlx5_3+mlx5_4+mlx5_5+mlx5_6+mlx5_9+mlx5_10+mlx5_11
157+
158+
# this is set in the docker image
159+
unset REALM_UCP_BOOTSTRAP_PLUGIN
160+
161+
slurm_cmd=("$SCRIPT_DIR/job.slurm" srun -N "$NUM_RANKS" --mpi=pmix -n "$NUM_RANKS" --ntasks-per-node "$RANKS_PER_NODE" --mem "$((WORK_RAM + 4000))"M --container-image "$IMAGE" "$@")
162+
163+
if [[ -z "${NODE_LIST}" ]]; then
164+
sbatch_cmd=(sbatch -p "$QUEUE" -t "$TIMELIMIT" --exclusive -N "$NUM_NODES"
165+
-o "$HOST_OUT_DIR/$JOB_NAME-$TIME.txt" -A "$ACCOUNT" -J "$JOB")
166+
else
167+
sbatch_cmd=(sbatch -p "$QUEUE" -t "$TIMELIMIT" --exclusive -w "$NODE_LIST" -N "$NUM_NODES"
168+
-o "$HOST_OUT_DIR/$JOB_NAME-$TIME.txt" -A "$ACCOUNT" -J "$JOB")
169+
fi
170+
171+
sbatch_cmd+=("${slurm_cmd[@]}")
172+
submit "${sbatch_cmd[@]}"
135173
fi
136174

137175
# Wait for batch job to start

0 commit comments

Comments
 (0)