Skip to content

Commit

Permalink
use env vars for ports
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackmin801 committed Sep 29, 2024
1 parent cc7ac31 commit 02b22d4
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions scripts/simulate_multi_node_diloco.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,18 @@ trap cleanup SIGINT
mkdir -p logs

export GLOBAL_ADDR=localhost
export GLOBAL_PORT=1234
export GLOBAL_PORT=${GLOBAL_PORT:-1234}
export GLOBAL_WORLD_SIZE=$N
export BASE_PORT=${BASE_PORT:-10001}

for i in $(seq 0 $(($N - 1 )))
do
> logs/log$i
GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((10001 + $i)) --nnodes=1 $@ > logs/log$i.log 2>&1 &
GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((BASE_PORT + $i)) --nnodes=1 $@ > logs/log$i.log 2>&1 &
child_pids+=($!)
done

tail -f logs/log0.log &
child_pids+=($!)

wait

0 comments on commit 02b22d4

Please sign in to comment.