From cc7ac313fe8cce984928da4c8fa138252a34410b Mon Sep 17 00:00:00 2001 From: Jackmin801 Date: Sat, 28 Sep 2024 22:53:47 +0000 Subject: [PATCH] fix global port in script --- scripts/simulate_multi_node_diloco.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/simulate_multi_node_diloco.sh b/scripts/simulate_multi_node_diloco.sh index cbbd8737..a8917302 100755 --- a/scripts/simulate_multi_node_diloco.sh +++ b/scripts/simulate_multi_node_diloco.sh @@ -53,17 +53,17 @@ trap cleanup SIGINT mkdir -p logs export GLOBAL_ADDR=localhost -export GLOBAL_PORT=10000 +export GLOBAL_PORT=1234 export GLOBAL_WORLD_SIZE=$N for i in $(seq 0 $(($N - 1 ))) do > logs/log$i - GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((10001 + $i)) --nnodes=1 $@ > logs/log$i 2>&1 & + GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((10001 + $i)) --nnodes=1 $@ > logs/log$i.log 2>&1 & child_pids+=($!) done -tail -f logs/log0 & +tail -f logs/log0.log & child_pids+=($!) wait