-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_multi_gpus.sh
81 lines (68 loc) · 2.22 KB
/
train_multi_gpus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
#SBATCH --nodes=5
#SBATCH --ntasks=5
#SBATCH --cpus-per-task=24
#SBATCH --job-name=n-5nodes
#SBATCH --mem=200GB
#SBATCH --gres=gpu:4
#SBATCH --partition=a100
#SBATCH --output=logs/mgpus_%x-%j.out
#SBATCH --error=logs/mgpus_%x-%j.err
#SBATCH --time=20-00:00:00
#SBATCH --exclude=gpu101,gpu113
set -x -e
# log the sbatch environment
echo "start time: $(date)"
echo "SLURM_JOBID="$SLURM_JOBID
echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
echo "SLURM_JOB_PARTITION"=$SLURM_JOB_PARTITION
echo "SLURM_NNODES"=$SLURM_NNODES
echo "SLURM_GPUS_ON_NODE"=$SLURM_GPUS_ON_NODE
echo "SLURM_SUBMIT_DIR"=$SLURM_SUBMIT_DIR
# Training setup
GPUS_PER_NODE=$SLURM_GPUS_ON_NODE
## Master node setup
MAIN_HOST=`hostname -s`
export MASTER_ADDR=$MAIN_HOST
# Get a free port using python
export MASTER_PORT=$(python - <<EOF
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(('', 0)) # OS will allocate a free port
free_port = sock.getsockname()[1]
sock.close()
print(free_port)
EOF
)
export NNODES=$SLURM_NNODES
#NODE_RANK=$SLURM_PROCID ## do i need this?
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) # M nodes x N GPUs
echo "nnodes: ${NNODES}"
## Vector's cluster doesn't support infinite bandwidth
## but gloo backend would automatically use inifinite bandwidth if not disable
export NCCL_IB_DISABLE=1
export OMP_NUM_THREADS=1
export NCCL_DEBUG=INFO
echo "SLURM_JOBID="$SLURM_JOBID
echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
echo "SLURM_JOB_PARTITION"=$SLURM_JOB_PARTITION
echo "SLURM_NNODES"=$SLURM_NNODES
echo "SLURM_GPUS_ON_NODE"=$SLURM_GPUS_ON_NODE
echo "SLURM_SUBMIT_DIR"=$SLURM_SUBMIT_DIR
echo SLURM_NTASKS=$SLURM_NTASKS
for (( i=0; i < $SLURM_NTASKS; ++i ))
do
/opt/slurm/bin/srun -lN1 --mem=200G --gres=gpu:4 -c $SLURM_CPUS_ON_NODE -N 1 -n 1 -r $i bash -c \
python train_multi_gpus.py \
-task_name MedSAM-ViT-B-2GPUs_6_7 \
-work_dir ./work_dir \
-batch_size 8 \
-num_workers 8 \
--world_size ${WORLD_SIZE} \
--bucket_cap_mb 25 \
--grad_acc_steps 1 \
--node_rank ${i} \
--init_method tcp://${MASTER_ADDR}:${MASTER_PORT}" >> ./logs/log_for_${SLURM_JOB_ID}_node_${i}.log 2>&1 &
done
wait ## Wait for the tasks on nodes to finish
echo "END TIME: $(date)"