-
Notifications
You must be signed in to change notification settings - Fork 0
/
config_DGX1_multi.sh
executable file
·42 lines (36 loc) · 1.48 KB
/
config_DGX1_multi.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
## DL params
OPTIMIZER="sgdwfastlars"
BATCHSIZE="52"
KVSTORE="horovod"
LR="38"
WARMUP_EPOCHS="25"
EVAL_OFFSET="1" # Targeting epoch no 85
EVAL_PERIOD="4"
WD="0.0002"
LARSETA="0.001"
LABELSMOOTHING="0.1"
LRSCHED="pow2"
NUMEPOCHS="90"
NETWORK="resnet-v1b-normconv-fl"
export MXNET_CUDNN_SUPPLY_NORMCONV_CONSTANTS=1
DALI_PREFETCH_QUEUE="3"
DALI_NVJPEG_MEMPADDING="256"
DALI_CACHE_SIZE=6144
## Environment variables for multi node runs
export HOROVOD_CYCLE_TIME=0.1
export HOROVOD_FUSION_THRESHOLD=67108864
export HOROVOD_NUM_STREAMS=2
export MXNET_HOROVOD_NUM_GROUPS=16
export NHWC_BATCHNORM_LAUNCH_MARGIN=64
export MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD=999
export MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD=25
## System run parms
DGXNNODES=130
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=08:00:00
## System config params
DGXNGPU=8
DGXSOCKETCORES=20
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'