From 78a1c56eade076c534404436a76cca4fda9ffd11 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Fri, 16 Jul 2021 16:17:16 -0700 Subject: [PATCH 1/8] update distributed init to use slurm env, not sync file --- train.py | 1 - utils/distributed.py | 14 +++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index aa8e698..a305d7a 100644 --- a/train.py +++ b/train.py @@ -10,7 +10,6 @@ # Externals import yaml import numpy as np -import torch.distributed as dist # Locals from datasets import get_data_loaders diff --git a/utils/distributed.py b/utils/distributed.py index 9a1d876..b30c16a 100644 --- a/utils/distributed.py +++ b/utils/distributed.py @@ -27,11 +27,19 @@ def init_workers_nccl_file(): rank = int(os.environ['SLURM_PROCID']) n_ranks = int(os.environ['SLURM_NTASKS']) sync_file = _get_sync_file() - print('Setting up with sync file', sync_file) dist.init_process_group(backend='nccl', world_size=n_ranks, rank=rank, init_method=sync_file) return rank, n_ranks +def init_workers_slurm(backend='nccl', port='29507'): + """Initialize workers with NCCL backend and SLURM""" + rank = int(os.environ['SLURM_PROCID']) + n_ranks = int(os.environ['SLURM_NTASKS']) + os.environ['MASTER_ADDR'] = os.environ['SLURM_LAUNCH_NODE_IPADDR'] + os.environ['MASTER_PORT'] = port + dist.init_process_group(backend=backend, world_size=n_ranks, rank=rank) + return rank, n_ranks + def init_workers_mpi(): """Initialize workers with MPI backend""" dist.init_process_group(backend='mpi') @@ -54,7 +62,7 @@ def init_workers(backend=None): elif backend == 'mpi': rank, n_ranks = init_workers_mpi() elif backend == 'nccl': - rank, n_ranks = init_workers_nccl_file() + rank, n_ranks = init_workers_slurm(backend=backend) elif backend == 'gloo': - rank, n_ranks = init_workers_gloo_file() + rank, n_ranks = init_workers_slurm(backend=backend) return rank, n_ranks From ce6e735638220d90214081264a087931a429c69d Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Fri, 16 Jul 2021 16:17:37 -0700 Subject: [PATCH 2/8] python logging fix --- utils/logging.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/logging.py b/utils/logging.py index 5df8f02..8143b7a 100644 --- a/utils/logging.py +++ b/utils/logging.py @@ -16,4 +16,5 @@ def config_logging(verbose, log_file=None): file_handler = logging.FileHandler(log_file, mode='w') file_handler.setLevel(log_level) handlers.append(file_handler) - logging.basicConfig(level=log_level, format=log_format, handlers=handlers) + logging.basicConfig(level=log_level, format=log_format, handlers=handlers, + force=True) From c7f0e38943ac91d7fe6de8dc7ed4a8f18a3c4277 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Fri, 16 Jul 2021 16:17:52 -0700 Subject: [PATCH 3/8] adding submit commands for perlmutter --- scripts/submit_all.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/submit_all.sh b/scripts/submit_all.sh index bf46afb..8652438 100755 --- a/scripts/submit_all.sh +++ b/scripts/submit_all.sh @@ -2,6 +2,18 @@ # Launch all benchmark runs for this version +# Scaling on Perlmutter +sbatch -n 1 scripts/run.sh +sbatch -n 2 scripts/run.sh +sbatch -n 4 scripts/run.sh +sbatch -n 8 scripts/run.sh +sbatch -n 16 scripts/run.sh +sbatch -n 32 scripts/run.sh +sbatch -n 64 scripts/run.sh +sbatch -n 128 scripts/run.sh +sbatch -n 256 scripts/run.sh +sbatch -n 512 scripts/run.sh + # Scaling on Haswell sbatch -N 1 scripts/run_hsw.sh sbatch -N 2 scripts/run_hsw.sh From 438e2afa258b7d5144c1f5bdbd46ee3dc8404c61 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Fri, 16 Jul 2021 16:20:32 -0700 Subject: [PATCH 4/8] add run script for perlmutter --- scripts/run.sh | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 scripts/run.sh diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100755 index 0000000..82c7359 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH -C gpu +#SBATCH --ntasks-per-node=4 +#SBATCH --gpus-per-task=1 +#SBATCH --exclusive +#SBATCH -d singleton +#SBATCH -c 32 +#SBATCH -t 30 +#SBATCH -J pytorch-bm-gpu +#SBATCH -o logs/%x-%j.out + +set -e + +# Options +version=1.8.0 +backend=nccl +models="alexnet resnet50 lstm cnn3d transformer" +clean=false +usage="$0 --version VERSION --backend BACKEND --models \"MODELS ...\" --clean CLEAN" + +# Parse command line options +while (( "$#" )); do + case "$1" in + --version) + version=$2 + shift 2 + ;; + --backend) + backend=$2 + shift 2 + ;; + --models) + models=$2 + shift 2 + ;; + --clean) + clean=$2 + shift 2 + ;; + *) + echo "Usage: $usage" + exit 1 + ;; + esac +done + +# Configuration +export BENCHMARK_RESULTS_PATH=$SCRATCH/pytorch-benchmarks/results/gpu-$version-$backend-n$SLURM_NTASKS +if $clean; then + [ -d $BENCHMARK_RESULTS_PATH ] && rm -rf $BENCHMARK_RESULTS_PATH +fi + +# Print settings +echo "Running PyTorch benchmarks with" +echo "version $version" +echo "backend $backend" +echo "models $models" +echo "clean $clean" +echo "writing outputs to $BENCHMARK_RESULTS_PATH" + +# Load software +module load pytorch/$version +module list +#export NCCL_DEBUG=INFO + +# Run each model +for model in $models; do + echo "running $model" + srun -l -u python train.py configs/${model}.yaml -d $backend --rank-gpu \ + --output-dir $BENCHMARK_RESULTS_PATH/$model \ + --ranks-per-node $SLURM_NTASKS_PER_NODE +done + +echo "Collecting benchmark results..." +python parse.py $BENCHMARK_RESULTS_PATH -o $BENCHMARK_RESULTS_PATH/results.txt From 5d846b639de86965873a3d9db6cb2b3b8570d8f5 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Mon, 25 Oct 2021 13:19:42 -0700 Subject: [PATCH 5/8] update PM run script --- scripts/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run.sh b/scripts/run.sh index 82c7359..864166b 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -2,7 +2,7 @@ #SBATCH -C gpu #SBATCH --ntasks-per-node=4 #SBATCH --gpus-per-task=1 -#SBATCH --exclusive +#SBATCH -A nstaff_g #SBATCH -d singleton #SBATCH -c 32 #SBATCH -t 30 @@ -12,7 +12,7 @@ set -e # Options -version=1.8.0 +version=1.9.0 backend=nccl models="alexnet resnet50 lstm cnn3d transformer" clean=false From be0a791d2291357ed66b601a048907b29268383b Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Wed, 27 Oct 2021 14:19:09 -0700 Subject: [PATCH 6/8] moved to importlib for loading datasets --- datasets/__init__.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/datasets/__init__.py b/datasets/__init__.py index de0da27..7bebd7e 100644 --- a/datasets/__init__.py +++ b/datasets/__init__.py @@ -2,27 +2,15 @@ PyTorch dataset specifications. """ +import importlib + from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler def get_datasets(name, **data_args): - if name == 'dummy': - from .dummy import get_datasets - return get_datasets(**data_args) - elif name == 'mnist': - from .mnist import get_datasets - return get_datasets(**data_args) - elif name == 'cifar10': - from .cifar10 import get_datasets - return get_datasets(**data_args) - elif name == 'hep_images': - from .hep_images import get_datasets - return get_datasets(**data_args) - elif name == 'rpv_images': - from .rpv_images import get_datasets - return get_datasets(**data_args) - else: - raise Exception('Dataset %s unknown' % name) + """Factory function for importing datasets from local modules""" + module = importlib.import_module('.' + name, 'datasets') + return module.get_datasets(**data_args) def get_data_loaders(name, batch_size, distributed=False, use_dist_sampler_train=True, From dc33c07ef8f2bbeb0691fd7518058c229f66a675 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Wed, 27 Oct 2021 22:00:34 -0700 Subject: [PATCH 7/8] Add new random number dataset This one allows to specify a number of pregenerated random data samples from which the training set of arbitrary size will be sampled. --- datasets/random.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 datasets/random.py diff --git a/datasets/random.py b/datasets/random.py new file mode 100644 index 0000000..4680f29 --- /dev/null +++ b/datasets/random.py @@ -0,0 +1,46 @@ +""" +This module contains a PyTorch random synthetic dataset implementation. +""" + +import torch + +def _make_tensor(shape, data_type, n_classes=None): + if data_type == 'label': + return torch.randint(n_classes, shape, dtype=torch.long) + elif data_type == 'randn': + return torch.randn(shape) + else: + raise ValueError(f'Unsupported data_type {data_type}') + +class PregeneratedRandomDataset(torch.utils.data.Dataset): + """Random number synthetic dataset. + + Pre-generates a specified number of samples to draw from. + """ + + def __init__(self, n, input_shape, target_shape=[], input_type='randn', + target_type='label', n_classes=None, n_gen=1024): + self.n = n + x = _make_tensor(shape=[n_gen] + input_shape, + data_type=input_type, n_classes=n_classes) + if target_shape is None: + self.data = torch.utils.data.TensorDataset(x) + else: + y = _make_tensor(shape=[n_gen] + target_shape, + data_type=target_type, n_classes=n_classes) + self.data = torch.utils.data.TensorDataset(x, y) + + def __len__(self): + return self.n + + def __getitem__(self, index): + return self.data[index % len(self.data)] + +def get_datasets(n_train, n_valid, **kwargs): + """Construct and return random number datasets""" + #initial_seed = torch.initial_seed() + #torch.manual_seed(0) + train_dataset = PregeneratedRandomDataset(n=n_train, **kwargs) + valid_dataset = PregeneratedRandomDataset(n=n_valid, **kwargs) + #torch.manual_seed(initial_seed & ((1<<63)-1)) # suppressing overflow error + return train_dataset, valid_dataset From 24477f0a2734992a5dd65848ddace307f912ad10 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Wed, 27 Oct 2021 22:02:20 -0700 Subject: [PATCH 8/8] Update configs for new larger random datasets Move to new 'random' dataset, and increase training and validation set sizes for all benchmarks. --- configs/alexnet.yaml | 6 +++--- configs/cnn3d.yaml | 6 +++--- configs/dcgan.yaml | 2 +- configs/inceptionV3.yaml | 6 +++--- configs/lstm.yaml | 6 +++--- configs/resnet50.yaml | 6 +++--- configs/transformer.yaml | 6 +++--- configs/vgg11.yaml | 6 +++--- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/configs/alexnet.yaml b/configs/alexnet.yaml index 236727c..25dd3b2 100644 --- a/configs/alexnet.yaml +++ b/configs/alexnet.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/alexnet data_config: - name: dummy - n_train: 16384 - n_valid: 16384 + name: random + n_train: 32768 + n_valid: 32768 input_shape: [3, 224, 224] n_classes: 1000 batch_size: 128 diff --git a/configs/cnn3d.yaml b/configs/cnn3d.yaml index b8325de..09e6f41 100644 --- a/configs/cnn3d.yaml +++ b/configs/cnn3d.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/cnn3d data_config: - name: dummy - n_train: 8192 - n_valid: 8192 + name: random + n_train: 32768 + n_valid: 32768 input_shape: [1, 64, 64, 64] n_classes: 4 batch_size: 128 diff --git a/configs/dcgan.yaml b/configs/dcgan.yaml index ba91c2d..ea3e8a0 100644 --- a/configs/dcgan.yaml +++ b/configs/dcgan.yaml @@ -2,7 +2,7 @@ trainer: gan output_dir: results/dcgan data_config: - name: dummy + name: random n_train: 65536 n_valid: 65536 input_shape: [3, 64, 64] diff --git a/configs/inceptionV3.yaml b/configs/inceptionV3.yaml index 2dc5ae7..eebe17c 100644 --- a/configs/inceptionV3.yaml +++ b/configs/inceptionV3.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/inceptionV3 data_config: - name: dummy - n_train: 4096 - n_valid: 4096 + name: random + n_train: 32768 + n_valid: 32768 input_shape: [3, 299, 299] n_classes: 1000 batch_size: 128 diff --git a/configs/lstm.yaml b/configs/lstm.yaml index 4b5ea6d..0d4c871 100644 --- a/configs/lstm.yaml +++ b/configs/lstm.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/lstm data_config: - name: dummy - n_train: 32768 - n_valid: 32768 + name: random + n_train: 131072 + n_valid: 131072 input_shape: [64, 512] # (seq_len, input_size) n_classes: 4 batch_size: 128 diff --git a/configs/resnet50.yaml b/configs/resnet50.yaml index cbd8f68..eb0107c 100644 --- a/configs/resnet50.yaml +++ b/configs/resnet50.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/resnet50 data_config: - name: dummy - n_train: 4096 - n_valid: 4096 + name: random + n_train: 32768 + n_valid: 32768 input_shape: [3, 224, 224] n_classes: 1000 batch_size: 128 diff --git a/configs/transformer.yaml b/configs/transformer.yaml index ab941ee..008b0cf 100644 --- a/configs/transformer.yaml +++ b/configs/transformer.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/transformer data_config: - name: dummy - n_train: 8192 - n_valid: 8192 + name: random + n_train: 32768 + n_valid: 32768 input_shape: [512] target_shape: [512] input_type: 'label' diff --git a/configs/vgg11.yaml b/configs/vgg11.yaml index 394312b..2d209bc 100644 --- a/configs/vgg11.yaml +++ b/configs/vgg11.yaml @@ -2,9 +2,9 @@ trainer: generic output_dir: results/vgg11 data_config: - name: dummy - n_train: 4096 - n_valid: 4096 + name: random + n_train: 32768 + n_valid: 32768 input_shape: [3, 224, 224] n_classes: 1000 batch_size: 128