Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates for running on Perlmutter #3

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions configs/alexnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/alexnet

data_config:
name: dummy
n_train: 16384
n_valid: 16384
name: random
n_train: 32768
n_valid: 32768
input_shape: [3, 224, 224]
n_classes: 1000
batch_size: 128
Expand Down
6 changes: 3 additions & 3 deletions configs/cnn3d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/cnn3d

data_config:
name: dummy
n_train: 8192
n_valid: 8192
name: random
n_train: 32768
n_valid: 32768
input_shape: [1, 64, 64, 64]
n_classes: 4
batch_size: 128
Expand Down
2 changes: 1 addition & 1 deletion configs/dcgan.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ trainer: gan
output_dir: results/dcgan

data_config:
name: dummy
name: random
n_train: 65536
n_valid: 65536
input_shape: [3, 64, 64]
Expand Down
6 changes: 3 additions & 3 deletions configs/inceptionV3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/inceptionV3

data_config:
name: dummy
n_train: 4096
n_valid: 4096
name: random
n_train: 32768
n_valid: 32768
input_shape: [3, 299, 299]
n_classes: 1000
batch_size: 128
Expand Down
6 changes: 3 additions & 3 deletions configs/lstm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/lstm

data_config:
name: dummy
n_train: 32768
n_valid: 32768
name: random
n_train: 131072
n_valid: 131072
input_shape: [64, 512] # (seq_len, input_size)
n_classes: 4
batch_size: 128
Expand Down
6 changes: 3 additions & 3 deletions configs/resnet50.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/resnet50

data_config:
name: dummy
n_train: 4096
n_valid: 4096
name: random
n_train: 32768
n_valid: 32768
input_shape: [3, 224, 224]
n_classes: 1000
batch_size: 128
Expand Down
6 changes: 3 additions & 3 deletions configs/transformer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/transformer

data_config:
name: dummy
n_train: 8192
n_valid: 8192
name: random
n_train: 32768
n_valid: 32768
input_shape: [512]
target_shape: [512]
input_type: 'label'
Expand Down
6 changes: 3 additions & 3 deletions configs/vgg11.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ trainer: generic
output_dir: results/vgg11

data_config:
name: dummy
n_train: 4096
n_valid: 4096
name: random
n_train: 32768
n_valid: 32768
input_shape: [3, 224, 224]
n_classes: 1000
batch_size: 128
Expand Down
22 changes: 5 additions & 17 deletions datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,15 @@
PyTorch dataset specifications.
"""

import importlib

from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

def get_datasets(name, **data_args):
if name == 'dummy':
from .dummy import get_datasets
return get_datasets(**data_args)
elif name == 'mnist':
from .mnist import get_datasets
return get_datasets(**data_args)
elif name == 'cifar10':
from .cifar10 import get_datasets
return get_datasets(**data_args)
elif name == 'hep_images':
from .hep_images import get_datasets
return get_datasets(**data_args)
elif name == 'rpv_images':
from .rpv_images import get_datasets
return get_datasets(**data_args)
else:
raise Exception('Dataset %s unknown' % name)
"""Factory function for importing datasets from local modules"""
module = importlib.import_module('.' + name, 'datasets')
return module.get_datasets(**data_args)

def get_data_loaders(name, batch_size, distributed=False,
use_dist_sampler_train=True,
Expand Down
46 changes: 46 additions & 0 deletions datasets/random.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
This module contains a PyTorch random synthetic dataset implementation.
"""

import torch

def _make_tensor(shape, data_type, n_classes=None):
if data_type == 'label':
return torch.randint(n_classes, shape, dtype=torch.long)
elif data_type == 'randn':
return torch.randn(shape)
else:
raise ValueError(f'Unsupported data_type {data_type}')

class PregeneratedRandomDataset(torch.utils.data.Dataset):
"""Random number synthetic dataset.

Pre-generates a specified number of samples to draw from.
"""

def __init__(self, n, input_shape, target_shape=[], input_type='randn',
target_type='label', n_classes=None, n_gen=1024):
self.n = n
x = _make_tensor(shape=[n_gen] + input_shape,
data_type=input_type, n_classes=n_classes)
if target_shape is None:
self.data = torch.utils.data.TensorDataset(x)
else:
y = _make_tensor(shape=[n_gen] + target_shape,
data_type=target_type, n_classes=n_classes)
self.data = torch.utils.data.TensorDataset(x, y)

def __len__(self):
return self.n

def __getitem__(self, index):
return self.data[index % len(self.data)]

def get_datasets(n_train, n_valid, **kwargs):
"""Construct and return random number datasets"""
#initial_seed = torch.initial_seed()
#torch.manual_seed(0)
train_dataset = PregeneratedRandomDataset(n=n_train, **kwargs)
valid_dataset = PregeneratedRandomDataset(n=n_valid, **kwargs)
#torch.manual_seed(initial_seed & ((1<<63)-1)) # suppressing overflow error
return train_dataset, valid_dataset
75 changes: 75 additions & 0 deletions scripts/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
#SBATCH -C gpu
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH -A nstaff_g
#SBATCH -d singleton
#SBATCH -c 32
#SBATCH -t 30
#SBATCH -J pytorch-bm-gpu
#SBATCH -o logs/%x-%j.out

set -e

# Options
version=1.9.0
backend=nccl
models="alexnet resnet50 lstm cnn3d transformer"
clean=false
usage="$0 --version VERSION --backend BACKEND --models \"MODELS ...\" --clean CLEAN"

# Parse command line options
while (( "$#" )); do
case "$1" in
--version)
version=$2
shift 2
;;
--backend)
backend=$2
shift 2
;;
--models)
models=$2
shift 2
;;
--clean)
clean=$2
shift 2
;;
*)
echo "Usage: $usage"
exit 1
;;
esac
done

# Configuration
export BENCHMARK_RESULTS_PATH=$SCRATCH/pytorch-benchmarks/results/gpu-$version-$backend-n$SLURM_NTASKS
if $clean; then
[ -d $BENCHMARK_RESULTS_PATH ] && rm -rf $BENCHMARK_RESULTS_PATH
fi

# Print settings
echo "Running PyTorch benchmarks with"
echo "version $version"
echo "backend $backend"
echo "models $models"
echo "clean $clean"
echo "writing outputs to $BENCHMARK_RESULTS_PATH"

# Load software
module load pytorch/$version
module list
#export NCCL_DEBUG=INFO

# Run each model
for model in $models; do
echo "running $model"
srun -l -u python train.py configs/${model}.yaml -d $backend --rank-gpu \
--output-dir $BENCHMARK_RESULTS_PATH/$model \
--ranks-per-node $SLURM_NTASKS_PER_NODE
done

echo "Collecting benchmark results..."
python parse.py $BENCHMARK_RESULTS_PATH -o $BENCHMARK_RESULTS_PATH/results.txt
12 changes: 12 additions & 0 deletions scripts/submit_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

# Launch all benchmark runs for this version

# Scaling on Perlmutter
sbatch -n 1 scripts/run.sh
sbatch -n 2 scripts/run.sh
sbatch -n 4 scripts/run.sh
sbatch -n 8 scripts/run.sh
sbatch -n 16 scripts/run.sh
sbatch -n 32 scripts/run.sh
sbatch -n 64 scripts/run.sh
sbatch -n 128 scripts/run.sh
sbatch -n 256 scripts/run.sh
sbatch -n 512 scripts/run.sh

# Scaling on Haswell
sbatch -N 1 scripts/run_hsw.sh
sbatch -N 2 scripts/run_hsw.sh
Expand Down
1 change: 0 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
# Externals
import yaml
import numpy as np
import torch.distributed as dist

# Locals
from datasets import get_data_loaders
Expand Down
14 changes: 11 additions & 3 deletions utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,19 @@ def init_workers_nccl_file():
rank = int(os.environ['SLURM_PROCID'])
n_ranks = int(os.environ['SLURM_NTASKS'])
sync_file = _get_sync_file()
print('Setting up with sync file', sync_file)
dist.init_process_group(backend='nccl', world_size=n_ranks, rank=rank,
init_method=sync_file)
return rank, n_ranks

def init_workers_slurm(backend='nccl', port='29507'):
"""Initialize workers with NCCL backend and SLURM"""
rank = int(os.environ['SLURM_PROCID'])
n_ranks = int(os.environ['SLURM_NTASKS'])
os.environ['MASTER_ADDR'] = os.environ['SLURM_LAUNCH_NODE_IPADDR']
os.environ['MASTER_PORT'] = port
dist.init_process_group(backend=backend, world_size=n_ranks, rank=rank)
return rank, n_ranks

def init_workers_mpi():
"""Initialize workers with MPI backend"""
dist.init_process_group(backend='mpi')
Expand All @@ -54,7 +62,7 @@ def init_workers(backend=None):
elif backend == 'mpi':
rank, n_ranks = init_workers_mpi()
elif backend == 'nccl':
rank, n_ranks = init_workers_nccl_file()
rank, n_ranks = init_workers_slurm(backend=backend)
elif backend == 'gloo':
rank, n_ranks = init_workers_gloo_file()
rank, n_ranks = init_workers_slurm(backend=backend)
return rank, n_ranks
3 changes: 2 additions & 1 deletion utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ def config_logging(verbose, log_file=None):
file_handler = logging.FileHandler(log_file, mode='w')
file_handler.setLevel(log_level)
handlers.append(file_handler)
logging.basicConfig(level=log_level, format=log_format, handlers=handlers)
logging.basicConfig(level=log_level, format=log_format, handlers=handlers,
force=True)