diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 31941bc..c1b81e4 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -29,4 +29,4 @@ sphinx: # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - - requirements: ci_requirements.txt + - requirements: doc/requirements.txt diff --git a/README.md b/README.md index 87cfc21..4f992df 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ pip install git+https://github.com/tangentlabs/django-oscar-paypal.git@issue/34/ ``` ## Minimal Example -[examples/barebone_mnist.py](example/barebone_mnist.py) features a minimal and barebone example on how to distributely train MNIST. +See [examples/barebone_mnist.py](https://github.com/sehoffmann/dmlcloud/blob/develop/examples/barebone_mnist.py) for a minimal and barebone example on how to distributely train MNIST. To run it on a single node with 4 GPUs, use ``` dmlrun -n 4 examples/barebone_mnist.py diff --git a/dmlcloud/run.py b/dmlcloud/run.py index 0f56aec..ac119b2 100644 --- a/dmlcloud/run.py +++ b/dmlcloud/run.py @@ -23,15 +23,20 @@ import argparse import os + def main(): - description = ('dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.\n\n' - 'While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.' - 'For serious mulit-node training, we recommend using srun or torchrun directly.') - epilog = ('Example:\n' - ' dmlrun --gpus 3,7 train.py\n' - ' dmlrun --num-gpus 2 train.py --batch-size 64') - parser = argparse.ArgumentParser(prog='dmlrun', description=description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('--gpus', '-g', help='Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.') + description = ( + 'dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.\n\n' + 'While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.' + 'For serious mulit-node training, we recommend using srun or torchrun directly.' + ) + epilog = 'Example:\n' ' dmlrun --gpus 3,7 train.py\n' ' dmlrun --num-gpus 2 train.py --batch-size 64' + parser = argparse.ArgumentParser( + prog='dmlrun', description=description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + '--gpus', '-g', help='Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.' + ) parser.add_argument('--nprocs', '-n', type=int, help='Number of GPUs to use for training.') parser.add_argument('script', type=str, help='Path to the script to run.') parser.add_argument('args', nargs=argparse.REMAINDER, help='Arguments to pass to the script.') @@ -41,7 +46,6 @@ def main(): if args.gpus and args.num_gpus: raise ValueError('Only one of --gpus or --num-gpus can be specified.') - if args.gpus: ids = args.gpus.split(',') if not all(id.isdigit() for id in ids): @@ -67,4 +71,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 0000000..e7e6280 --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,3 @@ +-r ../ci_requirements.txt +-e ./ +-r ../requirements.txt