diff --git a/doc/scheduler-job-examples.tex b/doc/scheduler-job-examples.tex index 7b8ebc6..2a50b3a 100644 --- a/doc/scheduler-job-examples.tex +++ b/doc/scheduler-job-examples.tex @@ -264,6 +264,25 @@ \subsection{Scheduling On The GPU Nodes} %And that there are no more GPUs available on that node (\texttt{hc:gpu=0}). %Note that no more than two GPUs can be requested for any one job. +% ------------------------------------------------------------------------------ +\subsubsection{P6 on Multi-GPU, Multi-Node} + +As described lines above, P6 cards are not compatible with Distribute and DataParallel functions +(\texttt{Pytorch, Tensorflow}) when running on Multi-GPUs. +One workaround is to run the job in Multi-node, single GPU per node; per example: + +\begin{verbatim} +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=1 +\end{verbatim} + +On P6 nodes: \texttt{speed-05, speed-17, speed-01} + +The example: + \href{https://github.com/NAG-DevOps/speed-hpc/blob/master/src/pytorch-multinode-multigpu.sh} + {pytorch-multinode-multigpu.sh} +illustrates a job for training on Multi-nodes, Multi-GPUs + % ------------------------------------------------------------------------------ \subsubsection{CUDA} diff --git a/doc/scheduler-scripting.tex b/doc/scheduler-scripting.tex index aa19c6c..d695d89 100644 --- a/doc/scheduler-scripting.tex +++ b/doc/scheduler-scripting.tex @@ -601,7 +601,7 @@ \subsubsection{Jupyter Notebooks} Create an \tool{ssh} tunnel between your computer and the node (\texttt{speed-XX}) where Jupyter is running (Using \texttt{speed-submit} as a ``jump server'') (Preferably: PuTTY, see \xf{fig:putty1} and \xf{fig:putty2}) \begin{verbatim} -ssh -L 8888:localhost:8888 speed-XX +ssh -L 8888:speed-XX:8888 YOUR_USER@speed-submit.encs.concordia.ca \end{verbatim} Don't close the tunnel. diff --git a/doc/speed-manual.pdf b/doc/speed-manual.pdf index 474738f..eb4731c 100644 Binary files a/doc/speed-manual.pdf and b/doc/speed-manual.pdf differ diff --git a/src/README.md b/src/README.md index ce5d11b..0072959 100644 --- a/src/README.md +++ b/src/README.md @@ -21,6 +21,7 @@ These are examples either trivial or some are more elaborate. Some are described - `efficientdet.sh` -- `efficientdet` with Conda environment described below - `gurobi-with-python.sh` -- using Gurobi with Python and Python virtual environment - `pytorch-multicpu.txt` -- using Pytorch with Python virtual environment to run on CPUs; with instructions and code ready to paste. + - `pytorch-multinode-multigpu.sh` -- using Pytorch with Python virtual environment to run on Multinodes and MultiGpus - `lambdal-singularity.sh` -- an example use of the Singularity container to run LambdaLabs software stack on the GPU node. The container was built from the docker image as a [source](https://github.com/NAG-DevOps/lambda-stack-dockerfiles). - `openiss-reid-speed.sh` -- OpenISS computer vision exame for re-edentification, see [more](https://github.com/NAG-DevOps/speed-hpc/tree/master/src#openiss-reid-tfk) in its section - `openiss-yolo-cpu.sh`, `openiss-yolo-gpu.sh`, and `openiss-yolo-interactive.sh` -- OpenISS examples with YOLO, related to `reid`, see [more](https://github.com/NAG-DevOps/speed-hpc/tree/master/src#openiss-yolov3) in the corresponding section diff --git a/src/pytorch-multinode-multigpu.sh b/src/pytorch-multinode-multigpu.sh new file mode 100644 index 0000000..aa9989c --- /dev/null +++ b/src/pytorch-multinode-multigpu.sh @@ -0,0 +1,29 @@ +#!/encs/bin/tcsh +#SBATCH --job-name=pytorch_multinode_multigpu_train + +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=1 #On P6 cards this value MUST be: 1 +#SBATCH --cpus-per-task=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=128G ## Assign memory per node + +if ( $?SLURM_CPUS_PER_TASK ) then + setenv omp_threads $SLURM_CPUS_PER_TASK +else + setenv omp_threads 1 +endif +setenv OMP_NUM_THREADS $omp_threads + +setenv RDZV_HOST `hostname -s` +setenv RDZV_PORT 29400 +setenv endpoint ${RDZV_HOST}:${RDZV_PORT} +setenv CUDA_LAUNCH_BLOCKING 1 +setenv NCCL_BLOCKING_WAIT 1 +#setenv NCCL_DEBUG INFO +setenv NCCL_P2P_DISABLE 1 +setenv NCCL_IB_DISABLE 1 +source /speed-scratch/$USER/tmp/Venv-Name/bin/activate.csh #path where you have created your python venv +unsetenv CUDA_VISIBLE_DEVICES +# nproc_per_node=1 On P6 cards +srun torchrun --nnodes=$SLURM_JOB_NUM_NODES --nproc_per_node=1 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$endpoint main_multinode.py +deactivate