Merge pull request #176 from rosswhitfield/gpus2

Add support for GPUs on perlmutter
HPC-SimTools · Nov 3, 2022 · e40dccf · e40dccf
2 parents a51c97a + 421f9f8
commit e40dccf
Show file tree

Hide file tree

Showing 21 changed files with 655 additions and 79 deletions.
diff --git a/.github/workflows/conda_env/environment_macos.yml b/.github/workflows/conda_env/environment_macos.yml
@@ -5,6 +5,6 @@ dependencies:
 - pytest-cov<4
 - pytest-timeout
 - psutil
-- dask=2022.08.1
+- dask=2022.10.0
 - coverage!=6.3
 - flask=2.2.2
diff --git a/.github/workflows/conda_env/environment_minimal.yml b/.github/workflows/conda_env/environment_minimal.yml
@@ -6,3 +6,4 @@ dependencies:
 - pytest-timeout
 - psutil
 - coverage!=6.3
+- dask<=2022.10.0
diff --git a/components/drivers/hello/hello_worker_task_pool.py b/components/drivers/hello/hello_worker_task_pool.py
@@ -52,7 +52,7 @@ def step(self, timeStamp=0.0):
                                    cwd, myFun, str(duration[i]),
                                    task_env=task_env)
 
-        ret_val = self.services.submit_tasks('pool', use_dask=True, dask_nodes=1, dask_ppn=10)
+        ret_val = self.services.submit_tasks('pool', use_dask=True, dask_nodes=1, dask_ppw=10)
         print('ret_val = ', ret_val)
         exit_status = self.services.get_finished_tasks('pool')
         print(exit_status)

diff --git a/doc/development.rst b/doc/development.rst
@@ -169,6 +169,38 @@ An example batch script for running the unit tests is:
 Then check the output in ``pytest.out`` to see that all the tests
 passed.
 
+.. _perlmutter-tests :
+
+Perlmutter only tests
+~~~~~~~~~~~~~~~~~~~~~
+
+The are some tests that only run on Perlmutter at NERSC and these are
+not run as part of the :ref:`CI <continuous integration>` and must be
+run manually. To run those test you need to add the option
+``--runperlmutter`` to the ``pytest``.There are also tests for the
+srun commands built with different ``task_ppn``, ``task_cpp`` and
+``task_gpp`` options in
+:meth:`~ipsframework.services.ServicesProxy.launch_task`.
+
+
+An example batch script for running the unit tests is:
+
+.. code-block:: bash
+
+  #!/bin/bash
+  #SBATCH -p debug
+  #SBATCH --nodes=1
+  #SBATCH -t 00:20:00
+  #SBATCH -C gpu
+  #SBATCH -J pytest
+  #SBATCH -e pytest.err
+  #SBATCH -o pytest.out
+  module load python
+  python -m pytest --runperlmutter
+
+Then check the output in ``pytest.out`` to see that all the tests
+passed.
+
 Writing Tests
 ~~~~~~~~~~~~~
 
@@ -258,4 +290,4 @@ release before the full release to allow feedback from users. Patch
 versions will not normally have an release candidate.
 
 Before a release is finalized the :ref:`Cori only tests<cori-tests>`
-should be run.
+and :ref:`Perlmutter only tests<perlmutter-tests>` should be run.
diff --git a/doc/user_guides/advanced_guide.rst b/doc/user_guides/advanced_guide.rst
@@ -426,6 +426,106 @@ the resulting core affinity of the OpenMP threads are:
     Hello from rank 7, thread 0, on nid00026. (core affinity = 18)
     Hello from rank 7, thread 1, on nid00026. (core affinity = 19)
 
+
+Slurm with GPUs examples
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+   New in 0.8.0
+
+The :py:meth:`~ipsframework.services.ServicesProxy.launch_task` method
+has an option ``task_gpp`` which allows you to set the number of GPUs
+per process, used as the ``--gpus-per-task`` in the ``srun``
+command.
+
+IPS will validate the number of GPUs per node requested does not
+exceed the number specified by the ``GPUS_PER_NODE`` parameter in the
+:ref:`plat-conf-sec`. You need to make sure that the number of GPUs
+per process times the number of processes per node does not exceed the
+``GPUS_PER_NODE`` set.
+
+Using the `gpus_for_tasks
+<https://docs.nersc.gov/jobs/affinity/#gpus>`_ program provided for
+Perlmutter (which has 4 GPUs per node) to test the behavior, you will
+see the following:
+
+
+To launch a task with 1 process and 1 GPU per process (``task_gpp``) run:
+
+.. code-block:: python
+
+    self.services.launch_task(1, cwd, "gpu-per-task", task_gpp=1)
+
+will create the command ``srun -N 1 -n 1 -c
+64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1
+gpus_for_tasks`` and the output of will be:
+
+.. code-block:: text
+
+   Rank 0 out of 1 processes: I see 1 GPU(s).
+   0 for rank 0: 0000:03:00.0
+
+To launch 8 processes on 2 nodes (so 4 processes per node) with 1 gpu per process run:
+
+.. code-block:: python
+
+    self.services.launch_task(8, cwd, "gpu-per-task", task_ppn=4, task_gpp=1)
+
+will create the command ``srun -N 2 -n 8 -c
+16 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1
+gpus_for_task`` and the output of will be:
+
+.. code-block:: text
+
+   Rank 0 out of 8 processes: I see 1 GPU(s).
+   0 for rank 0: 0000:03:00.0
+   Rank 1 out of 8 processes: I see 1 GPU(s).
+   0 for rank 1: 0000:41:00.0
+   Rank 2 out of 8 processes: I see 1 GPU(s).
+   0 for rank 2: 0000:82:00.0
+   Rank 3 out of 8 processes: I see 1 GPU(s).
+   0 for rank 3: 0000:C1:00.0
+   Rank 4 out of 8 processes: I see 1 GPU(s).
+   0 for rank 4: 0000:03:00.0
+   Rank 5 out of 8 processes: I see 1 GPU(s).
+   0 for rank 5: 0000:41:00.0
+   Rank 6 out of 8 processes: I see 1 GPU(s).
+   0 for rank 6: 0000:82:00.0
+   Rank 7 out of 8 processes: I see 1 GPU(s).
+   0 for rank 7: 0000:C1:00.0
+
+To launch 2 processes on 2 nodes (so 1 processes per node) with 4 gpu per process run:
+
+.. code-block:: python
+
+    self.services.launch_task(2, cwd, "gpu-per-task", task_ppn=1, task_gpp=4)
+
+will create the command ``srun -N 2 -n 2 -c
+64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=4
+gpus_per_tasks`` and the output of will be:
+
+.. code-block:: text
+
+   Rank 0 out of 2 processes: I see 4 GPU(s).
+   0 for rank 0: 0000:03:00.0
+   1 for rank 0: 0000:41:00.0
+   2 for rank 0: 0000:82:00.0
+   3 for rank 0: 0000:C1:00.0
+   Rank 1 out of 2 processes: I see 4 GPU(s).
+   0 for rank 1: 0000:03:00.0
+   1 for rank 1: 0000:41:00.0
+   2 for rank 1: 0000:82:00.0
+   3 for rank 1: 0000:C1:00.0
+
+If you try to launch a task with too many GPUs per node, *e.g.*:
+
+.. code-block:: python
+
+    self.services.launch_task(8, cwd, "gpu-per-task", task_gpp=1)
+
+then it will raise an :class:`~ipsframework.ipsExceptions.GPUResourceRequestMismatchException`.
+
 .. automethod:: ipsframework.services.ServicesProxy.launch_task
    :noindex:
 

diff --git a/doc/user_guides/platform.rst b/doc/user_guides/platform.rst
@@ -423,6 +423,9 @@ The platform configuration file contains platform specific information that the
 	one task can share a node [#nochange]_.  Simulations,
 	components and tasks can set their node usage allocation
 	policies in the configuration file and on task launch.
+**GPUS_PER_NODE**
+        number of GPUs per node, used when validating the launch task
+	commands with ``task_gpp`` set, see :meth:`~ipsframework.services.ServicesProxy.launch_task`.
 
 
 .. [#nochange] This value should not change unless the machine is

diff --git a/ipsframework/configurationManager.py b/ipsframework/configurationManager.py
@@ -225,6 +225,7 @@ def initialize(self, data_mgr, resource_mgr, task_mgr):
         self.platform_conf['PROCS_PER_NODE'] = int(self.platform_conf.get('PROCS_PER_NODE', 0))
         self.platform_conf['CORES_PER_NODE'] = int(self.platform_conf.get('CORES_PER_NODE', 0))
         self.platform_conf['SOCKETS_PER_NODE'] = int(self.platform_conf.get('SOCKETS_PER_NODE', 0))
+        self.platform_conf['GPUS_PER_NODE'] = int(self.platform_conf.get('GPUS_PER_NODE', 0))
         self.platform_conf['USE_ACCURATE_NODES'] = use_accurate_nodes
         self.platform_conf['MPIRUN_VERSION'] = mpirun_version
 

diff --git a/ipsframework/ipsExceptions.py b/ipsframework/ipsExceptions.py
@@ -74,6 +74,26 @@ def __str__(self):
         return s
 
 
+class GPUResourceRequestMismatchException(Exception):
+    """ Exception raised by the resource manager when it is possible to launch
+    the requested number of GPUs per task
+    """
+
+    def __init__(self, caller_id, tid, ppn, gpp, max_gpp):
+        super().__init__()
+        self.caller_id = caller_id
+        self.task_id = tid
+        self.ppn = ppn
+        self.gpp = gpp
+        self.max_gpp = max_gpp
+        self.args = (caller_id, tid, ppn, gpp, max_gpp)
+
+    def __str__(self):
+        s = "component %s requested %d processes per node with %d GPUs per process, which is greater than the available %d GPUS_PER_NODE" % (
+            self.caller_id, self.ppn, self.gpp, self.max_gpp)
+        return s
+
+
 class ResourceRequestUnequalPartitioningException(Exception):
     """Exception raised by the resource manager when it is possible to
     launch the requested number of processes, but the requested number

diff --git a/ipsframework/resourceManager.py b/ipsframework/resourceManager.py
@@ -9,6 +9,7 @@
 from .ipsExceptions import (InsufficientResourcesException,
                             BadResourceRequestException,
                             ResourceRequestMismatchException,
+                            GPUResourceRequestMismatchException,
                             ResourceRequestUnequalPartitioningException)
 from .ips_es_spec import eventManager
 from .resourceHelper import getResourceList
@@ -64,6 +65,7 @@ def __init__(self, fwk):
         # other stuff
         self.max_ppn = 1   # the ppn for the whole submission (max ppn allowed by *software*)
         self.ppn = 1  # platform config ppn for the whole IPS
+        self.gpn = 0
         self.myTopic = None
         self.service_methods = ['get_allocation', 'release_allocation']
 
@@ -176,6 +178,11 @@ def initialize(self, dataMngr, taskMngr, configMngr,
                 self.sockets_per_node = 1
                 self.cores_per_socket = self.cores_per_node
 
+            # -------------------------------
+            # set gpp
+            # -------------------------------
+            self.gpn = int(self.CM.get_platform_parameter('GPUS_PER_NODE'))
+
         # -------------------------------
         # populate nodes
         # -------------------------------
@@ -260,7 +267,7 @@ def add_nodes(self, listOfNodes):
     # RM getAllocation
     # pylint: disable=inconsistent-return-statements
     def get_allocation(self, comp_id, nproc, task_id,
-                       whole_nodes, whole_socks, task_ppn=0, task_cpp=0):
+                       whole_nodes, whole_socks, task_ppn=0, task_cpp=0, task_gpp=0):
         """
         Traverse available nodes to return:
 
@@ -358,6 +365,11 @@ def get_allocation(self, comp_id, nproc, task_id,
                                                                   self.total_cores,
                                                                   self.max_ppn)
         else:
+            if not self.check_gpus(ppn, task_gpp):
+                raise GPUResourceRequestMismatchException(comp_id, task_id,
+                                                          ppn, task_gpp,
+                                                          self.gpn)
+
             try:
                 self.processes += nproc
                 cores_allocated = 0
@@ -594,6 +606,9 @@ def check_core_cap(self, nproc, ppn):
         else:
             return False, "mismatch"
 
+    def check_gpus(self, ppn, task_gpp):
+        return ppn * task_gpp <= self.gpn
+
     # RM releaseAllocation
     def release_allocation(self, task_id, status):
         """