Add perlmutter test

HPC-SimTools · Nov 3, 2022 · 421f9f8 · 421f9f8
1 parent f864926
commit 421f9f8
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 8 deletions.
diff --git a/doc/development.rst b/doc/development.rst
@@ -169,6 +169,38 @@ An example batch script for running the unit tests is:
 Then check the output in ``pytest.out`` to see that all the tests
 passed.
 
+.. _perlmutter-tests :
+
+Perlmutter only tests
+~~~~~~~~~~~~~~~~~~~~~
+
+The are some tests that only run on Perlmutter at NERSC and these are
+not run as part of the :ref:`CI <continuous integration>` and must be
+run manually. To run those test you need to add the option
+``--runperlmutter`` to the ``pytest``.There are also tests for the
+srun commands built with different ``task_ppn``, ``task_cpp`` and
+``task_gpp`` options in
+:meth:`~ipsframework.services.ServicesProxy.launch_task`.
+
+
+An example batch script for running the unit tests is:
+
+.. code-block:: bash
+
+  #!/bin/bash
+  #SBATCH -p debug
+  #SBATCH --nodes=1
+  #SBATCH -t 00:20:00
+  #SBATCH -C gpu
+  #SBATCH -J pytest
+  #SBATCH -e pytest.err
+  #SBATCH -o pytest.out
+  module load python
+  python -m pytest --runperlmutter
+
+Then check the output in ``pytest.out`` to see that all the tests
+passed.
+
 Writing Tests
 ~~~~~~~~~~~~~
 
@@ -258,4 +290,4 @@ release before the full release to allow feedback from users. Patch
 versions will not normally have an release candidate.
 
 Before a release is finalized the :ref:`Cori only tests<cori-tests>`
-should be run.
+and :ref:`Perlmutter only tests<perlmutter-tests>` should be run.
diff --git a/tests/components/workers/perlmutter_srun_gpu.py b/tests/components/workers/perlmutter_srun_gpu.py
@@ -0,0 +1,13 @@
+from ipsframework import Component
+
+
+class gpu_task(Component):
+    # pylint: disable=no-member
+    def step(self, timestamp=0.0, **keywords):
+        cwd = self.services.get_working_dir()
+
+        self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_1", task_gpp=1))
+        self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_2", task_gpp=2))
+        self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_4", task_gpp=4))
+        self.services.wait_task(self.services.launch_task(2, cwd, self.EXE, "2_2", task_gpp=2))
+        self.services.wait_task(self.services.launch_task(4, cwd, self.EXE, "4_1", task_gpp=1))
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -44,17 +44,24 @@ def run_around_tests():
 
 def pytest_addoption(parser):
     parser.addoption("--runcori", action="store_true", default=False, help="run Cori tests")
+    parser.addoption("--runperlmutter", action="store_true", default=False, help="run Perlmutter tests")
 
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "cori: mark test to only work on Cori")
+    config.addinivalue_line("markers", "perlmutter: mark test to only work on Perlmutter")
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runcori"):
-        # --runslow given in cli: do not skip slow tests
-        return
-    skip_cori = pytest.mark.skip(reason="need --runcori option to run")
-    for item in items:
-        if "cori" in item.keywords:
-            item.add_marker(skip_cori)
+    if not config.getoption("--runcori"):
+        # --runcori given in cli: do not skip slow tests
+        skip_cori = pytest.mark.skip(reason="need --runcori option to run")
+        for item in items:
+            if "cori" in item.keywords:
+                item.add_marker(skip_cori)
+    if not config.getoption("--runperlmutter"):
+        # --runperlmutter given in cli: do not skip slow tests
+        skip_cori = pytest.mark.skip(reason="need --runperlmutter option to run")
+        for item in items:
+            if "perlmutter" in item.keywords:
+                item.add_marker(skip_cori)
diff --git a/tests/new/test_perlmutter_srun.py b/tests/new/test_perlmutter_srun.py
@@ -0,0 +1,103 @@
+import glob
+import json
+import pytest
+from ipsframework import Framework
+
+
+def write_basic_config_and_platform_files(tmpdir):
+    platform_file = tmpdir.join('perlmutter.platform.conf')
+
+    platform = """MPIRUN = srun
+HOST = perlmutter
+NODE_DETECTION = slurm_env
+CORES_PER_NODE = 64
+PROCS_PER_NODE = 64
+GPUS_PER_NODE = 4
+SOCKETS_PER_NODE = 1
+NODE_ALLOCATION_MODE = EXCLUSIVE
+USE_ACCURATE_NODES = ON
+"""
+
+    with open(platform_file, 'w') as f:
+        f.write(platform)
+
+    config_file = tmpdir.join('ips.config')
+
+    config = f"""RUN_COMMENT = testing
+SIM_NAME = test
+LOG_FILE = {str(tmpdir)}/sim.log
+LOG_LEVEL = INFO
+SIM_ROOT = {str(tmpdir)}
+SIMULATION_MODE = NORMAL
+[PORTS]
+    NAMES = DRIVER
+    [[DRIVER]]
+      IMPLEMENTATION = DRIVER
+[DRIVER]
+    CLASS = OPENMP
+    SUB_CLASS =
+    NAME = gpu_task
+    BIN_PATH =
+    EXE = {str(tmpdir)}/gpu_test.sh
+    NPROC = 1
+    INPUT_FILES =
+    OUTPUT_FILES =
+    SCRIPT =
+    MODULE = components.workers.perlmutter_srun_gpu
+"""
+
+    with open(config_file, 'w') as f:
+        f.write(config)
+
+    return platform_file, config_file
+
+
+@pytest.mark.perlmutter
+def test_srun_gpu_on_perlmutter(tmpdir):
+
+    platform_file, config_file = write_basic_config_and_platform_files(tmpdir)
+
+    exe = tmpdir.join("gpu_test.sh")
+    exe.write("#!/bin/bash\nmkdir -p $1\nnvidia-smi -L > $1/proc_${SLURM_PROCID}_GPUS.log\n")
+    exe.chmod(448)  # 700
+
+    framework = Framework(config_file_list=[str(config_file)],
+                          log_file_name=str(tmpdir.join('ips.log')),
+                          platform_file_name=str(platform_file),
+                          debug=None,
+                          verbose_debug=None,
+                          cmd_nodes=0,
+                          cmd_ppn=0)
+
+    framework.run()
+
+    # check simulation_log
+    json_files = glob.glob(str(tmpdir.join("simulation_log").join("*.json")))
+    assert len(json_files) == 1
+    with open(json_files[0], 'r') as json_file:
+        comments = [json.loads(line)['comment'].split(', ', maxsplit=4)[3:] for line in json_file.readlines()]
+
+    assert comments[5][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1")
+    assert comments[5][0].endswith("gpu_test.sh 1_1")
+
+    assert comments[7][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=2")
+    assert comments[7][0].endswith("gpu_test.sh 1_2")
+
+    assert comments[9][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=4")
+    assert comments[9][0].endswith("gpu_test.sh 1_4")
+
+    assert comments[11][0].startswith("Target = srun -N 1 -n 2 -c 32 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=2")
+    assert comments[11][0].endswith("gpu_test.sh 2_2")
+
+    assert comments[13][0].startswith("Target = srun -N 1 -n 4 -c 16 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1")
+    assert comments[13][0].endswith("gpu_test.sh 4_1")
+
+    # check that the process output log files are created
+    work_dir = tmpdir.join("work").join("OPENMP__gpu_task_1")
+
+    for nprocs, ngpus in ((1, 1), (1, 2), (1, 4), (2, 2), (4, 1)):
+        output_files = glob.glob(str(work_dir.join(f'{nprocs}_{ngpus}').join("*.log")))
+        assert len(output_files) == nprocs
+        for n in range(nprocs):
+            lines = open(output_files[n], 'r').readlines()
+            assert len(lines) == ngpus