Skip to content

Commit

Permalink
Add perlmutter test
Browse files Browse the repository at this point in the history
  • Loading branch information
rosswhitfield committed Nov 3, 2022
1 parent f864926 commit 421f9f8
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 8 deletions.
34 changes: 33 additions & 1 deletion doc/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,38 @@ An example batch script for running the unit tests is:
Then check the output in ``pytest.out`` to see that all the tests
passed.

.. _perlmutter-tests :

Perlmutter only tests
~~~~~~~~~~~~~~~~~~~~~

The are some tests that only run on Perlmutter at NERSC and these are
not run as part of the :ref:`CI <continuous integration>` and must be
run manually. To run those test you need to add the option
``--runperlmutter`` to the ``pytest``.There are also tests for the
srun commands built with different ``task_ppn``, ``task_cpp`` and
``task_gpp`` options in
:meth:`~ipsframework.services.ServicesProxy.launch_task`.


An example batch script for running the unit tests is:

.. code-block:: bash
#!/bin/bash
#SBATCH -p debug
#SBATCH --nodes=1
#SBATCH -t 00:20:00
#SBATCH -C gpu
#SBATCH -J pytest
#SBATCH -e pytest.err
#SBATCH -o pytest.out
module load python
python -m pytest --runperlmutter
Then check the output in ``pytest.out`` to see that all the tests
passed.

Writing Tests
~~~~~~~~~~~~~

Expand Down Expand Up @@ -258,4 +290,4 @@ release before the full release to allow feedback from users. Patch
versions will not normally have an release candidate.

Before a release is finalized the :ref:`Cori only tests<cori-tests>`
should be run.
and :ref:`Perlmutter only tests<perlmutter-tests>` should be run.
13 changes: 13 additions & 0 deletions tests/components/workers/perlmutter_srun_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from ipsframework import Component


class gpu_task(Component):
# pylint: disable=no-member
def step(self, timestamp=0.0, **keywords):
cwd = self.services.get_working_dir()

self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_1", task_gpp=1))
self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_2", task_gpp=2))
self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_4", task_gpp=4))
self.services.wait_task(self.services.launch_task(2, cwd, self.EXE, "2_2", task_gpp=2))
self.services.wait_task(self.services.launch_task(4, cwd, self.EXE, "4_1", task_gpp=1))
21 changes: 14 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,24 @@ def run_around_tests():

def pytest_addoption(parser):
parser.addoption("--runcori", action="store_true", default=False, help="run Cori tests")
parser.addoption("--runperlmutter", action="store_true", default=False, help="run Perlmutter tests")


def pytest_configure(config):
config.addinivalue_line("markers", "cori: mark test to only work on Cori")
config.addinivalue_line("markers", "perlmutter: mark test to only work on Perlmutter")


def pytest_collection_modifyitems(config, items):
if config.getoption("--runcori"):
# --runslow given in cli: do not skip slow tests
return
skip_cori = pytest.mark.skip(reason="need --runcori option to run")
for item in items:
if "cori" in item.keywords:
item.add_marker(skip_cori)
if not config.getoption("--runcori"):
# --runcori given in cli: do not skip slow tests
skip_cori = pytest.mark.skip(reason="need --runcori option to run")
for item in items:
if "cori" in item.keywords:
item.add_marker(skip_cori)
if not config.getoption("--runperlmutter"):
# --runperlmutter given in cli: do not skip slow tests
skip_cori = pytest.mark.skip(reason="need --runperlmutter option to run")
for item in items:
if "perlmutter" in item.keywords:
item.add_marker(skip_cori)
103 changes: 103 additions & 0 deletions tests/new/test_perlmutter_srun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import glob
import json
import pytest
from ipsframework import Framework


def write_basic_config_and_platform_files(tmpdir):
platform_file = tmpdir.join('perlmutter.platform.conf')

platform = """MPIRUN = srun
HOST = perlmutter
NODE_DETECTION = slurm_env
CORES_PER_NODE = 64
PROCS_PER_NODE = 64
GPUS_PER_NODE = 4
SOCKETS_PER_NODE = 1
NODE_ALLOCATION_MODE = EXCLUSIVE
USE_ACCURATE_NODES = ON
"""

with open(platform_file, 'w') as f:
f.write(platform)

config_file = tmpdir.join('ips.config')

config = f"""RUN_COMMENT = testing
SIM_NAME = test
LOG_FILE = {str(tmpdir)}/sim.log
LOG_LEVEL = INFO
SIM_ROOT = {str(tmpdir)}
SIMULATION_MODE = NORMAL
[PORTS]
NAMES = DRIVER
[[DRIVER]]
IMPLEMENTATION = DRIVER
[DRIVER]
CLASS = OPENMP
SUB_CLASS =
NAME = gpu_task
BIN_PATH =
EXE = {str(tmpdir)}/gpu_test.sh
NPROC = 1
INPUT_FILES =
OUTPUT_FILES =
SCRIPT =
MODULE = components.workers.perlmutter_srun_gpu
"""

with open(config_file, 'w') as f:
f.write(config)

return platform_file, config_file


@pytest.mark.perlmutter
def test_srun_gpu_on_perlmutter(tmpdir):

platform_file, config_file = write_basic_config_and_platform_files(tmpdir)

exe = tmpdir.join("gpu_test.sh")
exe.write("#!/bin/bash\nmkdir -p $1\nnvidia-smi -L > $1/proc_${SLURM_PROCID}_GPUS.log\n")
exe.chmod(448) # 700

framework = Framework(config_file_list=[str(config_file)],
log_file_name=str(tmpdir.join('ips.log')),
platform_file_name=str(platform_file),
debug=None,
verbose_debug=None,
cmd_nodes=0,
cmd_ppn=0)

framework.run()

# check simulation_log
json_files = glob.glob(str(tmpdir.join("simulation_log").join("*.json")))
assert len(json_files) == 1
with open(json_files[0], 'r') as json_file:
comments = [json.loads(line)['comment'].split(', ', maxsplit=4)[3:] for line in json_file.readlines()]

assert comments[5][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1")
assert comments[5][0].endswith("gpu_test.sh 1_1")

assert comments[7][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=2")
assert comments[7][0].endswith("gpu_test.sh 1_2")

assert comments[9][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=4")
assert comments[9][0].endswith("gpu_test.sh 1_4")

assert comments[11][0].startswith("Target = srun -N 1 -n 2 -c 32 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=2")
assert comments[11][0].endswith("gpu_test.sh 2_2")

assert comments[13][0].startswith("Target = srun -N 1 -n 4 -c 16 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1")
assert comments[13][0].endswith("gpu_test.sh 4_1")

# check that the process output log files are created
work_dir = tmpdir.join("work").join("OPENMP__gpu_task_1")

for nprocs, ngpus in ((1, 1), (1, 2), (1, 4), (2, 2), (4, 1)):
output_files = glob.glob(str(work_dir.join(f'{nprocs}_{ngpus}').join("*.log")))
assert len(output_files) == nprocs
for n in range(nprocs):
lines = open(output_files[n], 'r').readlines()
assert len(lines) == ngpus

0 comments on commit 421f9f8

Please sign in to comment.