diff --git a/doc/development.rst b/doc/development.rst index 7b02658f..9f528d56 100644 --- a/doc/development.rst +++ b/doc/development.rst @@ -169,6 +169,38 @@ An example batch script for running the unit tests is: Then check the output in ``pytest.out`` to see that all the tests passed. +.. _perlmutter-tests : + +Perlmutter only tests +~~~~~~~~~~~~~~~~~~~~~ + +The are some tests that only run on Perlmutter at NERSC and these are +not run as part of the :ref:`CI ` and must be +run manually. To run those test you need to add the option +``--runperlmutter`` to the ``pytest``.There are also tests for the +srun commands built with different ``task_ppn``, ``task_cpp`` and +``task_gpp`` options in +:meth:`~ipsframework.services.ServicesProxy.launch_task`. + + +An example batch script for running the unit tests is: + +.. code-block:: bash + + #!/bin/bash + #SBATCH -p debug + #SBATCH --nodes=1 + #SBATCH -t 00:20:00 + #SBATCH -C gpu + #SBATCH -J pytest + #SBATCH -e pytest.err + #SBATCH -o pytest.out + module load python + python -m pytest --runperlmutter + +Then check the output in ``pytest.out`` to see that all the tests +passed. + Writing Tests ~~~~~~~~~~~~~ @@ -258,4 +290,4 @@ release before the full release to allow feedback from users. Patch versions will not normally have an release candidate. Before a release is finalized the :ref:`Cori only tests` -should be run. +and :ref:`Perlmutter only tests` should be run. diff --git a/tests/components/workers/perlmutter_srun_gpu.py b/tests/components/workers/perlmutter_srun_gpu.py new file mode 100644 index 00000000..48e6c54b --- /dev/null +++ b/tests/components/workers/perlmutter_srun_gpu.py @@ -0,0 +1,13 @@ +from ipsframework import Component + + +class gpu_task(Component): + # pylint: disable=no-member + def step(self, timestamp=0.0, **keywords): + cwd = self.services.get_working_dir() + + self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_1", task_gpp=1)) + self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_2", task_gpp=2)) + self.services.wait_task(self.services.launch_task(1, cwd, self.EXE, "1_4", task_gpp=4)) + self.services.wait_task(self.services.launch_task(2, cwd, self.EXE, "2_2", task_gpp=2)) + self.services.wait_task(self.services.launch_task(4, cwd, self.EXE, "4_1", task_gpp=1)) diff --git a/tests/conftest.py b/tests/conftest.py index 52367f6e..39d7e44c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,17 +44,24 @@ def run_around_tests(): def pytest_addoption(parser): parser.addoption("--runcori", action="store_true", default=False, help="run Cori tests") + parser.addoption("--runperlmutter", action="store_true", default=False, help="run Perlmutter tests") def pytest_configure(config): config.addinivalue_line("markers", "cori: mark test to only work on Cori") + config.addinivalue_line("markers", "perlmutter: mark test to only work on Perlmutter") def pytest_collection_modifyitems(config, items): - if config.getoption("--runcori"): - # --runslow given in cli: do not skip slow tests - return - skip_cori = pytest.mark.skip(reason="need --runcori option to run") - for item in items: - if "cori" in item.keywords: - item.add_marker(skip_cori) + if not config.getoption("--runcori"): + # --runcori given in cli: do not skip slow tests + skip_cori = pytest.mark.skip(reason="need --runcori option to run") + for item in items: + if "cori" in item.keywords: + item.add_marker(skip_cori) + if not config.getoption("--runperlmutter"): + # --runperlmutter given in cli: do not skip slow tests + skip_cori = pytest.mark.skip(reason="need --runperlmutter option to run") + for item in items: + if "perlmutter" in item.keywords: + item.add_marker(skip_cori) diff --git a/tests/new/test_perlmutter_srun.py b/tests/new/test_perlmutter_srun.py new file mode 100644 index 00000000..b9122db5 --- /dev/null +++ b/tests/new/test_perlmutter_srun.py @@ -0,0 +1,103 @@ +import glob +import json +import pytest +from ipsframework import Framework + + +def write_basic_config_and_platform_files(tmpdir): + platform_file = tmpdir.join('perlmutter.platform.conf') + + platform = """MPIRUN = srun +HOST = perlmutter +NODE_DETECTION = slurm_env +CORES_PER_NODE = 64 +PROCS_PER_NODE = 64 +GPUS_PER_NODE = 4 +SOCKETS_PER_NODE = 1 +NODE_ALLOCATION_MODE = EXCLUSIVE +USE_ACCURATE_NODES = ON +""" + + with open(platform_file, 'w') as f: + f.write(platform) + + config_file = tmpdir.join('ips.config') + + config = f"""RUN_COMMENT = testing +SIM_NAME = test +LOG_FILE = {str(tmpdir)}/sim.log +LOG_LEVEL = INFO +SIM_ROOT = {str(tmpdir)} +SIMULATION_MODE = NORMAL +[PORTS] + NAMES = DRIVER + [[DRIVER]] + IMPLEMENTATION = DRIVER +[DRIVER] + CLASS = OPENMP + SUB_CLASS = + NAME = gpu_task + BIN_PATH = + EXE = {str(tmpdir)}/gpu_test.sh + NPROC = 1 + INPUT_FILES = + OUTPUT_FILES = + SCRIPT = + MODULE = components.workers.perlmutter_srun_gpu +""" + + with open(config_file, 'w') as f: + f.write(config) + + return platform_file, config_file + + +@pytest.mark.perlmutter +def test_srun_gpu_on_perlmutter(tmpdir): + + platform_file, config_file = write_basic_config_and_platform_files(tmpdir) + + exe = tmpdir.join("gpu_test.sh") + exe.write("#!/bin/bash\nmkdir -p $1\nnvidia-smi -L > $1/proc_${SLURM_PROCID}_GPUS.log\n") + exe.chmod(448) # 700 + + framework = Framework(config_file_list=[str(config_file)], + log_file_name=str(tmpdir.join('ips.log')), + platform_file_name=str(platform_file), + debug=None, + verbose_debug=None, + cmd_nodes=0, + cmd_ppn=0) + + framework.run() + + # check simulation_log + json_files = glob.glob(str(tmpdir.join("simulation_log").join("*.json"))) + assert len(json_files) == 1 + with open(json_files[0], 'r') as json_file: + comments = [json.loads(line)['comment'].split(', ', maxsplit=4)[3:] for line in json_file.readlines()] + + assert comments[5][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1") + assert comments[5][0].endswith("gpu_test.sh 1_1") + + assert comments[7][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=2") + assert comments[7][0].endswith("gpu_test.sh 1_2") + + assert comments[9][0].startswith("Target = srun -N 1 -n 1 -c 64 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=4") + assert comments[9][0].endswith("gpu_test.sh 1_4") + + assert comments[11][0].startswith("Target = srun -N 1 -n 2 -c 32 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=2") + assert comments[11][0].endswith("gpu_test.sh 2_2") + + assert comments[13][0].startswith("Target = srun -N 1 -n 4 -c 16 --threads-per-core=1 --cpu-bind=cores --gpus-per-task=1") + assert comments[13][0].endswith("gpu_test.sh 4_1") + + # check that the process output log files are created + work_dir = tmpdir.join("work").join("OPENMP__gpu_task_1") + + for nprocs, ngpus in ((1, 1), (1, 2), (1, 4), (2, 2), (4, 1)): + output_files = glob.glob(str(work_dir.join(f'{nprocs}_{ngpus}').join("*.log"))) + assert len(output_files) == nprocs + for n in range(nprocs): + lines = open(output_files[n], 'r').readlines() + assert len(lines) == ngpus