Skip to content

Commit

Permalink
Merge pull request #237 from wpbonelli/test-data-generation
Browse files Browse the repository at this point in the history
refactor test data generation
  • Loading branch information
jmccreight authored Sep 14, 2023
2 parents a17937a + 579093b commit 2f57332
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 200 deletions.
1 change: 1 addition & 0 deletions environment_w_jupyter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ dependencies:
- black
- click != 8.1.0
- isort
- filelock
- flake8
- git+https://github.com/modflowpy/flopy.git
- jupyter_black
Expand Down
20 changes: 11 additions & 9 deletions test_data/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# test_data/
# Test data

This directory contains
The `test_data` directory contains
* domain directories for domain tests
* scripts/ which document how domain tests are established.

* `scripts/`, which contains scripts to run domain simulations and generate data.

## Domain directories

Expand All @@ -15,15 +14,18 @@ output to a disk faster than the one where the repository lives. However, we do
version control certain files in the directory. See the `conus_2yr/README.md` for details
on how it is setup.


Other domain directories can be run from pytest given the repository. The pytest
flag `--all_domains` will detect these repos and run them.

Original source data can be found on Denali for most (all?) domains in `/home/jmccreight/pywatershed_data`.

# Generating data

# scripts/
The `test_data/scripts` subdirectory contains code for reproducing test data in the domains. Importantly, `test_run_domains.py` should be run occasionally to update the test data. After running the domains, NetCDF files can be created from simulation outputs by running the tests in `test_nc_domains.py`. E.g.,

This contains code for reproducing test data in the domains. Importantly, `run_domains.py` should be run
occasionally to update the test data.
```shell
pytest -v -n auto test_run_domains.py
pytest -v -n auto test_nc_domains.py
```

Original source data can be found on Denali for most (all?) domains in `/home/jmccreight/pywatershed_data`.
NetCDF dependencies are encoded implicitly into the `pytest` fixture system: `test_nc_domains.py` uses a custom test parametrization with `pytest_generate_tests` to map each CSV file created by the domain simulation to one or more NetCDF files, which are then aggregated into further files on session teardown by [yield fixtures](https://docs.pytest.org/en/7.2.x/how-to/fixtures.html#teardown-cleanup-aka-fixture-finalization). A [filelock](https://pytest-xdist.readthedocs.io/en/latest/how-to.html#making-session-scoped-fixtures-execute-only-once) is used to ensure aggregate files are only created once, even with multiple `pytest-xdist` workers.
146 changes: 50 additions & 96 deletions test_data/scripts/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from fnmatch import fnmatch
from platform import processor
from typing import List

import pytest

Expand Down Expand Up @@ -55,32 +56,6 @@ def exe():
# This would change to handle other/additional schedulers
domain_globs_schedule = ["*conus*"]

# For generating timeseries of previous states
previous_vars = [
"dprst_stor_hru",
"freeh2o",
"hru_impervstor",
"pk_ice",
"pref_flow_stor",
"slow_stor",
"soil_lower",
"soil_moist",
"soil_rechr",
"ssres_stor",
]

misc_nc_file_vars = [
"infil",
"sroff",
"ssres_flow",
"gwres_flow",
]


final_nc_file_vars = [
"through_rain",
]


def scheduler_active():
slurm = os.getenv("SLURM_JOB_ID") is not None
Expand All @@ -102,26 +77,38 @@ def enforce_scheduler(test_dir):
return None


def collect_simulations(domain_list: list, force: bool):
def collect_simulations(
domain_list: list, force: bool = True, verbose: bool = False
):
simulations = {}
for test_dir in test_dirs:
for pth in test_dir.iterdir():
# checking for prcp.cbh ensure this is a self-contained run (all
# files in repo)
if (
(test_dir / "prcp.cbh").exists()
and pth.is_file()
and pth.name == "control.test"
):
if len(domain_list) and (test_dir.name not in domain_list):
continue

if not force:
enforce_scheduler(test_dir)

# add simulation
simulations[str(test_dir)] = pth.name
# ensure this is a self-contained run (all files in repo)
if not (test_dir / "prcp.cbh").exists():
continue

# filter selected domains
if len(domain_list) and (test_dir.name not in domain_list):
continue

# optionally enforce scheduler
if not force:
enforce_scheduler(test_dir)

# if control file is found, add simulation
ctrl_file = next(
iter(
[
p
for p in test_dir.iterdir()
if p.is_file() and p.name == "control.test"
]
),
None,
)
if ctrl_file:
simulations[str(test_dir)] = ctrl_file.name

# make sure all requested domains were found
if len(domain_list) and (len(simulations) < len(domain_list)):
requested = set(domain_list)
found = [pl.Path(dd).name for dd in simulations.keys()]
Expand All @@ -132,13 +119,14 @@ def collect_simulations(domain_list: list, force: bool):
)
pytest.exit(msg)

print("\nrun_domains.py found the following domains to run:\n")
print(f"{list(simulations.keys())}")
if verbose:
print("\nrun_domains.py found the following domains to run:\n")
print(f"{list(simulations.keys())}")

return simulations


def collect_csv_files(domain_list: list, force: bool):
simulations = collect_simulations(domain_list, force)
def collect_csv_files(simulations: list) -> List[pl.Path]:
csv_files = []
for key, value in simulations.items():
output_pth = pl.Path(key) / "output"
Expand All @@ -147,63 +135,29 @@ def collect_csv_files(domain_list: list, force: bool):
return csv_files


def collect_misc_nc_files(domain_list: list, var_list: list, force: bool):
simulations = collect_simulations(domain_list, force)
sim_dirs = list(simulations.keys())
misc_nc_files = []
for var in var_list:
for sim in sim_dirs:
the_file = pl.Path(sim) / f"output/{var}.nc"
# assert the_file.exists()
misc_nc_files += [the_file.with_suffix("")]

return misc_nc_files


def pytest_generate_tests(metafunc):
domain_list = metafunc.config.getoption("domain")
force = metafunc.config.getoption("force")
simulations = collect_simulations(domain_list, force)
csv_files = collect_csv_files(simulations)

if "simulations" in metafunc.fixturenames:
simulations = collect_simulations(domain_list, force)
sim_list = [
{"ws": key, "control_file": val}
for key, val in simulations.items()
]
ids = [pl.Path(ss).name for ss in simulations.keys()]
metafunc.parametrize("simulations", sim_list, ids=ids)

if "csv_files" in metafunc.fixturenames:
csv_files = collect_csv_files(domain_list, force)
ids = [ff.parent.parent.name + ":" + ff.name for ff in csv_files]
metafunc.parametrize("csv_files", csv_files, ids=ids)

if "csv_files_prev" in metafunc.fixturenames:
csv_files = collect_csv_files(domain_list, force)
csv_files = [
ff for ff in csv_files if ff.with_suffix("").name in previous_vars
]
if "csv_file" in metafunc.fixturenames:
ids = [ff.parent.parent.name + ":" + ff.name for ff in csv_files]
metafunc.parametrize("csv_files_prev", csv_files, ids=ids)

if "misc_nc_files_input" in metafunc.fixturenames:
misc_nc_files = collect_misc_nc_files(
domain_list, misc_nc_file_vars, force
)
ids = [ff.parent.parent.name + ":" + ff.name for ff in misc_nc_files]
metafunc.parametrize("misc_nc_files_input", misc_nc_files, ids=ids)

if "misc_nc_final_input" in metafunc.fixturenames:
misc_nc_files = collect_misc_nc_files(
domain_list, final_nc_file_vars, force
)
ids = [ff.parent.parent.name + ":" + ff.name for ff in misc_nc_files]
metafunc.parametrize("misc_nc_final_input", misc_nc_files, ids=ids)
metafunc.parametrize("csv_file", csv_files, ids=ids)

if "soltab_file" in metafunc.fixturenames:
simulations = collect_simulations(domain_list, force)
soltab_files = [
pl.Path(kk) / "soltab_debug" for kk in simulations.keys()
]
ids = [ff.parent.name + ":" + ff.name for ff in soltab_files]
metafunc.parametrize("soltab_file", soltab_files, ids=ids)
metafunc.parametrize(
"soltab_file", soltab_files, ids=ids, scope="session"
)

if "simulation" in metafunc.fixturenames:
sims = [
{"ws": key, "control_file": val}
for key, val in simulations.items()
]
ids = [pl.Path(kk).name for kk in simulations.keys()]
metafunc.parametrize("simulation", sims, ids=ids, scope="session")
8 changes: 7 additions & 1 deletion test_data/scripts/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
[pytest]
addopts = --order-dependencies
addopts = --order-dependencies
python_files =
test_*.py
python_functions =
create_*
make_*
test_*
Loading

0 comments on commit 2f57332

Please sign in to comment.