diff --git a/docs/provenance.md b/docs/provenance.md index d2eed1fe92..d07da38b11 100644 --- a/docs/provenance.md +++ b/docs/provenance.md @@ -80,13 +80,19 @@ To make things more concrete, let's consider an example: "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018", "machine": "x86_64" } + }, + "resources": { + "elapsed_time": 12.34, + "user_time": 10.56, + "sys_time": 1.78, + "max_memory": 1048576 } } ``` This information records the provenance for a very simple msprime simulation. The record is a JSON -object with three mandatory fields ("software", "parameters" and "environment") -which we discuss separately in the following sections. +object with three mandatory fields ("software", "parameters" and "environment") and one optional +("resources") which we discuss separately in the following sections. (sec_provenance_software)= @@ -221,6 +227,19 @@ The `libraries` section captures information about important libraries that the primary software links against. There is no required structure. +## Resources + +The resources section captures details about the computational resources used during the execution of the software. This section is optional and has the following fields, each of which is optional and may not be filled depending on os support: + + +- `elapsed_time`: The total elapsed time in seconds. +- `user_time`: The total user CPU time in seconds. +- `sys_time`: The total system CPU time in seconds. +- `max_memory`: The maximum memory usage in bytes. + +Including this information makes it easy for users of tree-sequence producing software to +account for resource usage across pipelines of tools. + (sec_provenance_schema)= ## Full schema diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 9815792123..36c7abc235 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -64,6 +64,9 @@ ``pack_untracked_polytomies`` allows large polytomies involving untracked samples to be summarised as a dotted line (:user:`hyanwong`, :issue:`3011` :pr:`3010`, :pr:`3012`) +- Add ``resources`` section to provenance schema. (:user:`benjeffery`, :pr:`3016`) + + -------------------- [0.5.8] - 2024-06-27 -------------------- diff --git a/python/tests/test_provenance.py b/python/tests/test_provenance.py index 0f7c662523..1c23162d36 100644 --- a/python/tests/test_provenance.py +++ b/python/tests/test_provenance.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2020 Tskit Developers +# Copyright (c) 2018-2024 Tskit Developers # Copyright (C) 2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -26,6 +26,14 @@ import json import os import platform +import sys +import time + +try: + import resource +except ImportError: + resource = None # resource.getrusage absent on windows + import msprime import pytest @@ -35,6 +43,9 @@ import tskit.provenance as provenance +_start_time = time.time() + + def get_provenance( software_name="x", software_version="y", @@ -121,6 +132,37 @@ def test_extra_stuff(self): } tskit.validate_provenance(extra) + def test_resources(self): + resources = { + "schema_version": "1", + "software": {"name": "x", "version": "y"}, + "environment": {}, + "parameters": {}, + "resources": { + "elapsed_time": 1, + "user_time": 2, + "sys_time": 3, + "max_memory": 4, + }, + } + tskit.validate_provenance(resources) + + def test_resources_error(self): + resources = { + "schema_version": "1", + "software": {"name": "x", "version": "y"}, + "environment": {}, + "parameters": {}, + "resources": { + "elapsed_time": "1", + "user_time": 2, + "sys_time": 3, + "max_memory": 4, + }, + } + with pytest.raises(tskit.ProvenanceValidationError): + tskit.validate_provenance(resources) + class TestOutputProvenance: """ @@ -178,6 +220,33 @@ def test_libraries(self): assert libs == env["libraries"] +class TestGetResources: + def test_used_resources_keys(self): + resources = provenance.get_resources(_start_time) + assert "elapsed_time" in resources + assert "user_time" in resources + assert "sys_time" in resources + if resource is not None: + assert "max_memory" in resources + + def test_used_resources_values(self): + resources = provenance.get_resources(_start_time) + assert isinstance(resources["elapsed_time"], float) + assert isinstance(resources["user_time"], float) + assert isinstance(resources["sys_time"], float) + assert resources["elapsed_time"] > 0.0001 + assert resources["user_time"] > 0.0001 + assert resources["sys_time"] > 0.0001 + if resource is not None: + assert isinstance(resources["max_memory"], int) + assert resources["max_memory"] > 1024 + + def test_used_resources_platform(self): + resources = provenance.get_resources(_start_time) + if sys.platform != "darwin" and resource is not None: + assert resources["max_memory"] % 1024 == 0 + + class TestGetSchema: """ Ensure we return the correct JSON schema. diff --git a/python/tskit/provenance.py b/python/tskit/provenance.py index bc88e29f1a..8e5fe40794 100644 --- a/python/tskit/provenance.py +++ b/python/tskit/provenance.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2023 Tskit Developers +# Copyright (c) 2018-2024 Tskit Developers # Copyright (c) 2016-2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -27,6 +27,13 @@ import json import os.path import platform +import sys +import time + +try: + import resource +except ImportError: + resource = None # resource.getrusage absent on windows import jsonschema @@ -72,6 +79,24 @@ def get_environment(extra_libs=None, include_tskit=True): return env +def get_resources(start_time): + # Returns a dict describing the resources used by the current process + times = os.times() + ret = { + "elapsed_time": time.time() - start_time, + "user_time": times.user + times.children_user, + "sys_time": times.system + times.children_system, + } + if resource is not None: + # Don't report max memory on Windows. We could do this using the psutil lib, via + # psutil.Process(os.getpid()).get_ext_memory_info().peak_wset if demand exists + ret["max_memory"] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if sys.platform != "darwin": + ret["max_memory"] *= 1024 # Linux, freeBSD et al reports in KB, not bytes + + return ret + + def get_provenance_dict(parameters=None): """ Returns a dictionary encoding an execution of tskit conforming to the diff --git a/python/tskit/provenance.schema.json b/python/tskit/provenance.schema.json index fd683fff9e..412a7ccb97 100644 --- a/python/tskit/provenance.schema.json +++ b/python/tskit/provenance.schema.json @@ -45,6 +45,28 @@ "type": "object" } } + }, + "resources": { + "description": "Resources used by this operation.", + "type": "object", + "properties": { + "elapsed_time": { + "description": "Wall clock time in used in seconds.", + "type": "number" + }, + "user_time": { + "description": "User time used in seconds.", + "type": "number" + }, + "sys_time": { + "description": "System time used in seconds.", + "type": "number" + }, + "max_memory": { + "description": "Maximum memory used in bytes.", + "type": "number" + } + } } } }