Skip to content

Commit

Permalink
Add resources to provenance
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery committed Oct 9, 2024
1 parent 7320290 commit 19436e3
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 4 deletions.
23 changes: 21 additions & 2 deletions docs/provenance.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,19 @@ To make things more concrete, let's consider an example:
"version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018",
"machine": "x86_64"
}
},
"resources": {
"elapsed_time": 12.34,
"user_time": 10.56,
"sys_time": 1.78,
"max_memory": 1048576
}
}
```

This information records the provenance for a very simple msprime simulation. The record is a JSON
object with three mandatory fields ("software", "parameters" and "environment")
which we discuss separately in the following sections.
object with three mandatory fields ("software", "parameters" and "environment") and one optional
("resources") which we discuss separately in the following sections.

(sec_provenance_software)=

Expand Down Expand Up @@ -221,6 +227,19 @@ The `libraries` section captures information about important libraries that the
primary software links against. There is no required structure.


## Resources

The resources section captures details about the computational resources used during the execution of the software. This section is optional and has the following fields, each of which is optional and may not be filled depending on os support:


- `elapsed_time`: The total elapsed time in seconds.
- `user_time`: The total user CPU time in seconds.
- `sys_time`: The total system CPU time in seconds.
- `max_memory`: The maximum memory usage in bytes.

Including this information makes it easy for users of tree-sequence producing software to
account for resource usage across pipelines of tools.

(sec_provenance_schema)=

## Full schema
Expand Down
3 changes: 3 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
``pack_untracked_polytomies`` allows large polytomies involving untracked samples to
be summarised as a dotted line (:user:`hyanwong`, :issue:`3011` :pr:`3010`, :pr:`3012`)

- Add ``resources`` section to provenance schema. (:user:`benjeffery`, :pr:`3016`)


--------------------
[0.5.8] - 2024-06-27
--------------------
Expand Down
71 changes: 70 additions & 1 deletion python/tests/test_provenance.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2018-2020 Tskit Developers
# Copyright (c) 2018-2024 Tskit Developers
# Copyright (C) 2018 University of Oxford
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
Expand All @@ -26,6 +26,14 @@
import json
import os
import platform
import sys
import time

try:
import resource
except ImportError:
resource = None # resource.getrusage absent on windows


import msprime
import pytest
Expand All @@ -35,6 +43,9 @@
import tskit.provenance as provenance


_start_time = time.time()


def get_provenance(
software_name="x",
software_version="y",
Expand Down Expand Up @@ -121,6 +132,37 @@ def test_extra_stuff(self):
}
tskit.validate_provenance(extra)

def test_resources(self):
resources = {
"schema_version": "1",
"software": {"name": "x", "version": "y"},
"environment": {},
"parameters": {},
"resources": {
"elapsed_time": 1,
"user_time": 2,
"sys_time": 3,
"max_memory": 4,
},
}
tskit.validate_provenance(resources)

def test_resources_error(self):
resources = {
"schema_version": "1",
"software": {"name": "x", "version": "y"},
"environment": {},
"parameters": {},
"resources": {
"elapsed_time": "1",
"user_time": 2,
"sys_time": 3,
"max_memory": 4,
},
}
with pytest.raises(tskit.ProvenanceValidationError):
tskit.validate_provenance(resources)


class TestOutputProvenance:
"""
Expand Down Expand Up @@ -178,6 +220,33 @@ def test_libraries(self):
assert libs == env["libraries"]


class TestGetResources:
def test_used_resources_keys(self):
resources = provenance.get_resources(_start_time)
assert "elapsed_time" in resources
assert "user_time" in resources
assert "sys_time" in resources
if resource is not None:
assert "max_memory" in resources

def test_used_resources_values(self):
resources = provenance.get_resources(_start_time)
assert isinstance(resources["elapsed_time"], float)
assert isinstance(resources["user_time"], float)
assert isinstance(resources["sys_time"], float)
assert resources["elapsed_time"] > 0.0001
assert resources["user_time"] > 0.0001
assert resources["sys_time"] > 0.0001
if resource is not None:
assert isinstance(resources["max_memory"], int)
assert resources["max_memory"] > 1024

def test_used_resources_platform(self):
resources = provenance.get_resources(_start_time)
if sys.platform != "darwin" and resource is not None:
assert resources["max_memory"] % 1024 == 0


class TestGetSchema:
"""
Ensure we return the correct JSON schema.
Expand Down
27 changes: 26 additions & 1 deletion python/tskit/provenance.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2018-2023 Tskit Developers
# Copyright (c) 2018-2024 Tskit Developers
# Copyright (c) 2016-2017 University of Oxford
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
Expand All @@ -27,6 +27,13 @@
import json
import os.path
import platform
import sys
import time

try:
import resource
except ImportError:
resource = None # resource.getrusage absent on windows

Check warning on line 36 in python/tskit/provenance.py

View check run for this annotation

Codecov / codecov/patch

python/tskit/provenance.py#L35-L36

Added lines #L35 - L36 were not covered by tests

import jsonschema

Expand Down Expand Up @@ -72,6 +79,24 @@ def get_environment(extra_libs=None, include_tskit=True):
return env


def get_resources(start_time):
# Returns a dict describing the resources used by the current process
times = os.times()
ret = {
"elapsed_time": time.time() - start_time,
"user_time": times.user + times.children_user,
"sys_time": times.system + times.children_system,
}
if resource is not None:
# Don't report max memory on Windows. We could do this using the psutil lib, via
# psutil.Process(os.getpid()).get_ext_memory_info().peak_wset if demand exists
ret["max_memory"] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if sys.platform != "darwin":
ret["max_memory"] *= 1024 # Linux, freeBSD et al reports in KB, not bytes

return ret


def get_provenance_dict(parameters=None):
"""
Returns a dictionary encoding an execution of tskit conforming to the
Expand Down
22 changes: 22 additions & 0 deletions python/tskit/provenance.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,28 @@
"type": "object"
}
}
},
"resources": {
"description": "Resources used by this operation.",
"type": "object",
"properties": {
"elapsed_time": {
"description": "Wall clock time in used in seconds.",
"type": "number"
},
"user_time": {
"description": "User time used in seconds.",
"type": "number"
},
"sys_time": {
"description": "System time used in seconds.",
"type": "number"
},
"max_memory": {
"description": "Maximum memory used in bytes.",
"type": "number"
}
}
}
}
}

0 comments on commit 19436e3

Please sign in to comment.