Merge pull request #1953 from jeromekelleher/time-units-and-refseq

Time units and refseq
tskit-dev · Dec 14, 2021 · 57ef4ee · 57ef4ee
2 parents 8f86bb9 + 74b7018
commit 57ef4ee
Show file tree

Hide file tree

Showing 9 changed files with 175 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,14 @@
 # Changelog
 
-**New features**:
+## [1.1.0] - 2021-12-14
+
+**New features**
 
+- Add support for tree sequence ``time_units`` field. The ``time_units`` will
+  be set to "generations" for the output of ``sim_ancestry`` (and ``simulate``),
+  unless the ``initial_state`` argument is used. In this case, the
+  ``time_units`` value will be inherited from the input.
+  ({pr}`1953`, {issue}`1951`, {issue}`1877`, {issue}`1948`, {user}`jeromekelleher`).
 
 **Bug fixes**:
 
@@ -18,6 +25,10 @@
   ``Demography.from_old_style`` ({issue}`1950`, {pr}`1954`,
   {user}`jeromekelleher`)
 
+**Maintenance **:
+
+- Update tskit to Python 0.4.0 and C 0.99.15.
+
 ## [1.0.4] - 2021-12-01
 
 **New features**:

diff --git a/msprime/__init__.py b/msprime/__init__.py
@@ -45,6 +45,7 @@
     StandardCoalescent,
     SweepGenicSelection,
     FixedPedigree,
+    TimeUnitsMismatchWarning,
 )
 
 from msprime.core import __version__

diff --git a/msprime/ancestry.py b/msprime/ancestry.py
@@ -30,6 +30,7 @@
 import math
 import sys
 import tempfile
+import warnings
 from typing import Any
 from typing import ClassVar
 
@@ -45,6 +46,15 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+TIME_UNITS_GENERATIONS = "generations"
+
+
+class TimeUnitsMismatchWarning(UserWarning):
+    """
+    Warning raise when the time units specified in different parts of a
+    simulation do not match.
+    """
+
 
 def _model_factory(model: None | str | AncestryModel) -> AncestryModel:
     """
@@ -195,6 +205,7 @@ def _demography_factory(
 def _build_initial_tables(*, sequence_length, samples, ploidy, demography):
     # NOTE: this is only used in the simulate() codepath.
     tables = tskit.TableCollection(sequence_length)
+    tables.time_units = TIME_UNITS_GENERATIONS
 
     for index, (population, time) in enumerate(samples):
         tables.nodes.add_row(
@@ -804,6 +815,20 @@ def _parse_sim_ancestry(
             raise TypeError(
                 "initial_state must either be a TreeSequence or TableCollection instance"
             )
+        if initial_state.time_units == tskit.TIME_UNITS_UNCALIBRATED:
+            raise ValueError(
+                "Cannot use a tree sequence with uncalibrated time_units as "
+                "the initial state"
+            )
+        if initial_state.time_units != TIME_UNITS_GENERATIONS:
+            message = (
+                f"The initial_state has time_units={initial_state.time_units} but "
+                "time is measured in generations in msprime. This may lead to "
+                "significant discrepancies between the timescales. "
+                "If you wish to suppress this warning, you can use, e.g., "
+                "warnings.simplefilter('ignore', msprime.TimeUnitsMismatchWarning)"
+            )
+            warnings.warn(message, TimeUnitsMismatchWarning)
 
     if sequence_length is None:
         # These are all the cases in which we derive the sequence_length
@@ -940,6 +965,7 @@ def _parse_sim_ancestry(
                 "Either the samples or initial_state arguments must be provided"
             )
         initial_state = tskit.TableCollection(sequence_length)
+        initial_state.time_units = TIME_UNITS_GENERATIONS
         demography.insert_populations(initial_state)
         if not init_for_debugger:
             sample_sets = _parse_samples(samples, demography)

diff --git a/msprime/mutations.py b/msprime/mutations.py
@@ -1356,6 +1356,11 @@ def sim_mutations(
     if not isinstance(rate_map, intervals.RateMap):
         raise TypeError("rate must be a float or a RateMap")
 
+    if tables.time_units == tskit.TIME_UNITS_UNCALIBRATED:
+        raise ValueError(
+            "Simulating mutations doesn't make sense when time is uncalibrated"
+        )
+
     start_time = -sys.float_info.max if start_time is None else float(start_time)
     end_time = sys.float_info.max if end_time is None else float(end_time)
     if start_time > end_time:

diff --git a/msprime/pedigrees.py b/msprime/pedigrees.py
@@ -31,6 +31,7 @@ def __init__(self, demography=None):
             demography = demog_mod.Demography.isolated_model([1])
         self.demography = demography
         self.tables = tskit.TableCollection(0)
+        self.tables.time_units = "generations"
         demography.insert_populations(self.tables)
 
         assert len(self.tables.individuals) == 0

diff --git a/tests/test_ancestry.py b/tests/test_ancestry.py
@@ -519,7 +519,9 @@ def test_sequence_length_errors(self):
             )
 
         # An initial state with a sequence_length that disagrees.
-        initial_state = tskit.TableCollection(1234).tree_sequence()
+        initial_state = tskit.TableCollection(1234)
+        initial_state.time_units = "generations"
+        initial_state = initial_state.tree_sequence()
         with pytest.raises(ValueError):
             ancestry._parse_sim_ancestry(initial_state=initial_state, sequence_length=1)
         with pytest.raises(ValueError):
@@ -753,12 +755,14 @@ def test_dtwf_population_size(self):
 
     def test_initial_state_errors(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.add_row()
         # sequence_length doesn't match.
         with pytest.raises(ValueError):
             ancestry._parse_sim_ancestry(initial_state=tables, sequence_length=100)
         # Must have at least one population
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         with pytest.raises(ValueError):
             ancestry._parse_sim_ancestry(initial_state=tables)
         for bad_type in [[], "sdf", {}]:
@@ -801,6 +805,7 @@ def test_samples_and_initial_state(self):
 
         # Specifying both is also an error.
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.add_row()
         with pytest.raises(ValueError):
             ancestry._parse_sim_ancestry(2, initial_state=tables)
@@ -822,18 +827,22 @@ def test_pedigree_requires_no_population_size(self):
 
     @pytest.mark.parametrize("ploidy", [1, 3])
     def test_pedigree_requires_ploidy2(self, ploidy):
+        tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         with pytest.raises(ValueError, match="must have ploidy=2"):
             ancestry._parse_sim_ancestry(
                 model="fixed_pedigree",
-                initial_state=tskit.TableCollection(1),
+                initial_state=tables,
                 ploidy=ploidy,
             )
 
     def test_pedigree_requires_no_samples(self):
+        tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         with pytest.raises(ValueError, match="Cannot specify both samples and"):
             ancestry._parse_sim_ancestry(
                 model="fixed_pedigree",
-                initial_state=tskit.TableCollection(1),
+                initial_state=tables,
                 samples=10,
             )
 
@@ -2249,3 +2258,42 @@ def test_sim_ancestry_unknown_mid_plus_flanks(self):
         )
         with pytest.raises(ValueError, match="Missing regions of the genome"):
             msprime.sim_ancestry(2, recombination_rate=rate_map, random_seed=1)
+
+
+class TestTimeUnits:
+    def test_simulate_gives_generations(self):
+        ts = msprime.simulate(2, random_seed=1)
+        assert ts.time_units == "generations"
+
+    def test_sim_ancestry_gives_generations(self):
+        ts = msprime.sim_ancestry(2, random_seed=1)
+        assert ts.time_units == "generations"
+
+    @pytest.mark.parametrize("time_units", ["", "unknown", "mya", "ticks"])
+    def test_initial_state_warns_not_generations(self, time_units):
+        tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables()
+        tables.time_units = time_units
+        with pytest.warns(msprime.TimeUnitsMismatchWarning, match="time_units"):
+            ts = msprime.sim_ancestry(
+                initial_state=tables, population_size=10, random_seed=1
+            )
+        assert ts.time_units == time_units
+
+    def test_initial_state_suppress_message(self):
+        tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables()
+        tables.time_units = "ticks"
+        with warnings.catch_warnings(record=True) as w:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", msprime.TimeUnitsMismatchWarning)
+                msprime.sim_ancestry(
+                    initial_state=tables, population_size=10, random_seed=1
+                )
+        assert len(w) == 0
+
+    def test_initial_state_errors_uncalibrated(self):
+        tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables()
+        tables.time_units = tskit.TIME_UNITS_UNCALIBRATED
+        with pytest.raises(ValueError, match="time_units"):
+            msprime.sim_ancestry(
+                initial_state=tables, population_size=10, random_seed=1
+            )
diff --git a/tests/test_mutations.py b/tests/test_mutations.py
@@ -524,6 +524,18 @@ def test_unicode_alleles(self):
                 for mutation in site.mutations:
                     assert mutation.derived_state == alleles[1]
 
+    @pytest.mark.parametrize("rate", [0, 1])
+    def test_uncalibrated_time_units(self, rate):
+        ts = msprime.sim_ancestry(8, random_seed=2)
+        tables = ts.dump_tables()
+        tables.time_units = "uncalibrated"
+        ts = tables.tree_sequence()
+        with pytest.raises(ValueError, match="uncalibrated"):
+            msprime.sim_mutations(ts, rate=rate, random_seed=1)
+        # Make sure also works on legacy interface
+        with pytest.raises(ValueError, match="uncalibrated"):
+            msprime.mutate(ts, rate=rate, random_seed=1)
+
     def test_zero_mutation_rate(self):
         ts = msprime.sim_ancestry(10, random_seed=1)
         mutated = msprime.sim_mutations(ts, 0)
@@ -2168,3 +2180,37 @@ def test_sim_ancestry_mutate(self):
         assert ts.num_sites == 0
         mts = msprime.mutate(ts, rate=1, random_seed=3)
         assert mts.num_sites > 0
+
+
+class TestInputUnmodified:
+    """
+    Check that things that shouldn't be touched by sim_mutations, aren't.
+    """
+
+    def test_refseq_just_data(self):
+        ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1)
+        tables = ts1.dump_tables()
+        tables.reference_sequence.data = "A" * 10
+        ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2)
+        assert ts2.reference_sequence.data == "A" * 10
+        tables.reference_sequence.assert_equals(ts2.reference_sequence)
+
+    def test_refseq_all_fields(self):
+        ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1)
+        tables = ts1.dump_tables()
+        tables.reference_sequence.data = "A"
+        tables.reference_sequence.metadata_schema = (
+            tskit.MetadataSchema.permissive_json()
+        )
+        tables.reference_sequence.metadata = {"a": 1, "b": 2}
+        tables.reference_sequence.url = "http://stuff.stuff"
+        ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2)
+        tables.reference_sequence.assert_equals(ts2.reference_sequence)
+
+    @pytest.mark.parametrize("time_units", ["", "generations", "mya"])
+    def test_time_units(self, time_units):
+        ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1)
+        tables = ts1.dump_tables()
+        tables.time_units = time_units
+        ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2)
+        assert ts2.time_units == time_units
diff --git a/tests/test_pedigree.py b/tests/test_pedigree.py
@@ -266,6 +266,7 @@ def test_valid_pedigree(self):
             num_generations=10,
         )
         tc = tskit.TableCollection(1)
+        tc.time_units = "generations"
         tc.individuals.metadata_schema = tb.metadata_schema
         for row in tb:
             tc.individuals.append(row)

diff --git a/tests/test_simulate_from.py b/tests/test_simulate_from.py
@@ -598,6 +598,7 @@ def verify_simple_model(
             model=self.model,
         )
         tables = tskit.TableCollection(ts1.sequence_length)
+        tables.time_units = "generations"
         tables.populations.add_row()
         for _ in range(n):
             tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, population=0)
@@ -658,6 +659,7 @@ def test_two_populations_migration(self):
             random_seed=seed,
         )
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.add_row()
         tables.populations.add_row()
         for _ in range(n):
@@ -952,6 +954,7 @@ def test_population_split(self):
 
     def test_extra_pops_no_metadata(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.add_row()
         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0)
         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0)
@@ -970,6 +973,7 @@ def test_extra_pops_no_metadata(self):
 
     def test_extra_pops_struct_metadata(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.metadata_schema = tskit.MetadataSchema(
             {
                 "codec": "struct",
@@ -999,6 +1003,7 @@ def test_extra_pops_struct_metadata(self):
 
     def test_extra_pops_missing_name_metadata(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.metadata_schema = tskit.MetadataSchema(
             {
                 "codec": "json",
@@ -1028,6 +1033,7 @@ def test_extra_pops_missing_name_metadata(self):
 
     def test_extra_pops_missing_description_metadata(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.metadata_schema = tskit.MetadataSchema(
             {
                 "codec": "json",
@@ -1057,6 +1063,7 @@ def test_extra_pops_missing_description_metadata(self):
 
     def test_extra_pops_minimal_schema(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json()
         tables.populations.add_row(metadata={"name": "X"})
         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0)
@@ -1109,6 +1116,7 @@ def test_extra_pops_missing_extra_metadata(self):
 
     def test_extra_pops_set_extra_metadata(self):
         tables = tskit.TableCollection(1)
+        tables.time_units = "generations"
         tables.populations.metadata_schema = tskit.MetadataSchema(
             {
                 "codec": "json",
@@ -1162,6 +1170,30 @@ def test_population_size_set(self):
         ts3 = msprime.sim_ancestry(2, population_size=100, random_seed=2)
         assert ts2.equals(ts3, ignore_provenance=True)
 
+    def test_refseq_just_data_maintained(self):
+        ts1 = msprime.sim_ancestry(2, end_time=0, sequence_length=10, random_seed=1)
+        tables = ts1.dump_tables()
+        tables.reference_sequence.data = "A" * 10
+        ts2 = msprime.sim_ancestry(
+            initial_state=tables, population_size=100, random_seed=2
+        )
+        assert ts2.reference_sequence.data == "A" * 10
+        tables.reference_sequence.assert_equals(ts2.reference_sequence)
+
+    def test_refseq_all_fields_maintained(self):
+        ts1 = msprime.sim_ancestry(2, end_time=0, sequence_length=10, random_seed=1)
+        tables = ts1.dump_tables()
+        tables.reference_sequence.data = "A"
+        tables.reference_sequence.metadata_schema = (
+            tskit.MetadataSchema.permissive_json()
+        )
+        tables.reference_sequence.metadata = {"a": 1, "b": 2}
+        tables.reference_sequence.url = "http://stuff.stuff"
+        ts2 = msprime.sim_ancestry(
+            initial_state=tables, population_size=100, random_seed=2
+        )
+        tables.reference_sequence.assert_equals(ts2.reference_sequence)
+
 
 class TestSlimOutput:
     """