Skip to content

Commit

Permalink
Merge pull request #1953 from jeromekelleher/time-units-and-refseq
Browse files Browse the repository at this point in the history
Time units and refseq
  • Loading branch information
jeromekelleher authored Dec 14, 2021
2 parents 8f86bb9 + 74b7018 commit 57ef4ee
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 4 deletions.
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
# Changelog

**New features**:
## [1.1.0] - 2021-12-14

**New features**

- Add support for tree sequence ``time_units`` field. The ``time_units`` will
be set to "generations" for the output of ``sim_ancestry`` (and ``simulate``),
unless the ``initial_state`` argument is used. In this case, the
``time_units`` value will be inherited from the input.
({pr}`1953`, {issue}`1951`, {issue}`1877`, {issue}`1948`, {user}`jeromekelleher`).

**Bug fixes**:

Expand All @@ -18,6 +25,10 @@
``Demography.from_old_style`` ({issue}`1950`, {pr}`1954`,
{user}`jeromekelleher`)

**Maintenance **:

- Update tskit to Python 0.4.0 and C 0.99.15.

## [1.0.4] - 2021-12-01

**New features**:
Expand Down
1 change: 1 addition & 0 deletions msprime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
StandardCoalescent,
SweepGenicSelection,
FixedPedigree,
TimeUnitsMismatchWarning,
)

from msprime.core import __version__
Expand Down
26 changes: 26 additions & 0 deletions msprime/ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import math
import sys
import tempfile
import warnings
from typing import Any
from typing import ClassVar

Expand All @@ -45,6 +46,15 @@

logger: logging.Logger = logging.getLogger(__name__)

TIME_UNITS_GENERATIONS = "generations"


class TimeUnitsMismatchWarning(UserWarning):
"""
Warning raise when the time units specified in different parts of a
simulation do not match.
"""


def _model_factory(model: None | str | AncestryModel) -> AncestryModel:
"""
Expand Down Expand Up @@ -195,6 +205,7 @@ def _demography_factory(
def _build_initial_tables(*, sequence_length, samples, ploidy, demography):
# NOTE: this is only used in the simulate() codepath.
tables = tskit.TableCollection(sequence_length)
tables.time_units = TIME_UNITS_GENERATIONS

for index, (population, time) in enumerate(samples):
tables.nodes.add_row(
Expand Down Expand Up @@ -804,6 +815,20 @@ def _parse_sim_ancestry(
raise TypeError(
"initial_state must either be a TreeSequence or TableCollection instance"
)
if initial_state.time_units == tskit.TIME_UNITS_UNCALIBRATED:
raise ValueError(
"Cannot use a tree sequence with uncalibrated time_units as "
"the initial state"
)
if initial_state.time_units != TIME_UNITS_GENERATIONS:
message = (
f"The initial_state has time_units={initial_state.time_units} but "
"time is measured in generations in msprime. This may lead to "
"significant discrepancies between the timescales. "
"If you wish to suppress this warning, you can use, e.g., "
"warnings.simplefilter('ignore', msprime.TimeUnitsMismatchWarning)"
)
warnings.warn(message, TimeUnitsMismatchWarning)

if sequence_length is None:
# These are all the cases in which we derive the sequence_length
Expand Down Expand Up @@ -940,6 +965,7 @@ def _parse_sim_ancestry(
"Either the samples or initial_state arguments must be provided"
)
initial_state = tskit.TableCollection(sequence_length)
initial_state.time_units = TIME_UNITS_GENERATIONS
demography.insert_populations(initial_state)
if not init_for_debugger:
sample_sets = _parse_samples(samples, demography)
Expand Down
5 changes: 5 additions & 0 deletions msprime/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1356,6 +1356,11 @@ def sim_mutations(
if not isinstance(rate_map, intervals.RateMap):
raise TypeError("rate must be a float or a RateMap")

if tables.time_units == tskit.TIME_UNITS_UNCALIBRATED:
raise ValueError(
"Simulating mutations doesn't make sense when time is uncalibrated"
)

start_time = -sys.float_info.max if start_time is None else float(start_time)
end_time = sys.float_info.max if end_time is None else float(end_time)
if start_time > end_time:
Expand Down
1 change: 1 addition & 0 deletions msprime/pedigrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, demography=None):
demography = demog_mod.Demography.isolated_model([1])
self.demography = demography
self.tables = tskit.TableCollection(0)
self.tables.time_units = "generations"
demography.insert_populations(self.tables)

assert len(self.tables.individuals) == 0
Expand Down
54 changes: 51 additions & 3 deletions tests/test_ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,9 @@ def test_sequence_length_errors(self):
)

# An initial state with a sequence_length that disagrees.
initial_state = tskit.TableCollection(1234).tree_sequence()
initial_state = tskit.TableCollection(1234)
initial_state.time_units = "generations"
initial_state = initial_state.tree_sequence()
with pytest.raises(ValueError):
ancestry._parse_sim_ancestry(initial_state=initial_state, sequence_length=1)
with pytest.raises(ValueError):
Expand Down Expand Up @@ -753,12 +755,14 @@ def test_dtwf_population_size(self):

def test_initial_state_errors(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.add_row()
# sequence_length doesn't match.
with pytest.raises(ValueError):
ancestry._parse_sim_ancestry(initial_state=tables, sequence_length=100)
# Must have at least one population
tables = tskit.TableCollection(1)
tables.time_units = "generations"
with pytest.raises(ValueError):
ancestry._parse_sim_ancestry(initial_state=tables)
for bad_type in [[], "sdf", {}]:
Expand Down Expand Up @@ -801,6 +805,7 @@ def test_samples_and_initial_state(self):

# Specifying both is also an error.
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.add_row()
with pytest.raises(ValueError):
ancestry._parse_sim_ancestry(2, initial_state=tables)
Expand All @@ -822,18 +827,22 @@ def test_pedigree_requires_no_population_size(self):

@pytest.mark.parametrize("ploidy", [1, 3])
def test_pedigree_requires_ploidy2(self, ploidy):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
with pytest.raises(ValueError, match="must have ploidy=2"):
ancestry._parse_sim_ancestry(
model="fixed_pedigree",
initial_state=tskit.TableCollection(1),
initial_state=tables,
ploidy=ploidy,
)

def test_pedigree_requires_no_samples(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
with pytest.raises(ValueError, match="Cannot specify both samples and"):
ancestry._parse_sim_ancestry(
model="fixed_pedigree",
initial_state=tskit.TableCollection(1),
initial_state=tables,
samples=10,
)

Expand Down Expand Up @@ -2249,3 +2258,42 @@ def test_sim_ancestry_unknown_mid_plus_flanks(self):
)
with pytest.raises(ValueError, match="Missing regions of the genome"):
msprime.sim_ancestry(2, recombination_rate=rate_map, random_seed=1)


class TestTimeUnits:
def test_simulate_gives_generations(self):
ts = msprime.simulate(2, random_seed=1)
assert ts.time_units == "generations"

def test_sim_ancestry_gives_generations(self):
ts = msprime.sim_ancestry(2, random_seed=1)
assert ts.time_units == "generations"

@pytest.mark.parametrize("time_units", ["", "unknown", "mya", "ticks"])
def test_initial_state_warns_not_generations(self, time_units):
tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables()
tables.time_units = time_units
with pytest.warns(msprime.TimeUnitsMismatchWarning, match="time_units"):
ts = msprime.sim_ancestry(
initial_state=tables, population_size=10, random_seed=1
)
assert ts.time_units == time_units

def test_initial_state_suppress_message(self):
tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables()
tables.time_units = "ticks"
with warnings.catch_warnings(record=True) as w:
with warnings.catch_warnings():
warnings.simplefilter("ignore", msprime.TimeUnitsMismatchWarning)
msprime.sim_ancestry(
initial_state=tables, population_size=10, random_seed=1
)
assert len(w) == 0

def test_initial_state_errors_uncalibrated(self):
tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables()
tables.time_units = tskit.TIME_UNITS_UNCALIBRATED
with pytest.raises(ValueError, match="time_units"):
msprime.sim_ancestry(
initial_state=tables, population_size=10, random_seed=1
)
46 changes: 46 additions & 0 deletions tests/test_mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,18 @@ def test_unicode_alleles(self):
for mutation in site.mutations:
assert mutation.derived_state == alleles[1]

@pytest.mark.parametrize("rate", [0, 1])
def test_uncalibrated_time_units(self, rate):
ts = msprime.sim_ancestry(8, random_seed=2)
tables = ts.dump_tables()
tables.time_units = "uncalibrated"
ts = tables.tree_sequence()
with pytest.raises(ValueError, match="uncalibrated"):
msprime.sim_mutations(ts, rate=rate, random_seed=1)
# Make sure also works on legacy interface
with pytest.raises(ValueError, match="uncalibrated"):
msprime.mutate(ts, rate=rate, random_seed=1)

def test_zero_mutation_rate(self):
ts = msprime.sim_ancestry(10, random_seed=1)
mutated = msprime.sim_mutations(ts, 0)
Expand Down Expand Up @@ -2168,3 +2180,37 @@ def test_sim_ancestry_mutate(self):
assert ts.num_sites == 0
mts = msprime.mutate(ts, rate=1, random_seed=3)
assert mts.num_sites > 0


class TestInputUnmodified:
"""
Check that things that shouldn't be touched by sim_mutations, aren't.
"""

def test_refseq_just_data(self):
ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1)
tables = ts1.dump_tables()
tables.reference_sequence.data = "A" * 10
ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2)
assert ts2.reference_sequence.data == "A" * 10
tables.reference_sequence.assert_equals(ts2.reference_sequence)

def test_refseq_all_fields(self):
ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1)
tables = ts1.dump_tables()
tables.reference_sequence.data = "A"
tables.reference_sequence.metadata_schema = (
tskit.MetadataSchema.permissive_json()
)
tables.reference_sequence.metadata = {"a": 1, "b": 2}
tables.reference_sequence.url = "http://stuff.stuff"
ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2)
tables.reference_sequence.assert_equals(ts2.reference_sequence)

@pytest.mark.parametrize("time_units", ["", "generations", "mya"])
def test_time_units(self, time_units):
ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1)
tables = ts1.dump_tables()
tables.time_units = time_units
ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2)
assert ts2.time_units == time_units
1 change: 1 addition & 0 deletions tests/test_pedigree.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ def test_valid_pedigree(self):
num_generations=10,
)
tc = tskit.TableCollection(1)
tc.time_units = "generations"
tc.individuals.metadata_schema = tb.metadata_schema
for row in tb:
tc.individuals.append(row)
Expand Down
32 changes: 32 additions & 0 deletions tests/test_simulate_from.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,7 @@ def verify_simple_model(
model=self.model,
)
tables = tskit.TableCollection(ts1.sequence_length)
tables.time_units = "generations"
tables.populations.add_row()
for _ in range(n):
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, population=0)
Expand Down Expand Up @@ -658,6 +659,7 @@ def test_two_populations_migration(self):
random_seed=seed,
)
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.add_row()
tables.populations.add_row()
for _ in range(n):
Expand Down Expand Up @@ -952,6 +954,7 @@ def test_population_split(self):

def test_extra_pops_no_metadata(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.add_row()
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0)
Expand All @@ -970,6 +973,7 @@ def test_extra_pops_no_metadata(self):

def test_extra_pops_struct_metadata(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.metadata_schema = tskit.MetadataSchema(
{
"codec": "struct",
Expand Down Expand Up @@ -999,6 +1003,7 @@ def test_extra_pops_struct_metadata(self):

def test_extra_pops_missing_name_metadata(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.metadata_schema = tskit.MetadataSchema(
{
"codec": "json",
Expand Down Expand Up @@ -1028,6 +1033,7 @@ def test_extra_pops_missing_name_metadata(self):

def test_extra_pops_missing_description_metadata(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.metadata_schema = tskit.MetadataSchema(
{
"codec": "json",
Expand Down Expand Up @@ -1057,6 +1063,7 @@ def test_extra_pops_missing_description_metadata(self):

def test_extra_pops_minimal_schema(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json()
tables.populations.add_row(metadata={"name": "X"})
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0)
Expand Down Expand Up @@ -1109,6 +1116,7 @@ def test_extra_pops_missing_extra_metadata(self):

def test_extra_pops_set_extra_metadata(self):
tables = tskit.TableCollection(1)
tables.time_units = "generations"
tables.populations.metadata_schema = tskit.MetadataSchema(
{
"codec": "json",
Expand Down Expand Up @@ -1162,6 +1170,30 @@ def test_population_size_set(self):
ts3 = msprime.sim_ancestry(2, population_size=100, random_seed=2)
assert ts2.equals(ts3, ignore_provenance=True)

def test_refseq_just_data_maintained(self):
ts1 = msprime.sim_ancestry(2, end_time=0, sequence_length=10, random_seed=1)
tables = ts1.dump_tables()
tables.reference_sequence.data = "A" * 10
ts2 = msprime.sim_ancestry(
initial_state=tables, population_size=100, random_seed=2
)
assert ts2.reference_sequence.data == "A" * 10
tables.reference_sequence.assert_equals(ts2.reference_sequence)

def test_refseq_all_fields_maintained(self):
ts1 = msprime.sim_ancestry(2, end_time=0, sequence_length=10, random_seed=1)
tables = ts1.dump_tables()
tables.reference_sequence.data = "A"
tables.reference_sequence.metadata_schema = (
tskit.MetadataSchema.permissive_json()
)
tables.reference_sequence.metadata = {"a": 1, "b": 2}
tables.reference_sequence.url = "http://stuff.stuff"
ts2 = msprime.sim_ancestry(
initial_state=tables, population_size=100, random_seed=2
)
tables.reference_sequence.assert_equals(ts2.reference_sequence)


class TestSlimOutput:
"""
Expand Down

0 comments on commit 57ef4ee

Please sign in to comment.