Skip to content

Commit

Permalink
remove coordinate shifting; closes popsim-consortium#1404
Browse files Browse the repository at this point in the history
  • Loading branch information
petrelharp committed Jul 6, 2024
1 parent 09f97ea commit f39557d
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 78 deletions.
20 changes: 20 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,28 @@
[0.2.1] - 2023-XX-XX
--------------------

**Bug fixes**:

- Updates to SLiM support: updated the `active` flag in the SLiM code to be integer.`

**Breaking changes**:

- The `time_units` attribute of tree sequence metadata is now set to "generations"
for SLiM output, avoiding the "time units mismatch warning".
(:user:`nspope`, :pr:`1567`)

- Previously, Contigs specified with `left` and `right` arguments produced
simulations having coordinates shifted so they were relative to `left`. This
is no longer the case: coordinates in resulting simulations retain the
original chromosome's coordinates. As a result, regions outside `[left,
right)` will now have missing data. To produce simulation output with the
previous behavior, use the `.trim()` method of the resulting TreeSequence.
So, the Contig's `.original_coordinates` attribute is now called
`.coordinates`, and several methods (e.g., `Contig.basic_contig()`) now take
`left` and `right` arguments as well as `length`. Finally, arguments such as
`relative_coordinates=True/False` to `Contig.dfe_breakpoints()` are no longer
necessary and are deprecated. (:user:`petrelharp`, :pr:`1570`)

- To add the possibility for a relationship between dominance and selection
coefficient, now each stdpopsim MutationType might have more than one
underlying SLiM mutation type; so, where this is recorded in top-level
Expand Down
4 changes: 2 additions & 2 deletions stdpopsim/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,9 +813,9 @@ def run_simulation(args):
intervals_summary_str = f"[{left}, {right})"
if intervals_summary_str is None:
# case where no intervals specified but we have a DFE
_, left, right = contig.original_coordinates
_, left, right = contig.coordinates
intervals = np.array([[left, right]])
intervals_summary_str = f"[{intervals[0][0]}, {intervals[0][1]})"
intervals_summary_str = f"[{left}, {right})"

dfe = species.get_dfe(args.dfe)
contig.add_dfe(
Expand Down
93 changes: 63 additions & 30 deletions stdpopsim/genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,25 @@
logger = logging.getLogger(__name__)


def _uniform_rate_map(left, right, length, rate):
"""
Makes a RateMap that is
"""
pos = [
0.0,
]
x = []
if left > 0:
pos.append(left)
x.append(np.nan)
pos.append(right)
x.append(rate)
if right < length:
pos.append(length)
x.append(np.nan)
return msprime.RateMap(position=pos, rate=x)


@attr.s
class Genome:
"""
Expand Down Expand Up @@ -225,9 +244,6 @@ class Contig:
and ``interval_list``. These must be of the same length, and the k-th DFE
applies to the k-th interval; see :meth:`.add_dfe` for more information.
The contig may be a segment of a named chromosome. If so, the original
coordinate system is used for :meth:`.add_dfe` and :meth:`add_single_site`.
:ivar mutation_rate: The rate of mutation per base per generation.
:vartype mutation_rate: float
:ivar ploidy: The ploidy of the contig. Defaults to 2 (diploid).
Expand Down Expand Up @@ -267,11 +283,11 @@ class Contig:
:ivar interval_list: A list of :class:`np.array` objects containing integers.
By default, the inital interval list spans the whole chromosome with the
neutral DFE.
:ivar original_coordinates: The location of the contig on a named chromosome,
:ivar coordinates: The location of the contig on a named chromosome,
as a tuple of the form `(chromosome, left, right)`. If `None`, the contig
is assumed to be generic (i.e. it does not inherit a coordinate system
from a larger chromosome).
:vartype original_coordinates: tuple
:vartype coordinates: tuple
.. note::
To run stdpopsim simulations with alternative, user-specified mutation,
Expand Down Expand Up @@ -300,14 +316,14 @@ class Contig:
exclusion_mask = attr.ib(default=None)
dfe_list = attr.ib(factory=list)
interval_list = attr.ib(factory=list)
original_coordinates = attr.ib(default=None, type=tuple)
coordinates = attr.ib(default=None, type=tuple)

def __attrs_post_init__(self):
if self.original_coordinates is None:
self.original_coordinates = (None, 0, int(self.length))
_, left, right = self.original_coordinates
if self.coordinates is None:
self.coordinates = (None, 0, int(self.length))
_, left, right = self.coordinates
self.add_dfe(
[[left, right]],
np.array([[left, right]]),
stdpopsim.dfe.neutral_dfe(),
)

Expand Down Expand Up @@ -491,20 +507,24 @@ def species_contig(
logger.debug(
f"Making flat chromosome {length_multiplier} * {chrom.id}"
)
recomb_map = msprime.RateMap.uniform(
round(length_multiplier * chrom.length),
chrom.recombination_rate,
)
else:
logger.debug(
f"Making flat contig of length {right - left} from {chrom.id}"
f"Making flat contig from {left} to {right} for {chrom.id}"
)
recomb_map = _uniform_rate_map(
left, right, chrom.length, chrom.recombination_rate
)
recomb_map = msprime.RateMap.uniform(
right - left, chrom.recombination_rate
)
else:
if length_multiplier != 1:
raise ValueError("Cannot use length multiplier with empirical maps")
logger.debug(f"Getting map for {chrom.id} from {genetic_map}")
gm = species.get_genetic_map(genetic_map)
recomb_map = gm.get_chromosome_map(chrom.id)
recomb_map = recomb_map.slice(left=left, right=right, trim=True)
recomb_map = recomb_map.slice(left=left, right=right)

inclusion_intervals = None
exclusion_intervals = None
Expand All @@ -517,7 +537,7 @@ def species_contig(
)
else:
inclusion_intervals = inclusion_mask
inclusion_intervals = stdpopsim.utils.clip_and_shift_intervals(
inclusion_intervals = stdpopsim.utils.clip_intervals(
inclusion_intervals, left, right
)
if exclusion_mask is not None:
Expand All @@ -529,7 +549,7 @@ def species_contig(
)
else:
exclusion_intervals = exclusion_mask
exclusion_intervals = stdpopsim.utils.clip_and_shift_intervals(
exclusion_intervals = stdpopsim.utils.clip_intervals(
exclusion_intervals, left, right
)

Expand Down Expand Up @@ -562,7 +582,7 @@ def species_contig(
gene_conversion_length=gene_conversion_length,
inclusion_mask=inclusion_intervals,
exclusion_mask=exclusion_intervals,
original_coordinates=(chromosome, left, right),
coordinates=(chromosome, left, right),
ploidy=ploidy,
)

Expand All @@ -572,19 +592,30 @@ def species_contig(
def length(self):
return self.recombination_map.sequence_length

@property
def original_coordinates(self):
warnings.warn(
stdpopsim.DeprecatedFeatureWarning(
"The original_coordinates property is deprecated, "
"since contigs are no longer shifted to be relative to their "
"left endpoint: use Contig.coordinates instead."
)
)
return self.coordinates

@property
def origin(self):
"""
The location of the contig on a named chromosome as a string with format,
"chromosome:left-right"; or None if a generic contig.
"""
chromosome, left, right = self.original_coordinates
chromosome, left, right = self.coordinates
if chromosome is None:
return None
else:
return f"{chromosome}:{left}-{right}"

def dfe_breakpoints(self, relative_coordinates=True):
def dfe_breakpoints(self, *, relative_coordinates=None):
"""
Returns two things: the sorted vector of endpoints of all intervals across
all DFEs in the contig, and a vector of integer labels for these intervals,
Expand All @@ -607,21 +638,23 @@ def dfe_breakpoints(self, relative_coordinates=True):
``breaks[k]``. Some intervals may not be covered by a DFE, in which
case they will have the label ``-1`` (beware of python indexing!).
:param bool relative_coordinates: If True, the returned breakpoints
will be relative to the start of the contig, rather than to the
start of the chromosome to which to contig belongs.
:return: A tuple (breaks, dfe_labels).
"""
if relative_coordinates is not None:
warnings.warn(
stdpopsim.DeprecatedFeatureWarning(
"The relative_coordinates argument is deprecated "
"(and no longer needed),"
"because contigs are no longer shifted to be relative to their "
"left endpoint."
)
)
breaks = np.unique(
np.vstack(self.interval_list + [[[0, int(self.length)]]]) # also sorted
)
dfe_labels = np.full(len(breaks) - 1, -1, dtype="int")
for j, intervals in enumerate(self.interval_list):
dfe_labels[np.isin(breaks[:-1], intervals[:, 0], assume_unique=True)] = j
if not relative_coordinates:
_, left, _ = self.original_coordinates
breaks += left
return breaks, dfe_labels

def clear_dfes(self):
Expand Down Expand Up @@ -659,9 +692,9 @@ def add_dfe(self, intervals, DFE):
:param array intervals: A valid set of intervals.
:param DFE dfe: A DFE object.
"""
_, left, right = self.original_coordinates
intervals = stdpopsim.utils.clip_and_shift_intervals(intervals, left, right)
stdpopsim.utils._check_intervals_validity(intervals, start=0, end=self.length)
_, left, right = self.coordinates
intervals = stdpopsim.utils.clip_intervals(intervals, left, right)
stdpopsim.utils._check_intervals_validity(intervals, start=left, end=right)
for j, ints in enumerate(self.interval_list):
self.interval_list[j] = stdpopsim.utils.mask_intervals(ints, intervals)
self.dfe_list.append(DFE)
Expand Down
17 changes: 14 additions & 3 deletions stdpopsim/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,17 @@ def get_contig(
is to be simulated based on empirical information for a given species
and chromosome.
*Coordinates:* If a particular chunk of a given chromosome is obtained
(by specifying a `chromosome` ID and `left` and `right` coordinates),
the resulting genomes will have coordinates matching the original
(full) genome, and portions of the genome outside of `[left, right)`
will be masked (and so contain missing data). So, coordinates of
`genetic_map` and masks should be in coordinates of the original genome
(so, not shifted to be relative to `left`). If it is desired to have
output coordinates relative to `left`, use the
:meth:`tskit.TreeSequence.trim` method on the result (for instance,
`ts_shifted = ts.trim()`).
:param str chromosome: The ID of the chromosome to simulate.
A complete list of chromosome IDs for each species can be found in the
"Genome" subsection for the species in the :ref:`sec_catalog`.
Expand Down Expand Up @@ -218,11 +229,11 @@ def get_contig(
and recombination rates are equal to the genome-wide average across all
autosomal chromosomes.
:param float left: The left coordinate (inclusive) of the region to
keep on the chromosome. Defaults to 0. Genetic maps and masks are
clipped to this boundary.
keep on the chromosome. Defaults to 0. Remaining regions will have
missing data when simulated.
:param float right: The right coordinate (exclusive) of the region to
keep on the chromosome. Defaults to the length of the chromosome.
Genetic maps and masks are clipped to this boundary.
Remaining regions will have missing data when simulated.
:rtype: :class:`.Contig`
:return: A :class:`.Contig` describing the section of the genome.
"""
Expand Down
6 changes: 2 additions & 4 deletions stdpopsim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,11 @@ def mask_intervals(intervals, mask):
return out


def clip_and_shift_intervals(intervals, left, right):
def clip_intervals(intervals, left, right):
"""
Intersect each interval in ``intervals`` with ``[left, right]``.
``intervals`` should be a numpy array with two columns that are the
beginning and end coordinates, respectively. The coordinates are then
shifted such that ``left`` becomes 0.
beginning and end coordinates, respectively.
"""
assert 0 <= left < right
intervals = np.array(intervals).astype(int)
Expand All @@ -268,7 +267,6 @@ def clip_and_shift_intervals(intervals, left, right):
intervals = np.clip(intervals[~out_of_bounds], left, right)
if intervals.shape[0] == 0:
warnings.warn(f"No intervals remain after clipping to [{left}, {right}]")
intervals -= left
return intervals


Expand Down
Loading

0 comments on commit f39557d

Please sign in to comment.