Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't use scipy and pandas for OutOfAfricaExtendedNeandertalAdmixture… #1416

Merged
merged 4 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ channels:
- bioconda
dependencies:
- numpy
- pandas
- msprime
- scikit-allel
- matplotlib
Expand Down
4 changes: 2 additions & 2 deletions maintenance/annotation_maint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import stdpopsim

logger = logging.getLogger(__name__)
# make root directory for zarr annotations
# make root directory for annotations
annot_path = "annotations"
os.makedirs(annot_path, exist_ok=True)

Expand Down Expand Up @@ -108,7 +108,7 @@ def download_process_annotations():
)
]
logger.info(f"merging overlapping regions {an.id}")
# create zarr store and zarr root
# create numpy recarray for each chromosome
spc_name_path = os.path.join(annot_path, spc.id)
os.makedirs(spc_name_path, exist_ok=True)
for chrom_id in CHROM_IDS:
Expand Down
4 changes: 2 additions & 2 deletions requirements/CI/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ attrs==21.4.0
appdirs==1.4.4
humanize==4.4.0
pyslim==1.0.1
pandas==1.3.4
numpy==1.21.5; python_version=='3.7'
numpy==1.22.3; python_version>='3.8'
scipy; python_version=='3.7'
scipy==1.7.3; python_version>='3.8'
scikit-allel==1.3.5
zarr==2.11.3
biopython==1.79
demes==0.2.2
demesdraw==0.3.1
Expand Down
3 changes: 1 addition & 2 deletions requirements/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@ appdirs
humanize
pre-commit
pyslim>=1.0.1
pandas
numpy
scikit-allel
zarr>=2.4
biopython
demes==0.2.0
demesdraw==0.2.0
boto3
kastore
scipy
2 changes: 0 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ install_requires =
appdirs
humanize
pyslim >= 1.0.1
pandas
zarr
numpy
setup_requires =
setuptools
Expand Down
51 changes: 19 additions & 32 deletions stdpopsim/catalog/HomSap/demographic_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import math

import msprime
from scipy import stats
import pandas as pd

import stdpopsim

_species = stdpopsim.get_species("HomSap")
Expand Down Expand Up @@ -71,7 +71,7 @@ def extended_pulse(
"""
tm = (migration_stop - migration_start) / 2 + migration_start
k = 1 / ((migration_stop - migration_start) / (4 * tm)) ** 2
m = stats.gamma.pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)
m = stdpopsim.utils.gamma_pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)

"""
Scaling the distribution by the total migration rate.
Expand All @@ -89,30 +89,15 @@ def extended_pulse(
over the set migration cutoff. They will be included in the m(t)
distribution.
"""
D_extended_pulse["time"].append(x)
D_extended_pulse["rate"].append(m_scaled[x])
D_extended_pulse["source"].append(source)
D_extended_pulse["dest"].append(dest)
event_in_range.append(x)
rate = m_scaled[x]
if rate > 0:
D_extended_pulse["time"].append(x)
D_extended_pulse["rate"].append(rate)
D_extended_pulse["source"].append(source)
D_extended_pulse["dest"].append(dest)
event_in_range.append(x)

"""
Setting migration rate to 0 at the end/ the start of
gene flow (end of gene flow backwards in time).
"""

D_extended_pulse["time"].append((event_in_range[-1] + 1))
D_extended_pulse["rate"].append(0)
D_extended_pulse["source"].append(source)
D_extended_pulse["dest"].append(dest)
"""
Storing all migration event in a df, sorted by time
"""
extended_pulse = pd.DataFrame.from_dict(D_extended_pulse)
extended_pulse = extended_pulse[extended_pulse.rate != 0]
extended_pulse.sort_values(by=["time"], ignore_index=True)
extended_pulse.reset_index(inplace=True)

return extended_pulse
return D_extended_pulse

generation_time = 29
mutation_rate = 2e-8
Expand Down Expand Up @@ -155,10 +140,10 @@ def extended_pulse(

"""Absolute start end end of admixture"""
Neandertal_Gene_Flow_absolute_start = msprime.MigrationRateChange(
time=int(extended_GF.time.head(1) - 1), rate=0
time=int(extended_GF["time"][0] - 1), rate=0
)
Neandertal_Gene_Flow_absolute_end = msprime.MigrationRateChange(
time=int(extended_GF.time.tail(1) + 1), rate=0
time=int(extended_GF["time"][-1] + 1), rate=0
)

demographic_events_without_admixture = [
Expand All @@ -170,11 +155,13 @@ def extended_pulse(

demographic_events = demographic_events_without_admixture + [
msprime.MigrationRateChange(
time=extended_GF.time[i],
rate=extended_GF.rate[i],
matrix_index=(extended_GF.source[i], extended_GF.dest[i]),
time=time,
rate=rate,
matrix_index=(source, dest),
)
for time, rate, source, dest in zip(
*(extended_GF[k] for k in ("time", "rate", "source", "dest"))
)
for i in range(extended_GF.shape[0])
]

demographic_events.sort(key=lambda x: x.time)
Expand Down
60 changes: 16 additions & 44 deletions stdpopsim/qc/HomSap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import math
import pandas as pd
from scipy import stats

import msprime

Expand Down Expand Up @@ -1465,57 +1463,37 @@ def extended_pulse(
migration rate. The function returns a dataframe of the migration rates.
"""

event_in_range = list()
D_extended_pulse = {"time": [], "rate": [], "source": [], "dest": []}

"""
The shape and scale parameters are calculated from the input.
"""
tm = (migration_stop - migration_start) / 2 + migration_start
k = 1 / ((migration_stop - migration_start) / (4 * tm)) ** 2
m = [stats.gamma.pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)]
m = stdpopsim.utils.gamma_pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)

"""
Scaling the distribution by the total migration rate.
"""
m = m[0]
m[abs(m) < migration_cutoff] = 0
m_scaled = m * total_migration_rate / sum(m)

"""
Filling the table of migration rate for each generation.
"""
D_extended_pulse = []
for x in range(split_time + 1):

"""
Writing gene flow events which are inside the set time boarders and
over the set migration cutoff. They will be included in the m(t)
distribution.
"""
D_extended_pulse["time"].append(x)
D_extended_pulse["rate"].append(m_scaled[x])
D_extended_pulse["source"].append(source)
D_extended_pulse["dest"].append(dest)
event_in_range.append(x)
rate = m_scaled[x]
if rate > 0:
D_extended_pulse.append(
dict(time=x, rate=rate, source=source, dest=dest)
)

"""
Setting migration rate to 0 at the end/ the start of
gene flow (end of gene flow backwards in time).
"""

D_extended_pulse["time"].append((event_in_range[-1] + 1))
D_extended_pulse["rate"].append(0)
D_extended_pulse["source"].append(source)
D_extended_pulse["dest"].append(dest)
"""
Storing all migration event in a df, sorted by time
"""
extended_pulse = pd.DataFrame.from_dict(D_extended_pulse)
extended_pulse = extended_pulse[extended_pulse.rate != 0]
extended_pulse.sort_values(by=["time"], ignore_index=True)
extended_pulse.reset_index(inplace=True)

return extended_pulse
return D_extended_pulse

# Get a dataframe specifying the rates in the migration pulse
pulse_df = extended_pulse(
Expand All @@ -1529,23 +1507,17 @@ def extended_pulse(
)

# Need to insert zero migration rates before and after the pulse
start = pd.DataFrame(
{"time": pulse_df.iloc[0]["time"] - 1, "rate": 0, "source": 1, "dest": 2},
index=[0],
)
end = pd.DataFrame(
{"time": pulse_df.iloc[-1]["time"] + 1, "rate": 0, "source": 1, "dest": 2},
index=[0],
)
pulse_df = pd.concat([start, pulse_df, end])
start = {"time": pulse_df[0]["time"] - 1, "rate": 0, "source": 1, "dest": 2}
end = {"time": pulse_df[-1]["time"] + 1, "rate": 0, "source": 1, "dest": 2}
pulse_df = [start] + pulse_df + [end]

# Don't love iterating over rows here, but the DataFrame is small so works fine
for idx, record in pulse_df.iterrows():
for record in pulse_df:
de.add_migration_rate_change(
time=record.time,
rate=record.rate,
source=int(record.source),
dest=int(record.dest),
time=record["time"],
rate=record["rate"],
source=record["source"],
dest=record["dest"],
)

# Merge CEU into YRI and NEA into YRI
Expand Down
36 changes: 31 additions & 5 deletions stdpopsim/utils.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
"""
Miscellaneous utilities.
"""
import re
import os
import contextlib
import hashlib
import urllib.request
import math
import os
import re
import shutil
import tarfile
import contextlib
import numpy as np
import urllib.request
import warnings

import numpy as np


def is_valid_demographic_model_id(model_id):
"""
Expand Down Expand Up @@ -327,3 +329,27 @@ def haploidize_individuals(ts):
node_indiv[node] = new_idx
tables.nodes.individual = node_indiv
return tables.tree_sequence()


def gamma_pdf(x, a, loc=0, scale=1):
"""
Gamma PDF with same parameterisation as scipy.stats.gamma.pdf().

Reimplemented here to avoid a runtime dependency on scipy.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gamma.html
"""
x = np.array(x)
a = np.array(a)
loc = np.array(loc)
scale = np.array(scale)

gamma_a = np.vectorize(lambda b: math.gamma(b) if b > 0 else np.nan)(a)

with np.errstate(divide="ignore"):
y = (x - loc) / scale
result = np.power(y, a - 1) * np.exp(-y) / (scale * gamma_a)

if result.ndim == 0:
return result[()]
else:
return result
33 changes: 33 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tempfile

import numpy as np
import scipy.stats
import pytest

from stdpopsim import utils
Expand Down Expand Up @@ -516,3 +517,35 @@ def test_mask_intervals_errors(self):
utils.mask_intervals(intervals=np.array([[50, 10]]), mask=np.array([[]]))
with pytest.raises(ValueError):
utils.mask_intervals(intervals=np.array([[]]), mask=np.array([[10, 5]]))


class TestGammaPdf:
def test_gamma_pdf_basic(self):
np.testing.assert_allclose(
utils.gamma_pdf(x=1, a=1), scipy.stats.gamma.pdf(x=1, a=1)
)
np.testing.assert_allclose(
utils.gamma_pdf(x=range(5), a=1), scipy.stats.gamma.pdf(x=range(5), a=1)
)
np.testing.assert_allclose(
utils.gamma_pdf(x=1, a=range(1, 5)),
scipy.stats.gamma.pdf(x=1, a=range(1, 5)),
)

@pytest.mark.parametrize(
"x,a,loc,scale",
[
(1, 1, 0, 1),
# Note that some of these values are (deliberately) invalid,
# which should produce NaN in the corresponding output.
(np.arange(5), np.linspace(0, 1, 5), 0, 1),
(np.arange(5), np.linspace(0, 1, 5), 1, 2),
(np.arange(5), np.linspace(0, 1, 5), np.arange(5), np.arange(5)),
(np.eye(5), 1, 0, 1),
],
)
def test_gamma_pdf(self, x, a, loc, scale):
np.testing.assert_allclose(
utils.gamma_pdf(x=x, a=a, loc=loc, scale=scale),
scipy.stats.gamma.pdf(x=x, a=a, loc=loc, scale=scale),
)