popsim-consortium · grahamgower · Nov 9, 2022 · Nov 7, 2022 · Nov 8, 2022 · Nov 8, 2022
diff --git a/environment.yml b/environment.yml
@@ -12,7 +12,6 @@ channels:
         - bioconda
 dependencies:
         - numpy
-        - pandas
         - msprime
         - scikit-allel
         - matplotlib

diff --git a/maintenance/annotation_maint.py b/maintenance/annotation_maint.py
@@ -9,7 +9,7 @@
 import stdpopsim
 
 logger = logging.getLogger(__name__)
-# make root directory for zarr annotations
+# make root directory for annotations
 annot_path = "annotations"
 os.makedirs(annot_path, exist_ok=True)
 
@@ -108,7 +108,7 @@ def download_process_annotations():
                     )
                 ]
                 logger.info(f"merging overlapping regions {an.id}")
-                # create zarr store and zarr root
+                # create numpy recarray for each chromosome
                 spc_name_path = os.path.join(annot_path, spc.id)
                 os.makedirs(spc_name_path, exist_ok=True)
                 for chrom_id in CHROM_IDS:

diff --git a/requirements/CI/requirements.txt b/requirements/CI/requirements.txt
@@ -15,11 +15,11 @@ attrs==21.4.0
 appdirs==1.4.4
 humanize==4.4.0
 pyslim==1.0.1
-pandas==1.3.4
 numpy==1.21.5; python_version=='3.7'
 numpy==1.22.3; python_version>='3.8'
+scipy; python_version=='3.7'
+scipy==1.7.3; python_version>='3.8'
 scikit-allel==1.3.5
-zarr==2.11.3
 biopython==1.79
 demes==0.2.2
 demesdraw==0.3.1

diff --git a/requirements/development.txt b/requirements/development.txt
@@ -19,12 +19,11 @@ appdirs
 humanize
 pre-commit
 pyslim>=1.0.1
-pandas
 numpy
 scikit-allel
-zarr>=2.4
 biopython
 demes==0.2.0
 demesdraw==0.2.0
 boto3
 kastore
+scipy
diff --git a/setup.cfg b/setup.cfg
@@ -38,8 +38,6 @@ install_requires =
     appdirs
     humanize
     pyslim >= 1.0.1
-    pandas
-    zarr
     numpy
 setup_requires =
     setuptools

diff --git a/stdpopsim/catalog/HomSap/demographic_models.py b/stdpopsim/catalog/HomSap/demographic_models.py
@@ -1,7 +1,7 @@
 import math
+
 import msprime
-from scipy import stats
-import pandas as pd
+
 import stdpopsim
 
 _species = stdpopsim.get_species("HomSap")
@@ -71,7 +71,7 @@ def extended_pulse(
         """
         tm = (migration_stop - migration_start) / 2 + migration_start
         k = 1 / ((migration_stop - migration_start) / (4 * tm)) ** 2
-        m = stats.gamma.pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)
+        m = stdpopsim.utils.gamma_pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)
 
         """
         Scaling the distribution by the total migration rate.
@@ -89,30 +89,15 @@ def extended_pulse(
             over the set migration cutoff. They will be included in the m(t)
             distribution.
             """
-            D_extended_pulse["time"].append(x)
-            D_extended_pulse["rate"].append(m_scaled[x])
-            D_extended_pulse["source"].append(source)
-            D_extended_pulse["dest"].append(dest)
-            event_in_range.append(x)
+            rate = m_scaled[x]
+            if rate > 0:
+                D_extended_pulse["time"].append(x)
+                D_extended_pulse["rate"].append(rate)
+                D_extended_pulse["source"].append(source)
+                D_extended_pulse["dest"].append(dest)
+                event_in_range.append(x)
 
-        """
-        Setting migration rate to 0 at the end/ the start of
-        gene flow (end of gene flow backwards in time).
-        """
-
-        D_extended_pulse["time"].append((event_in_range[-1] + 1))
-        D_extended_pulse["rate"].append(0)
-        D_extended_pulse["source"].append(source)
-        D_extended_pulse["dest"].append(dest)
-        """
-        Storing all migration event in a df, sorted by time
-        """
-        extended_pulse = pd.DataFrame.from_dict(D_extended_pulse)
-        extended_pulse = extended_pulse[extended_pulse.rate != 0]
-        extended_pulse.sort_values(by=["time"], ignore_index=True)
-        extended_pulse.reset_index(inplace=True)
-
-        return extended_pulse
+        return D_extended_pulse
 
     generation_time = 29
     mutation_rate = 2e-8
@@ -155,10 +140,10 @@ def extended_pulse(
 
     """Absolute start end end of admixture"""
     Neandertal_Gene_Flow_absolute_start = msprime.MigrationRateChange(
-        time=int(extended_GF.time.head(1) - 1), rate=0
+        time=int(extended_GF["time"][0] - 1), rate=0
     )
     Neandertal_Gene_Flow_absolute_end = msprime.MigrationRateChange(
-        time=int(extended_GF.time.tail(1) + 1), rate=0
+        time=int(extended_GF["time"][-1] + 1), rate=0
     )
 
     demographic_events_without_admixture = [
@@ -170,11 +155,13 @@ def extended_pulse(
 
     demographic_events = demographic_events_without_admixture + [
         msprime.MigrationRateChange(
-            time=extended_GF.time[i],
-            rate=extended_GF.rate[i],
-            matrix_index=(extended_GF.source[i], extended_GF.dest[i]),
+            time=time,
+            rate=rate,
+            matrix_index=(source, dest),
+        )
+        for time, rate, source, dest in zip(
+            *(extended_GF[k] for k in ("time", "rate", "source", "dest"))
         )
-        for i in range(extended_GF.shape[0])
     ]
 
     demographic_events.sort(key=lambda x: x.time)

diff --git a/stdpopsim/qc/HomSap.py b/stdpopsim/qc/HomSap.py
@@ -1,6 +1,4 @@
 import math
-import pandas as pd
-from scipy import stats
 
 import msprime
 
@@ -1465,57 +1463,37 @@ def extended_pulse(
         migration rate. The function returns a dataframe of the migration rates.
         """
 
-        event_in_range = list()
-        D_extended_pulse = {"time": [], "rate": [], "source": [], "dest": []}
-
         """
         The shape and scale parameters are calculated from the input.
         """
         tm = (migration_stop - migration_start) / 2 + migration_start
         k = 1 / ((migration_stop - migration_start) / (4 * tm)) ** 2
-        m = [stats.gamma.pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)]
+        m = stdpopsim.utils.gamma_pdf(x=range(split_time + 1), a=k, loc=0, scale=tm / k)
 
         """
         Scaling the distribution by the total migration rate.
         """
-        m = m[0]
         m[abs(m) < migration_cutoff] = 0
         m_scaled = m * total_migration_rate / sum(m)
 
         """
         Filling the table of migration rate for each generation.
         """
+        D_extended_pulse = []
         for x in range(split_time + 1):
 
             """
             Writing gene flow events which are inside the set time boarders and
             over the set migration cutoff. They will be included in the m(t)
             distribution.
             """
-            D_extended_pulse["time"].append(x)
-            D_extended_pulse["rate"].append(m_scaled[x])
-            D_extended_pulse["source"].append(source)
-            D_extended_pulse["dest"].append(dest)
-            event_in_range.append(x)
+            rate = m_scaled[x]
+            if rate > 0:
+                D_extended_pulse.append(
+                    dict(time=x, rate=rate, source=source, dest=dest)
+                )
 
-        """
-        Setting migration rate to 0 at the end/ the start of
-        gene flow (end of gene flow backwards in time).
-        """
-
-        D_extended_pulse["time"].append((event_in_range[-1] + 1))
-        D_extended_pulse["rate"].append(0)
-        D_extended_pulse["source"].append(source)
-        D_extended_pulse["dest"].append(dest)
-        """
-        Storing all migration event in a df, sorted by time
-        """
-        extended_pulse = pd.DataFrame.from_dict(D_extended_pulse)
-        extended_pulse = extended_pulse[extended_pulse.rate != 0]
-        extended_pulse.sort_values(by=["time"], ignore_index=True)
-        extended_pulse.reset_index(inplace=True)
-
-        return extended_pulse
+        return D_extended_pulse
 
     # Get a dataframe specifying the rates in the migration pulse
     pulse_df = extended_pulse(
@@ -1529,23 +1507,17 @@ def extended_pulse(
     )
 
     # Need to insert zero migration rates before and after the pulse
-    start = pd.DataFrame(
-        {"time": pulse_df.iloc[0]["time"] - 1, "rate": 0, "source": 1, "dest": 2},
-        index=[0],
-    )
-    end = pd.DataFrame(
-        {"time": pulse_df.iloc[-1]["time"] + 1, "rate": 0, "source": 1, "dest": 2},
-        index=[0],
-    )
-    pulse_df = pd.concat([start, pulse_df, end])
+    start = {"time": pulse_df[0]["time"] - 1, "rate": 0, "source": 1, "dest": 2}
+    end = {"time": pulse_df[-1]["time"] + 1, "rate": 0, "source": 1, "dest": 2}
+    pulse_df = [start] + pulse_df + [end]
 
     # Don't love iterating over rows here, but the DataFrame is small so works fine
-    for idx, record in pulse_df.iterrows():
+    for record in pulse_df:
         de.add_migration_rate_change(
-            time=record.time,
-            rate=record.rate,
-            source=int(record.source),
-            dest=int(record.dest),
+            time=record["time"],
+            rate=record["rate"],
+            source=record["source"],
+            dest=record["dest"],
         )
 
     # Merge CEU into YRI and NEA into YRI

diff --git a/stdpopsim/utils.py b/stdpopsim/utils.py
@@ -1,16 +1,18 @@
 """
 Miscellaneous utilities.
 """
-import re
-import os
+import contextlib
 import hashlib
-import urllib.request
+import math
+import os
+import re
 import shutil
 import tarfile
-import contextlib
-import numpy as np
+import urllib.request
 import warnings
 
+import numpy as np
+
 
 def is_valid_demographic_model_id(model_id):
     """
@@ -327,3 +329,27 @@ def haploidize_individuals(ts):
         node_indiv[node] = new_idx
     tables.nodes.individual = node_indiv
     return tables.tree_sequence()
+
+
+def gamma_pdf(x, a, loc=0, scale=1):
+    """
+    Gamma PDF with same parameterisation as scipy.stats.gamma.pdf().
+
+    Reimplemented here to avoid a runtime dependency on scipy.
+    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gamma.html
+    """
+    x = np.array(x)
+    a = np.array(a)
+    loc = np.array(loc)
+    scale = np.array(scale)
+
+    gamma_a = np.vectorize(lambda b: math.gamma(b) if b > 0 else np.nan)(a)
+
+    with np.errstate(divide="ignore"):
+        y = (x - loc) / scale
+        result = np.power(y, a - 1) * np.exp(-y) / (scale * gamma_a)
+
+    if result.ndim == 0:
+        return result[()]
+    else:
+        return result
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -8,6 +8,7 @@
 import tempfile
 
 import numpy as np
+import scipy.stats
 import pytest
 
 from stdpopsim import utils
@@ -516,3 +517,35 @@ def test_mask_intervals_errors(self):
             utils.mask_intervals(intervals=np.array([[50, 10]]), mask=np.array([[]]))
         with pytest.raises(ValueError):
             utils.mask_intervals(intervals=np.array([[]]), mask=np.array([[10, 5]]))
+
+
+class TestGammaPdf:
+    def test_gamma_pdf_basic(self):
+        np.testing.assert_allclose(
+            utils.gamma_pdf(x=1, a=1), scipy.stats.gamma.pdf(x=1, a=1)
+        )
+        np.testing.assert_allclose(
+            utils.gamma_pdf(x=range(5), a=1), scipy.stats.gamma.pdf(x=range(5), a=1)
+        )
+        np.testing.assert_allclose(
+            utils.gamma_pdf(x=1, a=range(1, 5)),
+            scipy.stats.gamma.pdf(x=1, a=range(1, 5)),
+        )
+
+    @pytest.mark.parametrize(
+        "x,a,loc,scale",
+        [
+            (1, 1, 0, 1),
+            # Note that some of these values are (deliberately) invalid,
+            # which should produce NaN in the corresponding output.
+            (np.arange(5), np.linspace(0, 1, 5), 0, 1),
+            (np.arange(5), np.linspace(0, 1, 5), 1, 2),
+            (np.arange(5), np.linspace(0, 1, 5), np.arange(5), np.arange(5)),
+            (np.eye(5), 1, 0, 1),
+        ],
+    )
+    def test_gamma_pdf(self, x, a, loc, scale):
+        np.testing.assert_allclose(
+            utils.gamma_pdf(x=x, a=a, loc=loc, scale=scale),
+            scipy.stats.gamma.pdf(x=x, a=a, loc=loc, scale=scale),
+        )