From c54f031053f25fbe764297c8621c76b5fd8dfcde Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Wed, 10 May 2023 09:46:58 +0100
Subject: [PATCH 01/26] Adding XPEHH

---
 malariagen_data/ag3.py       |   2 +
 malariagen_data/anopheles.py | 497 +++++++++++++++++++++++++++++++++++
 2 files changed, 499 insertions(+)

diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index b451be7fd..7ef0e0f74 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -43,6 +43,7 @@
 G123_GWSS_CACHE_NAME = "ag3_g123_gwss_v1"
 H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1"
 IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
+XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
 DEFAULT_SITE_MASK = "gamb_colu_arab"
 
 
@@ -113,6 +114,7 @@ class Ag3(AnophelesDataResource):
     _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME
     _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _xpehh_gwss_cache_name = IHS_GWSS_CACHE_NAME
     site_mask_ids = ("gamb_colu_arab", "gamb_colu", "arab")
     _default_site_mask = DEFAULT_SITE_MASK
     phasing_analysis_ids = ("gamb_colu_arab", "gamb_colu", "arab")
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index ec939d135..56c42d3cf 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -491,6 +491,93 @@ class ihs_params:
     palette_default: palette = "Blues"
 
 
+class xpehh_params:
+    window_size: TypeAlias = Annotated[
+        int,
+        """
+        The size of window in number of SNPs used to summarise XP-EHH over.
+        If None, per-variant XP-EHH values are returned.
+        """,
+    ]
+    window_size_default: window_size = 200
+    min_cohort_size_default: base_params.min_cohort_size = 15
+    max_cohort_size_default: base_params.max_cohort_size = 50
+    percentiles: TypeAlias = Annotated[
+        Union[int, Tuple[int, ...]],
+        """
+        If window size is specified, this returns the XP-EHH percentiles
+        for each window.
+        """,
+    ]
+    percentiles_default: percentiles = (50, 75, 100)
+    standardize: TypeAlias = Annotated[
+        bool, "If True, standardize XP-EHH values by alternate allele counts."
+    ]
+    standardization_bins: TypeAlias = Annotated[
+        Tuple[float, ...],
+        "If provided, use these allele count bins to standardize XP-EHH values.",
+    ]
+    standardization_n_bins: TypeAlias = Annotated[
+        int,
+        """
+        Number of allele count bins to use for standardization.
+        Overrides standardization_bins.
+        """,
+    ]
+    standardization_n_bins_default: standardization_n_bins = 20
+    standardization_diagnostics: TypeAlias = Annotated[
+        bool, "If True, plot some diagnostics about the standardization."
+    ]
+    filter_min_maf: TypeAlias = Annotated[
+        float,
+        """
+        Minimum minor allele frequency to use for filtering prior to passing
+        haplotypes to allel.xpehh function
+        """,
+    ]
+    filter_min_maf_default: filter_min_maf = 0.05
+    map_pos: TypeAlias = Annotated[
+        np.ndarray,
+        """
+        Variant positions (genetic map distance).
+        """,
+    ]
+    min_ehh: TypeAlias = Annotated[
+        float,
+        """
+        Minimum EHH beyond which to truncate integrated haplotype homozygosity
+        calculation.
+        """,
+    ]
+    min_ehh_default: min_ehh = 0.05
+    max_gap: TypeAlias = Annotated[
+        int,
+        """
+        Do not report scores if EHH spans a gap larger than this number of
+        base pairs.
+        """,
+    ]
+    max_gap_default: max_gap = 200_000
+    gap_scale: TypeAlias = Annotated[
+        int, "Rescale distance between variants if gap is larger than this value."
+    ]
+    gap_scale_default: gap_scale = 20_000
+    include_edges: TypeAlias = Annotated[
+        bool,
+        """
+        If True, report scores even if EHH does not decay below min_ehh at the
+        end of the chromosome.
+        """,
+    ]
+    use_threads: TypeAlias = Annotated[
+        bool, "If True, use multiple threads to compute XP-EHH."
+    ]
+    palette: TypeAlias = Annotated[
+        str, "Name of bokeh palette to use for plotting multiple percentiles."
+    ]
+    palette_default: palette = "Blues"
+
+
 class hapclust_params:
     linkage_method: TypeAlias = Annotated[
         Literal[
@@ -723,6 +810,11 @@ def _h1x_gwss_cache_name(self):
     def _ihs_gwss_cache_name(self):
         raise NotImplementedError("Must override _ihs_gwss_cache_name")
 
+    @property
+    @abstractmethod
+    def _xpehh_gwss_cache_name(self):
+        raise NotImplementedError("Must override _xpehh_gwss_cache_name")
+
     @property
     @abstractmethod
     def _site_annotations_zarr_path(self):
@@ -6057,7 +6149,9 @@ def ihs_gwss(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data
+        print("Begin ihs")
         name = self._ihs_gwss_cache_name
+        print("End ihs")
 
         params = dict(
             contig=contig,
@@ -6306,6 +6400,98 @@ def plot_ihs_gwss_track(
 
         return fig
 
+    @doc(
+        summary="Run and plot iHS GWSS data.",
+    )
+    def plot_xpehh_gwss(
+        self,
+        contig: base_params.contig,
+        analysis: hap_params.analysis = DEFAULT,
+        sample_sets: Optional[base_params.sample_sets] = None,
+        sample_query1: Optional[base_params.sample_query] = None,
+        sample_query2: Optional[base_params.sample_query] = None,
+        window_size: xpehh_params.window_size = xpehh_params.window_size_default,
+        percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
+        standardize: xpehh_params.standardize = True,
+        standardization_bins: Optional[xpehh_params.standardization_bins] = None,
+        standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default,
+        standardization_diagnostics: xpehh_params.standardization_diagnostics = False,
+        filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default,
+        map_pos: Optional[xpehh_params.map_pos] = None,
+        min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default,
+        max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default,
+        gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default,
+        include_edges: xpehh_params.include_edges = True,
+        use_threads: xpehh_params.use_threads = True,
+        min_cohort_size: Optional[
+            base_params.min_cohort_size
+        ] = xpehh_params.min_cohort_size_default,
+        max_cohort_size: Optional[
+            base_params.max_cohort_size
+        ] = xpehh_params.max_cohort_size_default,
+        random_seed: base_params.random_seed = 42,
+        palette: xpehh_params.palette = xpehh_params.palette_default,
+        title: Optional[gplt_params.title] = None,
+        sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
+        width: gplt_params.width = gplt_params.width_default,
+        track_height: gplt_params.track_height = 170,
+        genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
+    ) -> None:
+        # gwss track
+        fig1 = self.plot_xpehh_gwss_track(
+            contig=contig,
+            analysis=analysis,
+            sample_sets=sample_sets,
+            sample_query1=sample_query1,
+            sample_query2=sample_query2,
+            window_size=window_size,
+            percentiles=percentiles,
+            palette=palette,
+            standardize=standardize,
+            standardization_bins=standardization_bins,
+            standardization_n_bins=standardization_n_bins,
+            standardization_diagnostics=standardization_diagnostics,
+            filter_min_maf=filter_min_maf,
+            map_pos=map_pos,
+            min_ehh=min_ehh,
+            max_gap=max_gap,
+            gap_scale=gap_scale,
+            include_edges=include_edges,
+            use_threads=use_threads,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+            random_seed=random_seed,
+            title=title,
+            sizing_mode=sizing_mode,
+            width=width,
+            height=track_height,
+            show=False,
+            x_range=None,
+        )
+
+        fig1.xaxis.visible = False
+
+        # plot genes
+        fig2 = self.plot_genes(
+            region=contig,
+            sizing_mode=sizing_mode,
+            width=width,
+            height=genes_height,
+            x_range=fig1.x_range,
+            show=False,
+        )
+
+        # combine plots into a single figure
+        fig = bokeh.layouts.gridplot(
+            [fig1, fig2],
+            ncols=1,
+            toolbar_location="above",
+            merge_tools=True,
+            sizing_mode=sizing_mode,
+        )
+
+        bokeh.plotting.show(fig)
+
     @doc(
         summary="Run and plot iHS GWSS data.",
     )
@@ -6876,6 +7062,317 @@ def plot_g123_calibration(
         fig.title = title
         bokeh.plotting.show(fig)
 
+    @doc(
+        summary="Run XP-EHH GWSS.",
+        returns=dict(
+            x="An array containing the window centre point genomic positions.",
+            xpehh="An array with XP-EHH statistic values for each window.",
+        ),
+    )
+    def xpehh_gwss(
+        self,
+        contig: base_params.contig,
+        analysis: hap_params.analysis = DEFAULT,
+        sample_sets: Optional[base_params.sample_sets] = None,
+        sample_query1: Optional[base_params.sample_query] = None,
+        sample_query2: Optional[base_params.sample_query] = None,
+        window_size: xpehh_params.window_size = xpehh_params.window_size_default,
+        percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
+        standardize: xpehh_params.standardize = True,
+        standardization_bins: Optional[xpehh_params.standardization_bins] = None,
+        standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default,
+        standardization_diagnostics: xpehh_params.standardization_diagnostics = False,
+        filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default,
+        map_pos: Optional[xpehh_params.map_pos] = None,
+        min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default,
+        max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default,
+        gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default,
+        include_edges: xpehh_params.include_edges = True,
+        use_threads: xpehh_params.use_threads = True,
+        min_cohort_size: Optional[
+            base_params.min_cohort_size
+        ] = xpehh_params.min_cohort_size_default,
+        max_cohort_size: Optional[
+            base_params.max_cohort_size
+        ] = xpehh_params.max_cohort_size_default,
+        random_seed: base_params.random_seed = 42,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        # change this name if you ever change the behaviour of this function, to
+        # invalidate any previously cached data
+        name = self._xpehh_gwss_cache_name
+
+        params = dict(
+            contig=contig,
+            analysis=self._prep_phasing_analysis_param(analysis=analysis),
+            window_size=window_size,
+            percentiles=percentiles,
+            standardize=standardize,
+            standardization_bins=standardization_bins,
+            standardization_n_bins=standardization_n_bins,
+            standardization_diagnostics=standardization_diagnostics,
+            filter_min_maf=filter_min_maf,
+            map_pos=map_pos,
+            min_ehh=min_ehh,
+            include_edges=include_edges,
+            max_gap=max_gap,
+            gap_scale=gap_scale,
+            use_threads=use_threads,
+            sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets),
+            # N.B., do not be tempted to convert this sample query into integer
+            # indices using _prep_sample_selection_params, because the indices
+            # are different in the haplotype data.
+            sample_query1=sample_query1,
+            sample_query2=sample_query2,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+            random_seed=random_seed,
+        )
+
+        try:
+            results = self.results_cache_get(name=name, params=params)
+
+        except CacheMiss:
+            results = self._xpehh_gwss(**params)  # self.
+            self.results_cache_set(name=name, params=params, results=results)
+
+        x = results["x"]
+        xpehh = results["xpehh"]
+
+        return x, xpehh
+
+    def _xpehh_gwss(
+        self,
+        *,
+        contig,
+        analysis,
+        sample_sets,
+        sample_query1,
+        sample_query2,
+        window_size,
+        percentiles,
+        standardize,
+        standardization_bins,
+        standardization_n_bins,
+        standardization_diagnostics,
+        filter_min_maf,
+        map_pos,
+        min_ehh,
+        max_gap,
+        gap_scale,
+        include_edges,
+        use_threads,
+        min_cohort_size,
+        max_cohort_size,
+        random_seed,
+    ):
+        ds_haps1 = self.haplotypes(
+            region=contig,
+            analysis=analysis,
+            sample_query=sample_query1,
+            sample_sets=sample_sets,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+            random_seed=random_seed,
+        )
+
+        ds_haps2 = self.haplotypes(
+            region=contig,
+            analysis=analysis,
+            sample_query=sample_query2,
+            sample_sets=sample_sets,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+            random_seed=random_seed,
+        )
+
+        gt1 = allel.GenotypeDaskArray(ds_haps1["call_genotype"].data)
+        gt2 = allel.GenotypeDaskArray(ds_haps2["call_genotype"].data)
+        with self._dask_progress(desc="Load haplotypes"):
+            ht1 = gt1.to_haplotypes().compute()
+            ht2 = gt2.to_haplotypes().compute()
+
+        ac1 = ht1.count_alleles(max_allele=1)
+        ac2 = ht2.count_alleles(max_allele=1)
+        pos = ds_haps1["variant_position"].values
+
+        # To be modified to take both into account
+        if filter_min_maf > 0:
+            af = ac1.to_frequencies()
+            maf = np.min(af, axis=1)
+            maf_filter = maf > filter_min_maf
+
+            ht1 = ht1.compress(maf_filter, axis=0)
+            ht2 = ht2.compress(maf_filter, axis=0)
+            pos = pos[maf_filter]
+            ac1 = ac1[maf_filter]
+            ac2 = ac2[maf_filter]
+
+        # compute XP-EHH
+        xp = allel.xpehh(
+            h1=ht1,
+            h2=ht2,
+            pos=pos,
+            map_pos=map_pos,
+            # min_maf=compute_min_maf,
+            min_ehh=min_ehh,
+            include_edges=include_edges,
+            max_gap=max_gap,
+            gap_scale=gap_scale,
+            use_threads=use_threads,
+        )
+
+        # remove any NaNs
+        na_mask = ~np.isnan(xp)
+        xp = xp[na_mask]
+        pos = pos[na_mask]
+        ac1 = ac1[na_mask]
+        ac2 = ac2[na_mask]
+
+        # take absolute value
+        xp = np.fabs(xp)
+
+        # Update to take into account both sets
+        if standardize:
+            xp, _ = allel.standardize_by_allele_count(
+                score=xp,
+                aac=ac1[:, 1],
+                bins=standardization_bins,
+                n_bins=standardization_n_bins,
+                diagnostics=standardization_diagnostics,
+            )
+
+        if window_size:
+            xp = allel.moving_statistic(
+                xp, statistic=np.percentile, size=window_size, q=percentiles
+            )
+            pos = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
+
+        results = dict(x=pos, xpehh=xp)
+
+        return results
+
+    @doc(
+        summary="Run and plot XP-EHH GWSS data.",
+    )
+    def plot_xpehh_gwss_track(
+        self,
+        contig: base_params.contig,
+        analysis: hap_params.analysis = DEFAULT,
+        sample_sets: Optional[base_params.sample_sets] = None,
+        sample_query1: Optional[base_params.sample_query] = None,
+        sample_query2: Optional[base_params.sample_query] = None,
+        window_size: xpehh_params.window_size = xpehh_params.window_size_default,
+        percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
+        standardize: xpehh_params.standardize = True,
+        standardization_bins: Optional[xpehh_params.standardization_bins] = None,
+        standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default,
+        standardization_diagnostics: xpehh_params.standardization_diagnostics = False,
+        filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default,
+        map_pos: Optional[xpehh_params.map_pos] = None,
+        min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default,
+        max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default,
+        gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default,
+        include_edges: xpehh_params.include_edges = True,
+        use_threads: xpehh_params.use_threads = True,
+        min_cohort_size: Optional[
+            base_params.min_cohort_size
+        ] = xpehh_params.min_cohort_size_default,
+        max_cohort_size: Optional[
+            base_params.max_cohort_size
+        ] = xpehh_params.max_cohort_size_default,
+        random_seed: base_params.random_seed = 42,
+        palette: xpehh_params.palette = xpehh_params.palette_default,
+        title: Optional[gplt_params.title] = None,
+        sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
+        width: gplt_params.width = gplt_params.width_default,
+        height: gplt_params.height = 200,
+        show: gplt_params.show = True,
+        x_range: Optional[gplt_params.x_range] = None,
+    ):
+        # compute ihs
+        x, xpehh = self.xpehh_gwss(
+            contig=contig,
+            analysis=analysis,
+            window_size=window_size,
+            percentiles=percentiles,
+            standardize=standardize,
+            standardization_bins=standardization_bins,
+            standardization_n_bins=standardization_n_bins,
+            standardization_diagnostics=standardization_diagnostics,
+            filter_min_maf=filter_min_maf,
+            map_pos=map_pos,
+            min_ehh=min_ehh,
+            max_gap=max_gap,
+            gap_scale=gap_scale,
+            include_edges=include_edges,
+            use_threads=use_threads,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+            sample_query1=sample_query1,
+            sample_query2=sample_query2,
+            sample_sets=sample_sets,
+            random_seed=random_seed,
+        )
+
+        # determine X axis range
+        x_min = x[0]
+        x_max = x[-1]
+        if x_range is None:
+            x_range = bokeh.models.Range1d(x_min, x_max, bounds="auto")
+
+        # create a figure
+        xwheel_zoom = bokeh.models.WheelZoomTool(
+            dimensions="width", maintain_focus=False
+        )
+        if title is None:
+            title = "'" + sample_query1 + "' and '" + sample_query2 + "'"
+        fig = bokeh.plotting.figure(
+            title=title,
+            tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"],
+            active_scroll=xwheel_zoom,
+            active_drag="xpan",
+            sizing_mode=sizing_mode,
+            width=width,
+            height=height,
+            toolbar_location="above",
+            x_range=x_range,
+        )
+
+        if window_size:
+            if isinstance(percentiles, int):
+                percentiles = (percentiles,)
+
+        # add an empty dimension to xpehh array if 1D
+        xpehh = np.reshape(xpehh, (xpehh.shape[0], -1))
+        bokeh_palette = bokeh.palettes.all_palettes[palette]
+        for i in range(xpehh.shape[1]):
+            xpehh_perc = xpehh[:, i]
+            if xpehh.shape[1] >= 3:
+                color = bokeh_palette[xpehh.shape[1]][i]
+            elif xpehh.shape[1] == 2:
+                color = bokeh_palette[3][i]
+            else:
+                color = None
+
+            # plot ihs
+            fig.circle(
+                x=x,
+                y=xpehh_perc,
+                size=3,
+                line_width=0.15,
+                line_color="black",
+                fill_color=color,
+            )
+
+        # tidy up the plot
+        fig.yaxis.axis_label = "xpehh"
+        self._bokeh_style_genome_xaxis(fig, contig)
+
+        if show:
+            bokeh.plotting.show(fig)
+
+        return fig
+
     @doc(
         summary="""
             Hierarchically cluster haplotypes in region and produce an interactive plot.

From 01c6563d863807c87515385c2f69097c9cd8e42d Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Wed, 10 May 2023 13:57:32 +0100
Subject: [PATCH 02/26] Trying to outsmart linting

---
 malariagen_data/anopheles.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 56c42d3cf..d1cfc5fb7 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -7325,7 +7325,10 @@ def plot_xpehh_gwss_track(
             dimensions="width", maintain_focus=False
         )
         if title is None:
-            title = "'" + sample_query1 + "' and '" + sample_query2 + "'"
+            if sample_query1 is None or sample_query2 is None:
+                title = "XP-EHH"
+            else:
+                title = "'" + sample_query1 + "' and '" + sample_query2 + "'"
         fig = bokeh.plotting.figure(
             title=title,
             tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"],

From e85699ea8df42e980622261e715d58b8dd842192 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Mon, 15 May 2023 09:30:49 +0100
Subject: [PATCH 03/26] Added the tests

---
 tests/test_ag3.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/test_ag3.py b/tests/test_ag3.py
index 442920516..9197d0673 100644
--- a/tests/test_ag3.py
+++ b/tests/test_ag3.py
@@ -3012,6 +3012,37 @@ def test_ihs_gwss():
     assert_allclose(ihs[:, 2][100], 2.3467595962486327)
 
 
+def test_xpehh_gwss():
+    ag3 = setup_ag3()
+    sample_query1 = "country == 'Ghana'"
+    sample_query2 = "country == 'Angola'"
+    contig = "3L"
+    analysis = "gamb_colu"
+    sample_sets = "3.0"
+    window_size = 1000
+
+    x, xpehh = ag3.xpehh_gwss(
+        contig=contig,
+        analysis=analysis,
+        sample_query1=sample_query1,
+        sample_query2=sample_query2,
+        sample_sets=sample_sets,
+        window_size=window_size,
+        max_cohort_size=20,
+    )
+
+    assert isinstance(x, np.ndarray)
+    assert isinstance(xpehh, np.ndarray)
+
+    # check dimensions
+    assert len(x) == 395
+    assert len(x) == len(xpehh)
+
+    # check some values
+    assert_allclose(x[0], 510232.847)
+    assert_allclose(xpehh[:, 2][100], 1.7302443850220177)
+
+
 def test_g123_gwss():
     ag3 = setup_ag3()
     sample_query = "country == 'Ghana'"

From aa95dd91da4fc2a9e5caffb8cf1180300129b6d4 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 13:54:28 +0200
Subject: [PATCH 04/26] Update xpehh variable malariagen_data/ag3.py

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/ag3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index 2997c6805..d49d82792 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -114,7 +114,7 @@ class Ag3(AnophelesDataResource):
     _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME
     _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
-    _xpehh_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
     site_mask_ids = ("gamb_colu_arab", "gamb_colu", "arab")
     _default_site_mask = DEFAULT_SITE_MASK
     phasing_analysis_ids = ("gamb_colu_arab", "gamb_colu", "arab")

From 1e2f1f209426de7933115e081afb3b12054f9a03 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 13:54:56 +0200
Subject: [PATCH 05/26] Remove initial comment malariagen_data/anopheles.py

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/anopheles.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 16e82f811..68834c628 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -6149,7 +6149,6 @@ def ihs_gwss(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data
-        print("Begin ihs")
         name = self._ihs_gwss_cache_name
         print("End ihs")
 

From 1aa208aa2f4e254f7758ac6d2d974b07c4379492 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 13:55:23 +0200
Subject: [PATCH 06/26] Remove final comment malariagen_data/anopheles.py

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/anopheles.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 68834c628..277d265bc 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -6150,7 +6150,6 @@ def ihs_gwss(
         # change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data
         name = self._ihs_gwss_cache_name
-        print("End ihs")
 
         params = dict(
             contig=contig,

From 578139ffce02b3899e1e3db41149219f0c63086c Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 13:56:39 +0200
Subject: [PATCH 07/26] Update forgotten instruction
 malariagen_data/anopheles.py

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/anopheles.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 277d265bc..088614555 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -7195,7 +7195,8 @@ def _xpehh_gwss(
 
         # To be modified to take both into account
         if filter_min_maf > 0:
-            af = ac1.to_frequencies()
+            ac = ac1 + ac2
+            af = ac.to_frequencies()
             maf = np.min(af, axis=1)
             maf_filter = maf > filter_min_maf
 

From a73f57769ac80fb3b581e8ae5dd5797da2bceff2 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 13:57:23 +0200
Subject: [PATCH 08/26] Correct annotation malariagen_data/anopheles.py

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/anopheles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 088614555..7db99df23 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -7356,7 +7356,7 @@ def plot_xpehh_gwss_track(
             else:
                 color = None
 
-            # plot ihs
+            # plot xpehh
             fig.circle(
                 x=x,
                 y=xpehh_perc,

From 7d001005738ec0a750ecea0620bb450f4c873925 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 13:58:40 +0200
Subject: [PATCH 09/26] Improve display for webgl malariagen_data/anopheles.py

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/anopheles.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 7db99df23..5ea9ca729 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -7360,9 +7360,9 @@ def plot_xpehh_gwss_track(
             fig.circle(
                 x=x,
                 y=xpehh_perc,
-                size=3,
-                line_width=0.15,
-                line_color="black",
+                size=4,
+                line_width=0,
+                line_color=color,
                 fill_color=color,
             )
 

From e754799efd5470144b79651c7d7f9fdfc7516130 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 13 Jun 2023 15:01:52 +0200
Subject: [PATCH 10/26] I am not sure what the problem is

---
 malariagen_data/anoph/xpehh_params.py |  93 ++++
 malariagen_data/anopheles.py          | 609 +-------------------------
 2 files changed, 94 insertions(+), 608 deletions(-)
 create mode 100644 malariagen_data/anoph/xpehh_params.py

diff --git a/malariagen_data/anoph/xpehh_params.py b/malariagen_data/anoph/xpehh_params.py
new file mode 100644
index 000000000..ca2a940b3
--- /dev/null
+++ b/malariagen_data/anoph/xpehh_params.py
@@ -0,0 +1,93 @@
+"""Parameter definitions for XPEHH analysis functions."""
+
+from typing import Tuple, Union
+
+import numpy as np
+from typing_extensions import Annotated, TypeAlias
+
+from . import base_params
+
+window_size: TypeAlias = Annotated[
+    int,
+    """
+        The size of window in number of SNPs used to summarise XP-EHH over.
+        If None, per-variant XP-EHH values are returned.
+        """,
+]
+window_size_default: window_size = 200
+min_cohort_size_default: base_params.min_cohort_size = 15
+max_cohort_size_default: base_params.max_cohort_size = 50
+percentiles: TypeAlias = Annotated[
+    Union[int, Tuple[int, ...]],
+    """
+    If window size is specified, this returns the XP-EHH percentiles
+    for each window.
+    """,
+]
+percentiles_default: percentiles = (50, 75, 100)
+standardize: TypeAlias = Annotated[
+    bool, "If True, standardize XP-EHH values by alternate allele counts."
+]
+standardization_bins: TypeAlias = Annotated[
+    Tuple[float, ...],
+    "If provided, use these allele count bins to standardize XP-EHH values.",
+]
+standardization_n_bins: TypeAlias = Annotated[
+    int,
+    """
+    Number of allele count bins to use for standardization.
+    Overrides standardization_bins.
+    """,
+]
+standardization_n_bins_default: standardization_n_bins = 20
+standardization_diagnostics: TypeAlias = Annotated[
+    bool, "If True, plot some diagnostics about the standardization."
+]
+filter_min_maf: TypeAlias = Annotated[
+    float,
+    """
+    Minimum minor allele frequency to use for filtering prior to passing
+    haplotypes to allel.xpehh function
+    """,
+]
+filter_min_maf_default: filter_min_maf = 0.05
+map_pos: TypeAlias = Annotated[
+    np.ndarray,
+    """
+    Variant positions (genetic map distance).
+    """,
+]
+min_ehh: TypeAlias = Annotated[
+    float,
+    """
+    Minimum EHH beyond which to truncate integrated haplotype homozygosity
+    calculation.
+    """,
+]
+min_ehh_default: min_ehh = 0.05
+max_gap: TypeAlias = Annotated[
+    int,
+    """
+    Do not report scores if EHH spans a gap larger than this number of
+    base pairs.
+    """,
+]
+max_gap_default: max_gap = 200_000
+gap_scale: TypeAlias = Annotated[
+    int, "Rescale distance between variants if gap is larger than this value."
+]
+gap_scale_default: gap_scale = 20_000
+include_edges: TypeAlias = Annotated[
+    bool,
+    """
+    If True, report scores even if EHH does not decay below min_ehh at the
+    end of the chromosome.
+    """,
+]
+use_threads: TypeAlias = Annotated[
+    bool, "If True, use multiple threads to compute XP-EHH."
+]
+palette: TypeAlias = Annotated[
+    str, "Name of bokeh palette to use for plotting multiple percentiles."
+]
+palette_default: palette = "Blues"
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index c755dde75..2a1f1df39 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -36,6 +36,7 @@
     map_params,
     pca_params,
     plotly_params,
+    xpehh_params,
 )
 from .anoph.aim_data import AnophelesAimData
 from .anoph.base import AnophelesBase
@@ -61,614 +62,6 @@
     "effect in ['NON_SYNONYMOUS_CODING', 'START_LOST', 'STOP_LOST', 'STOP_GAINED']"
 )
 
-class hap_params:
-    """Parameter definitions for haplotype functions."""
-
-    analysis: TypeAlias = Annotated[
-        str,
-        """
-        Which haplotype phasing analysis to use. See the
-        `phasing_analysis_ids` property for available values.
-        """,
-    ]
-
-
-class h12_params:
-    """Parameter definitions for H12 analysis functions."""
-
-    window_sizes: TypeAlias = Annotated[
-        Tuple[int, ...],
-        """
-        The sizes of windows (number of SNPs) used to calculate statistics within.
-        """,
-    ]
-    window_sizes_default: window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)
-    window_size: TypeAlias = Annotated[
-        int,
-        """
-        The size of windows (number of SNPs) used to calculate statistics within.
-        """,
-    ]
-    cohort_size_default: Optional[base_params.cohort_size] = None
-    min_cohort_size_default: base_params.min_cohort_size = 15
-    max_cohort_size_default: base_params.max_cohort_size = 50
-
-
-class g123_params:
-    """Parameter definitions for G123 analysis functions."""
-
-    sites: TypeAlias = Annotated[
-        str,
-        """
-        Which sites to use: 'all' includes all sites that pass
-        site filters; 'segregating' includes only segregating sites for
-        the given cohort; or a phasing analysis identifier can be
-        provided to use sites from the haplotype data, which is an
-        approximation to finding segregating sites in the entire Ag3.0
-        (gambiae complex) or Af1.0 (funestus) cohort.
-        """,
-    ]
-    window_sizes: TypeAlias = Annotated[
-        Tuple[int, ...],
-        """
-        The sizes of windows (number of sites) used to calculate statistics within.
-        """,
-    ]
-    window_sizes_default: window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)
-    window_size: TypeAlias = Annotated[
-        int,
-        """
-        The size of windows (number of sites) used to calculate statistics within.
-        """,
-    ]
-    min_cohort_size_default: base_params.min_cohort_size = 20
-    max_cohort_size_default: base_params.max_cohort_size = 50
-
-
-class fst_params:
-    """Parameter definitions for Fst functions."""
-
-    # N.B., window size can mean different things for different functions
-    window_size: TypeAlias = Annotated[
-        int,
-        "The size of windows (number of sites) used to calculate statistics within.",
-    ]
-    cohort_size_default: Optional[base_params.cohort_size] = None
-    min_cohort_size_default: base_params.min_cohort_size = 15
-    max_cohort_size_default: base_params.max_cohort_size = 50
-
-
-class frq_params:
-    """Parameter definitions for functions computing and plotting allele frequencies."""
-
-    drop_invariant: TypeAlias = Annotated[
-        bool,
-        """
-        If True, drop variants not observed in the selected samples.
-        """,
-    ]
-    effects: TypeAlias = Annotated[bool, "If True, add SNP effect annotations."]
-    area_by: TypeAlias = Annotated[
-        str,
-        """
-        Column name in the sample metadata to use to group samples spatially. E.g.,
-        use "admin1_iso" or "admin1_name" to group by level 1 administrative
-        divisions, or use "admin2_name" to group by level 2 administrative
-        divisions.
-        """,
-    ]
-    period_by: TypeAlias = Annotated[
-        Literal["year", "quarter", "month"],
-        "Length of time to group samples temporally.",
-    ]
-    variant_query: TypeAlias = Annotated[
-        str,
-        "A pandas query to be evaluated against variants.",
-    ]
-    nobs_mode: TypeAlias = Annotated[
-        Literal["called", "fixed"],
-        """
-        Method for calculating the denominator when computing frequencies. If
-        "called" then use the number of called alleles, i.e., number of samples
-        with non-missing genotype calls multiplied by 2. If "fixed" then use the
-        number of samples multiplied by 2.
-        """,
-    ]
-    nobs_mode_default: nobs_mode = "called"
-    ci_method: TypeAlias = Annotated[
-        Literal["normal", "agresti_coull", "beta", "wilson", "binom_test"],
-        """
-        Method to use for computing confidence intervals, passed through to
-        `statsmodels.stats.proportion.proportion_confint`.
-        """,
-    ]
-    ci_method_default: ci_method = "wilson"
-    ds_frequencies_advanced: TypeAlias = Annotated[
-        xr.Dataset,
-        """
-        A dataset of variant frequencies, such as returned by
-        `snp_allele_frequencies_advanced()`,
-        `aa_allele_frequencies_advanced()` or
-        `gene_cnv_frequencies_advanced()`.
-        """,
-    ]
-
-
-class het_params:
-    """Parameters for functions related to heterozygosity and runs of homozygosity."""
-
-    sample: TypeAlias = Annotated[
-        Union[str, int],
-        "Sample identifier or index within sample set.",
-    ]
-    window_size: TypeAlias = Annotated[
-        int,
-        "Number of sites per window.",
-    ]
-    window_size_default: window_size = 20_000
-    phet_roh: TypeAlias = Annotated[
-        float,
-        "Probability of observing a heterozygote in a ROH.",
-    ]
-    phet_roh_default: phet_roh = 0.001
-    phet_nonroh: TypeAlias = Annotated[
-        Tuple[float, ...],
-        "One or more probabilities of observing a heterozygote outside a ROH.",
-    ]
-    phet_nonroh_default: phet_nonroh = (0.003, 0.01)
-    transition: TypeAlias = Annotated[
-        float,
-        """
-        Probability of moving between states. A larger window size may call
-        for a larger transitional probability.
-        """,
-    ]
-    transition_default: transition = 0.001
-    y_max: TypeAlias = Annotated[
-        float,
-        "Y axis limit.",
-    ]
-    y_max_default: y_max = 0.03
-    circle_kwargs: TypeAlias = Annotated[
-        Mapping,
-        "Passed through to bokeh circle() function.",
-    ]
-    df_roh: TypeAlias = Annotated[
-        pd.DataFrame,
-        """
-        A DataFrame where each row provides data about a single run of
-        homozygosity.
-        """,
-    ]
-    heterozygosity_height: TypeAlias = Annotated[
-        int,
-        "Height in pixels (px) of heterozygosity track.",
-    ]
-    roh_height: TypeAlias = Annotated[
-        int,
-        "Height in pixels (px) of runs of homozygosity track.",
-    ]
-
-
-class pca_params:
-    """Parameters for PCA functions."""
-
-    n_snps: TypeAlias = Annotated[
-        int,
-        """
-        The desired number of SNPs to use when running the analysis.
-        SNPs will be evenly thinned to approximately this number.
-        """,
-    ]
-    thin_offset: TypeAlias = Annotated[
-        int,
-        """
-        Starting index for SNP thinning. Change this to repeat the analysis
-        using a different set of SNPs.
-        """,
-    ]
-    thin_offset_default: thin_offset = 0
-    min_minor_ac: TypeAlias = Annotated[
-        int,
-        """
-        The minimum minor allele count. SNPs with a minor allele count
-        below this value will be excluded prior to thinning.
-        """,
-    ]
-    min_minor_ac_default: min_minor_ac = 2
-    max_missing_an: TypeAlias = Annotated[
-        int,
-        """
-        The maximum number of missing allele calls to accept. SNPs with
-        more than this value will be excluded prior to thinning. Set to 0
-        (default) to require no missing calls.
-        """,
-    ]
-    max_missing_an_default = 0
-    n_components: TypeAlias = Annotated[
-        int,
-        "Number of components to return.",
-    ]
-    n_components_default: n_components = 20
-    df_pca: TypeAlias = Annotated[
-        pd.DataFrame,
-        """
-        A dataframe of sample metadata, with columns "PC1", "PC2", "PC3",
-        etc., added.
-        """,
-    ]
-    evr: TypeAlias = Annotated[
-        np.ndarray,
-        "An array of explained variance ratios, one per component.",
-    ]
-
-
-class plotly_params:
-    """Parameters for any plotting functions using plotly."""
-
-    # N.B., most of these parameters are always able to take None
-    # and so we set as Optional here, rather than having to repeat
-    # that for each function doc.
-
-    x_label: TypeAlias = Annotated[
-        Optional[str],
-        "X axis label.",
-    ]
-    y_label: TypeAlias = Annotated[
-        Optional[str],
-        "Y axis label.",
-    ]
-    width: TypeAlias = Annotated[
-        Optional[int],
-        "Plot width in pixels (px).",
-    ]
-    height: TypeAlias = Annotated[
-        Optional[int],
-        "Plot height in pixels (px).",
-    ]
-    aspect: TypeAlias = Annotated[
-        Optional[Literal["equal", "auto"]],
-        "Aspect ratio, see also https://plotly.com/python-api-reference/generated/plotly.express.imshow",
-    ]
-    title: TypeAlias = Annotated[
-        Optional[Union[str, bool]],
-        """
-        If True, attempt to use metadata from input dataset as a plot title.
-        Otherwise, use supplied value as a title.
-        """,
-    ]
-    text_auto: TypeAlias = Annotated[
-        Union[bool, str],
-        """
-        If True or a string, single-channel img values will be displayed as text. A
-        string like '.2f' will be interpreted as a texttemplate numeric formatting
-        directive.
-        """,
-    ]
-    color_continuous_scale: TypeAlias = Annotated[
-        Optional[Union[str, List[str]]],
-        """
-        Colormap used to map scalar data to colors (for a 2D image). This
-        parameter is not used for RGB or RGBA images. If a string is provided,
-        it should be the name of a known color scale, and if a list is provided,
-        it should be a list of CSS-compatible colors.
-        """,
-    ]
-    colorbar: TypeAlias = Annotated[
-        bool,
-        "If False, do not display a color bar.",
-    ]
-    x: TypeAlias = Annotated[
-        str,
-        "Name of variable to plot on the X axis.",
-    ]
-    y: TypeAlias = Annotated[
-        str,
-        "Name of variable to plot on the Y axis.",
-    ]
-    z: TypeAlias = Annotated[
-        str,
-        "Name of variable to plot on the Z axis.",
-    ]
-    color: TypeAlias = Annotated[
-        Optional[str],
-        "Name of variable to use to color the markers.",
-    ]
-    symbol: TypeAlias = Annotated[
-        Optional[str],
-        "Name of the variable to use to choose marker symbols.",
-    ]
-    jitter_frac: TypeAlias = Annotated[
-        Optional[float],
-        "Randomly jitter points by this fraction of their range.",
-    ]
-    marker_size: TypeAlias = Annotated[
-        int,
-        "Marker size.",
-    ]
-    template: TypeAlias = Annotated[
-        Optional[
-            Literal[
-                "ggplot2",
-                "seaborn",
-                "simple_white",
-                "plotly",
-                "plotly_white",
-                "plotly_dark",
-                "presentation",
-                "xgridoff",
-                "ygridoff",
-                "gridon",
-                "none",
-            ]
-        ],
-        "The figure template name (must be a key in plotly.io.templates).",
-    ]
-    figure: TypeAlias = Annotated[go.Figure, "A plotly figure."]
-
-
-class ihs_params:
-    window_size: TypeAlias = Annotated[
-        int,
-        """
-        The size of window in number of SNPs used to summarise iHS over.
-        If None, per-variant iHS values are returned.
-        """,
-    ]
-    window_size_default: window_size = 200
-    min_cohort_size_default: base_params.min_cohort_size = 15
-    max_cohort_size_default: base_params.max_cohort_size = 50
-    percentiles: TypeAlias = Annotated[
-        Union[int, Tuple[int, ...]],
-        """
-        If window size is specified, this returns the iHS percentiles
-        for each window.
-        """,
-    ]
-    percentiles_default: percentiles = (50, 75, 100)
-    standardize: TypeAlias = Annotated[
-        bool, "If True, standardize iHS values by alternate allele counts."
-    ]
-    standardization_bins: TypeAlias = Annotated[
-        Tuple[float, ...],
-        "If provided, use these allele count bins to standardize iHS values.",
-    ]
-    standardization_n_bins: TypeAlias = Annotated[
-        int,
-        """
-        Number of allele count bins to use for standardization.
-        Overrides standardization_bins.
-        """,
-    ]
-    standardization_n_bins_default: standardization_n_bins = 20
-    standardization_diagnostics: TypeAlias = Annotated[
-        bool, "If True, plot some diagnostics about the standardization."
-    ]
-    filter_min_maf: TypeAlias = Annotated[
-        float,
-        """
-        Minimum minor allele frequency to use for filtering prior to passing
-        haplotypes to allel.ihs function
-        """,
-    ]
-    filter_min_maf_default: filter_min_maf = 0.05
-    compute_min_maf: TypeAlias = Annotated[
-        float,
-        """
-        Do not compute integrated haplotype homozygosity for variants with
-        minor allele frequency below this threshold.
-        """,
-    ]
-    compute_min_maf_default: compute_min_maf = 0.05
-    min_ehh: TypeAlias = Annotated[
-        float,
-        """
-        Minimum EHH beyond which to truncate integrated haplotype homozygosity
-        calculation.
-        """,
-    ]
-    min_ehh_default: min_ehh = 0.05
-    max_gap: TypeAlias = Annotated[
-        int,
-        """
-        Do not report scores if EHH spans a gap larger than this number of
-        base pairs.
-        """,
-    ]
-    max_gap_default: max_gap = 200_000
-    gap_scale: TypeAlias = Annotated[
-        int, "Rescale distance between variants if gap is larger than this value."
-    ]
-    gap_scale_default: gap_scale = 20_000
-    include_edges: TypeAlias = Annotated[
-        bool,
-        """
-        If True, report scores even if EHH does not decay below min_ehh at the
-        end of the chromosome.
-        """,
-    ]
-    use_threads: TypeAlias = Annotated[
-        bool, "If True, use multiple threads to compute iHS."
-    ]
-    palette: TypeAlias = Annotated[
-        str, "Name of bokeh palette to use for plotting multiple percentiles."
-    ]
-    palette_default: palette = "Blues"
-
-
-class xpehh_params:
-    window_size: TypeAlias = Annotated[
-        int,
-        """
-        The size of window in number of SNPs used to summarise XP-EHH over.
-        If None, per-variant XP-EHH values are returned.
-        """,
-    ]
-    window_size_default: window_size = 200
-    min_cohort_size_default: base_params.min_cohort_size = 15
-    max_cohort_size_default: base_params.max_cohort_size = 50
-    percentiles: TypeAlias = Annotated[
-        Union[int, Tuple[int, ...]],
-        """
-        If window size is specified, this returns the XP-EHH percentiles
-        for each window.
-        """,
-    ]
-    percentiles_default: percentiles = (50, 75, 100)
-    standardize: TypeAlias = Annotated[
-        bool, "If True, standardize XP-EHH values by alternate allele counts."
-    ]
-    standardization_bins: TypeAlias = Annotated[
-        Tuple[float, ...],
-        "If provided, use these allele count bins to standardize XP-EHH values.",
-    ]
-    standardization_n_bins: TypeAlias = Annotated[
-        int,
-        """
-        Number of allele count bins to use for standardization.
-        Overrides standardization_bins.
-        """,
-    ]
-    standardization_n_bins_default: standardization_n_bins = 20
-    standardization_diagnostics: TypeAlias = Annotated[
-        bool, "If True, plot some diagnostics about the standardization."
-    ]
-    filter_min_maf: TypeAlias = Annotated[
-        float,
-        """
-        Minimum minor allele frequency to use for filtering prior to passing
-        haplotypes to allel.xpehh function
-        """,
-    ]
-    filter_min_maf_default: filter_min_maf = 0.05
-    map_pos: TypeAlias = Annotated[
-        np.ndarray,
-        """
-        Variant positions (genetic map distance).
-        """,
-    ]
-    min_ehh: TypeAlias = Annotated[
-        float,
-        """
-        Minimum EHH beyond which to truncate integrated haplotype homozygosity
-        calculation.
-        """,
-    ]
-    min_ehh_default: min_ehh = 0.05
-    max_gap: TypeAlias = Annotated[
-        int,
-        """
-        Do not report scores if EHH spans a gap larger than this number of
-        base pairs.
-        """,
-    ]
-    max_gap_default: max_gap = 200_000
-    gap_scale: TypeAlias = Annotated[
-        int, "Rescale distance between variants if gap is larger than this value."
-    ]
-    gap_scale_default: gap_scale = 20_000
-    include_edges: TypeAlias = Annotated[
-        bool,
-        """
-        If True, report scores even if EHH does not decay below min_ehh at the
-        end of the chromosome.
-        """,
-    ]
-    use_threads: TypeAlias = Annotated[
-        bool, "If True, use multiple threads to compute XP-EHH."
-    ]
-    palette: TypeAlias = Annotated[
-        str, "Name of bokeh palette to use for plotting multiple percentiles."
-    ]
-    palette_default: palette = "Blues"
-
-
-class hapclust_params:
-    linkage_method: TypeAlias = Annotated[
-        Literal[
-            "single", "complete", "average", "weighted", "centroid", "median", "ward"
-        ],
-        """
-        The linkage algorithm to use. See the Linkage Methods section of the
-        scipy.cluster.hierarchy.linkage docs for full descriptions.
-        """,
-    ]
-    linkage_method_default: linkage_method = "single"
-    count_sort: TypeAlias = Annotated[
-        bool,
-        """
-        For each node n, the order (visually, from left-to-right) n's two descendant
-        links are plotted is determined by this parameter. If True, the child with
-        the minimum number of original objects in its cluster is plotted first. Note
-        distance_sort and count_sort cannot both be True.
-        """,
-    ]
-    distance_sort: TypeAlias = Annotated[
-        bool,
-        """
-        For each node n, the order (visually, from left-to-right) n's two descendant
-        links are plotted is determined by this parameter. If True, The child with the
-        minimum distance between its direct descendants is plotted first.
-        """,
-    ]
-
-
-class hapnet_params:
-    max_dist: TypeAlias = Annotated[
-        int,
-        "Join network components up to a maximum distance of 2 SNP differences.",
-    ]
-    max_dist_default: max_dist = 2
-    color: TypeAlias = Annotated[
-        str,
-        """
-        Identifies a column in the sample metadata which determines the colour
-        of pie chart segments within nodes.
-        """,
-    ]
-    color_discrete_sequence: TypeAlias = Annotated[
-        List, "Provide a list of colours to use."
-    ]
-    color_discrete_map: TypeAlias = Annotated[
-        Mapping, "Provide an explicit mapping from values to colours."
-    ]
-    category_order: TypeAlias = Annotated[
-        List,
-        "Control the order in which values appear in the legend.",
-    ]
-    node_size_factor: TypeAlias = Annotated[
-        int,
-        "Control the sizing of nodes.",
-    ]
-    node_size_factor_default: node_size_factor = 50
-    layout: TypeAlias = Annotated[
-        str,
-        "Name of the network layout to use to position nodes.",
-    ]
-    layout_default: layout = "cose"
-    layout_params: TypeAlias = Annotated[
-        Mapping,
-        "Additional parameters to the layout algorithm.",
-    ]
-
-
-class dash_params:
-    height: TypeAlias = Annotated[int, "Height of the Dash app in pixels (px)."]
-    width: TypeAlias = Annotated[Union[int, str], "Width of the Dash app."]
-    server_mode: TypeAlias = Annotated[
-        Literal["inline", "external", "jupyterlab"],
-        """
-        Controls how the Jupyter Dash app will be launched. See
-        https://medium.com/plotly/introducing-jupyterdash-811f1f57c02e for
-        more information.
-        """,
-    ]
-    server_mode_default: server_mode = "inline"
-    server_port: TypeAlias = Annotated[
-        int,
-        "Manually override the port on which the Dash app will run.",
-    ]
-      
 # N.B., we are in the process of breaking up the AnophelesDataResource
 # class into multiple parent classes like AnophelesGenomeSequenceData
 # and AnophelesBase. This is work in progress, and further PRs are

From 72af9be74525d7bbbae7e87b173993d2d3dc49ec Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:04:36 +0200
Subject: [PATCH 11/26] Removed (some of) the things that had been moved
 somewhere else

---
 malariagen_data/anopheles.py | 76 ++++++++++++++++++------------------
 tests/test_ag3.py            |  6 +--
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 2a1f1df39..f92c9165d 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -190,44 +190,44 @@ def _h1x_gwss_cache_name(self):
     def _ihs_gwss_cache_name(self):
         raise NotImplementedError("Must override _ihs_gwss_cache_name")
 
-    @property
-    @abstractmethod
-    def _xpehh_gwss_cache_name(self):
-        raise NotImplementedError("Must override _xpehh_gwss_cache_name")
-
-    @property
-    @abstractmethod
-    def _site_annotations_zarr_path(self):
-        raise NotImplementedError("Must override _site_annotations_zarr_path")
-
-    # Start of @abstractmethod
-
-    @property
-    @abstractmethod
-    def site_mask_ids(self):
-        """Identifiers for the different site masks that are available.
-        These are values than can be used for the `site_mask` parameter in any
-        method making using of SNP data.
-
-        """
-        raise NotImplementedError("Must override _site_mask_ids")
-
-    @property
-    @abstractmethod
-    def _default_site_mask(self):
-        raise NotImplementedError("Must override _default_site_mask")
-
-    def _prep_site_mask_param(self, *, site_mask):
-        if site_mask is None:
-            # allowed
-            pass
-        elif site_mask == DEFAULT:
-            site_mask = self._default_site_mask
-        elif site_mask not in self.site_mask_ids:
-            raise ValueError(
-                f"Invalid site mask, must be one of f{self.site_mask_ids}."
-            )
-        return site_mask
+    # @property
+    # @abstractmethod
+    # def _xpehh_gwss_cache_name(self):
+    #     raise NotImplementedError("Must override _xpehh_gwss_cache_name")
+
+    # @property
+    # @abstractmethod
+    # def _site_annotations_zarr_path(self):
+    #     raise NotImplementedError("Must override _site_annotations_zarr_path")
+
+    # # Start of @abstractmethod
+
+    # @property
+    # @abstractmethod
+    # def site_mask_ids(self):
+    #     """Identifiers for the different site masks that are available.
+    #     These are values than can be used for the `site_mask` parameter in any
+    #     method making using of SNP data.
+
+    #     """
+    #     raise NotImplementedError("Must override _site_mask_ids")
+
+    # @property
+    # @abstractmethod
+    # def _default_site_mask(self):
+    #     raise NotImplementedError("Must override _default_site_mask")
+
+    # def _prep_site_mask_param(self, *, site_mask):
+    #     if site_mask is None:
+    #         # allowed
+    #         pass
+    #     elif site_mask == DEFAULT:
+    #         site_mask = self._default_site_mask
+    #     elif site_mask not in self.site_mask_ids:
+    #         raise ValueError(
+    #             f"Invalid site mask, must be one of f{self.site_mask_ids}."
+    #         )
+    #     return site_mask
 
     @abstractmethod
     def _transcript_to_gene_name(self, transcript):
diff --git a/tests/test_ag3.py b/tests/test_ag3.py
index 05743370f..d8fee57ae 100644
--- a/tests/test_ag3.py
+++ b/tests/test_ag3.py
@@ -2288,12 +2288,12 @@ def test_xpehh_gwss():
     assert isinstance(xpehh, np.ndarray)
 
     # check dimensions
-    assert len(x) == 395
+    assert len(x) == 399
     assert len(x) == len(xpehh)
 
     # check some values
-    assert_allclose(x[0], 510232.847)
-    assert_allclose(xpehh[:, 2][100], 1.7302443850220177)
+    assert_allclose(x[0], 467448.348)
+    assert_allclose(xpehh[:, 2][100], 2.741348351867086)
 
 
 def test_g123_gwss():

From 25a83725f337eff4afbcf446e64fa0648b1b5328 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:40:22 +0200
Subject: [PATCH 12/26] Hopefully solved the cache_name issue

---
 malariagen_data/af1.py       |  2 ++
 malariagen_data/ag3.py       |  2 ++
 malariagen_data/anopheles.py | 44 ++++--------------------------------
 3 files changed, 9 insertions(+), 39 deletions(-)

diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
index 0fe6c4113..683f9eaa2 100644
--- a/malariagen_data/af1.py
+++ b/malariagen_data/af1.py
@@ -13,6 +13,7 @@
 H12_CALIBRATION_CACHE_NAME = "af1_h12_calibration_v1"
 H12_GWSS_CACHE_NAME = "af1_h12_gwss_v1"
 G123_GWSS_CACHE_NAME = "af1_g123_gwss_v1"
+XPEHH_GWSS_CACHE_NAME = "af1_g123_gwss_v1"
 G123_CALIBRATION_CACHE_NAME = "af1_g123_calibration_v1"
 H1X_GWSS_CACHE_NAME = "af1_h1x_gwss_v1"
 IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"
@@ -75,6 +76,7 @@ class Af1(AnophelesDataResource):
     _h12_calibration_cache_name = H12_CALIBRATION_CACHE_NAME
     _h12_gwss_cache_name = H12_GWSS_CACHE_NAME
     _g123_gwss_cache_name = G123_GWSS_CACHE_NAME
+    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
     _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME
     _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index 1f97c4ae6..405796a47 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -42,6 +42,7 @@
 H12_GWSS_CACHE_NAME = "ag3_h12_gwss_v1"
 G123_CALIBRATION_CACHE_NAME = "ag3_g123_calibration_v1"
 G123_GWSS_CACHE_NAME = "ag3_g123_gwss_v1"
+XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
 H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1"
 IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
 XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
@@ -140,6 +141,7 @@ class Ag3(AnophelesDataResource):
     _h12_calibration_cache_name = H12_CALIBRATION_CACHE_NAME
     _h12_gwss_cache_name = H12_GWSS_CACHE_NAME
     _g123_gwss_cache_name = G123_GWSS_CACHE_NAME
+    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
     _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME
     _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index f92c9165d..7b214a81d 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -180,6 +180,11 @@ def _h12_gwss_cache_name(self):
     def _g123_gwss_cache_name(self):
         raise NotImplementedError("Must override _g123_gwss_cache_name")
 
+    @property
+    @abstractmethod
+    def _xpehh_gwss_cache_name(self):
+        raise NotImplementedError("Must override _xpehh_gwss_cache_name")
+
     @property
     @abstractmethod
     def _h1x_gwss_cache_name(self):
@@ -190,45 +195,6 @@ def _h1x_gwss_cache_name(self):
     def _ihs_gwss_cache_name(self):
         raise NotImplementedError("Must override _ihs_gwss_cache_name")
 
-    # @property
-    # @abstractmethod
-    # def _xpehh_gwss_cache_name(self):
-    #     raise NotImplementedError("Must override _xpehh_gwss_cache_name")
-
-    # @property
-    # @abstractmethod
-    # def _site_annotations_zarr_path(self):
-    #     raise NotImplementedError("Must override _site_annotations_zarr_path")
-
-    # # Start of @abstractmethod
-
-    # @property
-    # @abstractmethod
-    # def site_mask_ids(self):
-    #     """Identifiers for the different site masks that are available.
-    #     These are values than can be used for the `site_mask` parameter in any
-    #     method making using of SNP data.
-
-    #     """
-    #     raise NotImplementedError("Must override _site_mask_ids")
-
-    # @property
-    # @abstractmethod
-    # def _default_site_mask(self):
-    #     raise NotImplementedError("Must override _default_site_mask")
-
-    # def _prep_site_mask_param(self, *, site_mask):
-    #     if site_mask is None:
-    #         # allowed
-    #         pass
-    #     elif site_mask == DEFAULT:
-    #         site_mask = self._default_site_mask
-    #     elif site_mask not in self.site_mask_ids:
-    #         raise ValueError(
-    #             f"Invalid site mask, must be one of f{self.site_mask_ids}."
-    #         )
-    #     return site_mask
-
     @abstractmethod
     def _transcript_to_gene_name(self, transcript):
         # children may have different manual overrides.

From d381c8d4fd78f3e42c36f806adde5f66fb8331c0 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Thu, 15 Jun 2023 08:08:12 +0200
Subject: [PATCH 13/26] Changed parameter names to be more uniform

---
 malariagen_data/anopheles.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 7b214a81d..968cb0d3c 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -5194,8 +5194,8 @@ def _xpehh_gwss(
         contig,
         analysis,
         sample_sets,
-        sample_query1,
-        sample_query2,
+        cohort1_query,
+        cohort2_query,
         window_size,
         percentiles,
         standardize,
@@ -5216,7 +5216,7 @@ def _xpehh_gwss(
         ds_haps1 = self.haplotypes(
             region=contig,
             analysis=analysis,
-            sample_query=sample_query1,
+            sample_query=cohort1_query,
             sample_sets=sample_sets,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
@@ -5226,7 +5226,7 @@ def _xpehh_gwss(
         ds_haps2 = self.haplotypes(
             region=contig,
             analysis=analysis,
-            sample_query=sample_query2,
+            sample_query=cohort2_query,
             sample_sets=sample_sets,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
@@ -5308,8 +5308,8 @@ def plot_xpehh_gwss_track(
         contig: base_params.contig,
         analysis: hap_params.analysis = DEFAULT,
         sample_sets: Optional[base_params.sample_sets] = None,
-        sample_query1: Optional[base_params.sample_query] = None,
-        sample_query2: Optional[base_params.sample_query] = None,
+        cohort1_query: Optional[base_params.sample_query] = None,
+        cohort2_query: Optional[base_params.sample_query] = None,
         window_size: xpehh_params.window_size = xpehh_params.window_size_default,
         percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
         standardize: xpehh_params.standardize = True,
@@ -5357,8 +5357,8 @@ def plot_xpehh_gwss_track(
             use_threads=use_threads,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
-            sample_query1=sample_query1,
-            sample_query2=sample_query2,
+            cohort1_query=cohort1_query,
+            cohort2_query=cohort2_query,
             sample_sets=sample_sets,
             random_seed=random_seed,
         )
@@ -5374,10 +5374,10 @@ def plot_xpehh_gwss_track(
             dimensions="width", maintain_focus=False
         )
         if title is None:
-            if sample_query1 is None or sample_query2 is None:
+            if cohort1_query is None or cohort2_query is None:
                 title = "XP-EHH"
             else:
-                title = "'" + sample_query1 + "' and '" + sample_query2 + "'"
+                title = f"Cohort 1: {cohort1_query}\nCohort 2: {cohort2_query}"
         fig = bokeh.plotting.figure(
             title=title,
             tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"],

From 23a81a4a4cb9eb58137d1616428855a12930e234 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Thu, 15 Jun 2023 08:19:06 +0200
Subject: [PATCH 14/26] Adds show paramater to plot_xpehh_gwss()

---
 malariagen_data/anopheles.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 968cb0d3c..2710d0c5c 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -4420,8 +4420,8 @@ def plot_xpehh_gwss(
         contig: base_params.contig,
         analysis: hap_params.analysis = DEFAULT,
         sample_sets: Optional[base_params.sample_sets] = None,
-        sample_query1: Optional[base_params.sample_query] = None,
-        sample_query2: Optional[base_params.sample_query] = None,
+        cohort1_query: Optional[base_params.sample_query] = None,
+        cohort2_query: Optional[base_params.sample_query] = None,
         window_size: xpehh_params.window_size = xpehh_params.window_size_default,
         percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
         standardize: xpehh_params.standardize = True,
@@ -4448,14 +4448,16 @@ def plot_xpehh_gwss(
         width: gplt_params.width = gplt_params.width_default,
         track_height: gplt_params.track_height = 170,
         genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
-    ) -> None:
+        show: gplt_params.show = True,
+        output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
+    ) -> gplt_params.figure:
         # gwss track
         fig1 = self.plot_xpehh_gwss_track(
             contig=contig,
             analysis=analysis,
             sample_sets=sample_sets,
-            sample_query1=sample_query1,
-            sample_query2=sample_query2,
+            cohort1_query=cohort1_query,
+            cohort2_query=cohort2_query,
             window_size=window_size,
             percentiles=percentiles,
             palette=palette,
@@ -4502,7 +4504,11 @@ def plot_xpehh_gwss(
             sizing_mode=sizing_mode,
         )
 
-        bokeh.plotting.show(fig)
+        if show:
+            bokeh.plotting.show(fig)
+            return None
+        else:
+            return fig
 
     @doc(
         summary="Run and plot iHS GWSS data.",
@@ -5337,7 +5343,8 @@ def plot_xpehh_gwss_track(
         height: gplt_params.height = 200,
         show: gplt_params.show = True,
         x_range: Optional[gplt_params.x_range] = None,
-    ):
+        output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
+    ) -> gplt_params.figure:
         # compute ihs
         x, xpehh = self.xpehh_gwss(
             contig=contig,

From 2f8a59c61957930cc700df0944c77ffcbb0ec8ac Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Thu, 15 Jun 2023 08:22:41 +0200
Subject: [PATCH 15/26] Added support for output_backend

---
 malariagen_data/anopheles.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 2710d0c5c..a7765a993 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -4493,6 +4493,7 @@ def plot_xpehh_gwss(
             height=genes_height,
             x_range=fig1.x_range,
             show=False,
+            output_backend=output_backend,
         )
 
         # combine plots into a single figure
@@ -5395,6 +5396,7 @@ def plot_xpehh_gwss_track(
             height=height,
             toolbar_location="above",
             x_range=x_range,
+            output_backend=output_backend,
         )
 
         if window_size:
@@ -5429,8 +5431,9 @@ def plot_xpehh_gwss_track(
 
         if show:
             bokeh.plotting.show(fig)
-
-        return fig
+            return None
+        else:
+            return fig
 
     @doc(
         summary="""

From 63659483f65476d0e868a2d770e85074690eecb2 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Thu, 15 Jun 2023 08:27:30 +0200
Subject: [PATCH 16/26] Updated colors for webgl

---
 malariagen_data/anopheles.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index a7765a993..2a8ad5384 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -5402,18 +5402,24 @@ def plot_xpehh_gwss_track(
         if window_size:
             if isinstance(percentiles, int):
                 percentiles = (percentiles,)
+            # Ensure percentiles are sorted so that colors make sense.
+            percentiles = tuple(sorted(percentiles))
 
         # add an empty dimension to xpehh array if 1D
         xpehh = np.reshape(xpehh, (xpehh.shape[0], -1))
-        bokeh_palette = bokeh.palettes.all_palettes[palette]
+
+        # select the base color palette to work from
+        base_palette = bokeh.palettes.all_palettes[palette][8]
+
+        # keep only enough colours to plot the IHS tracks
+        bokeh_palette = base_palette[: xpehh.shape[1]]
+
+        # reverse the colors so darkest is last
+        bokeh_palette = bokeh_palette[::-1]
+
         for i in range(xpehh.shape[1]):
             xpehh_perc = xpehh[:, i]
-            if xpehh.shape[1] >= 3:
-                color = bokeh_palette[xpehh.shape[1]][i]
-            elif xpehh.shape[1] == 2:
-                color = bokeh_palette[3][i]
-            else:
-                color = None
+            color = bokeh_palette[i]
 
             # plot xpehh
             fig.circle(

From f1f212ea8634c211a6a9312f1d4915e0aba68b74 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Thu, 15 Jun 2023 09:37:32 +0200
Subject: [PATCH 17/26] Updated the tests with new parameter names

---
 malariagen_data/anopheles.py | 8 ++++----
 tests/test_ag3.py            | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 2a8ad5384..cc5e23d7b 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -5129,8 +5129,8 @@ def xpehh_gwss(
         contig: base_params.contig,
         analysis: hap_params.analysis = DEFAULT,
         sample_sets: Optional[base_params.sample_sets] = None,
-        sample_query1: Optional[base_params.sample_query] = None,
-        sample_query2: Optional[base_params.sample_query] = None,
+        cohort1_query: Optional[base_params.sample_query] = None,
+        cohort2_query: Optional[base_params.sample_query] = None,
         window_size: xpehh_params.window_size = xpehh_params.window_size_default,
         percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
         standardize: xpehh_params.standardize = True,
@@ -5176,8 +5176,8 @@ def xpehh_gwss(
             # N.B., do not be tempted to convert this sample query into integer
             # indices using _prep_sample_selection_params, because the indices
             # are different in the haplotype data.
-            sample_query1=sample_query1,
-            sample_query2=sample_query2,
+            cohort1_query=cohort1_query,
+            cohort2_query=cohort2_query,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
             random_seed=random_seed,
diff --git a/tests/test_ag3.py b/tests/test_ag3.py
index d8fee57ae..102ebd266 100644
--- a/tests/test_ag3.py
+++ b/tests/test_ag3.py
@@ -2267,8 +2267,8 @@ def test_ihs_gwss():
 
 def test_xpehh_gwss():
     ag3 = setup_ag3()
-    sample_query1 = "country == 'Ghana'"
-    sample_query2 = "country == 'Angola'"
+    cohort1_query = "country == 'Ghana'"
+    cohort2_query = "country == 'Angola'"
     contig = "3L"
     analysis = "gamb_colu"
     sample_sets = "3.0"
@@ -2277,8 +2277,8 @@ def test_xpehh_gwss():
     x, xpehh = ag3.xpehh_gwss(
         contig=contig,
         analysis=analysis,
-        sample_query1=sample_query1,
-        sample_query2=sample_query2,
+        cohort1_query=cohort1_query,
+        cohort2_query=cohort2_query,
         sample_sets=sample_sets,
         window_size=window_size,
         max_cohort_size=20,

From 256ecfa39c351e175c1b03934069161aed87a706 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Fri, 16 Jun 2023 10:43:50 +0200
Subject: [PATCH 18/26] Correct bad copy/paste

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/af1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
index 683f9eaa2..47b8b8c98 100644
--- a/malariagen_data/af1.py
+++ b/malariagen_data/af1.py
@@ -13,7 +13,7 @@
 H12_CALIBRATION_CACHE_NAME = "af1_h12_calibration_v1"
 H12_GWSS_CACHE_NAME = "af1_h12_gwss_v1"
 G123_GWSS_CACHE_NAME = "af1_g123_gwss_v1"
-XPEHH_GWSS_CACHE_NAME = "af1_g123_gwss_v1"
+XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1"
 G123_CALIBRATION_CACHE_NAME = "af1_g123_calibration_v1"
 H1X_GWSS_CACHE_NAME = "af1_h1x_gwss_v1"
 IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"

From 282013c53fac3335fa7c655caf4f879eae8c4fe9 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Fri, 16 Jun 2023 10:45:09 +0200
Subject: [PATCH 19/26] Corrected missing parameter

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
---
 malariagen_data/anopheles.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index cc5e23d7b..30c8119fa 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -4481,6 +4481,7 @@ def plot_xpehh_gwss(
             height=track_height,
             show=False,
             x_range=None,
+            output_backend=output_backend,
         )
 
         fig1.xaxis.visible = False

From 7ed2ea67f4773805a8be311493ca54c55b5de4db Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Fri, 16 Jun 2023 11:00:14 +0200
Subject: [PATCH 20/26] A few lines had escaped deletion

---
 malariagen_data/ag3.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index 405796a47..59b899415 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -145,11 +145,6 @@ class Ag3(AnophelesDataResource):
     _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME
     _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
-    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
-    site_mask_ids = ("gamb_colu_arab", "gamb_colu", "arab")
-    _default_site_mask = DEFAULT_SITE_MASK
-    phasing_analysis_ids = ("gamb_colu_arab", "gamb_colu", "arab")
-    _default_phasing_analysis = "gamb_colu_arab"
 
     def __init__(
         self,

From ac91865a64e6158a77eab1deff64dd3b464c6290 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 20 Jun 2023 15:59:07 +0200
Subject: [PATCH 21/26] Unremoved an important line

---
 malariagen_data/ag3.py                |  2 --
 malariagen_data/anoph/xpehh_params.py | 18 ------------
 malariagen_data/anopheles.py          | 40 ++-------------------------
 3 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
index 59b899415..590e76ba6 100644
--- a/malariagen_data/ag3.py
+++ b/malariagen_data/ag3.py
@@ -45,8 +45,6 @@
 XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
 H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1"
 IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
-XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
-DEFAULT_SITE_MASK = "gamb_colu_arab"
 
 
 def _setup_aim_palettes():
diff --git a/malariagen_data/anoph/xpehh_params.py b/malariagen_data/anoph/xpehh_params.py
index ca2a940b3..d8dab0016 100644
--- a/malariagen_data/anoph/xpehh_params.py
+++ b/malariagen_data/anoph/xpehh_params.py
@@ -25,24 +25,6 @@
     """,
 ]
 percentiles_default: percentiles = (50, 75, 100)
-standardize: TypeAlias = Annotated[
-    bool, "If True, standardize XP-EHH values by alternate allele counts."
-]
-standardization_bins: TypeAlias = Annotated[
-    Tuple[float, ...],
-    "If provided, use these allele count bins to standardize XP-EHH values.",
-]
-standardization_n_bins: TypeAlias = Annotated[
-    int,
-    """
-    Number of allele count bins to use for standardization.
-    Overrides standardization_bins.
-    """,
-]
-standardization_n_bins_default: standardization_n_bins = 20
-standardization_diagnostics: TypeAlias = Annotated[
-    bool, "If True, plot some diagnostics about the standardization."
-]
 filter_min_maf: TypeAlias = Annotated[
     float,
     """
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 30c8119fa..7f80543a1 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -4413,7 +4413,7 @@ def plot_ihs_gwss_track(
 
     @check_types
     @doc(
-        summary="Run and plot iHS GWSS data.",
+        summary="Run and plot XP-EHH GWSS data.",
     )
     def plot_xpehh_gwss(
         self,
@@ -5134,10 +5134,6 @@ def xpehh_gwss(
         cohort2_query: Optional[base_params.sample_query] = None,
         window_size: xpehh_params.window_size = xpehh_params.window_size_default,
         percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
-        standardize: xpehh_params.standardize = True,
-        standardization_bins: Optional[xpehh_params.standardization_bins] = None,
-        standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default,
-        standardization_diagnostics: xpehh_params.standardization_diagnostics = False,
         filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default,
         map_pos: Optional[xpehh_params.map_pos] = None,
         min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default,
@@ -5162,10 +5158,6 @@ def xpehh_gwss(
             analysis=self._prep_phasing_analysis_param(analysis=analysis),
             window_size=window_size,
             percentiles=percentiles,
-            standardize=standardize,
-            standardization_bins=standardization_bins,
-            standardization_n_bins=standardization_n_bins,
-            standardization_diagnostics=standardization_diagnostics,
             filter_min_maf=filter_min_maf,
             map_pos=map_pos,
             min_ehh=min_ehh,
@@ -5206,10 +5198,6 @@ def _xpehh_gwss(
         cohort2_query,
         window_size,
         percentiles,
-        standardize,
-        standardization_bins,
-        standardization_n_bins,
-        standardization_diagnostics,
         filter_min_maf,
         map_pos,
         min_ehh,
@@ -5270,7 +5258,6 @@ def _xpehh_gwss(
             h2=ht2,
             pos=pos,
             map_pos=map_pos,
-            # min_maf=compute_min_maf,
             min_ehh=min_ehh,
             include_edges=include_edges,
             max_gap=max_gap,
@@ -5285,19 +5272,6 @@ def _xpehh_gwss(
         ac1 = ac1[na_mask]
         ac2 = ac2[na_mask]
 
-        # take absolute value
-        xp = np.fabs(xp)
-
-        # Update to take into account both sets
-        if standardize:
-            xp, _ = allel.standardize_by_allele_count(
-                score=xp,
-                aac=ac1[:, 1],
-                bins=standardization_bins,
-                n_bins=standardization_n_bins,
-                diagnostics=standardization_diagnostics,
-            )
-
         if window_size:
             xp = allel.moving_statistic(
                 xp, statistic=np.percentile, size=window_size, q=percentiles
@@ -5320,10 +5294,6 @@ def plot_xpehh_gwss_track(
         cohort2_query: Optional[base_params.sample_query] = None,
         window_size: xpehh_params.window_size = xpehh_params.window_size_default,
         percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
-        standardize: xpehh_params.standardize = True,
-        standardization_bins: Optional[xpehh_params.standardization_bins] = None,
-        standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default,
-        standardization_diagnostics: xpehh_params.standardization_diagnostics = False,
         filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default,
         map_pos: Optional[xpehh_params.map_pos] = None,
         min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default,
@@ -5347,16 +5317,12 @@ def plot_xpehh_gwss_track(
         x_range: Optional[gplt_params.x_range] = None,
         output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
     ) -> gplt_params.figure:
-        # compute ihs
+        # compute xpehh
         x, xpehh = self.xpehh_gwss(
             contig=contig,
             analysis=analysis,
             window_size=window_size,
             percentiles=percentiles,
-            standardize=standardize,
-            standardization_bins=standardization_bins,
-            standardization_n_bins=standardization_n_bins,
-            standardization_diagnostics=standardization_diagnostics,
             filter_min_maf=filter_min_maf,
             map_pos=map_pos,
             min_ehh=min_ehh,
@@ -5412,7 +5378,7 @@ def plot_xpehh_gwss_track(
         # select the base color palette to work from
         base_palette = bokeh.palettes.all_palettes[palette][8]
 
-        # keep only enough colours to plot the IHS tracks
+        # keep only enough colours to plot the XPEHH tracks
         bokeh_palette = base_palette[: xpehh.shape[1]]
 
         # reverse the colors so darkest is last

From 966df02c4003fc4343ab49fea0901603b74d75dd Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 20 Jun 2023 16:17:17 +0200
Subject: [PATCH 22/26] Forgot the main file

---
 malariagen_data/anopheles.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index 7f80543a1..fc0bc49e9 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -4424,10 +4424,6 @@ def plot_xpehh_gwss(
         cohort2_query: Optional[base_params.sample_query] = None,
         window_size: xpehh_params.window_size = xpehh_params.window_size_default,
         percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default,
-        standardize: xpehh_params.standardize = True,
-        standardization_bins: Optional[xpehh_params.standardization_bins] = None,
-        standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default,
-        standardization_diagnostics: xpehh_params.standardization_diagnostics = False,
         filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default,
         map_pos: Optional[xpehh_params.map_pos] = None,
         min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default,
@@ -4461,10 +4457,6 @@ def plot_xpehh_gwss(
             window_size=window_size,
             percentiles=percentiles,
             palette=palette,
-            standardize=standardize,
-            standardization_bins=standardization_bins,
-            standardization_n_bins=standardization_n_bins,
-            standardization_diagnostics=standardization_diagnostics,
             filter_min_maf=filter_min_maf,
             map_pos=map_pos,
             min_ehh=min_ehh,
@@ -5372,13 +5364,13 @@ def plot_xpehh_gwss_track(
             # Ensure percentiles are sorted so that colors make sense.
             percentiles = tuple(sorted(percentiles))
 
-        # add an empty dimension to xpehh array if 1D
+        # add an empty dimension to XP-EHH array if 1D
         xpehh = np.reshape(xpehh, (xpehh.shape[0], -1))
 
         # select the base color palette to work from
         base_palette = bokeh.palettes.all_palettes[palette][8]
 
-        # keep only enough colours to plot the XPEHH tracks
+        # keep only enough colours to plot the XP-EHH tracks
         bokeh_palette = base_palette[: xpehh.shape[1]]
 
         # reverse the colors so darkest is last
@@ -5388,7 +5380,7 @@ def plot_xpehh_gwss_track(
             xpehh_perc = xpehh[:, i]
             color = bokeh_palette[i]
 
-            # plot xpehh
+            # plot XP-EHH
             fig.circle(
                 x=x,
                 y=xpehh_perc,
@@ -5399,7 +5391,7 @@ def plot_xpehh_gwss_track(
             )
 
         # tidy up the plot
-        fig.yaxis.axis_label = "xpehh"
+        fig.yaxis.axis_label = "XP-EHH"
         self._bokeh_style_genome_xaxis(fig, contig)
 
         if show:

From 8aad3b541830bc7f761c3baf0e9b61b9116a3c81 Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 20 Jun 2023 17:20:43 +0200
Subject: [PATCH 23/26] update the test

---
 tests/test_ag3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_ag3.py b/tests/test_ag3.py
index 102ebd266..d916e23a3 100644
--- a/tests/test_ag3.py
+++ b/tests/test_ag3.py
@@ -2293,7 +2293,7 @@ def test_xpehh_gwss():
 
     # check some values
     assert_allclose(x[0], 467448.348)
-    assert_allclose(xpehh[:, 2][100], 2.741348351867086)
+    assert_allclose(xpehh[:, 2][100], 0.4817561326426265)
 
 
 def test_g123_gwss():

From fdde74f04f60cc609680e7c47b586becefd10bab Mon Sep 17 00:00:00 2001
From: jonbrenas <51911846+jonbrenas@users.noreply.github.com>
Date: Tue, 20 Jun 2023 20:50:15 +0200
Subject: [PATCH 24/26] Added a plot notebook as an example

---
 notebooks/plot_xpehh_gwss.ipynb | 178 ++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 notebooks/plot_xpehh_gwss.ipynb

diff --git a/notebooks/plot_xpehh_gwss.ipynb b/notebooks/plot_xpehh_gwss.ipynb
new file mode 100644
index 000000000..d6d6dfe46
--- /dev/null
+++ b/notebooks/plot_xpehh_gwss.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32067e1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import malariagen_data\n",
+    "\n",
+    "ag3 = malariagen_data.Ag3(\n",
+    "    \"simplecache::gs://vo_agam_release\",\n",
+    "    simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
+    ")\n",
+    "ag3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "966049ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "af1 = malariagen_data.Af1(\n",
+    "    \"simplecache::gs://vo_afun_release\",\n",
+    "    simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
+    ")\n",
+    "af1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "838eddc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_xpehh_gwss_track(\n",
+    "    contig=\"X\",\n",
+    "    window_size=1_000,\n",
+    "    analysis=\"gamb_colu\",\n",
+    "    percentiles=(50, 60, 90, 100),\n",
+    "    cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n",
+    "    cohort2_query=\"country == 'Mayotte'\",\n",
+    "    max_cohort_size=10,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e780aa23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_xpehh_gwss_track(\n",
+    "    contig=\"X\",\n",
+    "    window_size=1_000,\n",
+    "    analysis=\"gamb_colu\",\n",
+    "    cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n",
+    "    cohort2_query=\"country == 'Mayotte'\",\n",
+    "    max_cohort_size=10,\n",
+    "    filter_min_maf=0.1,\n",
+    "    width=700,\n",
+    "    sizing_mode=\"fixed\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b26a9654",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_xpehh_gwss_track(\n",
+    "    contig=\"X\",\n",
+    "    window_size=100,\n",
+    "    analysis=\"gamb_colu\",\n",
+    "    cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n",
+    "    cohort2_query=\"country == 'Mayotte'\",\n",
+    "    max_cohort_size=10,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89cf50ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_xpehh_gwss_track(\n",
+    "    contig=\"X\",\n",
+    "    window_size=1_000,\n",
+    "    analysis=\"gamb_colu\",\n",
+    "    cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n",
+    "    cohort2_query=\"country == 'Mayotte'\",\n",
+    "    min_ehh=0.1,\n",
+    "    max_cohort_size=None,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78b452f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "af1.sample_metadata()[\"cohort_admin1_year\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "079a3308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "af1.plot_xpehh_gwss_track(\n",
+    "    contig=\"X\",\n",
+    "    window_size=2_000,\n",
+    "    cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n",
+    "    cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n",
+    "    max_cohort_size=20,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6290896",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "af1.plot_xpehh_gwss_track(\n",
+    "    contig=\"X\",\n",
+    "    window_size=200,\n",
+    "    cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n",
+    "    cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n",
+    "    max_cohort_size=20,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5aed96d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From afad321f30817e1e027c91495f6b2ff95ce8af79 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Thu, 22 Jun 2023 12:35:12 +0100
Subject: [PATCH 25/26] remove cruft

---
 malariagen_data/anopheles.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
index fc0bc49e9..858faf6fd 100644
--- a/malariagen_data/anopheles.py
+++ b/malariagen_data/anopheles.py
@@ -5231,7 +5231,6 @@ def _xpehh_gwss(
         ac2 = ht2.count_alleles(max_allele=1)
         pos = ds_haps1["variant_position"].values
 
-        # To be modified to take both into account
         if filter_min_maf > 0:
             ac = ac1 + ac2
             af = ac.to_frequencies()

From 3cf92c16bdd7b8d96f84df4a5be20d06b1e132ea Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Thu, 22 Jun 2023 12:45:50 +0100
Subject: [PATCH 26/26] tweak examples in notebook

---
 notebooks/plot_xpehh_gwss.ipynb | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/notebooks/plot_xpehh_gwss.ipynb b/notebooks/plot_xpehh_gwss.ipynb
index d6d6dfe46..7d0b3005c 100644
--- a/notebooks/plot_xpehh_gwss.ipynb
+++ b/notebooks/plot_xpehh_gwss.ipynb
@@ -37,7 +37,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_xpehh_gwss_track(\n",
+    "ag3.plot_xpehh_gwss(\n",
     "    contig=\"X\",\n",
     "    window_size=1_000,\n",
     "    analysis=\"gamb_colu\",\n",
@@ -75,7 +75,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_xpehh_gwss_track(\n",
+    "ag3.plot_xpehh_gwss(\n",
     "    contig=\"X\",\n",
     "    window_size=100,\n",
     "    analysis=\"gamb_colu\",\n",
@@ -85,6 +85,23 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "181227f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_xpehh_gwss(\n",
+    "    contig=\"2RL\",\n",
+    "    window_size=100,\n",
+    "    analysis=\"gamb_colu\",\n",
+    "    cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n",
+    "    cohort2_query=\"country == 'Mayotte'\",\n",
+    "    max_cohort_size=10,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -92,7 +109,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_xpehh_gwss_track(\n",
+    "ag3.plot_xpehh_gwss(\n",
     "    contig=\"X\",\n",
     "    window_size=1_000,\n",
     "    analysis=\"gamb_colu\",\n",
@@ -120,7 +137,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "af1.plot_xpehh_gwss_track(\n",
+    "af1.plot_xpehh_gwss(\n",
     "    contig=\"X\",\n",
     "    window_size=2_000,\n",
     "    cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n",
@@ -136,8 +153,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "af1.plot_xpehh_gwss_track(\n",
-    "    contig=\"X\",\n",
+    "af1.plot_xpehh_gwss(\n",
+    "    contig=\"2RL\",\n",
     "    window_size=200,\n",
     "    cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n",
     "    cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n",
@@ -170,7 +187,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,