From c54f031053f25fbe764297c8621c76b5fd8dfcde Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Wed, 10 May 2023 09:46:58 +0100 Subject: [PATCH 01/26] Adding XPEHH --- malariagen_data/ag3.py | 2 + malariagen_data/anopheles.py | 497 +++++++++++++++++++++++++++++++++++ 2 files changed, 499 insertions(+) diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index b451be7fd..7ef0e0f74 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -43,6 +43,7 @@ G123_GWSS_CACHE_NAME = "ag3_g123_gwss_v1" H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" DEFAULT_SITE_MASK = "gamb_colu_arab" @@ -113,6 +114,7 @@ class Ag3(AnophelesDataResource): _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME + _xpehh_gwss_cache_name = IHS_GWSS_CACHE_NAME site_mask_ids = ("gamb_colu_arab", "gamb_colu", "arab") _default_site_mask = DEFAULT_SITE_MASK phasing_analysis_ids = ("gamb_colu_arab", "gamb_colu", "arab") diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index ec939d135..56c42d3cf 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -491,6 +491,93 @@ class ihs_params: palette_default: palette = "Blues" +class xpehh_params: + window_size: TypeAlias = Annotated[ + int, + """ + The size of window in number of SNPs used to summarise XP-EHH over. + If None, per-variant XP-EHH values are returned. + """, + ] + window_size_default: window_size = 200 + min_cohort_size_default: base_params.min_cohort_size = 15 + max_cohort_size_default: base_params.max_cohort_size = 50 + percentiles: TypeAlias = Annotated[ + Union[int, Tuple[int, ...]], + """ + If window size is specified, this returns the XP-EHH percentiles + for each window. + """, + ] + percentiles_default: percentiles = (50, 75, 100) + standardize: TypeAlias = Annotated[ + bool, "If True, standardize XP-EHH values by alternate allele counts." + ] + standardization_bins: TypeAlias = Annotated[ + Tuple[float, ...], + "If provided, use these allele count bins to standardize XP-EHH values.", + ] + standardization_n_bins: TypeAlias = Annotated[ + int, + """ + Number of allele count bins to use for standardization. + Overrides standardization_bins. + """, + ] + standardization_n_bins_default: standardization_n_bins = 20 + standardization_diagnostics: TypeAlias = Annotated[ + bool, "If True, plot some diagnostics about the standardization." + ] + filter_min_maf: TypeAlias = Annotated[ + float, + """ + Minimum minor allele frequency to use for filtering prior to passing + haplotypes to allel.xpehh function + """, + ] + filter_min_maf_default: filter_min_maf = 0.05 + map_pos: TypeAlias = Annotated[ + np.ndarray, + """ + Variant positions (genetic map distance). + """, + ] + min_ehh: TypeAlias = Annotated[ + float, + """ + Minimum EHH beyond which to truncate integrated haplotype homozygosity + calculation. + """, + ] + min_ehh_default: min_ehh = 0.05 + max_gap: TypeAlias = Annotated[ + int, + """ + Do not report scores if EHH spans a gap larger than this number of + base pairs. + """, + ] + max_gap_default: max_gap = 200_000 + gap_scale: TypeAlias = Annotated[ + int, "Rescale distance between variants if gap is larger than this value." + ] + gap_scale_default: gap_scale = 20_000 + include_edges: TypeAlias = Annotated[ + bool, + """ + If True, report scores even if EHH does not decay below min_ehh at the + end of the chromosome. + """, + ] + use_threads: TypeAlias = Annotated[ + bool, "If True, use multiple threads to compute XP-EHH." + ] + palette: TypeAlias = Annotated[ + str, "Name of bokeh palette to use for plotting multiple percentiles." + ] + palette_default: palette = "Blues" + + class hapclust_params: linkage_method: TypeAlias = Annotated[ Literal[ @@ -723,6 +810,11 @@ def _h1x_gwss_cache_name(self): def _ihs_gwss_cache_name(self): raise NotImplementedError("Must override _ihs_gwss_cache_name") + @property + @abstractmethod + def _xpehh_gwss_cache_name(self): + raise NotImplementedError("Must override _xpehh_gwss_cache_name") + @property @abstractmethod def _site_annotations_zarr_path(self): @@ -6057,7 +6149,9 @@ def ihs_gwss( ) -> Tuple[np.ndarray, np.ndarray]: # change this name if you ever change the behaviour of this function, to # invalidate any previously cached data + print("Begin ihs") name = self._ihs_gwss_cache_name + print("End ihs") params = dict( contig=contig, @@ -6306,6 +6400,98 @@ def plot_ihs_gwss_track( return fig + @doc( + summary="Run and plot iHS GWSS data.", + ) + def plot_xpehh_gwss( + self, + contig: base_params.contig, + analysis: hap_params.analysis = DEFAULT, + sample_sets: Optional[base_params.sample_sets] = None, + sample_query1: Optional[base_params.sample_query] = None, + sample_query2: Optional[base_params.sample_query] = None, + window_size: xpehh_params.window_size = xpehh_params.window_size_default, + percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, + standardize: xpehh_params.standardize = True, + standardization_bins: Optional[xpehh_params.standardization_bins] = None, + standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default, + standardization_diagnostics: xpehh_params.standardization_diagnostics = False, + filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, + map_pos: Optional[xpehh_params.map_pos] = None, + min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, + max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default, + gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default, + include_edges: xpehh_params.include_edges = True, + use_threads: xpehh_params.use_threads = True, + min_cohort_size: Optional[ + base_params.min_cohort_size + ] = xpehh_params.min_cohort_size_default, + max_cohort_size: Optional[ + base_params.max_cohort_size + ] = xpehh_params.max_cohort_size_default, + random_seed: base_params.random_seed = 42, + palette: xpehh_params.palette = xpehh_params.palette_default, + title: Optional[gplt_params.title] = None, + sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default, + width: gplt_params.width = gplt_params.width_default, + track_height: gplt_params.track_height = 170, + genes_height: gplt_params.genes_height = gplt_params.genes_height_default, + ) -> None: + # gwss track + fig1 = self.plot_xpehh_gwss_track( + contig=contig, + analysis=analysis, + sample_sets=sample_sets, + sample_query1=sample_query1, + sample_query2=sample_query2, + window_size=window_size, + percentiles=percentiles, + palette=palette, + standardize=standardize, + standardization_bins=standardization_bins, + standardization_n_bins=standardization_n_bins, + standardization_diagnostics=standardization_diagnostics, + filter_min_maf=filter_min_maf, + map_pos=map_pos, + min_ehh=min_ehh, + max_gap=max_gap, + gap_scale=gap_scale, + include_edges=include_edges, + use_threads=use_threads, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + title=title, + sizing_mode=sizing_mode, + width=width, + height=track_height, + show=False, + x_range=None, + ) + + fig1.xaxis.visible = False + + # plot genes + fig2 = self.plot_genes( + region=contig, + sizing_mode=sizing_mode, + width=width, + height=genes_height, + x_range=fig1.x_range, + show=False, + ) + + # combine plots into a single figure + fig = bokeh.layouts.gridplot( + [fig1, fig2], + ncols=1, + toolbar_location="above", + merge_tools=True, + sizing_mode=sizing_mode, + ) + + bokeh.plotting.show(fig) + @doc( summary="Run and plot iHS GWSS data.", ) @@ -6876,6 +7062,317 @@ def plot_g123_calibration( fig.title = title bokeh.plotting.show(fig) + @doc( + summary="Run XP-EHH GWSS.", + returns=dict( + x="An array containing the window centre point genomic positions.", + xpehh="An array with XP-EHH statistic values for each window.", + ), + ) + def xpehh_gwss( + self, + contig: base_params.contig, + analysis: hap_params.analysis = DEFAULT, + sample_sets: Optional[base_params.sample_sets] = None, + sample_query1: Optional[base_params.sample_query] = None, + sample_query2: Optional[base_params.sample_query] = None, + window_size: xpehh_params.window_size = xpehh_params.window_size_default, + percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, + standardize: xpehh_params.standardize = True, + standardization_bins: Optional[xpehh_params.standardization_bins] = None, + standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default, + standardization_diagnostics: xpehh_params.standardization_diagnostics = False, + filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, + map_pos: Optional[xpehh_params.map_pos] = None, + min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, + max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default, + gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default, + include_edges: xpehh_params.include_edges = True, + use_threads: xpehh_params.use_threads = True, + min_cohort_size: Optional[ + base_params.min_cohort_size + ] = xpehh_params.min_cohort_size_default, + max_cohort_size: Optional[ + base_params.max_cohort_size + ] = xpehh_params.max_cohort_size_default, + random_seed: base_params.random_seed = 42, + ) -> Tuple[np.ndarray, np.ndarray]: + # change this name if you ever change the behaviour of this function, to + # invalidate any previously cached data + name = self._xpehh_gwss_cache_name + + params = dict( + contig=contig, + analysis=self._prep_phasing_analysis_param(analysis=analysis), + window_size=window_size, + percentiles=percentiles, + standardize=standardize, + standardization_bins=standardization_bins, + standardization_n_bins=standardization_n_bins, + standardization_diagnostics=standardization_diagnostics, + filter_min_maf=filter_min_maf, + map_pos=map_pos, + min_ehh=min_ehh, + include_edges=include_edges, + max_gap=max_gap, + gap_scale=gap_scale, + use_threads=use_threads, + sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets), + # N.B., do not be tempted to convert this sample query into integer + # indices using _prep_sample_selection_params, because the indices + # are different in the haplotype data. + sample_query1=sample_query1, + sample_query2=sample_query2, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + ) + + try: + results = self.results_cache_get(name=name, params=params) + + except CacheMiss: + results = self._xpehh_gwss(**params) # self. + self.results_cache_set(name=name, params=params, results=results) + + x = results["x"] + xpehh = results["xpehh"] + + return x, xpehh + + def _xpehh_gwss( + self, + *, + contig, + analysis, + sample_sets, + sample_query1, + sample_query2, + window_size, + percentiles, + standardize, + standardization_bins, + standardization_n_bins, + standardization_diagnostics, + filter_min_maf, + map_pos, + min_ehh, + max_gap, + gap_scale, + include_edges, + use_threads, + min_cohort_size, + max_cohort_size, + random_seed, + ): + ds_haps1 = self.haplotypes( + region=contig, + analysis=analysis, + sample_query=sample_query1, + sample_sets=sample_sets, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + ) + + ds_haps2 = self.haplotypes( + region=contig, + analysis=analysis, + sample_query=sample_query2, + sample_sets=sample_sets, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + ) + + gt1 = allel.GenotypeDaskArray(ds_haps1["call_genotype"].data) + gt2 = allel.GenotypeDaskArray(ds_haps2["call_genotype"].data) + with self._dask_progress(desc="Load haplotypes"): + ht1 = gt1.to_haplotypes().compute() + ht2 = gt2.to_haplotypes().compute() + + ac1 = ht1.count_alleles(max_allele=1) + ac2 = ht2.count_alleles(max_allele=1) + pos = ds_haps1["variant_position"].values + + # To be modified to take both into account + if filter_min_maf > 0: + af = ac1.to_frequencies() + maf = np.min(af, axis=1) + maf_filter = maf > filter_min_maf + + ht1 = ht1.compress(maf_filter, axis=0) + ht2 = ht2.compress(maf_filter, axis=0) + pos = pos[maf_filter] + ac1 = ac1[maf_filter] + ac2 = ac2[maf_filter] + + # compute XP-EHH + xp = allel.xpehh( + h1=ht1, + h2=ht2, + pos=pos, + map_pos=map_pos, + # min_maf=compute_min_maf, + min_ehh=min_ehh, + include_edges=include_edges, + max_gap=max_gap, + gap_scale=gap_scale, + use_threads=use_threads, + ) + + # remove any NaNs + na_mask = ~np.isnan(xp) + xp = xp[na_mask] + pos = pos[na_mask] + ac1 = ac1[na_mask] + ac2 = ac2[na_mask] + + # take absolute value + xp = np.fabs(xp) + + # Update to take into account both sets + if standardize: + xp, _ = allel.standardize_by_allele_count( + score=xp, + aac=ac1[:, 1], + bins=standardization_bins, + n_bins=standardization_n_bins, + diagnostics=standardization_diagnostics, + ) + + if window_size: + xp = allel.moving_statistic( + xp, statistic=np.percentile, size=window_size, q=percentiles + ) + pos = allel.moving_statistic(pos, statistic=np.mean, size=window_size) + + results = dict(x=pos, xpehh=xp) + + return results + + @doc( + summary="Run and plot XP-EHH GWSS data.", + ) + def plot_xpehh_gwss_track( + self, + contig: base_params.contig, + analysis: hap_params.analysis = DEFAULT, + sample_sets: Optional[base_params.sample_sets] = None, + sample_query1: Optional[base_params.sample_query] = None, + sample_query2: Optional[base_params.sample_query] = None, + window_size: xpehh_params.window_size = xpehh_params.window_size_default, + percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, + standardize: xpehh_params.standardize = True, + standardization_bins: Optional[xpehh_params.standardization_bins] = None, + standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default, + standardization_diagnostics: xpehh_params.standardization_diagnostics = False, + filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, + map_pos: Optional[xpehh_params.map_pos] = None, + min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, + max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default, + gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default, + include_edges: xpehh_params.include_edges = True, + use_threads: xpehh_params.use_threads = True, + min_cohort_size: Optional[ + base_params.min_cohort_size + ] = xpehh_params.min_cohort_size_default, + max_cohort_size: Optional[ + base_params.max_cohort_size + ] = xpehh_params.max_cohort_size_default, + random_seed: base_params.random_seed = 42, + palette: xpehh_params.palette = xpehh_params.palette_default, + title: Optional[gplt_params.title] = None, + sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default, + width: gplt_params.width = gplt_params.width_default, + height: gplt_params.height = 200, + show: gplt_params.show = True, + x_range: Optional[gplt_params.x_range] = None, + ): + # compute ihs + x, xpehh = self.xpehh_gwss( + contig=contig, + analysis=analysis, + window_size=window_size, + percentiles=percentiles, + standardize=standardize, + standardization_bins=standardization_bins, + standardization_n_bins=standardization_n_bins, + standardization_diagnostics=standardization_diagnostics, + filter_min_maf=filter_min_maf, + map_pos=map_pos, + min_ehh=min_ehh, + max_gap=max_gap, + gap_scale=gap_scale, + include_edges=include_edges, + use_threads=use_threads, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + sample_query1=sample_query1, + sample_query2=sample_query2, + sample_sets=sample_sets, + random_seed=random_seed, + ) + + # determine X axis range + x_min = x[0] + x_max = x[-1] + if x_range is None: + x_range = bokeh.models.Range1d(x_min, x_max, bounds="auto") + + # create a figure + xwheel_zoom = bokeh.models.WheelZoomTool( + dimensions="width", maintain_focus=False + ) + if title is None: + title = "'" + sample_query1 + "' and '" + sample_query2 + "'" + fig = bokeh.plotting.figure( + title=title, + tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"], + active_scroll=xwheel_zoom, + active_drag="xpan", + sizing_mode=sizing_mode, + width=width, + height=height, + toolbar_location="above", + x_range=x_range, + ) + + if window_size: + if isinstance(percentiles, int): + percentiles = (percentiles,) + + # add an empty dimension to xpehh array if 1D + xpehh = np.reshape(xpehh, (xpehh.shape[0], -1)) + bokeh_palette = bokeh.palettes.all_palettes[palette] + for i in range(xpehh.shape[1]): + xpehh_perc = xpehh[:, i] + if xpehh.shape[1] >= 3: + color = bokeh_palette[xpehh.shape[1]][i] + elif xpehh.shape[1] == 2: + color = bokeh_palette[3][i] + else: + color = None + + # plot ihs + fig.circle( + x=x, + y=xpehh_perc, + size=3, + line_width=0.15, + line_color="black", + fill_color=color, + ) + + # tidy up the plot + fig.yaxis.axis_label = "xpehh" + self._bokeh_style_genome_xaxis(fig, contig) + + if show: + bokeh.plotting.show(fig) + + return fig + @doc( summary=""" Hierarchically cluster haplotypes in region and produce an interactive plot. From 01c6563d863807c87515385c2f69097c9cd8e42d Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Wed, 10 May 2023 13:57:32 +0100 Subject: [PATCH 02/26] Trying to outsmart linting --- malariagen_data/anopheles.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 56c42d3cf..d1cfc5fb7 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -7325,7 +7325,10 @@ def plot_xpehh_gwss_track( dimensions="width", maintain_focus=False ) if title is None: - title = "'" + sample_query1 + "' and '" + sample_query2 + "'" + if sample_query1 is None or sample_query2 is None: + title = "XP-EHH" + else: + title = "'" + sample_query1 + "' and '" + sample_query2 + "'" fig = bokeh.plotting.figure( title=title, tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"], From e85699ea8df42e980622261e715d58b8dd842192 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Mon, 15 May 2023 09:30:49 +0100 Subject: [PATCH 03/26] Added the tests --- tests/test_ag3.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_ag3.py b/tests/test_ag3.py index 442920516..9197d0673 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -3012,6 +3012,37 @@ def test_ihs_gwss(): assert_allclose(ihs[:, 2][100], 2.3467595962486327) +def test_xpehh_gwss(): + ag3 = setup_ag3() + sample_query1 = "country == 'Ghana'" + sample_query2 = "country == 'Angola'" + contig = "3L" + analysis = "gamb_colu" + sample_sets = "3.0" + window_size = 1000 + + x, xpehh = ag3.xpehh_gwss( + contig=contig, + analysis=analysis, + sample_query1=sample_query1, + sample_query2=sample_query2, + sample_sets=sample_sets, + window_size=window_size, + max_cohort_size=20, + ) + + assert isinstance(x, np.ndarray) + assert isinstance(xpehh, np.ndarray) + + # check dimensions + assert len(x) == 395 + assert len(x) == len(xpehh) + + # check some values + assert_allclose(x[0], 510232.847) + assert_allclose(xpehh[:, 2][100], 1.7302443850220177) + + def test_g123_gwss(): ag3 = setup_ag3() sample_query = "country == 'Ghana'" From aa95dd91da4fc2a9e5caffb8cf1180300129b6d4 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:54:28 +0200 Subject: [PATCH 04/26] Update xpehh variable malariagen_data/ag3.py Co-authored-by: Alistair Miles --- malariagen_data/ag3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 2997c6805..d49d82792 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -114,7 +114,7 @@ class Ag3(AnophelesDataResource): _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME - _xpehh_gwss_cache_name = IHS_GWSS_CACHE_NAME + _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME site_mask_ids = ("gamb_colu_arab", "gamb_colu", "arab") _default_site_mask = DEFAULT_SITE_MASK phasing_analysis_ids = ("gamb_colu_arab", "gamb_colu", "arab") From 1e2f1f209426de7933115e081afb3b12054f9a03 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:54:56 +0200 Subject: [PATCH 05/26] Remove initial comment malariagen_data/anopheles.py Co-authored-by: Alistair Miles --- malariagen_data/anopheles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 16e82f811..68834c628 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -6149,7 +6149,6 @@ def ihs_gwss( ) -> Tuple[np.ndarray, np.ndarray]: # change this name if you ever change the behaviour of this function, to # invalidate any previously cached data - print("Begin ihs") name = self._ihs_gwss_cache_name print("End ihs") From 1aa208aa2f4e254f7758ac6d2d974b07c4379492 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:55:23 +0200 Subject: [PATCH 06/26] Remove final comment malariagen_data/anopheles.py Co-authored-by: Alistair Miles --- malariagen_data/anopheles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 68834c628..277d265bc 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -6150,7 +6150,6 @@ def ihs_gwss( # change this name if you ever change the behaviour of this function, to # invalidate any previously cached data name = self._ihs_gwss_cache_name - print("End ihs") params = dict( contig=contig, From 578139ffce02b3899e1e3db41149219f0c63086c Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:56:39 +0200 Subject: [PATCH 07/26] Update forgotten instruction malariagen_data/anopheles.py Co-authored-by: Alistair Miles --- malariagen_data/anopheles.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 277d265bc..088614555 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -7195,7 +7195,8 @@ def _xpehh_gwss( # To be modified to take both into account if filter_min_maf > 0: - af = ac1.to_frequencies() + ac = ac1 + ac2 + af = ac.to_frequencies() maf = np.min(af, axis=1) maf_filter = maf > filter_min_maf From a73f57769ac80fb3b581e8ae5dd5797da2bceff2 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:57:23 +0200 Subject: [PATCH 08/26] Correct annotation malariagen_data/anopheles.py Co-authored-by: Alistair Miles --- malariagen_data/anopheles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 088614555..7db99df23 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -7356,7 +7356,7 @@ def plot_xpehh_gwss_track( else: color = None - # plot ihs + # plot xpehh fig.circle( x=x, y=xpehh_perc, From 7d001005738ec0a750ecea0620bb450f4c873925 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:58:40 +0200 Subject: [PATCH 09/26] Improve display for webgl malariagen_data/anopheles.py Co-authored-by: Alistair Miles --- malariagen_data/anopheles.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 7db99df23..5ea9ca729 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -7360,9 +7360,9 @@ def plot_xpehh_gwss_track( fig.circle( x=x, y=xpehh_perc, - size=3, - line_width=0.15, - line_color="black", + size=4, + line_width=0, + line_color=color, fill_color=color, ) From e754799efd5470144b79651c7d7f9fdfc7516130 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 13 Jun 2023 15:01:52 +0200 Subject: [PATCH 10/26] I am not sure what the problem is --- malariagen_data/anoph/xpehh_params.py | 93 ++++ malariagen_data/anopheles.py | 609 +------------------------- 2 files changed, 94 insertions(+), 608 deletions(-) create mode 100644 malariagen_data/anoph/xpehh_params.py diff --git a/malariagen_data/anoph/xpehh_params.py b/malariagen_data/anoph/xpehh_params.py new file mode 100644 index 000000000..ca2a940b3 --- /dev/null +++ b/malariagen_data/anoph/xpehh_params.py @@ -0,0 +1,93 @@ +"""Parameter definitions for XPEHH analysis functions.""" + +from typing import Tuple, Union + +import numpy as np +from typing_extensions import Annotated, TypeAlias + +from . import base_params + +window_size: TypeAlias = Annotated[ + int, + """ + The size of window in number of SNPs used to summarise XP-EHH over. + If None, per-variant XP-EHH values are returned. + """, +] +window_size_default: window_size = 200 +min_cohort_size_default: base_params.min_cohort_size = 15 +max_cohort_size_default: base_params.max_cohort_size = 50 +percentiles: TypeAlias = Annotated[ + Union[int, Tuple[int, ...]], + """ + If window size is specified, this returns the XP-EHH percentiles + for each window. + """, +] +percentiles_default: percentiles = (50, 75, 100) +standardize: TypeAlias = Annotated[ + bool, "If True, standardize XP-EHH values by alternate allele counts." +] +standardization_bins: TypeAlias = Annotated[ + Tuple[float, ...], + "If provided, use these allele count bins to standardize XP-EHH values.", +] +standardization_n_bins: TypeAlias = Annotated[ + int, + """ + Number of allele count bins to use for standardization. + Overrides standardization_bins. + """, +] +standardization_n_bins_default: standardization_n_bins = 20 +standardization_diagnostics: TypeAlias = Annotated[ + bool, "If True, plot some diagnostics about the standardization." +] +filter_min_maf: TypeAlias = Annotated[ + float, + """ + Minimum minor allele frequency to use for filtering prior to passing + haplotypes to allel.xpehh function + """, +] +filter_min_maf_default: filter_min_maf = 0.05 +map_pos: TypeAlias = Annotated[ + np.ndarray, + """ + Variant positions (genetic map distance). + """, +] +min_ehh: TypeAlias = Annotated[ + float, + """ + Minimum EHH beyond which to truncate integrated haplotype homozygosity + calculation. + """, +] +min_ehh_default: min_ehh = 0.05 +max_gap: TypeAlias = Annotated[ + int, + """ + Do not report scores if EHH spans a gap larger than this number of + base pairs. + """, +] +max_gap_default: max_gap = 200_000 +gap_scale: TypeAlias = Annotated[ + int, "Rescale distance between variants if gap is larger than this value." +] +gap_scale_default: gap_scale = 20_000 +include_edges: TypeAlias = Annotated[ + bool, + """ + If True, report scores even if EHH does not decay below min_ehh at the + end of the chromosome. + """, +] +use_threads: TypeAlias = Annotated[ + bool, "If True, use multiple threads to compute XP-EHH." +] +palette: TypeAlias = Annotated[ + str, "Name of bokeh palette to use for plotting multiple percentiles." +] +palette_default: palette = "Blues" diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index c755dde75..2a1f1df39 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -36,6 +36,7 @@ map_params, pca_params, plotly_params, + xpehh_params, ) from .anoph.aim_data import AnophelesAimData from .anoph.base import AnophelesBase @@ -61,614 +62,6 @@ "effect in ['NON_SYNONYMOUS_CODING', 'START_LOST', 'STOP_LOST', 'STOP_GAINED']" ) -class hap_params: - """Parameter definitions for haplotype functions.""" - - analysis: TypeAlias = Annotated[ - str, - """ - Which haplotype phasing analysis to use. See the - `phasing_analysis_ids` property for available values. - """, - ] - - -class h12_params: - """Parameter definitions for H12 analysis functions.""" - - window_sizes: TypeAlias = Annotated[ - Tuple[int, ...], - """ - The sizes of windows (number of SNPs) used to calculate statistics within. - """, - ] - window_sizes_default: window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000) - window_size: TypeAlias = Annotated[ - int, - """ - The size of windows (number of SNPs) used to calculate statistics within. - """, - ] - cohort_size_default: Optional[base_params.cohort_size] = None - min_cohort_size_default: base_params.min_cohort_size = 15 - max_cohort_size_default: base_params.max_cohort_size = 50 - - -class g123_params: - """Parameter definitions for G123 analysis functions.""" - - sites: TypeAlias = Annotated[ - str, - """ - Which sites to use: 'all' includes all sites that pass - site filters; 'segregating' includes only segregating sites for - the given cohort; or a phasing analysis identifier can be - provided to use sites from the haplotype data, which is an - approximation to finding segregating sites in the entire Ag3.0 - (gambiae complex) or Af1.0 (funestus) cohort. - """, - ] - window_sizes: TypeAlias = Annotated[ - Tuple[int, ...], - """ - The sizes of windows (number of sites) used to calculate statistics within. - """, - ] - window_sizes_default: window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000) - window_size: TypeAlias = Annotated[ - int, - """ - The size of windows (number of sites) used to calculate statistics within. - """, - ] - min_cohort_size_default: base_params.min_cohort_size = 20 - max_cohort_size_default: base_params.max_cohort_size = 50 - - -class fst_params: - """Parameter definitions for Fst functions.""" - - # N.B., window size can mean different things for different functions - window_size: TypeAlias = Annotated[ - int, - "The size of windows (number of sites) used to calculate statistics within.", - ] - cohort_size_default: Optional[base_params.cohort_size] = None - min_cohort_size_default: base_params.min_cohort_size = 15 - max_cohort_size_default: base_params.max_cohort_size = 50 - - -class frq_params: - """Parameter definitions for functions computing and plotting allele frequencies.""" - - drop_invariant: TypeAlias = Annotated[ - bool, - """ - If True, drop variants not observed in the selected samples. - """, - ] - effects: TypeAlias = Annotated[bool, "If True, add SNP effect annotations."] - area_by: TypeAlias = Annotated[ - str, - """ - Column name in the sample metadata to use to group samples spatially. E.g., - use "admin1_iso" or "admin1_name" to group by level 1 administrative - divisions, or use "admin2_name" to group by level 2 administrative - divisions. - """, - ] - period_by: TypeAlias = Annotated[ - Literal["year", "quarter", "month"], - "Length of time to group samples temporally.", - ] - variant_query: TypeAlias = Annotated[ - str, - "A pandas query to be evaluated against variants.", - ] - nobs_mode: TypeAlias = Annotated[ - Literal["called", "fixed"], - """ - Method for calculating the denominator when computing frequencies. If - "called" then use the number of called alleles, i.e., number of samples - with non-missing genotype calls multiplied by 2. If "fixed" then use the - number of samples multiplied by 2. - """, - ] - nobs_mode_default: nobs_mode = "called" - ci_method: TypeAlias = Annotated[ - Literal["normal", "agresti_coull", "beta", "wilson", "binom_test"], - """ - Method to use for computing confidence intervals, passed through to - `statsmodels.stats.proportion.proportion_confint`. - """, - ] - ci_method_default: ci_method = "wilson" - ds_frequencies_advanced: TypeAlias = Annotated[ - xr.Dataset, - """ - A dataset of variant frequencies, such as returned by - `snp_allele_frequencies_advanced()`, - `aa_allele_frequencies_advanced()` or - `gene_cnv_frequencies_advanced()`. - """, - ] - - -class het_params: - """Parameters for functions related to heterozygosity and runs of homozygosity.""" - - sample: TypeAlias = Annotated[ - Union[str, int], - "Sample identifier or index within sample set.", - ] - window_size: TypeAlias = Annotated[ - int, - "Number of sites per window.", - ] - window_size_default: window_size = 20_000 - phet_roh: TypeAlias = Annotated[ - float, - "Probability of observing a heterozygote in a ROH.", - ] - phet_roh_default: phet_roh = 0.001 - phet_nonroh: TypeAlias = Annotated[ - Tuple[float, ...], - "One or more probabilities of observing a heterozygote outside a ROH.", - ] - phet_nonroh_default: phet_nonroh = (0.003, 0.01) - transition: TypeAlias = Annotated[ - float, - """ - Probability of moving between states. A larger window size may call - for a larger transitional probability. - """, - ] - transition_default: transition = 0.001 - y_max: TypeAlias = Annotated[ - float, - "Y axis limit.", - ] - y_max_default: y_max = 0.03 - circle_kwargs: TypeAlias = Annotated[ - Mapping, - "Passed through to bokeh circle() function.", - ] - df_roh: TypeAlias = Annotated[ - pd.DataFrame, - """ - A DataFrame where each row provides data about a single run of - homozygosity. - """, - ] - heterozygosity_height: TypeAlias = Annotated[ - int, - "Height in pixels (px) of heterozygosity track.", - ] - roh_height: TypeAlias = Annotated[ - int, - "Height in pixels (px) of runs of homozygosity track.", - ] - - -class pca_params: - """Parameters for PCA functions.""" - - n_snps: TypeAlias = Annotated[ - int, - """ - The desired number of SNPs to use when running the analysis. - SNPs will be evenly thinned to approximately this number. - """, - ] - thin_offset: TypeAlias = Annotated[ - int, - """ - Starting index for SNP thinning. Change this to repeat the analysis - using a different set of SNPs. - """, - ] - thin_offset_default: thin_offset = 0 - min_minor_ac: TypeAlias = Annotated[ - int, - """ - The minimum minor allele count. SNPs with a minor allele count - below this value will be excluded prior to thinning. - """, - ] - min_minor_ac_default: min_minor_ac = 2 - max_missing_an: TypeAlias = Annotated[ - int, - """ - The maximum number of missing allele calls to accept. SNPs with - more than this value will be excluded prior to thinning. Set to 0 - (default) to require no missing calls. - """, - ] - max_missing_an_default = 0 - n_components: TypeAlias = Annotated[ - int, - "Number of components to return.", - ] - n_components_default: n_components = 20 - df_pca: TypeAlias = Annotated[ - pd.DataFrame, - """ - A dataframe of sample metadata, with columns "PC1", "PC2", "PC3", - etc., added. - """, - ] - evr: TypeAlias = Annotated[ - np.ndarray, - "An array of explained variance ratios, one per component.", - ] - - -class plotly_params: - """Parameters for any plotting functions using plotly.""" - - # N.B., most of these parameters are always able to take None - # and so we set as Optional here, rather than having to repeat - # that for each function doc. - - x_label: TypeAlias = Annotated[ - Optional[str], - "X axis label.", - ] - y_label: TypeAlias = Annotated[ - Optional[str], - "Y axis label.", - ] - width: TypeAlias = Annotated[ - Optional[int], - "Plot width in pixels (px).", - ] - height: TypeAlias = Annotated[ - Optional[int], - "Plot height in pixels (px).", - ] - aspect: TypeAlias = Annotated[ - Optional[Literal["equal", "auto"]], - "Aspect ratio, see also https://plotly.com/python-api-reference/generated/plotly.express.imshow", - ] - title: TypeAlias = Annotated[ - Optional[Union[str, bool]], - """ - If True, attempt to use metadata from input dataset as a plot title. - Otherwise, use supplied value as a title. - """, - ] - text_auto: TypeAlias = Annotated[ - Union[bool, str], - """ - If True or a string, single-channel img values will be displayed as text. A - string like '.2f' will be interpreted as a texttemplate numeric formatting - directive. - """, - ] - color_continuous_scale: TypeAlias = Annotated[ - Optional[Union[str, List[str]]], - """ - Colormap used to map scalar data to colors (for a 2D image). This - parameter is not used for RGB or RGBA images. If a string is provided, - it should be the name of a known color scale, and if a list is provided, - it should be a list of CSS-compatible colors. - """, - ] - colorbar: TypeAlias = Annotated[ - bool, - "If False, do not display a color bar.", - ] - x: TypeAlias = Annotated[ - str, - "Name of variable to plot on the X axis.", - ] - y: TypeAlias = Annotated[ - str, - "Name of variable to plot on the Y axis.", - ] - z: TypeAlias = Annotated[ - str, - "Name of variable to plot on the Z axis.", - ] - color: TypeAlias = Annotated[ - Optional[str], - "Name of variable to use to color the markers.", - ] - symbol: TypeAlias = Annotated[ - Optional[str], - "Name of the variable to use to choose marker symbols.", - ] - jitter_frac: TypeAlias = Annotated[ - Optional[float], - "Randomly jitter points by this fraction of their range.", - ] - marker_size: TypeAlias = Annotated[ - int, - "Marker size.", - ] - template: TypeAlias = Annotated[ - Optional[ - Literal[ - "ggplot2", - "seaborn", - "simple_white", - "plotly", - "plotly_white", - "plotly_dark", - "presentation", - "xgridoff", - "ygridoff", - "gridon", - "none", - ] - ], - "The figure template name (must be a key in plotly.io.templates).", - ] - figure: TypeAlias = Annotated[go.Figure, "A plotly figure."] - - -class ihs_params: - window_size: TypeAlias = Annotated[ - int, - """ - The size of window in number of SNPs used to summarise iHS over. - If None, per-variant iHS values are returned. - """, - ] - window_size_default: window_size = 200 - min_cohort_size_default: base_params.min_cohort_size = 15 - max_cohort_size_default: base_params.max_cohort_size = 50 - percentiles: TypeAlias = Annotated[ - Union[int, Tuple[int, ...]], - """ - If window size is specified, this returns the iHS percentiles - for each window. - """, - ] - percentiles_default: percentiles = (50, 75, 100) - standardize: TypeAlias = Annotated[ - bool, "If True, standardize iHS values by alternate allele counts." - ] - standardization_bins: TypeAlias = Annotated[ - Tuple[float, ...], - "If provided, use these allele count bins to standardize iHS values.", - ] - standardization_n_bins: TypeAlias = Annotated[ - int, - """ - Number of allele count bins to use for standardization. - Overrides standardization_bins. - """, - ] - standardization_n_bins_default: standardization_n_bins = 20 - standardization_diagnostics: TypeAlias = Annotated[ - bool, "If True, plot some diagnostics about the standardization." - ] - filter_min_maf: TypeAlias = Annotated[ - float, - """ - Minimum minor allele frequency to use for filtering prior to passing - haplotypes to allel.ihs function - """, - ] - filter_min_maf_default: filter_min_maf = 0.05 - compute_min_maf: TypeAlias = Annotated[ - float, - """ - Do not compute integrated haplotype homozygosity for variants with - minor allele frequency below this threshold. - """, - ] - compute_min_maf_default: compute_min_maf = 0.05 - min_ehh: TypeAlias = Annotated[ - float, - """ - Minimum EHH beyond which to truncate integrated haplotype homozygosity - calculation. - """, - ] - min_ehh_default: min_ehh = 0.05 - max_gap: TypeAlias = Annotated[ - int, - """ - Do not report scores if EHH spans a gap larger than this number of - base pairs. - """, - ] - max_gap_default: max_gap = 200_000 - gap_scale: TypeAlias = Annotated[ - int, "Rescale distance between variants if gap is larger than this value." - ] - gap_scale_default: gap_scale = 20_000 - include_edges: TypeAlias = Annotated[ - bool, - """ - If True, report scores even if EHH does not decay below min_ehh at the - end of the chromosome. - """, - ] - use_threads: TypeAlias = Annotated[ - bool, "If True, use multiple threads to compute iHS." - ] - palette: TypeAlias = Annotated[ - str, "Name of bokeh palette to use for plotting multiple percentiles." - ] - palette_default: palette = "Blues" - - -class xpehh_params: - window_size: TypeAlias = Annotated[ - int, - """ - The size of window in number of SNPs used to summarise XP-EHH over. - If None, per-variant XP-EHH values are returned. - """, - ] - window_size_default: window_size = 200 - min_cohort_size_default: base_params.min_cohort_size = 15 - max_cohort_size_default: base_params.max_cohort_size = 50 - percentiles: TypeAlias = Annotated[ - Union[int, Tuple[int, ...]], - """ - If window size is specified, this returns the XP-EHH percentiles - for each window. - """, - ] - percentiles_default: percentiles = (50, 75, 100) - standardize: TypeAlias = Annotated[ - bool, "If True, standardize XP-EHH values by alternate allele counts." - ] - standardization_bins: TypeAlias = Annotated[ - Tuple[float, ...], - "If provided, use these allele count bins to standardize XP-EHH values.", - ] - standardization_n_bins: TypeAlias = Annotated[ - int, - """ - Number of allele count bins to use for standardization. - Overrides standardization_bins. - """, - ] - standardization_n_bins_default: standardization_n_bins = 20 - standardization_diagnostics: TypeAlias = Annotated[ - bool, "If True, plot some diagnostics about the standardization." - ] - filter_min_maf: TypeAlias = Annotated[ - float, - """ - Minimum minor allele frequency to use for filtering prior to passing - haplotypes to allel.xpehh function - """, - ] - filter_min_maf_default: filter_min_maf = 0.05 - map_pos: TypeAlias = Annotated[ - np.ndarray, - """ - Variant positions (genetic map distance). - """, - ] - min_ehh: TypeAlias = Annotated[ - float, - """ - Minimum EHH beyond which to truncate integrated haplotype homozygosity - calculation. - """, - ] - min_ehh_default: min_ehh = 0.05 - max_gap: TypeAlias = Annotated[ - int, - """ - Do not report scores if EHH spans a gap larger than this number of - base pairs. - """, - ] - max_gap_default: max_gap = 200_000 - gap_scale: TypeAlias = Annotated[ - int, "Rescale distance between variants if gap is larger than this value." - ] - gap_scale_default: gap_scale = 20_000 - include_edges: TypeAlias = Annotated[ - bool, - """ - If True, report scores even if EHH does not decay below min_ehh at the - end of the chromosome. - """, - ] - use_threads: TypeAlias = Annotated[ - bool, "If True, use multiple threads to compute XP-EHH." - ] - palette: TypeAlias = Annotated[ - str, "Name of bokeh palette to use for plotting multiple percentiles." - ] - palette_default: palette = "Blues" - - -class hapclust_params: - linkage_method: TypeAlias = Annotated[ - Literal[ - "single", "complete", "average", "weighted", "centroid", "median", "ward" - ], - """ - The linkage algorithm to use. See the Linkage Methods section of the - scipy.cluster.hierarchy.linkage docs for full descriptions. - """, - ] - linkage_method_default: linkage_method = "single" - count_sort: TypeAlias = Annotated[ - bool, - """ - For each node n, the order (visually, from left-to-right) n's two descendant - links are plotted is determined by this parameter. If True, the child with - the minimum number of original objects in its cluster is plotted first. Note - distance_sort and count_sort cannot both be True. - """, - ] - distance_sort: TypeAlias = Annotated[ - bool, - """ - For each node n, the order (visually, from left-to-right) n's two descendant - links are plotted is determined by this parameter. If True, The child with the - minimum distance between its direct descendants is plotted first. - """, - ] - - -class hapnet_params: - max_dist: TypeAlias = Annotated[ - int, - "Join network components up to a maximum distance of 2 SNP differences.", - ] - max_dist_default: max_dist = 2 - color: TypeAlias = Annotated[ - str, - """ - Identifies a column in the sample metadata which determines the colour - of pie chart segments within nodes. - """, - ] - color_discrete_sequence: TypeAlias = Annotated[ - List, "Provide a list of colours to use." - ] - color_discrete_map: TypeAlias = Annotated[ - Mapping, "Provide an explicit mapping from values to colours." - ] - category_order: TypeAlias = Annotated[ - List, - "Control the order in which values appear in the legend.", - ] - node_size_factor: TypeAlias = Annotated[ - int, - "Control the sizing of nodes.", - ] - node_size_factor_default: node_size_factor = 50 - layout: TypeAlias = Annotated[ - str, - "Name of the network layout to use to position nodes.", - ] - layout_default: layout = "cose" - layout_params: TypeAlias = Annotated[ - Mapping, - "Additional parameters to the layout algorithm.", - ] - - -class dash_params: - height: TypeAlias = Annotated[int, "Height of the Dash app in pixels (px)."] - width: TypeAlias = Annotated[Union[int, str], "Width of the Dash app."] - server_mode: TypeAlias = Annotated[ - Literal["inline", "external", "jupyterlab"], - """ - Controls how the Jupyter Dash app will be launched. See - https://medium.com/plotly/introducing-jupyterdash-811f1f57c02e for - more information. - """, - ] - server_mode_default: server_mode = "inline" - server_port: TypeAlias = Annotated[ - int, - "Manually override the port on which the Dash app will run.", - ] - # N.B., we are in the process of breaking up the AnophelesDataResource # class into multiple parent classes like AnophelesGenomeSequenceData # and AnophelesBase. This is work in progress, and further PRs are From 72af9be74525d7bbbae7e87b173993d2d3dc49ec Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:04:36 +0200 Subject: [PATCH 11/26] Removed (some of) the things that had been moved somewhere else --- malariagen_data/anopheles.py | 76 ++++++++++++++++++------------------ tests/test_ag3.py | 6 +-- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 2a1f1df39..f92c9165d 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -190,44 +190,44 @@ def _h1x_gwss_cache_name(self): def _ihs_gwss_cache_name(self): raise NotImplementedError("Must override _ihs_gwss_cache_name") - @property - @abstractmethod - def _xpehh_gwss_cache_name(self): - raise NotImplementedError("Must override _xpehh_gwss_cache_name") - - @property - @abstractmethod - def _site_annotations_zarr_path(self): - raise NotImplementedError("Must override _site_annotations_zarr_path") - - # Start of @abstractmethod - - @property - @abstractmethod - def site_mask_ids(self): - """Identifiers for the different site masks that are available. - These are values than can be used for the `site_mask` parameter in any - method making using of SNP data. - - """ - raise NotImplementedError("Must override _site_mask_ids") - - @property - @abstractmethod - def _default_site_mask(self): - raise NotImplementedError("Must override _default_site_mask") - - def _prep_site_mask_param(self, *, site_mask): - if site_mask is None: - # allowed - pass - elif site_mask == DEFAULT: - site_mask = self._default_site_mask - elif site_mask not in self.site_mask_ids: - raise ValueError( - f"Invalid site mask, must be one of f{self.site_mask_ids}." - ) - return site_mask + # @property + # @abstractmethod + # def _xpehh_gwss_cache_name(self): + # raise NotImplementedError("Must override _xpehh_gwss_cache_name") + + # @property + # @abstractmethod + # def _site_annotations_zarr_path(self): + # raise NotImplementedError("Must override _site_annotations_zarr_path") + + # # Start of @abstractmethod + + # @property + # @abstractmethod + # def site_mask_ids(self): + # """Identifiers for the different site masks that are available. + # These are values than can be used for the `site_mask` parameter in any + # method making using of SNP data. + + # """ + # raise NotImplementedError("Must override _site_mask_ids") + + # @property + # @abstractmethod + # def _default_site_mask(self): + # raise NotImplementedError("Must override _default_site_mask") + + # def _prep_site_mask_param(self, *, site_mask): + # if site_mask is None: + # # allowed + # pass + # elif site_mask == DEFAULT: + # site_mask = self._default_site_mask + # elif site_mask not in self.site_mask_ids: + # raise ValueError( + # f"Invalid site mask, must be one of f{self.site_mask_ids}." + # ) + # return site_mask @abstractmethod def _transcript_to_gene_name(self, transcript): diff --git a/tests/test_ag3.py b/tests/test_ag3.py index 05743370f..d8fee57ae 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -2288,12 +2288,12 @@ def test_xpehh_gwss(): assert isinstance(xpehh, np.ndarray) # check dimensions - assert len(x) == 395 + assert len(x) == 399 assert len(x) == len(xpehh) # check some values - assert_allclose(x[0], 510232.847) - assert_allclose(xpehh[:, 2][100], 1.7302443850220177) + assert_allclose(x[0], 467448.348) + assert_allclose(xpehh[:, 2][100], 2.741348351867086) def test_g123_gwss(): From 25a83725f337eff4afbcf446e64fa0648b1b5328 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:40:22 +0200 Subject: [PATCH 12/26] Hopefully solved the cache_name issue --- malariagen_data/af1.py | 2 ++ malariagen_data/ag3.py | 2 ++ malariagen_data/anopheles.py | 44 ++++-------------------------------- 3 files changed, 9 insertions(+), 39 deletions(-) diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index 0fe6c4113..683f9eaa2 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -13,6 +13,7 @@ H12_CALIBRATION_CACHE_NAME = "af1_h12_calibration_v1" H12_GWSS_CACHE_NAME = "af1_h12_gwss_v1" G123_GWSS_CACHE_NAME = "af1_g123_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "af1_g123_gwss_v1" G123_CALIBRATION_CACHE_NAME = "af1_g123_calibration_v1" H1X_GWSS_CACHE_NAME = "af1_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1" @@ -75,6 +76,7 @@ class Af1(AnophelesDataResource): _h12_calibration_cache_name = H12_CALIBRATION_CACHE_NAME _h12_gwss_cache_name = H12_GWSS_CACHE_NAME _g123_gwss_cache_name = G123_GWSS_CACHE_NAME + _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 1f97c4ae6..405796a47 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -42,6 +42,7 @@ H12_GWSS_CACHE_NAME = "ag3_h12_gwss_v1" G123_CALIBRATION_CACHE_NAME = "ag3_g123_calibration_v1" G123_GWSS_CACHE_NAME = "ag3_g123_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1" XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" @@ -140,6 +141,7 @@ class Ag3(AnophelesDataResource): _h12_calibration_cache_name = H12_CALIBRATION_CACHE_NAME _h12_gwss_cache_name = H12_GWSS_CACHE_NAME _g123_gwss_cache_name = G123_GWSS_CACHE_NAME + _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index f92c9165d..7b214a81d 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -180,6 +180,11 @@ def _h12_gwss_cache_name(self): def _g123_gwss_cache_name(self): raise NotImplementedError("Must override _g123_gwss_cache_name") + @property + @abstractmethod + def _xpehh_gwss_cache_name(self): + raise NotImplementedError("Must override _xpehh_gwss_cache_name") + @property @abstractmethod def _h1x_gwss_cache_name(self): @@ -190,45 +195,6 @@ def _h1x_gwss_cache_name(self): def _ihs_gwss_cache_name(self): raise NotImplementedError("Must override _ihs_gwss_cache_name") - # @property - # @abstractmethod - # def _xpehh_gwss_cache_name(self): - # raise NotImplementedError("Must override _xpehh_gwss_cache_name") - - # @property - # @abstractmethod - # def _site_annotations_zarr_path(self): - # raise NotImplementedError("Must override _site_annotations_zarr_path") - - # # Start of @abstractmethod - - # @property - # @abstractmethod - # def site_mask_ids(self): - # """Identifiers for the different site masks that are available. - # These are values than can be used for the `site_mask` parameter in any - # method making using of SNP data. - - # """ - # raise NotImplementedError("Must override _site_mask_ids") - - # @property - # @abstractmethod - # def _default_site_mask(self): - # raise NotImplementedError("Must override _default_site_mask") - - # def _prep_site_mask_param(self, *, site_mask): - # if site_mask is None: - # # allowed - # pass - # elif site_mask == DEFAULT: - # site_mask = self._default_site_mask - # elif site_mask not in self.site_mask_ids: - # raise ValueError( - # f"Invalid site mask, must be one of f{self.site_mask_ids}." - # ) - # return site_mask - @abstractmethod def _transcript_to_gene_name(self, transcript): # children may have different manual overrides. From d381c8d4fd78f3e42c36f806adde5f66fb8331c0 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Thu, 15 Jun 2023 08:08:12 +0200 Subject: [PATCH 13/26] Changed parameter names to be more uniform --- malariagen_data/anopheles.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 7b214a81d..968cb0d3c 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -5194,8 +5194,8 @@ def _xpehh_gwss( contig, analysis, sample_sets, - sample_query1, - sample_query2, + cohort1_query, + cohort2_query, window_size, percentiles, standardize, @@ -5216,7 +5216,7 @@ def _xpehh_gwss( ds_haps1 = self.haplotypes( region=contig, analysis=analysis, - sample_query=sample_query1, + sample_query=cohort1_query, sample_sets=sample_sets, min_cohort_size=min_cohort_size, max_cohort_size=max_cohort_size, @@ -5226,7 +5226,7 @@ def _xpehh_gwss( ds_haps2 = self.haplotypes( region=contig, analysis=analysis, - sample_query=sample_query2, + sample_query=cohort2_query, sample_sets=sample_sets, min_cohort_size=min_cohort_size, max_cohort_size=max_cohort_size, @@ -5308,8 +5308,8 @@ def plot_xpehh_gwss_track( contig: base_params.contig, analysis: hap_params.analysis = DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, - sample_query1: Optional[base_params.sample_query] = None, - sample_query2: Optional[base_params.sample_query] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, window_size: xpehh_params.window_size = xpehh_params.window_size_default, percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, standardize: xpehh_params.standardize = True, @@ -5357,8 +5357,8 @@ def plot_xpehh_gwss_track( use_threads=use_threads, min_cohort_size=min_cohort_size, max_cohort_size=max_cohort_size, - sample_query1=sample_query1, - sample_query2=sample_query2, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, sample_sets=sample_sets, random_seed=random_seed, ) @@ -5374,10 +5374,10 @@ def plot_xpehh_gwss_track( dimensions="width", maintain_focus=False ) if title is None: - if sample_query1 is None or sample_query2 is None: + if cohort1_query is None or cohort2_query is None: title = "XP-EHH" else: - title = "'" + sample_query1 + "' and '" + sample_query2 + "'" + title = f"Cohort 1: {cohort1_query}\nCohort 2: {cohort2_query}" fig = bokeh.plotting.figure( title=title, tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"], From 23a81a4a4cb9eb58137d1616428855a12930e234 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Thu, 15 Jun 2023 08:19:06 +0200 Subject: [PATCH 14/26] Adds show paramater to plot_xpehh_gwss() --- malariagen_data/anopheles.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 968cb0d3c..2710d0c5c 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -4420,8 +4420,8 @@ def plot_xpehh_gwss( contig: base_params.contig, analysis: hap_params.analysis = DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, - sample_query1: Optional[base_params.sample_query] = None, - sample_query2: Optional[base_params.sample_query] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, window_size: xpehh_params.window_size = xpehh_params.window_size_default, percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, standardize: xpehh_params.standardize = True, @@ -4448,14 +4448,16 @@ def plot_xpehh_gwss( width: gplt_params.width = gplt_params.width_default, track_height: gplt_params.track_height = 170, genes_height: gplt_params.genes_height = gplt_params.genes_height_default, - ) -> None: + show: gplt_params.show = True, + output_backend: gplt_params.output_backend = gplt_params.output_backend_default, + ) -> gplt_params.figure: # gwss track fig1 = self.plot_xpehh_gwss_track( contig=contig, analysis=analysis, sample_sets=sample_sets, - sample_query1=sample_query1, - sample_query2=sample_query2, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, window_size=window_size, percentiles=percentiles, palette=palette, @@ -4502,7 +4504,11 @@ def plot_xpehh_gwss( sizing_mode=sizing_mode, ) - bokeh.plotting.show(fig) + if show: + bokeh.plotting.show(fig) + return None + else: + return fig @doc( summary="Run and plot iHS GWSS data.", @@ -5337,7 +5343,8 @@ def plot_xpehh_gwss_track( height: gplt_params.height = 200, show: gplt_params.show = True, x_range: Optional[gplt_params.x_range] = None, - ): + output_backend: gplt_params.output_backend = gplt_params.output_backend_default, + ) -> gplt_params.figure: # compute ihs x, xpehh = self.xpehh_gwss( contig=contig, From 2f8a59c61957930cc700df0944c77ffcbb0ec8ac Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Thu, 15 Jun 2023 08:22:41 +0200 Subject: [PATCH 15/26] Added support for output_backend --- malariagen_data/anopheles.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 2710d0c5c..a7765a993 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -4493,6 +4493,7 @@ def plot_xpehh_gwss( height=genes_height, x_range=fig1.x_range, show=False, + output_backend=output_backend, ) # combine plots into a single figure @@ -5395,6 +5396,7 @@ def plot_xpehh_gwss_track( height=height, toolbar_location="above", x_range=x_range, + output_backend=output_backend, ) if window_size: @@ -5429,8 +5431,9 @@ def plot_xpehh_gwss_track( if show: bokeh.plotting.show(fig) - - return fig + return None + else: + return fig @doc( summary=""" From 63659483f65476d0e868a2d770e85074690eecb2 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Thu, 15 Jun 2023 08:27:30 +0200 Subject: [PATCH 16/26] Updated colors for webgl --- malariagen_data/anopheles.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index a7765a993..2a8ad5384 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -5402,18 +5402,24 @@ def plot_xpehh_gwss_track( if window_size: if isinstance(percentiles, int): percentiles = (percentiles,) + # Ensure percentiles are sorted so that colors make sense. + percentiles = tuple(sorted(percentiles)) # add an empty dimension to xpehh array if 1D xpehh = np.reshape(xpehh, (xpehh.shape[0], -1)) - bokeh_palette = bokeh.palettes.all_palettes[palette] + + # select the base color palette to work from + base_palette = bokeh.palettes.all_palettes[palette][8] + + # keep only enough colours to plot the IHS tracks + bokeh_palette = base_palette[: xpehh.shape[1]] + + # reverse the colors so darkest is last + bokeh_palette = bokeh_palette[::-1] + for i in range(xpehh.shape[1]): xpehh_perc = xpehh[:, i] - if xpehh.shape[1] >= 3: - color = bokeh_palette[xpehh.shape[1]][i] - elif xpehh.shape[1] == 2: - color = bokeh_palette[3][i] - else: - color = None + color = bokeh_palette[i] # plot xpehh fig.circle( From f1f212ea8634c211a6a9312f1d4915e0aba68b74 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:37:32 +0200 Subject: [PATCH 17/26] Updated the tests with new parameter names --- malariagen_data/anopheles.py | 8 ++++---- tests/test_ag3.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 2a8ad5384..cc5e23d7b 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -5129,8 +5129,8 @@ def xpehh_gwss( contig: base_params.contig, analysis: hap_params.analysis = DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, - sample_query1: Optional[base_params.sample_query] = None, - sample_query2: Optional[base_params.sample_query] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, window_size: xpehh_params.window_size = xpehh_params.window_size_default, percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, standardize: xpehh_params.standardize = True, @@ -5176,8 +5176,8 @@ def xpehh_gwss( # N.B., do not be tempted to convert this sample query into integer # indices using _prep_sample_selection_params, because the indices # are different in the haplotype data. - sample_query1=sample_query1, - sample_query2=sample_query2, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, min_cohort_size=min_cohort_size, max_cohort_size=max_cohort_size, random_seed=random_seed, diff --git a/tests/test_ag3.py b/tests/test_ag3.py index d8fee57ae..102ebd266 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -2267,8 +2267,8 @@ def test_ihs_gwss(): def test_xpehh_gwss(): ag3 = setup_ag3() - sample_query1 = "country == 'Ghana'" - sample_query2 = "country == 'Angola'" + cohort1_query = "country == 'Ghana'" + cohort2_query = "country == 'Angola'" contig = "3L" analysis = "gamb_colu" sample_sets = "3.0" @@ -2277,8 +2277,8 @@ def test_xpehh_gwss(): x, xpehh = ag3.xpehh_gwss( contig=contig, analysis=analysis, - sample_query1=sample_query1, - sample_query2=sample_query2, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, sample_sets=sample_sets, window_size=window_size, max_cohort_size=20, From 256ecfa39c351e175c1b03934069161aed87a706 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Fri, 16 Jun 2023 10:43:50 +0200 Subject: [PATCH 18/26] Correct bad copy/paste Co-authored-by: Alistair Miles --- malariagen_data/af1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index 683f9eaa2..47b8b8c98 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -13,7 +13,7 @@ H12_CALIBRATION_CACHE_NAME = "af1_h12_calibration_v1" H12_GWSS_CACHE_NAME = "af1_h12_gwss_v1" G123_GWSS_CACHE_NAME = "af1_g123_gwss_v1" -XPEHH_GWSS_CACHE_NAME = "af1_g123_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1" G123_CALIBRATION_CACHE_NAME = "af1_g123_calibration_v1" H1X_GWSS_CACHE_NAME = "af1_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1" From 282013c53fac3335fa7c655caf4f879eae8c4fe9 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Fri, 16 Jun 2023 10:45:09 +0200 Subject: [PATCH 19/26] Corrected missing parameter Co-authored-by: Alistair Miles --- malariagen_data/anopheles.py | 1 + 1 file changed, 1 insertion(+) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index cc5e23d7b..30c8119fa 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -4481,6 +4481,7 @@ def plot_xpehh_gwss( height=track_height, show=False, x_range=None, + output_backend=output_backend, ) fig1.xaxis.visible = False From 7ed2ea67f4773805a8be311493ca54c55b5de4db Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Fri, 16 Jun 2023 11:00:14 +0200 Subject: [PATCH 20/26] A few lines had escaped deletion --- malariagen_data/ag3.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 405796a47..59b899415 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -145,11 +145,6 @@ class Ag3(AnophelesDataResource): _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME - _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME - site_mask_ids = ("gamb_colu_arab", "gamb_colu", "arab") - _default_site_mask = DEFAULT_SITE_MASK - phasing_analysis_ids = ("gamb_colu_arab", "gamb_colu", "arab") - _default_phasing_analysis = "gamb_colu_arab" def __init__( self, From ac91865a64e6158a77eab1deff64dd3b464c6290 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 20 Jun 2023 15:59:07 +0200 Subject: [PATCH 21/26] Unremoved an important line --- malariagen_data/ag3.py | 2 -- malariagen_data/anoph/xpehh_params.py | 18 ------------ malariagen_data/anopheles.py | 40 ++------------------------- 3 files changed, 3 insertions(+), 57 deletions(-) diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 59b899415..590e76ba6 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -45,8 +45,6 @@ XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1" -XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" -DEFAULT_SITE_MASK = "gamb_colu_arab" def _setup_aim_palettes(): diff --git a/malariagen_data/anoph/xpehh_params.py b/malariagen_data/anoph/xpehh_params.py index ca2a940b3..d8dab0016 100644 --- a/malariagen_data/anoph/xpehh_params.py +++ b/malariagen_data/anoph/xpehh_params.py @@ -25,24 +25,6 @@ """, ] percentiles_default: percentiles = (50, 75, 100) -standardize: TypeAlias = Annotated[ - bool, "If True, standardize XP-EHH values by alternate allele counts." -] -standardization_bins: TypeAlias = Annotated[ - Tuple[float, ...], - "If provided, use these allele count bins to standardize XP-EHH values.", -] -standardization_n_bins: TypeAlias = Annotated[ - int, - """ - Number of allele count bins to use for standardization. - Overrides standardization_bins. - """, -] -standardization_n_bins_default: standardization_n_bins = 20 -standardization_diagnostics: TypeAlias = Annotated[ - bool, "If True, plot some diagnostics about the standardization." -] filter_min_maf: TypeAlias = Annotated[ float, """ diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 30c8119fa..7f80543a1 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -4413,7 +4413,7 @@ def plot_ihs_gwss_track( @check_types @doc( - summary="Run and plot iHS GWSS data.", + summary="Run and plot XP-EHH GWSS data.", ) def plot_xpehh_gwss( self, @@ -5134,10 +5134,6 @@ def xpehh_gwss( cohort2_query: Optional[base_params.sample_query] = None, window_size: xpehh_params.window_size = xpehh_params.window_size_default, percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, - standardize: xpehh_params.standardize = True, - standardization_bins: Optional[xpehh_params.standardization_bins] = None, - standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default, - standardization_diagnostics: xpehh_params.standardization_diagnostics = False, filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, map_pos: Optional[xpehh_params.map_pos] = None, min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, @@ -5162,10 +5158,6 @@ def xpehh_gwss( analysis=self._prep_phasing_analysis_param(analysis=analysis), window_size=window_size, percentiles=percentiles, - standardize=standardize, - standardization_bins=standardization_bins, - standardization_n_bins=standardization_n_bins, - standardization_diagnostics=standardization_diagnostics, filter_min_maf=filter_min_maf, map_pos=map_pos, min_ehh=min_ehh, @@ -5206,10 +5198,6 @@ def _xpehh_gwss( cohort2_query, window_size, percentiles, - standardize, - standardization_bins, - standardization_n_bins, - standardization_diagnostics, filter_min_maf, map_pos, min_ehh, @@ -5270,7 +5258,6 @@ def _xpehh_gwss( h2=ht2, pos=pos, map_pos=map_pos, - # min_maf=compute_min_maf, min_ehh=min_ehh, include_edges=include_edges, max_gap=max_gap, @@ -5285,19 +5272,6 @@ def _xpehh_gwss( ac1 = ac1[na_mask] ac2 = ac2[na_mask] - # take absolute value - xp = np.fabs(xp) - - # Update to take into account both sets - if standardize: - xp, _ = allel.standardize_by_allele_count( - score=xp, - aac=ac1[:, 1], - bins=standardization_bins, - n_bins=standardization_n_bins, - diagnostics=standardization_diagnostics, - ) - if window_size: xp = allel.moving_statistic( xp, statistic=np.percentile, size=window_size, q=percentiles @@ -5320,10 +5294,6 @@ def plot_xpehh_gwss_track( cohort2_query: Optional[base_params.sample_query] = None, window_size: xpehh_params.window_size = xpehh_params.window_size_default, percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, - standardize: xpehh_params.standardize = True, - standardization_bins: Optional[xpehh_params.standardization_bins] = None, - standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default, - standardization_diagnostics: xpehh_params.standardization_diagnostics = False, filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, map_pos: Optional[xpehh_params.map_pos] = None, min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, @@ -5347,16 +5317,12 @@ def plot_xpehh_gwss_track( x_range: Optional[gplt_params.x_range] = None, output_backend: gplt_params.output_backend = gplt_params.output_backend_default, ) -> gplt_params.figure: - # compute ihs + # compute xpehh x, xpehh = self.xpehh_gwss( contig=contig, analysis=analysis, window_size=window_size, percentiles=percentiles, - standardize=standardize, - standardization_bins=standardization_bins, - standardization_n_bins=standardization_n_bins, - standardization_diagnostics=standardization_diagnostics, filter_min_maf=filter_min_maf, map_pos=map_pos, min_ehh=min_ehh, @@ -5412,7 +5378,7 @@ def plot_xpehh_gwss_track( # select the base color palette to work from base_palette = bokeh.palettes.all_palettes[palette][8] - # keep only enough colours to plot the IHS tracks + # keep only enough colours to plot the XPEHH tracks bokeh_palette = base_palette[: xpehh.shape[1]] # reverse the colors so darkest is last From 966df02c4003fc4343ab49fea0901603b74d75dd Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 20 Jun 2023 16:17:17 +0200 Subject: [PATCH 22/26] Forgot the main file --- malariagen_data/anopheles.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 7f80543a1..fc0bc49e9 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -4424,10 +4424,6 @@ def plot_xpehh_gwss( cohort2_query: Optional[base_params.sample_query] = None, window_size: xpehh_params.window_size = xpehh_params.window_size_default, percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, - standardize: xpehh_params.standardize = True, - standardization_bins: Optional[xpehh_params.standardization_bins] = None, - standardization_n_bins: xpehh_params.standardization_n_bins = xpehh_params.standardization_n_bins_default, - standardization_diagnostics: xpehh_params.standardization_diagnostics = False, filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, map_pos: Optional[xpehh_params.map_pos] = None, min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, @@ -4461,10 +4457,6 @@ def plot_xpehh_gwss( window_size=window_size, percentiles=percentiles, palette=palette, - standardize=standardize, - standardization_bins=standardization_bins, - standardization_n_bins=standardization_n_bins, - standardization_diagnostics=standardization_diagnostics, filter_min_maf=filter_min_maf, map_pos=map_pos, min_ehh=min_ehh, @@ -5372,13 +5364,13 @@ def plot_xpehh_gwss_track( # Ensure percentiles are sorted so that colors make sense. percentiles = tuple(sorted(percentiles)) - # add an empty dimension to xpehh array if 1D + # add an empty dimension to XP-EHH array if 1D xpehh = np.reshape(xpehh, (xpehh.shape[0], -1)) # select the base color palette to work from base_palette = bokeh.palettes.all_palettes[palette][8] - # keep only enough colours to plot the XPEHH tracks + # keep only enough colours to plot the XP-EHH tracks bokeh_palette = base_palette[: xpehh.shape[1]] # reverse the colors so darkest is last @@ -5388,7 +5380,7 @@ def plot_xpehh_gwss_track( xpehh_perc = xpehh[:, i] color = bokeh_palette[i] - # plot xpehh + # plot XP-EHH fig.circle( x=x, y=xpehh_perc, @@ -5399,7 +5391,7 @@ def plot_xpehh_gwss_track( ) # tidy up the plot - fig.yaxis.axis_label = "xpehh" + fig.yaxis.axis_label = "XP-EHH" self._bokeh_style_genome_xaxis(fig, contig) if show: From 8aad3b541830bc7f761c3baf0e9b61b9116a3c81 Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 20 Jun 2023 17:20:43 +0200 Subject: [PATCH 23/26] update the test --- tests/test_ag3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_ag3.py b/tests/test_ag3.py index 102ebd266..d916e23a3 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -2293,7 +2293,7 @@ def test_xpehh_gwss(): # check some values assert_allclose(x[0], 467448.348) - assert_allclose(xpehh[:, 2][100], 2.741348351867086) + assert_allclose(xpehh[:, 2][100], 0.4817561326426265) def test_g123_gwss(): From fdde74f04f60cc609680e7c47b586becefd10bab Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Tue, 20 Jun 2023 20:50:15 +0200 Subject: [PATCH 24/26] Added a plot notebook as an example --- notebooks/plot_xpehh_gwss.ipynb | 178 ++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 notebooks/plot_xpehh_gwss.ipynb diff --git a/notebooks/plot_xpehh_gwss.ipynb b/notebooks/plot_xpehh_gwss.ipynb new file mode 100644 index 000000000..d6d6dfe46 --- /dev/null +++ b/notebooks/plot_xpehh_gwss.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "32067e1d", + "metadata": {}, + "outputs": [], + "source": [ + "import malariagen_data\n", + "\n", + "ag3 = malariagen_data.Ag3(\n", + " \"simplecache::gs://vo_agam_release\",\n", + " simplecache=dict(cache_storage=\"../gcs_cache\"),\n", + ")\n", + "ag3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "966049ec", + "metadata": {}, + "outputs": [], + "source": [ + "af1 = malariagen_data.Af1(\n", + " \"simplecache::gs://vo_afun_release\",\n", + " simplecache=dict(cache_storage=\"../gcs_cache\"),\n", + ")\n", + "af1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "838eddc4", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=1_000,\n", + " analysis=\"gamb_colu\",\n", + " percentiles=(50, 60, 90, 100),\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e780aa23", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=1_000,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + " filter_min_maf=0.1,\n", + " width=700,\n", + " sizing_mode=\"fixed\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b26a9654", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=100,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89cf50ac", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=1_000,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " min_ehh=0.1,\n", + " max_cohort_size=None,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78b452f5", + "metadata": {}, + "outputs": [], + "source": [ + "af1.sample_metadata()[\"cohort_admin1_year\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "079a3308", + "metadata": {}, + "outputs": [], + "source": [ + "af1.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=2_000,\n", + " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", + " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", + " max_cohort_size=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6290896", + "metadata": {}, + "outputs": [], + "source": [ + "af1.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=200,\n", + " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", + " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", + " max_cohort_size=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5aed96d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From afad321f30817e1e027c91495f6b2ff95ce8af79 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 22 Jun 2023 12:35:12 +0100 Subject: [PATCH 25/26] remove cruft --- malariagen_data/anopheles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index fc0bc49e9..858faf6fd 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -5231,7 +5231,6 @@ def _xpehh_gwss( ac2 = ht2.count_alleles(max_allele=1) pos = ds_haps1["variant_position"].values - # To be modified to take both into account if filter_min_maf > 0: ac = ac1 + ac2 af = ac.to_frequencies() From 3cf92c16bdd7b8d96f84df4a5be20d06b1e132ea Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 22 Jun 2023 12:45:50 +0100 Subject: [PATCH 26/26] tweak examples in notebook --- notebooks/plot_xpehh_gwss.ipynb | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/notebooks/plot_xpehh_gwss.ipynb b/notebooks/plot_xpehh_gwss.ipynb index d6d6dfe46..7d0b3005c 100644 --- a/notebooks/plot_xpehh_gwss.ipynb +++ b/notebooks/plot_xpehh_gwss.ipynb @@ -37,7 +37,7 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.plot_xpehh_gwss_track(\n", + "ag3.plot_xpehh_gwss(\n", " contig=\"X\",\n", " window_size=1_000,\n", " analysis=\"gamb_colu\",\n", @@ -75,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.plot_xpehh_gwss_track(\n", + "ag3.plot_xpehh_gwss(\n", " contig=\"X\",\n", " window_size=100,\n", " analysis=\"gamb_colu\",\n", @@ -85,6 +85,23 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "181227f6", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss(\n", + " contig=\"2RL\",\n", + " window_size=100,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -92,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.plot_xpehh_gwss_track(\n", + "ag3.plot_xpehh_gwss(\n", " contig=\"X\",\n", " window_size=1_000,\n", " analysis=\"gamb_colu\",\n", @@ -120,7 +137,7 @@ "metadata": {}, "outputs": [], "source": [ - "af1.plot_xpehh_gwss_track(\n", + "af1.plot_xpehh_gwss(\n", " contig=\"X\",\n", " window_size=2_000,\n", " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", @@ -136,8 +153,8 @@ "metadata": {}, "outputs": [], "source": [ - "af1.plot_xpehh_gwss_track(\n", - " contig=\"X\",\n", + "af1.plot_xpehh_gwss(\n", + " contig=\"2RL\",\n", " window_size=200,\n", " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", @@ -170,7 +187,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.6" } }, "nbformat": 4,