Skip to content

Commit

Permalink
Add Ag3.haplotypes() implementation and tests (#54)
Browse files Browse the repository at this point in the history
* add Ag3.haplotypes() implementation and tests

* fix docstring

* fix linting? (I feel a CI thrash coming on...)

* ci thrash

* ci thrash - this time for sure

* prep for 0.9.0 release [ci skip]
  • Loading branch information
alimanfoo authored Jul 29, 2021
1 parent f3c3404 commit 149398a
Show file tree
Hide file tree
Showing 6 changed files with 298 additions and 12 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: pre-commit/[email protected]
with:
python-version: 3.7
- uses: pre-commit/[email protected]
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/keewis/blackdoc
rev: v0.3.3
rev: v0.3.4
hooks:
- id: blackdoc
- repo: https://gitlab.com/pycqa/flake8
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ $ poetry publish
## Release notes


### 0.9.0

* Add `Ag3.haplotypes()` and supporting functions `Ag3.open_haplotypes()`
and `Ag3.open_haplotype_sites()`.


### 0.8.0

* Add site filter columns to dataframes returned by
Expand Down
209 changes: 200 additions & 9 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def __init__(self, url, **kwargs):
self._cache_cnv_hmm = dict()
self._cache_cnv_coverage_calls = dict()
self._cache_cnv_discordant_read_calls = dict()
self._cache_haplotypes = dict()
self._cache_haplotype_sites = dict()

@property
def releases(self):
Expand Down Expand Up @@ -347,7 +349,7 @@ def site_filters(
field="filter_pass",
analysis="dt_20200416",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access SNP site filters.
Expand Down Expand Up @@ -400,7 +402,7 @@ def snp_sites(
site_mask=None,
site_filters="dt_20200416",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access SNP site data (positions and alleles).
Expand Down Expand Up @@ -480,7 +482,7 @@ def snp_genotypes(
site_mask=None,
site_filters="dt_20200416",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access SNP genotypes and associated data.
Expand Down Expand Up @@ -548,7 +550,7 @@ def open_genome(self):
self._cache_genome = zarr.open_consolidated(store=store)
return self._cache_genome

def genome_sequence(self, contig, inline_array=True, chunks="auto"):
def genome_sequence(self, contig, inline_array=True, chunks="native"):
"""Access the reference genome sequence.
Parameters
Expand Down Expand Up @@ -892,7 +894,7 @@ def site_annotations(
site_mask=None,
site_filters="dt_20200416",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Load site annotations.
Expand Down Expand Up @@ -1006,7 +1008,7 @@ def snp_calls(
site_mask=None,
site_filters="dt_20200416",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access SNP sites, site filters and genotype calls.
Expand Down Expand Up @@ -1175,7 +1177,7 @@ def cnv_hmm(
contig,
sample_sets="v3_wild",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access CNV HMM data.
Expand Down Expand Up @@ -1270,7 +1272,7 @@ def cnv_coverage_calls(
sample_set,
analysis,
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access CNV HMM data.
Expand Down Expand Up @@ -1471,7 +1473,7 @@ def cnv_discordant_read_calls(
contig,
sample_sets="v3_wild",
inline_array=True,
chunks="auto",
chunks="native",
):
"""Access CNV discordant read calls data.
Expand Down Expand Up @@ -1672,6 +1674,195 @@ def gene_cnv_frequencies(self, contig, cohorts=None, sample_sets="v3_wild"):

return df

def open_haplotypes(self, sample_set, analysis):
"""Open haplotypes zarr.
Parameters
----------
sample_set : str
analysis : {"arab", "gamb_colu", "gamb_colu_arab"}
Returns
-------
root : zarr.hierarchy.Group
"""
try:
return self._cache_haplotypes[(sample_set, analysis)]
except KeyError:
release = self._lookup_release(sample_set=sample_set)
path = f"{self._path}/{release}/snp_haplotypes/{sample_set}/{analysis}/zarr"
store = SafeStore(FSMap(root=path, fs=self._fs, check=False, create=False))
# some sample sets have no data for a given analysis, handle this
if ".zmetadata" not in store:
root = None
else:
root = zarr.open_consolidated(store=store)
self._cache_haplotypes[(sample_set, analysis)] = root
return root

def open_haplotype_sites(self, analysis):
"""Open haplotype sites zarr.
Parameters
----------
analysis : {"arab", "gamb_colu", "gamb_colu_arab"}
Returns
-------
root : zarr.hierarchy.Group
"""
try:
return self._cache_haplotype_sites[analysis]
except KeyError:
path = f"{self._path}/v3/snp_haplotypes/sites/{analysis}/zarr"
store = SafeStore(FSMap(root=path, fs=self._fs, check=False, create=False))
root = zarr.open_consolidated(store=store)
self._cache_haplotype_sites[analysis] = root
return root

def _haplotypes_dataset(self, contig, sample_set, analysis, inline_array, chunks):

# open zarr
root = self.open_haplotypes(sample_set=sample_set, analysis=analysis)
sites = self.open_haplotype_sites(analysis=analysis)

# some sample sets have no data for a given analysis, handle this
if root is None:
return None

coords = dict()
data_vars = dict()

# variant_position
pos = sites[f"{contig}/variants/POS"]
coords["variant_position"] = (
[DIM_VARIANT],
from_zarr(pos, inline_array=inline_array, chunks=chunks),
)

# variant_contig
contig_index = self.contigs.index(contig)
coords["variant_contig"] = (
[DIM_VARIANT],
da.full_like(pos, fill_value=contig_index, dtype="u1"),
)

# variant_allele
ref = from_zarr(
sites[f"{contig}/variants/REF"], inline_array=inline_array, chunks=chunks
)
alt = from_zarr(
sites[f"{contig}/variants/ALT"], inline_array=inline_array, chunks=chunks
)
variant_allele = da.hstack([ref[:, None], alt[:, None]])
data_vars["variant_allele"] = [DIM_VARIANT, DIM_ALLELE], variant_allele

# call_genotype
data_vars["call_genotype"] = (
[DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY],
from_zarr(
root[f"{contig}/calldata/GT"], inline_array=inline_array, chunks=chunks
),
)

# sample arrays
coords["sample_id"] = (
[DIM_SAMPLE],
from_zarr(root["samples"], inline_array=inline_array, chunks=chunks),
)

# setup attributes
attrs = {"contigs": self.contigs}

# create a dataset
ds = xarray.Dataset(data_vars=data_vars, coords=coords, attrs=attrs)

return ds

def haplotypes(
self,
contig,
analysis,
sample_sets="v3_wild",
inline_array=True,
chunks="native",
):
"""Access haplotype data.
Parameters
----------
contig : str
Chromosome arm, e.g., "3R".
analysis : {"arab", "gamb_colu", "gamb_colu_arab"}
Which phasing analysis to use. If analysing only An. arabiensis, the "arab" analysis
is best. If analysing only An. gambiae and An. coluzzii, the "gamb_colu" analysis is
best. Otherwise use the "gamb_colu_arab" analysis.
sample_sets : str or list of str
Can be a sample set identifier (e.g., "AG1000G-AO") or a list of sample set
identifiers (e.g., ["AG1000G-BF-A", "AG1000G-BF-B"]) or a release identifier (e.g.,
"v3") or a list of release identifiers.
inline_array : bool, optional
Passed through to dask.array.from_array().
chunks : str, optional
If 'auto' let dask decide chunk size. If 'native' use native zarr chunks.
Also can be a target size, e.g., '200 MiB'.
Returns
-------
ds : xarray.Dataset
"""

sample_sets = self._prep_sample_sets_arg(sample_sets=sample_sets)

if isinstance(sample_sets, str):

# single sample set requested
ds = self._haplotypes_dataset(
contig=contig,
sample_set=sample_sets,
analysis=analysis,
inline_array=inline_array,
chunks=chunks,
)

else:

# multiple sample sets requested, need to concatenate along samples dimension
datasets = [
self._haplotypes_dataset(
contig=contig,
sample_set=sample_set,
analysis=analysis,
inline_array=inline_array,
chunks=chunks,
)
for sample_set in sample_sets
]
# some sample sets have no data for a given analysis, handle this
datasets = [d for d in datasets if d is not None]
if len(datasets) == 0:
ds = None
else:
ds = xarray.concat(
datasets,
dim=DIM_SAMPLE,
data_vars="minimal",
coords="minimal",
compat="override",
join="override",
)

# if no samples at all, raise
if ds is None:
raise ValueError(
f"no samples available for analysis {analysis!r} and sample sets {sample_sets!r}"
)

return ds


@numba.njit("Tuple((int8, int64))(int8[:], int8)")
def _cn_mode_1d(a, vmax):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "malariagen_data"
version = "0.8.0"
version = "0.9.0"
description = "A package for accessing MalariaGEN public data."
authors = ["Alistair Miles <[email protected]>", "Chris Clarkson <[email protected]>"]
license = "MIT"
Expand Down
Loading

0 comments on commit 149398a

Please sign in to comment.