Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filter_samples_ht and naive_coalesce_partitions to v4 genomes VDS code #655

Merged
merged 4 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions gnomad_qc/v3/resources/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def get_gnomad_v3_vds(
split_reference_blocks: bool = True,
entries_to_keep: Optional[List[str]] = None,
annotate_het_non_ref: bool = False,
naive_coalesce_partitions: Optional[int] = None,
filter_samples_ht: Optional[hl.Table] = None,
) -> hl.vds.VariantDataset:
"""
Get gnomAD VariantDataset with desired filtering and metadata annotations.
Expand Down Expand Up @@ -63,6 +65,9 @@ def get_gnomad_v3_vds(
of the local entries (e.g. 'LGT') to keep.
:param annotate_het_non_ref: Whether to annotate non reference heterozygotes (as
'_het_non_ref') to the variant data. Default is False.
:param naive_coalesce_partitions: Optional argument to coalesce the VDS to a
specific number of partitions using naive coalesce.
:param filter_samples_ht: Optional Table of samples to filter the VDS to.
:return: gnomAD v3 dataset with chosen annotations and filters.
"""
if test:
Expand All @@ -89,6 +94,12 @@ def get_gnomad_v3_vds(
logger.info("Filtering to chromosome(s) %s...", chrom)
vds = hl.vds.filter_chromosomes(vds, keep=chrom)

if naive_coalesce_partitions:
vds = hl.vds.VariantDataset(
vds.reference_data.naive_coalesce(naive_coalesce_partitions),
vds.variant_data.naive_coalesce(naive_coalesce_partitions),
)

if filter_partitions:
logger.info("Filtering to %s partitions...", len(filter_partitions))
vds = hl.vds.VariantDataset(
Expand All @@ -114,6 +125,13 @@ def get_gnomad_v3_vds(
keep=False,
)

if filter_samples_ht:
logger.info(
"Filtering VDS to %d samples in provided Table...",
filter_samples_ht.count(),
)
vds = hl.vds.filter_samples(vds, filter_samples_ht)

if samples_meta or release_only:
meta_ht = meta.ht()
if release_only:
Expand Down
7 changes: 7 additions & 0 deletions gnomad_qc/v4/resources/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ def get_gnomad_v4_genomes_vds(
split_reference_blocks: bool = True,
entries_to_keep: Optional[List[str]] = None,
annotate_het_non_ref: bool = False,
naive_coalesce_partitions: Optional[int] = None,
filter_samples_ht: Optional[hl.Table] = None,
) -> hl.vds.VariantDataset:
"""
Get gnomAD v4 genomes VariantDataset with desired filtering and metadata annotations.
Expand Down Expand Up @@ -438,6 +440,9 @@ def get_gnomad_v4_genomes_vds(
of the local entries (e.g. 'LGT') to keep.
:param annotate_het_non_ref: Whether to annotate non reference heterozygotes (as
'_het_non_ref') to the variant data. Default is False.
:param naive_coalesce_partitions: Optional argument to coalesce the VDS to a
specific number of partitions using naive coalesce.
:param filter_samples_ht: Optional Table of samples to filter the VDS to.
:return: gnomAD v4 genomes VariantDataset with chosen annotations and filters.
"""
vds = v3_basics.get_gnomad_v3_vds(
Expand All @@ -455,6 +460,8 @@ def get_gnomad_v4_genomes_vds(
split_reference_blocks=split_reference_blocks,
entries_to_keep=entries_to_keep,
annotate_het_non_ref=annotate_het_non_ref,
naive_coalesce_partitions=naive_coalesce_partitions,
filter_samples_ht=filter_samples_ht,
)

if annotate_meta or release_only:
Expand Down