Skip to content

Commit 6f0e887

Browse files
committed
feat(data-pipelines): add steps to prepare v4 variants ht for public release
1 parent 597f61d commit 6f0e887

File tree

3 files changed

+25
-2
lines changed

3 files changed

+25
-2
lines changed

data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans
8585

8686
if mane_transcripts_path:
8787
mane_transcripts = hl.read_table(mane_transcripts_path)
88-
mane_transcripts_version = hl.eval(mane_transcripts.globals.version)
88+
mane_select_transcripts_version = hl.eval(mane_transcripts.globals.version)
8989

9090
mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id")) for row in mane_transcripts.collect()])
9191

@@ -138,7 +138,7 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans
138138
)
139139

140140
ds = ds.annotate(transcript_consequences=transcript_consequences).drop("vep")
141-
ds = ds.annotate_globals(mane_transcripts_version=mane_transcripts_version)
141+
ds = ds.annotate_globals(mane_select_version=mane_select_transcripts_version)
142142

143143
else:
144144
transcript_consequences = hl.sorted(

data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,17 @@ def freq_joint(ds, subset=None, pop=None, sex=None, raw=False):
391391
return ds
392392

393393

394+
def prepare_table_for_release(variants_table_path):
395+
ds = hl.read_table(variants_table_path)
396+
ds = ds.annotate(
397+
exomes=ds.exomes.drop("faf95", "faf99"),
398+
genomes=ds.genomes.drop("faf95", "faf99"),
399+
joint=ds.joint.drop("faf99_joint", "faf95_joint"),
400+
)
401+
ds = ds.select_globals(mane_select_version=ds.globals.mane_transcripts_version)
402+
return ds
403+
404+
394405
def prepare_gnomad_v4_variants(exome_variants_path: str, genome_variants_path: str, variants_joint_frequency_path: str):
395406
exome_variants = prepare_gnomad_v4_variants_helper(exome_variants_path, "exome")
396407
genome_variants = prepare_gnomad_v4_variants_helper(genome_variants_path, "genome")

data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from data_pipeline.datasets.gnomad_v4.gnomad_v4_variants import (
1010
prepare_gnomad_v4_variants,
11+
prepare_table_for_release,
1112
)
1213

1314

@@ -102,6 +103,17 @@
102103
},
103104
)
104105

106+
# removes several duplicated values, as well as constraint to prepare for release to the general public
107+
# naming scheme follows methods naming scheme for consistency
108+
pipeline.add_task(
109+
name="prepare_table_for_release",
110+
task_function=prepare_table_for_release,
111+
output_path=f"{output_sub_dir}/gnomad.browser.v4.1.sites.ht",
112+
inputs={
113+
"variants_table_path": pipeline.get_task("annotate_vrs_ids"),
114+
},
115+
)
116+
105117
###############################################
106118
# Outputs
107119
###############################################

0 commit comments

Comments
 (0)