Skip to content

Commit 94cea0c

Browse files
committed
add pipeline steps to prepare release browser hts
1 parent abbcaec commit 94cea0c

File tree

6 files changed

+78
-4
lines changed

6 files changed

+78
-4
lines changed

data-pipeline/src/data_pipeline/data_types/gene.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,16 @@ def import_hgnc(path):
226226
return ds
227227

228228

229+
def prepare_gene_table_for_release(genes_path, keep_mane_version_global_annotation):
230+
ds = hl.read_table(genes_path)
231+
if keep_mane_version_global_annotation:
232+
globals_dict = ds.index_globals()
233+
ds = ds.select_globals(mane_select_version=globals_dict["annotations"]["mane_select_transcript"]["version"])
234+
else:
235+
ds = ds.select_globals()
236+
return ds
237+
238+
229239
def prepare_genes(gencode_path, hgnc_path, reference_genome):
230240
genes = import_gencode(gencode_path, reference_genome)
231241

data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans
8585

8686
if mane_transcripts_path:
8787
mane_transcripts = hl.read_table(mane_transcripts_path)
88-
mane_transcripts_version = hl.eval(mane_transcripts.globals.version)
88+
mane_select_transcripts_version = hl.eval(mane_transcripts.globals.version)
8989

9090
mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id")) for row in mane_transcripts.collect()])
9191

@@ -138,7 +138,7 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans
138138
)
139139

140140
ds = ds.annotate(transcript_consequences=transcript_consequences).drop("vep")
141-
ds = ds.annotate_globals(mane_transcripts_version=mane_transcripts_version)
141+
ds = ds.annotate_globals(mane_select_version=mane_select_transcripts_version)
142142

143143
else:
144144
transcript_consequences = hl.sorted(

data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_constraint.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,9 @@ def prepare_gnomad_v4_constraint(path):
4343
ds = ds.key_by("transcript_id")
4444

4545
return ds
46+
47+
48+
def remove_gnomad_v4_constraint(genes_path):
49+
ds = hl.read_table(genes_path)
50+
ds = ds.drop("gnomad_constraint")
51+
return ds

data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,17 @@ def freq_joint(ds, subset=None, pop=None, sex=None, raw=False):
391391
return ds
392392

393393

394+
def prepare_table_for_release(variants_table_path):
395+
ds = hl.read_table(variants_table_path)
396+
ds = ds.annotate(
397+
exomes=ds.exomes.drop("faf95", "faf99"),
398+
genomes=ds.genomes.drop("faf95", "faf99"),
399+
joint=ds.joint.drop("faf99_joint", "faf95_joint"),
400+
)
401+
ds = ds.select_globals(mane_select_version=ds.globals.mane_transcripts_version)
402+
return ds
403+
404+
394405
def prepare_gnomad_v4_variants(exome_variants_path: str, genome_variants_path: str, variants_joint_frequency_path: str):
395406
exome_variants = prepare_gnomad_v4_variants_helper(exome_variants_path, "exome")
396407
genome_variants = prepare_gnomad_v4_variants_helper(genome_variants_path, "genome")

data-pipeline/src/data_pipeline/pipelines/genes.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from data_pipeline.helpers import annotate_table
66

7-
from data_pipeline.data_types.gene import prepare_genes
7+
from data_pipeline.data_types.gene import prepare_genes, prepare_gene_table_for_release
88
from data_pipeline.data_types.canonical_transcript import get_canonical_transcripts
99
from data_pipeline.data_types.mane_select_transcript import import_mane_select_transcripts
1010
from data_pipeline.data_types.transcript import (
@@ -27,7 +27,10 @@
2727
)
2828
from data_pipeline.data_types.gene import reject_par_y_genes
2929

30-
from data_pipeline.datasets.gnomad_v4.gnomad_v4_constraint import prepare_gnomad_v4_constraint
30+
from data_pipeline.datasets.gnomad_v4.gnomad_v4_constraint import (
31+
prepare_gnomad_v4_constraint,
32+
remove_gnomad_v4_constraint,
33+
)
3134

3235
pipeline = Pipeline()
3336

@@ -319,6 +322,17 @@ def annotate_with_preferred_transcript(table_path):
319322
},
320323
)
321324

325+
# naming scheme follows methods naming scheme for consistency
326+
pipeline.add_task(
327+
"prepare_grch37_genes_table_for_public_release",
328+
prepare_gene_table_for_release,
329+
f"/{genes_subdir}/gnomad.browser.GRCh37.GENCODEv19.ht",
330+
{
331+
"genes_path": pipeline.get_task("annotate_grch37_genes_step_5"),
332+
"keep_mane_version_global_annotation": False,
333+
},
334+
)
335+
322336
pipeline.add_task(
323337
"annotate_grch38_genes_step_1",
324338
annotate_table,
@@ -373,6 +387,27 @@ def annotate_with_constraint(genes_path, constraint_path):
373387
},
374388
)
375389

390+
pipeline.add_task(
391+
"remove_grch38_genes_constraint_for_release",
392+
remove_gnomad_v4_constraint,
393+
f"/{genes_subdir}/genes_grch38_annotate_5_removed_constraint",
394+
{
395+
"genes_path": pipeline.get_task("annotate_grch38_genes_step_5"),
396+
},
397+
)
398+
399+
400+
# naming scheme follows methods naming scheme for consistency
401+
pipeline.add_task(
402+
"prepare_grch38_genes_table_for_public_release",
403+
prepare_gene_table_for_release,
404+
f"/{genes_subdir}/gnomad.browser.GRCh38.GENCODEv39.ht",
405+
{
406+
"genes_path": pipeline.get_task("remove_grch38_genes_constraint_for_release"),
407+
"keep_mane_version_global_annotation": True,
408+
},
409+
)
410+
376411
###############################################
377412
# Extract transcripts
378413
###############################################

data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from data_pipeline.datasets.gnomad_v4.gnomad_v4_variants import (
1010
prepare_gnomad_v4_variants,
11+
prepare_table_for_release,
1112
)
1213

1314

@@ -102,6 +103,17 @@
102103
},
103104
)
104105

106+
# removes several duplicated values, as well as constraint to prepare for release to the general public
107+
# naming scheme follows methods naming scheme for consistency
108+
pipeline.add_task(
109+
name="prepare_table_for_release",
110+
task_function=prepare_table_for_release,
111+
output_path=f"{output_sub_dir}/gnomad.browser.v4.1.sites.ht",
112+
inputs={
113+
"variants_table_path": pipeline.get_task("annotate_vrs_ids"),
114+
},
115+
)
116+
105117
###############################################
106118
# Outputs
107119
###############################################

0 commit comments

Comments
 (0)