Genes data pipeline: update field mapping for variant cooccurrence (#…

…1403)
broadinstitute · Feb 14, 2024 · 69dbfc6 · 69dbfc6
1 parent 025d12a
commit 69dbfc6
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 32 deletions.
diff --git a/data-pipeline/src/data_pipeline/datasets/clinvar.py b/data-pipeline/src/data_pipeline/datasets/clinvar.py
@@ -195,9 +195,11 @@ def import_clinvar_xml(clinvar_xml_path):
                             [
                                 locations["GRCh37"]["locus"] if "GRCh37" in locations else "NA",
                                 json.dumps(locations["GRCh37"]["alleles"]) if "GRCh37" in locations else "NA",
-                                "chr" + locations["GRCh38"]["locus"].replace("MT", "M")
-                                if "GRCh38" in locations
-                                else "NA",
+                                (
+                                    "chr" + locations["GRCh38"]["locus"].replace("MT", "M")
+                                    if "GRCh38" in locations
+                                    else "NA"
+                                ),
                                 json.dumps(locations["GRCh38"]["alleles"]) if "GRCh38" in locations else "NA",
                                 json.dumps(variant),
                             ]

diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py
@@ -443,9 +443,11 @@ def prepare_gnomad_v3_short_tandem_repeats(path):
                         "repeat_unit": repeat_unit,
                         # Loci with only one repeat unit do not have a RepeatUnitClassification field.
                         # In those cases, the repeat unit is pathogenic.
-                        "classification": locus["RepeatUnitClassification"].get(repeat_unit, "unknown").lower()
-                        if "RepeatUnitClassification" in locus
-                        else "pathogenic",
+                        "classification": (
+                            locus["RepeatUnitClassification"].get(repeat_unit, "unknown").lower()
+                            if "RepeatUnitClassification" in locus
+                            else "pathogenic"
+                        ),
                     }
                     for repeat_unit in (
                         set(k.split("/")[2] for k in locus["AlleleCountHistogram"].keys())

diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_variants.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_variants.py
@@ -102,12 +102,14 @@ def subset_filter(subset):
                                 id="_".join(filter(bool, [pop, sex])),
                                 ac=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AC, 0),
                                 an=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AN, 0),
-                                hemizygote_count=0
-                                if sex == "XX"
-                                else hl.if_else(
-                                    ds.in_autosome_or_par,
-                                    0,
-                                    hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
+                                hemizygote_count=(
+                                    0
+                                    if sex == "XX"
+                                    else hl.if_else(
+                                        ds.in_autosome_or_par,
+                                        0,
+                                        hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
+                                    )
                                 ),
                                 homozygote_count=hl.or_else(
                                     freq(ds, subset=subset, pop=pop, sex=sex).homozygote_count, 0

diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/gnomad_v4_variants.py
@@ -105,12 +105,14 @@ def subset_filter(subset):
                                 id="_".join(filter(bool, [pop, sex])),
                                 ac=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AC, 0),
                                 an=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AN, 0),
-                                hemizygote_count=0
-                                if sex == "XX"
-                                else hl.if_else(
-                                    ds.in_autosome_or_par,
-                                    0,
-                                    hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
+                                hemizygote_count=(
+                                    0
+                                    if sex == "XX"
+                                    else hl.if_else(
+                                        ds.in_autosome_or_par,
+                                        0,
+                                        hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
+                                    )
                                 ),
                                 homozygote_count=hl.or_else(
                                     freq(ds, subset=subset, pop=pop, sex=sex).homozygote_count, 0

diff --git a/data-pipeline/src/data_pipeline/pipelines/variant_cooccurrence_counts.py b/data-pipeline/src/data_pipeline/pipelines/variant_cooccurrence_counts.py
@@ -5,16 +5,16 @@
 
 AF_CUTOFF_MAPPING = hl.literal(
     {
-        "1.0000e-02": "af_cutoff_0_01",
-        "1.0000e-03": "af_cutoff_0_001",
-        "1.0000e-04": "af_cutoff_0_0001",
-        "1.0000e-05": "af_cutoff_0_00001",
-        "1.5000e-02": "af_cutoff_0_015",
-        "2.0000e-02": "af_cutoff_0_02",
-        "5.0000e-02": "af_cutoff_0_05",
-        "5.0000e-03": "af_cutoff_0_005",
-        "5.0000e-04": "af_cutoff_0_0005",
-        "5.0000e-05": "af_cutoff_0_00005",
+        "0.01": "af_cutoff_0_01",
+        "0.001": "af_cutoff_0_001",
+        "0.0001": "af_cutoff_0_0001",
+        "0.00001": "af_cutoff_0_00001",
+        "0.015": "af_cutoff_0_015",
+        "0.02": "af_cutoff_0_02",
+        "0.05": "af_cutoff_0_05",
+        "0.005": "af_cutoff_0_005",
+        "0.0005": "af_cutoff_0_0005",
+        "0.00005": "af_cutoff_0_00005",
     }
 )
 
@@ -47,10 +47,10 @@ def prepare_variant_cooccurrence_counts(tsv_path, field_name_map):
 
 def prepare_heterozygous_variant_cooccurrence_counts():
     field_name_map = {
-        "in_cis": "n_same_hap_without_chet_or_unphased",
-        "in_trans": "n_chet",
-        "unphased": "n_unphased_without_chet",
-        "two_het_total": "n_any_het_het",
+        "in_cis": "n_in_cis_without_in_trans_and_unphased",
+        "in_trans": "n_in_trans",
+        "unphased": "n_unphased_without_in_trans",
+        "two_het_total": "n_two_het",
     }
     return prepare_variant_cooccurrence_counts(TWO_HET_DATA_PATH, field_name_map)