Skip to content

Commit

Permalink
Genes data pipeline: update field mapping for variant cooccurrence (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
nadeaujoshua authored Feb 14, 2024
1 parent 025d12a commit 69dbfc6
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 32 deletions.
8 changes: 5 additions & 3 deletions data-pipeline/src/data_pipeline/datasets/clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,11 @@ def import_clinvar_xml(clinvar_xml_path):
[
locations["GRCh37"]["locus"] if "GRCh37" in locations else "NA",
json.dumps(locations["GRCh37"]["alleles"]) if "GRCh37" in locations else "NA",
"chr" + locations["GRCh38"]["locus"].replace("MT", "M")
if "GRCh38" in locations
else "NA",
(
"chr" + locations["GRCh38"]["locus"].replace("MT", "M")
if "GRCh38" in locations
else "NA"
),
json.dumps(locations["GRCh38"]["alleles"]) if "GRCh38" in locations else "NA",
json.dumps(variant),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,11 @@ def prepare_gnomad_v3_short_tandem_repeats(path):
"repeat_unit": repeat_unit,
# Loci with only one repeat unit do not have a RepeatUnitClassification field.
# In those cases, the repeat unit is pathogenic.
"classification": locus["RepeatUnitClassification"].get(repeat_unit, "unknown").lower()
if "RepeatUnitClassification" in locus
else "pathogenic",
"classification": (
locus["RepeatUnitClassification"].get(repeat_unit, "unknown").lower()
if "RepeatUnitClassification" in locus
else "pathogenic"
),
}
for repeat_unit in (
set(k.split("/")[2] for k in locus["AlleleCountHistogram"].keys())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,14 @@ def subset_filter(subset):
id="_".join(filter(bool, [pop, sex])),
ac=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AC, 0),
an=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AN, 0),
hemizygote_count=0
if sex == "XX"
else hl.if_else(
ds.in_autosome_or_par,
0,
hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
hemizygote_count=(
0
if sex == "XX"
else hl.if_else(
ds.in_autosome_or_par,
0,
hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
)
),
homozygote_count=hl.or_else(
freq(ds, subset=subset, pop=pop, sex=sex).homozygote_count, 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,14 @@ def subset_filter(subset):
id="_".join(filter(bool, [pop, sex])),
ac=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AC, 0),
an=hl.or_else(freq(ds, subset=subset, pop=pop, sex=sex).AN, 0),
hemizygote_count=0
if sex == "XX"
else hl.if_else(
ds.in_autosome_or_par,
0,
hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
hemizygote_count=(
0
if sex == "XX"
else hl.if_else(
ds.in_autosome_or_par,
0,
hl.or_else(freq(ds, subset=subset, pop=pop, sex="XY").AC, 0),
)
),
homozygote_count=hl.or_else(
freq(ds, subset=subset, pop=pop, sex=sex).homozygote_count, 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

AF_CUTOFF_MAPPING = hl.literal(
{
"1.0000e-02": "af_cutoff_0_01",
"1.0000e-03": "af_cutoff_0_001",
"1.0000e-04": "af_cutoff_0_0001",
"1.0000e-05": "af_cutoff_0_00001",
"1.5000e-02": "af_cutoff_0_015",
"2.0000e-02": "af_cutoff_0_02",
"5.0000e-02": "af_cutoff_0_05",
"5.0000e-03": "af_cutoff_0_005",
"5.0000e-04": "af_cutoff_0_0005",
"5.0000e-05": "af_cutoff_0_00005",
"0.01": "af_cutoff_0_01",
"0.001": "af_cutoff_0_001",
"0.0001": "af_cutoff_0_0001",
"0.00001": "af_cutoff_0_00001",
"0.015": "af_cutoff_0_015",
"0.02": "af_cutoff_0_02",
"0.05": "af_cutoff_0_05",
"0.005": "af_cutoff_0_005",
"0.0005": "af_cutoff_0_0005",
"0.00005": "af_cutoff_0_00005",
}
)

Expand Down Expand Up @@ -47,10 +47,10 @@ def prepare_variant_cooccurrence_counts(tsv_path, field_name_map):

def prepare_heterozygous_variant_cooccurrence_counts():
field_name_map = {
"in_cis": "n_same_hap_without_chet_or_unphased",
"in_trans": "n_chet",
"unphased": "n_unphased_without_chet",
"two_het_total": "n_any_het_het",
"in_cis": "n_in_cis_without_in_trans_and_unphased",
"in_trans": "n_in_trans",
"unphased": "n_unphased_without_in_trans",
"two_het_total": "n_two_het",
}
return prepare_variant_cooccurrence_counts(TWO_HET_DATA_PATH, field_name_map)

Expand Down

0 comments on commit 69dbfc6

Please sign in to comment.