From 16cc35a1624a970e505f063c5e1cfc34d49d71c9 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 15 Jul 2024 13:39:43 -0400 Subject: [PATCH 01/36] Update STR data pipeline for new data Major changes here are: * Instead of a single `reference_region`, STRs now have a list of `reference_regions` with a single one designated the `main_reference_region` * Allele size distributions and genotype distributions were previously represented with an attempt to represent multidimensional data with a number of nested structs, which was serviceable when there were only one or two dimensions we might want to filter on, but was getting increasingly convoluted. Since this new data expands the number of dimensions further, rather than build on the former schema and confuse things more, these distributions are now represented with a flattened list of structs each of which represents a single subset of the distribution. --- .../gnomad_v3_short_tandem_repeats.py | 529 ++++-------------- .../pipelines/export_to_elasticsearch.py | 2 +- .../gnomad_v3_short_tandem_repeats.py | 4 +- 3 files changed, 106 insertions(+), 429 deletions(-) diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py index e27fa872d..901fca3e5 100644 --- a/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py @@ -1,6 +1,4 @@ -import itertools import json -from collections import defaultdict import hail as hl @@ -8,12 +6,8 @@ def _parse_region_id(region_id): [chrom, position_range] = region_id.split(":") chrom = chrom[3:] - [start, stop] = map(int, position_range.split("-")) - return { - "chrom": chrom, - "start": start, - "stop": stop, - } + [start, stop] = list(map(int, position_range.split("-"))) + return {"chrom": chrom, "start": start, "stop": stop, "reference_genome": "GRCh38"} def _prepare_histogram(histogram): @@ -23,161 +17,6 @@ def _prepare_histogram(histogram): ) -def _population_sort_key(pop): - pop_id = pop["id"] - if pop_id == "XX" or pop_id == "XY": - return ("zzz", pop_id) - - if "_" in pop_id: - return tuple(pop_id.split("_")) - - return (pop_id, "") - - -def _get_total_histogram(histogram): - total = defaultdict(int) - for v in histogram.values(): - for k, n in v.items(): - total[k] += n - - return total - - -def _prepare_allele_size_distribution_populations(locus): - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountHistogram"].keys())) - - distributions = sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[0] == population - } - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[0] == population and k.split("/")[1] == "XX" - } - ) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[0] == population and k.split("/")[1] == "XY" - } - ) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_histogram( - _get_total_histogram( - {k: v for k, v in locus["AlleleCountHistogram"].items() if k.split("/")[1] == sex} - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - -def _prepare_allele_size_distribution_repeat_units(locus): - repeat_units = sorted(set(key.split("/")[2] for key in locus["AlleleCountHistogram"].keys())) - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountHistogram"].keys())) - - distributions = sorted( - [ - { - "repeat_unit": repeat_unit, - "distribution": _prepare_histogram( - _get_total_histogram( - {k: v for k, v in locus["AlleleCountHistogram"].items() if k.split("/")[2] == repeat_unit} - ) - ), - "populations": sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[2] == repeat_unit and k.split("/")[0] == population - } - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_histogram( - locus["AlleleCountHistogram"].get(f"{population}/XX/{repeat_unit}", {}) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_histogram( - locus["AlleleCountHistogram"].get(f"{population}/XY/{repeat_unit}", {}) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[2] == repeat_unit and k.split("/")[1] == sex - } - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ), - } - for repeat_unit in repeat_units - ], - key=lambda r: (len(r["repeat_unit"]), r["repeat_unit"]), - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - def _prepare_age_distribution(locus): age_bins = [ ("<20", None, 20), @@ -205,192 +44,6 @@ def _prepare_age_distribution(locus): ] -def _prepare_genotype_distribution_histogram(histogram): - return sorted( - ([*(int(n) for n in n_repeats.split("/")), n_samples] for n_repeats, n_samples in histogram.items()), - key=lambda value: (value[0], value[1]), - ) - - -def _filter_genotype_distribution_histogram(histogram, repeat_units=None, population=None, sex=None): - predicates = [] - if repeat_units: - predicates.append( - lambda key: tuple(sorted(key.split("/")[2:4])) in (repeat_units, tuple(reversed(repeat_units))) - ) - if population: - predicates.append(lambda key: key.split("/")[0] == population) - if sex: - predicates.append(lambda key: key.split("/")[1] == sex) - - filtered_histogram = {k: v for k, v in histogram.items() if all(predicate(k) for predicate in predicates)} - - if not repeat_units: - return filtered_histogram - - return dict( - itertools.chain( - ((k, v) for k, v in filtered_histogram.items() if tuple(k.split("/")[2:4]) == repeat_units), - ( - (f"{k}-reversed", {"/".join(reversed(vk.split("/"))): vv for vk, vv in v.items()}) - for k, v in filtered_histogram.items() - if tuple(k.split("/")[2:4]) == tuple(reversed(repeat_units)) - and tuple(k.split("/")[2:4]) != repeat_units - ), - ) - ) - - -def _prepare_genotype_distribution_populations(locus): - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountScatterPlot"].keys())) - - distributions = sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], population=population - ) - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], population=population, sex="XX" - ) - ) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], population=population, sex="XY" - ) - ) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram(locus["AlleleCountScatterPlot"], sex=sex) - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - -def _prepare_genotype_distribution_repeat_units(locus): - repeat_unit_pairs = sorted( - set(tuple(sorted(key.split("/")[2:4])) for key in locus["AlleleCountScatterPlot"].keys()) - ) - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountScatterPlot"].keys())) - - distributions = sorted( - [ - { - "repeat_units": list(repeat_unit_pair), - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], repeat_units=repeat_unit_pair - ) - ) - ), - "populations": sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], - repeat_units=repeat_unit_pair, - population=population, - ) - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], - repeat_units=repeat_unit_pair, - population=population, - sex="XX", - ) - ) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], - repeat_units=repeat_unit_pair, - population=population, - sex="XY", - ) - ) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], repeat_units=repeat_unit_pair, sex=sex - ) - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ), - } - for repeat_unit_pair in repeat_unit_pairs - ], - key=lambda r: ( - len(r["repeat_units"][0]), - len(r["repeat_units"][1]), - r["repeat_units"][0], - r["repeat_units"][1], - ), - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - def _prepare_disease_repeat_size_classifications(disease): ranges = [] @@ -414,6 +67,63 @@ def _prepare_disease_repeat_size_classifications(disease): } +def _parse_allele_count_histogram_section(cohort_key, distribution): + [ancestry_group, sex, repunit, quality_description, q_score] = cohort_key.split("/") + return { + "ancestry_group": ancestry_group, + "sex": sex, + "repunit": repunit, + "quality_description": quality_description.lower(), + "q_score": float(q_score), + "distribution": [{"repunit_count": int(k), "frequency": v} for k, v in distribution.items()], + } + + +def _prepare_allele_size_distribution(allele_count_histogram): + return [_parse_allele_count_histogram_section(k, v) for (k, v) in allele_count_histogram.items()] + + +def _parse_allele_scatter_plot_item(item): + (key, value) = item + [short_allele_repunit_count, long_allele_repunit_count] = key.split("/") + return { + "short_allele_repunit_count": int(short_allele_repunit_count), + "long_allele_repunit_count": int(long_allele_repunit_count), + "frequency": value, + } + + +def _parse_allele_scatter_plot_distribution(distribution): + return list(map(_parse_allele_scatter_plot_item, distribution.items())) + + +def _parse_allele_scatter_plot_histogram_section(cohort_key, distribution): + [ancestry_group, sex, short_allele_repunit, long_allele_repunit, quality_description, q_score] = cohort_key.split( + "/" + ) + return { + "ancestry_group": ancestry_group, + "sex": sex, + "short_allele_repunit": short_allele_repunit, + "long_allele_repunit": long_allele_repunit, + "quality_description": quality_description, + "q_score": float(q_score), + "distribution": _parse_allele_scatter_plot_distribution(distribution), + } + + +def _prepare_genotype_distribution(allele_scatter_plot_histogram): + return [_parse_allele_scatter_plot_histogram_section(k, v) for k, v in allele_scatter_plot_histogram.items()] + + +def _parse_reference_regions(regions): + # "regions" may be a single string or list of strings + + if type(regions) == str: + return [_parse_region_id(regions)] + return list(map(_parse_region_id, regions)) + + def prepare_gnomad_v3_short_tandem_repeats(path): with hl.hadoop_open(path) as input_file: data = json.load(input_file) @@ -435,7 +145,8 @@ def prepare_gnomad_v3_short_tandem_repeats(path): for disease in locus["Diseases"] ], "stripy_id": locus["STRipyName"] if "STRipyName" in locus else None, - "reference_region": {"reference_genome": "GRCh38", **_parse_region_id(locus["ReferenceRegion"])}, + "main_reference_region": _parse_region_id(locus["MainReferenceRegion"]), + "reference_regions": _parse_reference_regions(locus["ReferenceRegion"]), "reference_repeat_unit": locus["ReferenceRepeatUnit"], "repeat_units": sorted( ( @@ -456,18 +167,8 @@ def prepare_gnomad_v3_short_tandem_repeats(path): ), key=lambda r: (len(r["repeat_unit"]), r["repeat_unit"]), ), - "allele_size_distribution": { - "distribution": _prepare_histogram(_get_total_histogram(locus["AlleleCountHistogram"])), - "populations": _prepare_allele_size_distribution_populations(locus), - "repeat_units": _prepare_allele_size_distribution_repeat_units(locus), - }, - "genotype_distribution": { - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram(locus["AlleleCountScatterPlot"]) - ), - "populations": _prepare_genotype_distribution_populations(locus), - "repeat_units": _prepare_genotype_distribution_repeat_units(locus), - }, + "allele_size_distribution": _prepare_allele_size_distribution(locus["AlleleCountHistogram"]), + "genotype_distribution": _prepare_genotype_distribution(locus["AlleleCountScatterPlot"]), "age_distribution": _prepare_age_distribution(locus), "adjacent_repeats": sorted( [ @@ -482,20 +183,12 @@ def prepare_gnomad_v3_short_tandem_repeats(path): set(k.split("/")[2] for k in adjacent_repeat["AlleleCountHistogram"].keys()), key=lambda repeat_unit: (len(repeat_unit), repeat_unit), ), - "allele_size_distribution": { - "distribution": _prepare_histogram( - _get_total_histogram(adjacent_repeat["AlleleCountHistogram"]) - ), - "populations": _prepare_allele_size_distribution_populations(adjacent_repeat), - "repeat_units": _prepare_allele_size_distribution_repeat_units(adjacent_repeat), - }, - "genotype_distribution": { - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram(adjacent_repeat["AlleleCountScatterPlot"]) - ), - "populations": _prepare_genotype_distribution_populations(adjacent_repeat), - "repeat_units": _prepare_genotype_distribution_repeat_units(adjacent_repeat), - }, + "allele_size_distribution": _prepare_allele_size_distribution( + adjacent_repeat["AlleleCountHistogram"] + ), + "genotype_distribution": _prepare_genotype_distribution( + adjacent_repeat["AlleleCountScatterPlot"] + ), "age_distribution": _prepare_age_distribution(adjacent_repeat), } for adjacent_repeat_id, adjacent_repeat in locus.get("AdjacentRepeats", {}).items() @@ -506,7 +199,31 @@ def prepare_gnomad_v3_short_tandem_repeats(path): for locus in data.values() ] - return hl.Table.parallelize( + allele_size_distribution_schema = hl.tarray( + hl.tstruct( + ancestry_group=hl.tstr, + sex=hl.tstr, + repunit=hl.tstr, + quality_description=hl.tstr, + q_score=hl.tfloat, + distribution=hl.tarray(hl.tstruct(repunit_count=hl.tint, frequency=hl.tint)), + ) + ) + genotype_distribution_schema = hl.tarray( + hl.tstruct( + ancestry_group=hl.tstr, + sex=hl.tstr, + short_allele_repunit=hl.tstr, + long_allele_repunit=hl.tstr, + quality_description=hl.tstr, + q_score=hl.tfloat, + distribution=hl.tarray( + hl.tstruct(short_allele_repunit_count=hl.tint, long_allele_repunit_count=hl.tint, frequency=hl.tfloat) + ), + ) + ) + + ds = hl.Table.parallelize( ds, hl.tstruct( id=hl.tstr, @@ -521,31 +238,14 @@ def prepare_gnomad_v3_short_tandem_repeats(path): notes=hl.tstr, ) ), - reference_region=hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint), + main_reference_region=hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint), + reference_regions=hl.tarray( + hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint) + ), reference_repeat_unit=hl.tstr, repeat_units=hl.tarray(hl.tstruct(repeat_unit=hl.tstr, classification=hl.tstr)), - allele_size_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_unit=hl.tstr, - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - ) - ), - ), - genotype_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_units=hl.tarray(hl.tstr), - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - ) - ), - ), + allele_size_distribution=allele_size_distribution_schema, + genotype_distribution=genotype_distribution_schema, age_distribution=hl.tarray( hl.tstruct(age_range=hl.tarray(hl.tint), distribution=hl.tarray(hl.tarray(hl.tint))) ), @@ -556,32 +256,8 @@ def prepare_gnomad_v3_short_tandem_repeats(path): reference_region=hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint), reference_repeat_unit=hl.tstr, repeat_units=hl.tarray(hl.tstr), - allele_size_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_unit=hl.tstr, - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray( - hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint))) - ), - ) - ), - ), - genotype_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_units=hl.tarray(hl.tstr), - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray( - hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint))) - ), - ) - ), - ), + allele_size_distribution=allele_size_distribution_schema, + genotype_distribution=genotype_distribution_schema, age_distribution=hl.tarray( hl.tstruct(age_range=hl.tarray(hl.tint), distribution=hl.tarray(hl.tarray(hl.tint))) ), @@ -590,3 +266,4 @@ def prepare_gnomad_v3_short_tandem_repeats(path): ), n_partitions=1, ) + return ds diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py index 9ddfa8b7b..e91b9d2d3 100644 --- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py +++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py @@ -284,7 +284,7 @@ def add_liftover_document_id(ds): ), "args": { "index": "gnomad_v3_short_tandem_repeats", - "index_fields": ["id", "gene.ensembl_id", "reference_region"], + "index_fields": ["id", "gene.ensembl_id", "main_reference_region"], "id_field": "id", "num_shards": 1, }, diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py index 04848c871..f303e7ed7 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py @@ -8,8 +8,8 @@ pipeline.add_task( "prepare_short_tandem_repeats", prepare_gnomad_v3_short_tandem_repeats, - "/gnomad_v3/gnomad_v3_short_tandem_repeats.ht", - {"path": "gs://gcp-public-data--gnomad/release/3.1.3/json/gnomAD_STR_distributions__2022_01_20.json.gz"}, + "/gnomad_v4/gnomad_v4_short_tandem_repeats.ht", + {"path": "gs://gnomad-browser-data-pipeline/phil-scratch/gnomAD_STR_distributions__gnomad-v2__2024_06_28.json"}, ) ############################################### From 1395bf013b113b31ac2482f7ebe7baa8a697bc51 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 15 Jul 2024 16:27:14 -0400 Subject: [PATCH 02/36] DONTMERGE rig index --- graphql-api/src/queries/short-tandem-repeat-queries.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphql-api/src/queries/short-tandem-repeat-queries.ts b/graphql-api/src/queries/short-tandem-repeat-queries.ts index d4b1eb2d2..e883835d4 100644 --- a/graphql-api/src/queries/short-tandem-repeat-queries.ts +++ b/graphql-api/src/queries/short-tandem-repeat-queries.ts @@ -5,7 +5,8 @@ import { fetchAllSearchResults } from './helpers/elasticsearch-helpers' const SHORT_TANDEM_REPEAT_INDICES = { gnomad_r3: 'gnomad_v3_short_tandem_repeats', - gnomad_r4: 'gnomad_v3_short_tandem_repeats', + // TK + gnomad_r4: 'gnomad_v3_short_tandem_repeats-2024-07-15--17-34', } const SUMMARY_FIELDS = [ From 68b7e2c8b903295ff7794e8ff2e77e97ed2c457b Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 15 Jul 2024 16:45:50 -0400 Subject: [PATCH 03/36] Update GraphQL schema for STRs to reflect new ES schema --- .../graphql/types/short-tandem-repeat.graphql | 66 +++++++++---------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/graphql-api/src/graphql/types/short-tandem-repeat.graphql b/graphql-api/src/graphql/types/short-tandem-repeat.graphql index 52dd64ae0..2ba0f2caf 100644 --- a/graphql-api/src/graphql/types/short-tandem-repeat.graphql +++ b/graphql-api/src/graphql/types/short-tandem-repeat.graphql @@ -31,43 +31,39 @@ type ShortTandemRepeatRepeatUnit { classification: String! } -type ShortTandemRepeatAlleleSizeDistributionPopulation { - id: String! - distribution: [[Int!]!]! -} - -type ShortTandemRepeatAlleleSizeDistributionRepeatUnit { - repeat_unit: String! - distribution: [[Int!]!]! - populations: [ShortTandemRepeatAlleleSizeDistributionPopulation!]! -} - -type ShortTandemRepeatAlleleSizeDistribution { +type ShortTandemRepeatAgeDistributionBin { + age_range: [Int]! distribution: [[Int!]!]! - populations: [ShortTandemRepeatAlleleSizeDistributionPopulation!]! - repeat_units: [ShortTandemRepeatAlleleSizeDistributionRepeatUnit!]! } -type ShortTandemRepeatGenotypeDistributionPopulation { - id: String! - distribution: [[Int!]!]! +type ShortTandemRepeatAlleleSizeItem { + repunit_count: Int! + frequency: Int! } -type ShortTandemRepeatGenotypeDistributionRepeatUnit { - repeat_units: [String!]! - distribution: [[Int!]!]! - populations: [ShortTandemRepeatGenotypeDistributionPopulation!]! +type ShortTandemRepeatAlleleSizeDistributionCohort { + ancestry_group: String! + sex: String! + repunit: String! + quality_description: String! + q_score: Float! + distribution: [ShortTandemRepeatAlleleSizeItem!]! } -type ShortTandemRepeatGenotypeDistribution { - distribution: [[Int!]!]! - populations: [ShortTandemRepeatGenotypeDistributionPopulation!]! - repeat_units: [ShortTandemRepeatGenotypeDistributionRepeatUnit!]! +type ShortTandemRepeatGenotypeItem { + short_allele_repunit_count: Int! + long_allele_repunit_count: Int! + frequency: Int! } -type ShortTandemRepeatAgeDistributionBin { - age_range: [Int]! - distribution: [[Int!]!]! +type ShortTandemRepeatGenotypeDistributionCohort { + ancestry_group: String! + sex: String! + short_allele_repunit: String! + long_allele_repunit: String! + quality_description: String! + q_score: Float! + distribution: [ShortTandemRepeatGenotypeItem!]! } type ShortTandemRepeatAdjacentRepeat { @@ -75,8 +71,8 @@ type ShortTandemRepeatAdjacentRepeat { reference_region: ShortTandemRepeatReferenceRegion! reference_repeat_unit: String! repeat_units: [String!]! - allele_size_distribution: ShortTandemRepeatAlleleSizeDistribution! - genotype_distribution: ShortTandemRepeatGenotypeDistribution! + allele_size_distribution: [ShortTandemRepeatAlleleSizeDistributionCohort!]! + genotype_distribution: [ShortTandemRepeatGenotypeDistributionCohort!]! age_distribution: [ShortTandemRepeatAgeDistributionBin!] } @@ -85,7 +81,8 @@ type ShortTandemRepeat { gene: ShortTandemRepeatGene! associated_diseases: [ShortTandemRepeatAssociatedDisease!]! stripy_id: String! - reference_region: ShortTandemRepeatReferenceRegion! + main_reference_region: ShortTandemRepeatReferenceRegion! + reference_regions: [ShortTandemRepeatReferenceRegion!]! reference_repeat_unit: String! } @@ -94,11 +91,12 @@ type ShortTandemRepeatDetails { gene: ShortTandemRepeatGene! associated_diseases: [ShortTandemRepeatAssociatedDisease!]! stripy_id: String - reference_region: ShortTandemRepeatReferenceRegion! + main_reference_region: ShortTandemRepeatReferenceRegion! + reference_regions: [ShortTandemRepeatReferenceRegion!]! reference_repeat_unit: String! repeat_units: [ShortTandemRepeatRepeatUnit!]! - allele_size_distribution: ShortTandemRepeatAlleleSizeDistribution! - genotype_distribution: ShortTandemRepeatGenotypeDistribution! + allele_size_distribution: [ShortTandemRepeatAlleleSizeDistributionCohort!]! + genotype_distribution: [ShortTandemRepeatGenotypeDistributionCohort!]! age_distribution: [ShortTandemRepeatAgeDistributionBin!] adjacent_repeats: [ShortTandemRepeatAdjacentRepeat!]! } From 900e7465a83e0f5752ff993b96940cf9916c59de Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Wed, 17 Jul 2024 15:39:28 -0400 Subject: [PATCH 04/36] WIP pop options refactored --- browser/src/GenePage/GenePageContainer.tsx | 2 +- .../src/RegionPage/RegionPageContainer.tsx | 2 +- .../ShortTandemRepeatPage.tsx | 244 +++++++++--------- .../ShortTandemRepeatPopulationOptions.tsx | 71 +++-- dataset-metadata/gnomadPopulations.ts | 9 +- 5 files changed, 155 insertions(+), 173 deletions(-) diff --git a/browser/src/GenePage/GenePageContainer.tsx b/browser/src/GenePage/GenePageContainer.tsx index e78711c20..6d0ecdb1b 100644 --- a/browser/src/GenePage/GenePageContainer.tsx +++ b/browser/src/GenePage/GenePageContainer.tsx @@ -263,7 +263,7 @@ query ${operationName}($geneId: String, $geneSymbol: String, $referenceGenome: R } } ` - +// TK check type Props = { datasetId: DatasetId geneIdOrSymbol: string diff --git a/browser/src/RegionPage/RegionPageContainer.tsx b/browser/src/RegionPage/RegionPageContainer.tsx index 45da7c9a5..e468a7268 100644 --- a/browser/src/RegionPage/RegionPageContainer.tsx +++ b/browser/src/RegionPage/RegionPageContainer.tsx @@ -47,7 +47,7 @@ const query = ` } } ` - +// TK check type Props = { datasetId: DatasetId regionId: string diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx index 833e19460..7b4d7edda 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx @@ -25,48 +25,53 @@ import { getGenotypeDistributionPlotAxisLabels, } from './shortTandemRepeatHelpers' import ShortTandemRepeatAdjacentRepeatSection from './ShortTandemRepeatAdjacentRepeatSection' +import { AncestryGroupId } from '@gnomad/dataset-metadata/gnomadPopulations' -type ShortTandemRepeatRepeatUnit = { - repeat_unit: string - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] +type ShortTandemRepeatReferenceRegion = { + chrom: string + start: number + stop: number +} + +type AlleleSizeDistributionItem = { + repunit_count: number + frequency: number +} + +export type Sex = '' | 'XX' | 'XY' + +type AlleleSizeDistributionCohort = { + ancestry_group: AncestryGroupId + sex: Sex + repunit: string + quality_description: string + q_score: number + distribution: AlleleSizeDistributionItem[] +} + +type GenotypeDistributionItem = { + short_allele_repunit_count: number + long_allele_repunit_count: number + frequency: number +} + +type GenotypeDistributionCohort = { + ancestry_group: string + sex: Sex + short_allele_repunit: string + long_allele_repunit: string + quality_description: string + q_score: number + distribution: GenotypeDistributionItem[] } export type ShortTandemRepeatAdjacentRepeat = { id: string - reference_region: { - chrom: string - start: number - stop: number - } + reference_region: ShortTandemRepeatReferenceRegion reference_repeat_unit: string repeat_units: string[] - allele_size_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: ShortTandemRepeatRepeatUnit[] - } - genotype_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: { - repeat_units: string[] - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - }[] - } + allele_size_distribution: AlleleSizeDistributionCohort[] + genotype_distribution: GenotypeDistributionCohort[] } export type ShortTandemRepeat = { @@ -89,39 +94,15 @@ export type ShortTandemRepeat = { notes: string | null }[] stripy_id: string | null - reference_region: { - chrom: string - start: number - stop: number - } + main_reference_region: ShortTandemRepeatReferenceRegion + reference_regions: ShortTandemRepeatReferenceRegion[] reference_repeat_unit: string repeat_units: { repeat_unit: string classification: string }[] - allele_size_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: ShortTandemRepeatRepeatUnit[] - } - genotype_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: { - repeat_units: string[] - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - }[] - } + allele_size_distribution: AlleleSizeDistributionCohort[] + genotype_distribution: GenotypeDistributionCohort[] adjacent_repeats: ShortTandemRepeatAdjacentRepeat[] } @@ -160,31 +141,44 @@ type ShortTandemRepeatPageProps = { shortTandemRepeat: ShortTandemRepeat } -const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepeatPageProps) => { - const [selectedRepeatUnit, setSelectedRepeatUnit] = useState( - shortTandemRepeat.allele_size_distribution.repeat_units.length === 1 - ? shortTandemRepeat.allele_size_distribution.repeat_units[0].repeat_unit - : '' - ) - - const [selectedPopulationId, setSelectedPopulationId] = useState('') - const [selectedScaleType, setSelectedScaleType] = useState('linear') +type ScaleType = 'linear' | 'log' +const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepeatPageProps) => { + const { allele_size_distribution, genotype_distribution } = shortTandemRepeat + + const alleleSizeDistributionRepunits = allele_size_distribution + .map((cohort) => cohort.repunit) + .sort() + const genotypeDistributionRepunitPairs = genotype_distribution + .map((cohort) => [cohort.short_allele_repunit, cohort.long_allele_repunit].join(', ')) + .sort() + + const defaultAlleleSizeRepunit = + alleleSizeDistributionRepunits.length === 1 ? alleleSizeDistributionRepunits[0] : '' + const defaultGenotypeDistributionRepunits = + genotypeDistributionRepunitPairs.length === 1 ? genotypeDistributionRepunitPairs[0] : '' + const defaultDisease = shortTandemRepeat.associated_diseases[0].name + + const [selectedAncestryGroup, setSelectedAncestryGroup] = useState('') + const [selectedSex, setSelectedSex] = useState('') + const [selectedAlleleSizeRepeatUnit, setSelectedAlleleSizeRepeatUnit] = + useState(defaultAlleleSizeRepunit) const [selectedGenotypeDistributionRepeatUnits, setSelectedGenotypeDistributionRepeatUnits] = - useState( - shortTandemRepeat.genotype_distribution.repeat_units.length === 1 - ? shortTandemRepeat.genotype_distribution.repeat_units[0].repeat_units.join(' / ') - : '' + useState(defaultGenotypeDistributionRepunits) + const [selectedDisease, setSelectedDisease] = useState(defaultDisease) + const [selectedScaleType, setSelectedScaleType] = useState('linear') + const [showAdjacentRepeats, setShowAdjacentRepeats] = useState(false) + + const ancestryGroups = [ + ...new Set(shortTandemRepeat.allele_size_distribution.map((cohort) => cohort.ancestry_group)), + ].sort() + + const maxAlleleSizeDistributionRepeats = Math.max( + ...shortTandemRepeat.allele_size_distribution.flatMap((cohort) => + cohort.distribution.map((item) => item.repunit_count) ) - - const [selectedDisease, setSelectedDisease] = useState( - shortTandemRepeat.associated_diseases[0].name ) - const [showAdjacentRepeats, setShowAdjacentRepeats] = useState(false) - - const populationIds = shortTandemRepeat.allele_size_distribution.populations.map((pop) => pop.id) - const allRepeatUnitsByClassification: Record = {} shortTandemRepeat.repeat_units.forEach((repeatUnit) => { if (allRepeatUnitsByClassification[repeatUnit.classification] === undefined) { @@ -196,9 +190,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe // This uses repeat units from shortTandemRepeat.allele_size_distribution.repeat_units because // shortTandemRepeat.repeat_units may include repeat units that do not appear in gnomAD. const repeatUnitsFoundInGnomad = new Set( - shortTandemRepeat.allele_size_distribution.repeat_units.map( - (repeatUnit) => repeatUnit.repeat_unit - ) + shortTandemRepeat.allele_size_distribution.map((cohort) => cohort.repunit) ) const repeatUnitsFoundInGnomadByClassification: Record = {} @@ -283,25 +275,22 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe {' '} + + diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx index e2ad9d95c..929f4c81c 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx @@ -1,4 +1,3 @@ -import { max, min } from 'd3-array' import React, { useState } from 'react' import styled from 'styled-components' @@ -51,7 +50,7 @@ export type AlleleSizeDistributionCohort = { distribution: AlleleSizeDistributionItem[] } -type GenotypeDistributionItem = { +export type GenotypeDistributionItem = { short_allele_repunit_count: number long_allele_repunit_count: number frequency: number @@ -123,21 +122,6 @@ const FlexWrapper = styled.div` width: 100%; ` -const parseCombinedPopulationId = (combinedPopulationId: any) => { - let population - let sex - if (combinedPopulationId.includes('_')) { - ;[population, sex] = combinedPopulationId.split('_') - } else if (combinedPopulationId === 'XX' || combinedPopulationId === 'XY') { - population = null - sex = combinedPopulationId - } else { - population = combinedPopulationId - sex = null - } - return { population, sex } -} - type ShortTandemRepeatPageProps = { datasetId: DatasetId shortTandemRepeat: ShortTandemRepeat @@ -148,12 +132,16 @@ export type ScaleType = 'linear' | 'log' const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepeatPageProps) => { const { allele_size_distribution, genotype_distribution } = shortTandemRepeat - const alleleSizeDistributionRepunits = allele_size_distribution - .map((cohort) => cohort.repunit) - .sort() - const genotypeDistributionRepunitPairs = genotype_distribution - .map((cohort) => [cohort.short_allele_repunit, cohort.long_allele_repunit].join(', ')) - .sort() + const alleleSizeDistributionRepunits = [ + ...new Set(allele_size_distribution.map((cohort) => cohort.repunit)), + ].sort() + const genotypeDistributionRepunitPairs = [ + ...new Set( + genotype_distribution.map((cohort) => + [cohort.short_allele_repunit, cohort.long_allele_repunit].join(' / ') + ) + ), + ].sort() const defaultAlleleSizeRepunit = alleleSizeDistributionRepunits.length === 1 ? alleleSizeDistributionRepunits[0] : '' @@ -166,7 +154,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe const [selectedAlleleSizeRepeatUnit, setSelectedAlleleSizeRepeatUnit] = useState(defaultAlleleSizeRepunit) const [selectedGenotypeDistributionRepeatUnits, setSelectedGenotypeDistributionRepeatUnits] = - useState(defaultGenotypeDistributionRepunits) + useState(defaultGenotypeDistributionRepunits) const [selectedDisease, setSelectedDisease] = useState(defaultDisease) const [selectedScaleType, setSelectedScaleType] = useState('linear') const [showAdjacentRepeats, setShowAdjacentRepeats] = useState(false) @@ -181,6 +169,9 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe ) ) + const maxGenotypeDistributionShortAlleleRepeats = 0 // TK + const maxGenotypeDistributionLongAlleleRepeats = 0 // TK + const allRepeatUnitsByClassification: Record = {} shortTandemRepeat.repeat_units.forEach((repeatUnit) => { if (allRepeatUnitsByClassification[repeatUnit.classification] === undefined) { @@ -447,17 +438,12 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe Genotype Distribution - max(d.slice(0, 2)) - ), - max(shortTandemRepeat.genotype_distribution.distribution, (d: any) => - min(d.slice(0, 2)) - ), + maxGenotypeDistributionLongAlleleRepeats, + maxGenotypeDistributionShortAlleleRepeats, ]} genotypeDistribution={getSelectedGenotypeDistribution(shortTandemRepeat, { selectedRepeatUnits: selectedGenotypeDistributionRepeatUnits, @@ -498,6 +484,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe /> @@ -566,7 +553,6 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe Age Distribution { + // TK figure out what's up with classification const itemsByRepunitCount: Record = shortTandemRepeatOrAdjacentRepeat.allele_size_distribution.reduce((acc, cohort) => { if (selectedAncestryGroup !== '' && cohort.ancestry_group !== selectedAncestryGroup) { From 798e8f4237c86fa360fcea364f1ffefc3c0c245a Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 22 Jul 2024 16:34:14 -0400 Subject: [PATCH 07/36] WIP age dist --- .../ShortTandemRepeatAgeDistributionPlot.tsx | 71 ++++++------------- .../ShortTandemRepeatPage.tsx | 16 ++++- 2 files changed, 35 insertions(+), 52 deletions(-) diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx index 8fa875668..f93abbac4 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx @@ -1,12 +1,12 @@ import { max } from 'd3-array' import { scaleBand, scaleLog } from 'd3-scale' -import PropTypes from 'prop-types' import React from 'react' import { withSize } from 'react-sizeme' import styled from 'styled-components' import { AxisBottom, AxisLeft } from '@visx/axis' import { TooltipAnchor } from '@gnomad/ui' +import { PlotRange, AgeDistributionItem } from './ShortTandemRepeatPage' // The 100% width/height container is necessary the component // to size to fit its container vs staying at its initial size. @@ -19,9 +19,9 @@ const GraphWrapper = styled.div` const labelProps = { fontSize: 14, textAnchor: 'middle', -} +} as const -const ageRangeLabel = (ageRange: any) => { +const ageRangeLabel = (ageRange: [number | null, number | null]) => { const [minAge, maxAge] = ageRange if (minAge === null) { @@ -33,9 +33,15 @@ const ageRangeLabel = (ageRange: any) => { return `${minAge}-${maxAge}` } +type Props = { + ageDistribution: AgeDistributionItem[] + maxRepeats: number + ranges: PlotRange[] + size: { width: number } +} + const ShortTandemRepeatAgeDistributionPlot = withSize()( - // @ts-expect-error TS(2339) FIXME: Property 'ageDistribution' does not exist on type ... Remove this comment to see the full error message - ({ ageDistribution, maxRepeats, ranges, size: { width } }) => { + ({ ageDistribution, maxRepeats, ranges = [], size: { width } }: Props) => { const height = Math.min(width, 300) const margin = { @@ -76,22 +82,19 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( } }) - ageDistribution.forEach((ageBin: any, yBinIndex: any) => { - // @ts-expect-error TS(7031) FIXME: Binding element 'repeats' implicitly has an 'any' ... Remove this comment to see the full error message + ageDistribution.forEach((ageBin, yBinIndex) => { ageBin.distribution.forEach(([repeats, nAlleles]) => { const xBinIndex = Math.floor(repeats / xBinSize) data[xBinIndex * yNumBins + yBinIndex].count += nAlleles }) }) - const xScale = scaleBand() - // @ts-expect-error TS(2345) FIXME: Argument of type 'number[]' is not assignable to p... Remove this comment to see the full error message + const xScale = scaleBand() .domain(Array.from(Array(xNumBins).keys())) .range([0, plotWidth]) const xBandwidth = xScale.bandwidth() - const yScale = scaleBand() - // @ts-expect-error TS(2345) FIXME: Argument of type 'number[]' is not assignable to p... Remove this comment to see the full error message + const yScale = scaleBand() .domain(Array.from(Array(yNumBins).keys())) .range([plotHeight, 0]) const yBandwidth = yScale.bandwidth() @@ -116,11 +119,7 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( } const opacityScale = scaleLog() - // @ts-expect-error TS(2345) FIXME: Argument of type '(string | number | undefined)[]'... Remove this comment to see the full error message - .domain([ - 1, - max(ageDistribution, (ageBin: any) => max(ageBin.distribution, (d: any) => d[1])), - ]) + .domain([1, max(ageDistribution, (ageBin) => max(ageBin.distribution, (d: any) => d[1]))]) .range([0.1, 1]) return ( @@ -129,7 +128,6 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( {ranges - .filter((range: any) => range.start !== range.stop) - .filter((range: any) => range.start <= maxRepeats) - .map((range: any, rangeIndex: any) => { + .filter((range) => range.start !== range.stop) + .filter((range) => range.start <= maxRepeats) + .map((range, rangeIndex) => { const startBinIndex = Math.floor(range.start / xBinSize) const startX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(startBinIndex) + + (xScale(startBinIndex) || 0) + ((range.start - startBinIndex * xBinSize) / xBinSize) * xBandwidth let stopX if (range.stop <= maxRepeats) { const stopBinIndex = Math.floor(range.stop / xBinSize) stopX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(stopBinIndex) + + (xScale(stopBinIndex) || 0) + ((range.stop - stopBinIndex * xBinSize) / xBinSize) * xBandwidth } else { stopX = plotWidth @@ -306,27 +300,4 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( ShortTandemRepeatAgeDistributionPlot.displayName = 'ShortTandemRepeatAgeDistributionPlot' -ShortTandemRepeatAgeDistributionPlot.propTypes = { - // @ts-expect-error TS(2322) FIXME: Type '{ ageDistribution: PropTypes.Requireable<(Pr... Remove this comment to see the full error message - ageDistribution: PropTypes.arrayOf( - PropTypes.shape({ - age_range: PropTypes.arrayOf(PropTypes.number).isRequired, - distribution: PropTypes.arrayOf(PropTypes.arrayOf(PropTypes.number)).isRequired, - }) - ), - maxRepeats: PropTypes.number.isRequired, - ranges: PropTypes.arrayOf( - PropTypes.shape({ - start: PropTypes.number.isRequired, - stop: PropTypes.number.isRequired, - label: PropTypes.string.isRequired, - }) - ), -} - -ShortTandemRepeatAgeDistributionPlot.defaultProps = { - // @ts-expect-error TS(2322) FIXME: Type '{ ranges: never[]; }' is not assignable to t... Remove this comment to see the full error message - ranges: [], -} - export default ShortTandemRepeatAgeDistributionPlot diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx index 929f4c81c..561c4de89 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx @@ -66,6 +66,11 @@ type GenotypeDistributionCohort = { distribution: GenotypeDistributionItem[] } +export type AgeDistributionItem = { + age_range: [number | null, number | null] + distribution: number[][] +} + export type ShortTandemRepeatAdjacentRepeat = { id: string reference_region: ShortTandemRepeatReferenceRegion @@ -75,6 +80,12 @@ export type ShortTandemRepeatAdjacentRepeat = { genotype_distribution: GenotypeDistributionCohort[] } +export type PlotRange = { + label: string + start: number + stop: number +} + export type ShortTandemRepeat = { id: string gene: { @@ -104,6 +115,7 @@ export type ShortTandemRepeat = { }[] allele_size_distribution: AlleleSizeDistributionCohort[] genotype_distribution: GenotypeDistributionCohort[] + age_distribution: AgeDistributionItem[] adjacent_repeats: ShortTandemRepeatAdjacentRepeat[] } @@ -209,7 +221,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe ? diseaseToPlot.repeat_size_classifications : [] - const plotRanges = repeatSizeClassificationsToPlot.map((classification) => { + const plotRanges: PlotRange[] = repeatSizeClassificationsToPlot.map((classification) => { return { label: classification.classification, start: classification.min !== null ? classification.min : 0, @@ -553,7 +565,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe Age Distribution From 83ae794e811c5080850136791b5dbca90424db3e Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Wed, 24 Jul 2024 17:23:42 -0400 Subject: [PATCH 08/36] WIP more refactor --- ...ShortTandemRepeatAdjacentRepeatSection.tsx | 76 +++++------ ...TandemRepeatAlleleSizeDistributionPlot.tsx | 2 +- ...emRepeatGenotypeDistributionBinDetails.tsx | 24 ++-- ...tGenotypeDistributionRepeatUnitsSelect.tsx | 36 ++--- .../ShortTandemRepeatPage.tsx | 79 +++++------ .../ShortTandemRepeatReads.tsx | 126 ++++++++---------- .../shortTandemRepeatHelpers.ts | 102 +++++++++++++- 7 files changed, 261 insertions(+), 184 deletions(-) diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx index 81c9bcde4..db9900076 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx @@ -1,12 +1,11 @@ -import { max, min } from 'd3-array' -import React, { useState } from 'react' +import React, { SetStateAction, useState, Dispatch } from 'react' import { Modal, Select } from '@gnomad/ui' import ControlSection from '../VariantPage/ControlSection' import ShortTandemRepeatPopulationOptions from './ShortTandemRepeatPopulationOptions' -import { ShortTandemRepeatAdjacentRepeat } from './ShortTandemRepeatPage' +import { ShortTandemRepeatAdjacentRepeat, ScaleType, Sex } from './ShortTandemRepeatPage' import ShortTandemRepeatAlleleSizeDistributionPlot from './ShortTandemRepeatAlleleSizeDistributionPlot' import ShortTandemRepeatGenotypeDistributionPlot from './ShortTandemRepeatGenotypeDistributionPlot' import ShortTandemRepeatGenotypeDistributionBinDetails from './ShortTandemRepeatGenotypeDistributionBinDetails' @@ -16,35 +15,40 @@ import { getSelectedAlleleSizeDistribution, getSelectedGenotypeDistribution, getGenotypeDistributionPlotAxisLabels, + maxAlleleSizeDistributionRepeats, + maxGenotypeDistributionRepeats, } from './shortTandemRepeatHelpers' +import { AncestryGroupId } from '@gnomad/dataset-metadata/gnomadPopulations' +import { Bin as GenotypeBin } from './ShortTandemRepeatGenotypeDistributionPlot' type Props = { adjacentRepeat: ShortTandemRepeatAdjacentRepeat - populationIds: string[] - selectedPopulationId: string - onSelectPopulationId: (...args: any[]) => any - selectedScaleType: string - onSelectScaleType: (...args: any[]) => any + selectedScaleType: ScaleType + selectedAncestryGroup: AncestryGroupId | '' + selectedSex: Sex | '' + ancestryGroups: AncestryGroupId[] + selectedGenotypeDistributionBin: GenotypeBin | null + setSelectedGenotypeDistributionBin: Dispatch> + setSelectedScaleType: Dispatch> + setSelectedAncestryGroup: Dispatch> + setSelectedSex: Dispatch> } const ShortTandemRepeatAdjacentRepeatSection = ({ adjacentRepeat, - populationIds, - selectedPopulationId, - onSelectPopulationId, + ancestryGroups, selectedScaleType, - onSelectScaleType, + selectedAncestryGroup, + selectedSex, + setSelectedScaleType, + setSelectedAncestryGroup, + setSelectedSex, }: Props) => { const [selectedRepeatUnit, setSelectedRepeatUnit] = useState( adjacentRepeat.repeat_units.length === 1 ? adjacentRepeat.repeat_units[0] : '' ) - const [selectedGenotypeDistributionRepeatUnits, setSelectedGenotypeDistributionRepeatUnits] = - useState( - adjacentRepeat.genotype_distribution.repeat_units.length === 1 - ? adjacentRepeat.genotype_distribution.repeat_units[0].repeat_units.join(' / ') - : '' - ) + useState(defaultGenotypeDistributionRepeatUnits) const [selectedGenotypeDistributionBin, setSelectedGenotypeDistributionBin] = useState(null) @@ -55,14 +59,10 @@ const ShortTandemRepeatAdjacentRepeatSection = ({

Allele Size Distribution