diff --git a/.github/workflows/data-pipeline-ci.yml b/.github/workflows/data-pipeline-ci.yml
index 1924adb7b..4e96e18a5 100644
--- a/.github/workflows/data-pipeline-ci.yml
+++ b/.github/workflows/data-pipeline-ci.yml
@@ -18,7 +18,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v2
with:
- python-version: 3.7
+ python-version: 3.9
- name: Use pip cache
uses: actions/cache@v2
with:
@@ -29,10 +29,10 @@ jobs:
- name: Install dependencies
run: |
pip install wheel
- pip install -r requirements-dev.txt
- pip install hail
pip install -r data-pipeline/requirements.txt
- name: Check formatting
run: black --check data-pipeline/src/data_pipeline
- - name: Run Pylint
- run: pylint --disable=fixme data-pipeline/src/data_pipeline
+ - name: Run Ruff
+ run: ruff data-pipeline/src/data_pipeline
+ - name: Run Pyright
+ run: pyright --project data-pipeline
diff --git a/browser/about/about.md b/browser/about/about.md
index b94952185..7ad782f90 100644
--- a/browser/about/about.md
+++ b/browser/about/about.md
@@ -13,7 +13,7 @@ The aggregation and release of summary data from the exomes and genomes collecte
## Stats
- v4 release is composed of 730,947 exomes and 76,215 genomes (GRCh38)
-- gnomAD v4 structural variants (SV) represent 63,057 genomes (GRCh38)
-- gnomAD v4 copy number variants (CNV) represent variants in less than 1% of 464,566 exomes (GRCh38)
+- gnomAD v4 structural variants (SV) represent 63,046 genomes (GRCh38)
+- gnomAD v4 copy number variants (CNV) represent variants in less than 1% of 464,297 exomes (GRCh38)
For more Stats on gnomAD v4 please see our [stats page](/stats)
diff --git a/browser/help/faq/constraint/how-was-the-expected-number-of-variants-determined.md b/browser/help/faq/constraint/how-was-the-expected-number-of-variants-determined.md
index cbf3603dd..54b689b89 100644
--- a/browser/help/faq/constraint/how-was-the-expected-number-of-variants-determined.md
+++ b/browser/help/faq/constraint/how-was-the-expected-number-of-variants-determined.md
@@ -2,4 +2,4 @@
question: 'How was the expected number of variants determined?'
---
-We used a mutational model that accounts for local sequence context, CpG methylation, and sequencing depth to predict the number of expected single nucleotide variants per functional class per gene. More details can be found in the help section on [gene constraint](/help/constraint) and in [Karczewski _et al._ Nature 2020](https://doi.org/10.1038/s41586-020-2308-7). Note that the expected variant counts for bases with a median depth <1 were removed from the totals. In v4, we applied our mutational model only to sites with a median depth in the exomes ≥30.
+We used a mutational model that accounts for local sequence context, CpG methylation, and sequencing depth to predict the number of expected single nucleotide variants per functional class per gene. More details can be found in the help section on [gene constraint](/help/constraint) and in [Karczewski _et al._ Nature 2020](https://doi.org/10.1038/s41586-020-2308-7). Note that the expected variant counts for bases with a median depth <1 were removed from the totals.
diff --git a/browser/help/faq/constraint/what-are-the-fields-included-in-constraint-files.md b/browser/help/faq/constraint/what-are-the-fields-included-in-constraint-files.md
index 77ac32766..f797370e0 100644
--- a/browser/help/faq/constraint/what-are-the-fields-included-in-constraint-files.md
+++ b/browser/help/faq/constraint/what-are-the-fields-included-in-constraint-files.md
@@ -2,4 +2,4 @@
question: 'What are the fields included in constraint files?'
---
-Descriptions of the fields in these files can be found in the [README file](/downloads#v4-variants) supplied with the download.
+Descriptions of the fields in these files can be found in the Supplementary Dataset 11 section on pages 74-77 of the [Supplementary Information](https://www.nature.com/articles/s41586-020-2308-7#Sec12) of [_The mutational constraint spectrum quantified from variation in 141,456 humans._ Nature 581, 434–443 (2020)](https://doi.org/10.1038/s41586-020-2308-7).
diff --git a/browser/help/faq/constraint/why-are-constraint-metrics-missing-for-this-gene-or-annotated-with-a-note.md b/browser/help/faq/constraint/why-are-constraint-metrics-missing-for-this-gene-or-annotated-with-a-note.md
index 95243432d..ed3b1b57b 100644
--- a/browser/help/faq/constraint/why-are-constraint-metrics-missing-for-this-gene-or-annotated-with-a-note.md
+++ b/browser/help/faq/constraint/why-are-constraint-metrics-missing-for-this-gene-or-annotated-with-a-note.md
@@ -2,16 +2,14 @@
question: 'Why are constraint metrics missing for this gene or annotated with a note?'
---
-Genes that were outliers in certain assessments will not have constraint metrics or will be flagged with a note warning of various error modes. Please note that these assessments were applied to the canonical transcripts of the genes. If a gene was not annotated as a protein-coding gene in GENCODE v19, we did not calculate constraint. The following list describes the reason names given in the constraint_flag column of the [constraint files](/downloads#v4-constraint):
+Genes that were outliers in certain assessments will not have constraint metrics or will be flagged with a note warning of various error modes. Please note that these assessments were applied to the canonical transcripts of the genes. If a gene was not annotated as a protein-coding gene in GENCODE v19, we did not calculate constraint. The following list describes the reason names given in the constraint_flag column of the [constraint files](/downloads#v2-constraint):
-- `no_variants`: Zero observed synonymous, missense, pLoF variants
-- `no_exp_lof`: Zero expected pLoF variants
-- `outlier_lof`: Number of pLoF variants is significantly different than expectation
-- `no_exp_mis`: Zero expected missense variants
-- `outlier_mis`: Number of missense variants is significantly different than expectation
-- `no_exp_syn`: Zero expected synonymous variants
-- `outlier_syn`: Number of synonymous variants is significantly different than expectation
+- no_variants: Zero observed synonymous, missense, pLoF variants
+- no_exp_lof: Zero expected pLoF variants
+- lof_too_many: More pLoF variants than expected
+- no_exp_mis: Zero expected missense variants
+- mis_too_many: More missense variants than expected
+- no_exp_syn: Zero expected synonymous variants
+- syn_outlier: More or fewer synonymous variants than expected
Possible reasons that one might observe the deviations listed above include mismapped reads due to homologous regions or poor quality sequencing data.
-
-Currently, constraint scores are only available for autosomes. We will release scores for chromosomes X in the near future.
diff --git a/browser/help/faq/constraint/why-are-there-fewer-variants-in-the-constraint-table-than-on-the-gene-page.md b/browser/help/faq/constraint/why-are-there-fewer-variants-in-the-constraint-table-than-on-the-gene-page.md
index d837f77f3..bbe90b096 100644
--- a/browser/help/faq/constraint/why-are-there-fewer-variants-in-the-constraint-table-than-on-the-gene-page.md
+++ b/browser/help/faq/constraint/why-are-there-fewer-variants-in-the-constraint-table-than-on-the-gene-page.md
@@ -2,4 +2,4 @@
question: 'Why are there fewer variants in the constraint table than displayed on the gene page?'
---
-We only included single nucleotide variants that were found in the MANE Select (v3 and v4 on GRCh38) or canonical (ExAC and v2 on GRCh37/hg19) transcript of the gene. On the gene page, variants found in all transcripts are displayed. Additionally, both observed and expected variant counts were removed for sites with a median depth < 30.
+We only included single nucleotide variants that were found in the canonical (ExAC and v2 on GRCh37/hg19) transcript of the gene. On the gene page, variants found in all transcripts are displayed.
diff --git a/browser/help/faq/general/what-features-are-not-yet-in-v4-and-where-can-i-find-them.md b/browser/help/faq/general/what-features-are-not-yet-in-v4-and-where-can-i-find-them.md
index 93e2b734a..46938b40f 100644
--- a/browser/help/faq/general/what-features-are-not-yet-in-v4-and-where-can-i-find-them.md
+++ b/browser/help/faq/general/what-features-are-not-yet-in-v4-and-where-can-i-find-them.md
@@ -2,7 +2,7 @@
question: 'What features are not yet in v4 and where can I find them?'
---
-The v4.0 release is a minimum viable product (MVP) release, which allows us to get the most critical piece of the gnomAD database, high quality aggregate allele frequencies and updated constraint metrics, to our users as soon as possible. It also means that a few of the existing features found in v2 or v3 are not yet included in v4 but **will be coming soon**.
+The v4.0 release is a minimum viable product (MVP) release, which allows us to get the most critical piece of the gnomAD database, high quality aggregate allele frequencies, to our users as soon as possible. It also means that a few of the existing features found in v2 or v3 are not yet included in v4 but **will be coming soon**.
Below is a list of all features not included in the v4 MVP and where to find them in our past datasets until we are able to add them to v4:
@@ -10,6 +10,7 @@ Below is a list of all features not included in the v4 MVP and where to find the
| Non MVP feature | Past versions with this data |
| ----------------------------------------------- | ----------------------------------------------------- |
+| Gene constraint | v2 gene page |
| Pext score | v2 gene page |
| Sub-genetic ancestry groups (prevously subpops) | v2 variant page |
| Multi Nucleotide (MNV) calls | v2 variant table and variant page |
diff --git a/browser/help/topics/constraint.md b/browser/help/topics/constraint.md
index 575f4bced..04fc7e8e8 100644
--- a/browser/help/topics/constraint.md
+++ b/browser/help/topics/constraint.md
@@ -3,69 +3,50 @@ id: constraint
title: 'Gene constraint'
---
-Genetic constraint is a measure of the amount that a genomic region is under negative selection. Knowing how much a gene is constrained (e.g., intolerant to mutational changes) can help prioritize variants that are more likely to have a deleterious functional impact. In order to measure a gene’s intolerance to variation, we developed a mutational model that predicts the number of variants expected to be seen in the gnomAD dataset for a given transcript based on local sequence context and CpG methylation levels. We then compare those per-transcript expectations to the observed amount of variation, and consider transcripts that are significantly depleted of their expected variation to be constrained against, or intolerant of, such variation. As detailed below, we have developed three gene-level constraint metrics. **We recommend using the `LOEUF` (loss-of-function observed / expected upper bound fraction) score displayed in the constraint table**, though we also display the probability of being loss-of-function intolerant (`pLI`) score and z-score for all genes, when available.
+With gnomAD, we have shifted from using the _probability of being loss-of-function intolerant_ (`pLI`) score developed with ExAC and now recommend using the _observed / expected_ score. For this reason, the constraint table displayed on the browser (unless the ExAC data is selected) now also shows the _observed / expected_ (`oe`) metric. It is very important to note that the scale of `oe` is very different from that of `pLI`; in particular low `oe` values are indicative of strong intolerance. In addition, while `pLI` incorporated the uncertainty around low counts (i.e a gene with low expected count could not have a high `pLI`), `oe` does not. Therefore, the `oe` metric comes with a 90% CI. It is important to consider the confidence interval when using `oe`.
+The change from `pLI` to `oe` was motivated mainly by its easier interpretation and its continuity across the spectrum of selection. As an example, let’s take a gene with a `pLI` of 0.8: this means that this gene cannot be categorized as a highly likely haploinsufficient gene based on our data. However, it is unclear whether this value was obtained because of sample size or because there were too many loss-of-function (LoF) variants observed in the gene. In addition, if the cause was the latter, `pLI` doesn’t tell much about the overall selection against loss-of-function in this gene. On the other hand, a gene with an LoF `oe` of 0.4 can clearly be interpreted as a gene where only 40% of the expected loss-of-function variants were observed and therefore is likely under selection against LoF variants. In addition, the 90% CI allows us to clearly distinguish cases where there is a lot of uncertainty about the constraint for that gene due to sample size.
+Since `pLI` > 0.9 is widely used in research and clinical interpretation of Mendelian cases, we suggest using the upper bound of the `oe` confidence interval < 0.35 if a hard threshold is needed. Note that we also provide `pLI` values computed with gnomAD.
+Since `pLI` > 0.9 is widely used in research and clinical interpretation of Mendelian cases, we suggest using the upper bound of the `oe` confidence interval (which we term the "loss-of-function observed/expected upper bound fraction" or "LOEUF") < 0.35 if a hard threshold is needed. Note that we also provide `pLI` values computed with gnomAD.
-The sections below will review:
+The sections below give an explanation of both the _observed / expected_ and the _probability of being loss-of-function intolerant_ scores.
-- [Methods](/help/constraint#methods)
-- [Details on each of the scores](/help/constraint#scores)
- - [LOEUF: loss-of-function observed / expected upper bound fraction](/help/constraint#loeuf)
- - [pLI: probability of being loss-of-function intolerant](/help/constraint#pli)
- - [Z score](/help/constraint#z-score)
-- [An explanation of “What is the difference between the LOEUF and pLI score?](/help/constraint#loeuf-vs-pli)”
+### Observed / expected (`oe`)
-More details on these methods can be found in the supplement of [Lek _et al._ Nature 2016](https://www.nature.com/articles/nature19057) and [Karczewski _et al._ Nature 2020](https://www.nature.com/articles/s41586-020-2308-7).
-
-### Methods
-
-#### Transcripts included in the analyses
-
-We used the MANE select transcripts of protein-coding genes as defined by GENCODE v39.
-
-We excluded 165 transcripts that had zero observed variants when removing exons with a median depth < 1 as well as 31,991 transcripts that had either (1) far too many synonymous and missense variants as determined by a Z score (p < 10-4 and 10-3, respectively) or (2) far too few synonymous and missense variants as determined by a Z score (p < 10-4 and 10-3, respectively). When all outliers were removed, there were 13,313 transcripts left for analyses.
-
-#### Observed variant count
-
-The observed variant count is the number of unique single nucleotide variants in the transcript with minor allele frequency (MAF) < 0.1% and median depth in the exome samples ≥ 30. Variants with MAFs over 0.1% were not included; the rationale behind this choice is that, for pLoF variants, the total number of false positives far outweighs the number of true common variants.
-
-#### Expected variant count
-
-We calculate the expected number of variants for all bases with median depth ≥ 30 in our exome samples using a mutational model that corrects for local sequence context and CpG methylation levels. Previously, we used this same mutational model, but corrected for depth; more details on the previous approach can be found in section 4.1 of the supplement in [Karczewski _et al._ Nature 2020](https://www.nature.com/articles/s41586-020-2308-7).
-
-#### pLoF Variant types
-
-For pLoF counts, only nonsense, splice donor and acceptor site variants caused by single nucleotide changes and called as high confidence by [LOFTEE](https://gnomad.broadinstitute.org/help/vep#loftee) were counted. This is because the mutation model does not account for insertions and deletions that underlie frameshift variants.
-
-### Scores
-
-#### Observed / expected (`oe`) and the Loss-of-function Observed / expected upper bound fraction (`LOEUF`) score
-
-We have calculated the ratio of the observed / expected (`oe`) number of loss-of-function variants for all bases of sufficient depth in the MANE Select (v4 on GRCh38) or canonical (ExAC and v2 on GRCh37) and other non-Select/canonical transcript for each gene. The expected counts are based on a mutational model that takes sequence context and methylation into account.
+The constraint score shown in gnomAD is the ratio of the observed / expected (`oe`) number of loss-of-function variants in that gene. The expected counts are based on a mutational model that takes sequence context, coverage and methylation into account.
#### Interpretation
-Observed/expected (`oe`) is a continuous measure of how tolerant a gene is to a certain class of variation (e.g. loss-of-function). When a gene has a low `oe` value, it is under stronger selection for that class of variation than a gene with a higher value. Because counts depend on gene size and sample size, the precision of the `oe` values varies a lot from one gene to the next. Therefore in addition to the `oe` value, we also display the 90% confidence interval (CI) for each of the `oe` values.
+Observed/expected (`oe`) is a continuous measure of how tolerant a gene is to a certain class of variation (e.g. loss-of-function). When a gene has a low `oe` value, it is under stronger selection for that class of variation than a gene with a higher value. Because counts depend on gene size and sample size, the precision of the `oe` values varies a lot from one gene to the next. Therefore in addition to the `oe` value, we also display the 90% confidence interval (CI) for each of the `oe` values. When evaluating how constrained a gene is, it is essential to take the 90% CI into consideration.
+Although `oe` is a continuous value, we understand that it can be useful to use a threshold for certain applications. In particular, for the interpretation of Mendelian diseases cases, we suggest using the upper bound of the `oe` CI < 0.35 as a threshold if needed. Again, ideally `oe` should be used as a continuous value rather than a cutoff and evaluating the `oe` 90% CI is a must.
-When evaluating how constrained a gene is, it is essential to take the 90% CI into consideration. In particular, we suggest using the upper bound of that CI, which is also known as the `LOEUF` (“loss-of-function observed/expected upper bound fraction”) score. `LOEUF` is therefore a conservative estimate of the observed/expected ratio, based on the upper bound of a Poisson-derived confidence interval around the ratio. Low `LOEUF` scores indicate strong selection against predicted loss-of-function (pLoF) variation in a given gene, while high `LOEUF` scores suggest a relatively higher tolerance to inactivation.
+### Probability of being loss-of-function intolerant (`pLI`)
-One advantage of `oe` and `LOEUF` compared to `pLI` are that they are more direct measures of biological significance, and can be easily used as continuous values. For example, a doubling of `oe` from 0.2 to 0.4 conveys that 20% vs 40% of the expected number of variants has been observed in gnomAD. By contrast, a doubling of the `pLI` score (e.g., 0.45 to 0.9) is less immediately interpretable as `pLI` is fairly dichotomous with nearly all genes having scores < 0.1 or > 0.9. Intermediate `pLI` scores (0.1-0.9) are typically an indication that the gene was too small to be confidently categorized.
+#### Overall interpretation
-Although `oe` and `LOEUF` are continuous values, we understand that it can be useful to use a threshold for certain applications. In particular, for the interpretation of Mendelian disease cases, we suggest using a `LOEUF` score < 0.6 as a threshold if needed. Again, ideally `oe` and `LOEUF` should be used as a continuous values rather than a cutoff.
+We developed metrics to measure a transcript's intolerance to variation by predicting the number of variants expected to be seen in the gnomAD dataset and comparing those expectations to the observed amount of variation. Transcripts that are significantly depleted of their expected variation are considered constrained, or intolerant, of such variation.
+More specifically, for synonymous and missense variation, we created a signed Z score of the deviation of observed counts from the expected number. Positive Z scores indicate increased constraint (intolerance to variation) and therefore that the transcript had fewer variants than expected. Negative Z scores were given to transcripts that had more variants than expected.
+For protein-truncating variation, we assume that there are three classes of genes with respect to tolerance loss of gene function: null (where loss of both copies of the gene is tolerated), recessive (where loss of a single copy of the gene is tolerated, but not loss of both copies), and haploinsufficient (where loss of a single copy of the gene is not tolerated). We used the observed and expected variant counts to determine the probability that a given transcript is extremely intolerant of loss-of-function variation (e.g. falls into the third category). The closer pLI is to one, the more intolerant of protein-truncating variants the transcript appears to be. We consider pLI ≥ 0.9 as an extremely intolerant set of transcripts.
+More details can be found in the supplement of [Lek et al Nature 2016](https://www.nature.com/articles/nature19057).
-As mentioned above, `oe` and `LOEUF` are dependent on sample size and we note that these values are slightly higher in v4 compared to v2 for all genes. The major impact of this is that any `LOEUF` thresholds used on v2 will not give an equivalent number of genes when applied to v4. This rise in `oe` is anticipated, particularly as we are now able to sample variants with a much lower population allele frequency than before (e.g., 1 in ~125,000 individuals vs 1 in ~730,000 individuals).
+#### Transcripts included in the analyses
-#### Probability of being loss-of-function intolerant (`pLI`)
+We used the canonical transcripts of protein-coding genes as defined by GENCODE v19. We removed transcripts that lacked a methionine at the start of the coding sequence, a stop codon at the end of coding sequence, or were indivisible by three, which left 19,621 transcripts. Additionally, we excluded 795 transcripts that had zero observed variants when removing exons with a median depth < 1 as well as 251 transcripts that had either (1) far too many synonymous and missense variants as determined by a Z score (p < 10-4 and 10-3, respectively) or (2) far too few synonymous and missense variants as determined by a Z score (p < 10-4 and 10-3, respectively). When all outliers were removed, there were 18,225 transcripts left for analyses.
-`pLI` is based on the underlying premise that we can assign genes to three natural categories with respect to sensitivity to loss-of-function variation: null (tolerant; where loss-of-function variation – heterozygous or homozygous - is completely tolerated by natural selection), recessive (where heterozygous variants are tolerated but homozygous ones are not), and haploinsufficient (where heterozygous loss-of-function variants are not tolerated). In order to create these metrics, we assumed that tolerant genes would have the expected amount of loss-of-function variation and then took the empirical observed/expected rate of loss-of-function variation for recessive disease genes (0.706) and severe haploinsufficient genes (0.207) to represent the average outcome of the homozygous and heterozygous intolerant scenarios, respectively. We then used an expectation-maximization (EM) algorithm to assign each transcript a probability of belonging to each category. `pLI` is the probability of belonging to the haploinsufficient class of genes. We have updated the empirical observed/expected rate of loss-of-function variants from previous releases. More details on the original formulation of pLI can be found in section 4.4 of the supplement in [Lek _et al._ Nature 2016](https://www.nature.com/articles/nature19057).
+#### Observed variant count
-#### Synonymous and missense (Z scores)
+The observed variant count is the number of unique single nucleotide variants in the transcript with 123 or fewer alternative alleles (minor allele frequency < 0.1%). Variants in exons with a median depth < 1 were removed from the total counts.
+
+#### Expected variant count
-For synonymous and missense variation, we created a signed Z score of the deviation of observed counts from the expected number. Positive Z scores indicate increased constraint (intolerance to variation) and therefore that the transcript had fewer variants than expected. Negative Z scores were given to transcripts that had more variants than expected.
+We used a depth corrected probability of mutation for each gene to predict the expected variant counts. More details can be found in section 4.1 of the supplement in [Lek et al Nature 2016](https://www.nature.com/articles/nature19057). Expected variants in exons with a median depth < 1 were removed from the total counts.
-To generate Z scores, we used a previously described, but slightly modified, sequence-context based mutational model to predict the number of expected rare (minor allele frequency < 0.1%) variants per transcript at well covered sites. We then calculated the chi-squared value for the deviation of observation from expectation for each mutational class (synonymous and missense). The square root of these values was taken and multiplied by -1 if the number of observed variants was greater than expectation or 1 if observed counts were smaller than expected. The synonymous Z scores were then corrected by dividing each score by the standard deviation of all synonymous Z scores in between -5 and 5. For the missense Z scores, we took all Z scores between -5 and 0 and created a mirrored distribution. The missense Z scores were then corrected by dividing each score by the standard deviation of these mirror distributions.
+#### Synonymous and missense Z scores
-For more information, see [Samocha _et al._ Nature Genetics 2014](https://www.nature.com/articles/ng.3050) and [Lek _et al._ Nature 2016](https://www.nature.com/articles/nature19057).
+Higher (more positive) Z scores indicate that the transcript is more intolerant of variation (more constrained).
+To generate Z scores, we used a previously described, but slightly modified, sequence-context based mutational model to predict the number of expected rare (minor allele frequency < 0.1%) variants per transcript. We then calculated the chi-squared value for the deviation of observation from expectation for each mutational class (synonymous and missense). The square root of these values was taken and multiplied by -1 if the number of observed variants was greater than expectation or 1 if observed counts were smaller than expected. The synonymous Z scores were then corrected by dividing each score by the standard deviation of all synonymous Z scores in between -5 and 5. For the missense Z scores, we took all Z scores between -5 and 0 and created a mirrored distribution. The missense Z scores were then corrected by dividing each score by the standard deviation of these mirror distributions.
+For more information, see [Samocha et al Nature Genetics 2014](https://www.nature.com/articles/ng.3050) and [Lek et al Nature 2016](https://www.nature.com/articles/nature19057).
-#### What is the difference between the oe/LOEUF and pLI score?
+#### pLI (probability of being loss-of-function intolerant)
-It is very important to note that `oe` (and thereby `LOEUF`) score is very different from that of `pLI`; in particular low `oe` values are indicative of strong intolerance, whereas high `pLI` scores indicate intolerance. In addition, while `pLI` incorporated the uncertainty around low counts (i.e a gene with low expected count, due to small size or low coverage, could not have a high `pLI`), `oe` does not. Therefore, the `oe` metric comes with a 90% CI. It is important to consider the confidence interval when using `oe`. The change from `pLI` to `oe` was motivated mainly by its easier interpretation and its continuity across the spectrum of selection. As an example, let’s take a gene with a `pLI` of 0.8: this means that this gene cannot be categorized as a highly likely haploinsufficient gene based on our data. However, it is unclear whether this value was obtained because of small sample or gene size or because there were too many loss-of-function (LoF) variants observed in the gene. In addition, if the cause was the latter, `pLI` doesn’t tell much about the overall selection against loss-of-function in this gene. On the other hand, a gene with an LoF `oe` of 0.4 can clearly be interpreted as a gene where only 40% of the expected loss-of-function variants were observed and therefore is likely under selection against LoF variants. In addition, the 90% CI allows us to clearly distinguish cases where there is a lot of uncertainty about the constraint for that gene due to sample size. Since `pLI` > 0.9 is widely used in research and clinical interpretation of Mendelian cases, we suggest using the upper bound of the `oe` confidence interval (which we term the "loss-of-function observed/expected upper bound fraction" or "`LOEUF`") < 0.6 if a hard threshold is needed.
+pLI scores closer to one indicate more intolerance to protein-truncating variation. For a set of transcripts intolerant of protein-truncating variation, we suggest pLI ≥ 0.9.
+pLI is based on the underlying premise that we could assign genes to three natural categories with respect to sensitivity to protein-truncating variation: null (where protein-truncating variation – heterozygous or homozygous - is completely tolerated by natural selection), recessive (where heterozygous variants are tolerated but homozygous ones are not), and haploinsufficient (where heterozygous protein-truncating variants are not tolerated). In order to create this metrics, we assumed that tolerant (null) genes would have the expected amount of protein-truncating variation and then took the empirical observed/expected rate of protein-truncating variation for recessive disease genes (0.463) and severe haploinsufficient genes (0.089) to represent the average outcome of the homozygous and heterozygous intolerant scenarios, respectively. We then used an expectation-maximization (EM) algorithm to assign each transcript a probability of belonging to each category. pLI is the probability of belonging to the haploinsufficient class of genes. More details can be found in section 4.4 of the supplement in [Lek et al Nature 2016](https://www.nature.com/articles/nature19057).
diff --git a/browser/help/topics/structural-variants/sv-overview.md b/browser/help/topics/structural-variants/sv-overview.md
index 85d18d358..df35cfaa0 100644
--- a/browser/help/topics/structural-variants/sv-overview.md
+++ b/browser/help/topics/structural-variants/sv-overview.md
@@ -13,7 +13,7 @@ Due to their size and disruptive nature, SVs can often result in alterations to
### Description of gnomAD SV data
-The gnomAD v4 release has two SV data sets, 1) those detected in 63,046 unrelated genomes 2) and those detected in 464,566 exomes, the latter excluding common CNVs above 1%. In the gnomAD browser, we provide site, frequency, and annotation information for 1,199,106 high-quality SVs, as well as 66,826 high-quality rare coding CNVs. As with the gnomAD short variant data set, we have removed cohorts recruited for severe pediatric disease.
+The gnomAD v4 release has two SV data sets, 1) those detected in 63,046 unrelated genomes 2) and those detected in 464,297 exomes, the latter excluding common CNVs above 1%. In the gnomAD browser, we provide site, frequency, and annotation information for 1,199,117 high-quality SVs, as well as 66,903 high-quality rare coding CNVs. As with the gnomAD short variant data set, we have removed cohorts recruited for severe pediatric disease.
We have also produced VCF and BED files for both datasets, which are available via [the gnomAD Downloads page](https://gnomad.broadinstitute.org/downloads).
diff --git a/browser/package.json b/browser/package.json
index d07ff1517..2e1ef116c 100644
--- a/browser/package.json
+++ b/browser/package.json
@@ -59,5 +59,8 @@
"jest-styled-components": "^6.0.0",
"ts-migrate": "^0.1.30",
"typescript": "^4.7.4"
+ },
+ "volta": {
+ "node": "14.21.3"
}
}
diff --git a/browser/src/ClinvarVariantsTrack/ClinvarVariantTrack.tsx b/browser/src/ClinvarVariantsTrack/ClinvarVariantTrack.tsx
index 7d48b8671..758a99fef 100644
--- a/browser/src/ClinvarVariantsTrack/ClinvarVariantTrack.tsx
+++ b/browser/src/ClinvarVariantsTrack/ClinvarVariantTrack.tsx
@@ -134,7 +134,7 @@ const ClinvarVariantTrack = ({ referenceGenome, transcripts, variants }: Props)
// @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message
includedConsequenceCategories[getCategoryFromConsequence(v.major_consequence)] &&
(!showOnlyGnomad || v.in_gnomad) &&
- (v.gold_stars >= starFilter)
+ v.gold_stars >= starFilter
)
return (
@@ -229,7 +229,7 @@ const ClinvarVariantTrack = ({ referenceGenome, transcripts, variants }: Props)