From 148f1fdbbfd7ab7fa1170d94f7c5064bff54a8ff Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 10 Jul 2024 10:54:49 +0100 Subject: [PATCH 01/13] Remove references to multiple cohorts --- docs/how-to/samplesheet.rst | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/how-to/samplesheet.rst b/docs/how-to/samplesheet.rst index 49cb648f..ede58d8b 100644 --- a/docs/how-to/samplesheet.rst +++ b/docs/how-to/samplesheet.rst @@ -61,12 +61,11 @@ There are four mandatory columns: Notes ~~~~~ -.. note:: Multiple samplesheet rows are typically only needed if: - - - The target genomes are split to have a one file per chromosome - - You're working with multiple cohorts simultaneously +.. danger:: Always include every target genome chromosome in your samplesheet unless you're certain that missing chromosomes aren't in the scoring files + +.. note:: Multiple samplesheet rows are typically only needed if the target genomes are split to have a one file per chromosome -.. danger:: All samplesets have to be in the same genome build (either GRCh37 or +.. danger:: All target genome files have to be in the same genome build (either GRCh37 or GRCh38) which is specified using the ``--target_build [GRCh3#]`` command. All scoring files are downloaded or mapped to match the specified genome build, no liftover/re-mapping of the genotyping data is performed @@ -90,10 +89,7 @@ There is one optional column: imputation tools (Michigan or TopMed Imputation Servers) that output dosages for the ALT allele(s): to extract these data users should enter ``DS`` in this column. -An example of a samplesheet with two VCF datasets where you'd like to import -different genotypes from each is below: - -.. list-table:: Example samplesheet with genotype field set +.. list-table:: Example samplesheet with genotype field set to hard-calls (default) :header-rows: 1 * - sampleset @@ -106,6 +102,15 @@ different genotypes from each is below: - 22 - vcf - ``GT`` + +.. list-table:: Example samplesheet with genotype field set to dosage + :header-rows: 1 + + * - sampleset + - path_prefix + - chrom + - format + - vcf_genotype_field * - cineca_imputed - path/to/vcf_imputed - 22 From 860b12fa0ffc0c636fb2fecbc3a584288dc9f126 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 24 Jul 2024 11:07:20 +0100 Subject: [PATCH 02/13] remove duplicated changelog --- docs/_templates/globaltoc.html | 2 +- docs/changelog.rst | 332 --------------------------------- 2 files changed, 1 insertion(+), 333 deletions(-) delete mode 100644 docs/changelog.rst diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html index f54562ec..048f9eef 100644 --- a/docs/_templates/globaltoc.html +++ b/docs/_templates/globaltoc.html @@ -22,7 +22,7 @@

Contents

About the project

diff --git a/docs/changelog.rst b/docs/changelog.rst deleted file mode 100644 index 676c6a06..00000000 --- a/docs/changelog.rst +++ /dev/null @@ -1,332 +0,0 @@ -:orphan: - -Changelog ---------- - -Versions follow `semantic versioning`_ (``major.minor.patch``). Breaking changes -will only occur in major versions with changes noted in this changelog. - -.. _`semantic versioning`: https://semver.org/ - -pgsc_calc v2.0.0-beta (2024-06-19) -------------------------------------- - -Graduating to beta with the release of `our preprint `_ 🎉 - -Improvements - -* https://github.com/PGScatalog/pygscatalog/pull/23 - -* https://github.com/PGScatalog/pygscatalog/pull/22 - -* https://github.com/PGScatalog/pgsc_calc/pull/311 - -* Publish dependencies to Bioconda to improve conda profile UX - - * https://anaconda.org/bioconda/fraposa-pgsc - - * https://anaconda.org/bioconda/pgscatalog.core - - * https://anaconda.org/bioconda/pgscatalog.match - - * https://anaconda.org/bioconda/pgscatalog.calc - -Bug fixes - -* Fix for https://github.com/PGScatalog/pygscatalog/issues/21 - -* Closes https://github.com/PGScatalog/pgsc_calc/pull/301 - -* Specify modules explicitly to fix https://github.com/PGScatalog/pgsc_calc/pull/312 - -* Fix bim input to `pgscatalog-aggregate` https://github.com/PGScatalog/pgsc_calc/pull/319 - - -pgsc_calc v2.0.0-alpha.6 (2024-05-24) -------------------------------------- - -Please note the minimum required nextflow version has been updated to v23.10.0, released in October 2023. - -Improvements - -* Migrate our custom python tools to new https://github.com/PGScatalog/pygscatalog packages - - * Reference / target intersection now considers allelic frequency and variant missingness to determine PCA eligibility - - * Downloads from PGS Catalog should be faster (async) - - * Package CLI and libraries `are now documented `_ - -* Update plink version to alpha 5.10 final - -* Add docs describing cloud execution - -* Add correlation test comparing calculated scores against known good scores - -* When matching variants, matching logs are now written before scorefiles to improve debugging UX - -* Improvements to PCA quality (ensuring low missingness and suitable MAF for PCA-eligble variants in target samples). - - * This could allow us to implement MAF/missingness filters for scoring file variants in the future. - -Bug fixes - -* Fix ancestry adjustment with VCFs -* Fix support for scoring files that only have one effect type column -* Fix adjusting PGS with zero variance (skip them) -* Check for reserved characters in sampleset names - -pgsc_calc v2.0.0-alpha.5 (2024-03-19) -------------------------------------- - -Improvements: - -* Automatically mount directories inside singularity containers without setting any configuration -* Improve permanent caching of ancestry processes with --genotypes_cache parameter -* resync with nf-core framework -* Refactor combine_scorefiles - -Bug fixes: - -* Fix semantic storeDir definitions causing problems cloud execution (google batch) -* Fix missing DENOM values with multiple custom scoring files (score calculation not affected) -* Fix liftover failing silently with custom scoring files (thanks Brooke!) - -Misc: - -* Move aggregation step out of report - -pgsc_calc v2.0.0-alpha.4 (2023-12-05) -------------------------------------- - -Improvements: - -* Give a more helpful error message when there's no valid variant matches found - -Bug fixes: - -* Fix retrying downloads from PGS Catalog -* Fix numeric sample identifiers breaking ancestry analysis -* Check for chr prefix in samplesheets and error - -pgsc_calc v2.0.0-alpha.3 (2023-10-02) -------------------------------------- - -Improvements: - -* Automatically retry scoring with more RAM on larger datasets -* Describe scoring precision in docs -* Change handling of VCFs to reduce errors when recoding -* Internal changes to improve support for custom reference panels - -Bug fixes: - -* Fix VCF input to ancestry projection subworkflow (thanks `@frahimov`_ and `@AWS-crafter`_ for patiently debugging) -* Fix scoring options when reading allelic frequencies from a reference panel (thanks `@raimondsre`_ for reporting the changes from v1.3.2 -> 2.0.0-alpha) -* Fix conda profile action - -.. _`@frahimov`: https://github.com/PGScatalog/pgsc_calc/issues/172 -.. _`@AWS-crafter`: https://github.com/PGScatalog/pgsc_calc/issues/155 -.. _`@raimondsre`: https://github.com/PGScatalog/pgsc_calc/pull/139#issuecomment-1736313211 - -pgsc_calc v2.0.0-alpha.1 (2023-08-11) -------------------------------------- - -This patch fixes a bug when running the workflow directly from github with the -test profile (i.e. without cloning first). Thanks to `@staedlern`_ for reporting the -problem. - -.. _`@staedlern`: https://github.com/PGScatalog/pgsc_calc/issues/151 - -pgsc_calc v2.0.0-alpha (2023-08-08) ------------------------------------ - -This major release features breaking changes to samplesheet structure to provide -more flexible support for extra genomic file types in the future. Two major new -features were implemented in this release: - -- Genetic ancestry group similarity is calculated to a population reference panel - (default: 1000 Genomes) when the ``--run_ancestry`` flag is supplied. This runs - using PCA and projection implemented in the ``fraposa_pgsc (v0.1.0)`` package. -- Calculated PGS can be adjusted for genetic ancestry using empirical PGS distributions - from the most similar reference panel population or continuous PCA-based regressions. - -These new features are optional and don't run in the default workflow. Other features -included in the release are: - -- Speed optimizations for PGS scoring (skipping allele frequency calculation) - -pgsc_calc v1.3.2 (2023-01-27) ------------------------------ - -This patch fixes a bug that made some PGS Catalog scoring files incompatible -with the pipeline. Effect weights were sometimes set to utf-8 strings instead of -floating point numbers, which caused an assertion error. Thanks to `@j0n-a`_ for -reporting the problem. - -.. _`@j0n-a`: https://github.com/PGScatalog/pgsc_calc/issues/79 - -pgsc_calc v1.3.1 (2023-01-24) ------------------------------ - -This patch fixes a bug that breaks the workflow if all variants in one or more -PGS scoring files match perfectly with the target genomes. Thanks to -`@lemieuxl`_ for reporting the problem! - -.. _`@lemieuxl`: https://github.com/PGScatalog/pgsc_calc/issues/75 - -pgsc_calc v1.3.0 (2022-11-21) ------------------------------ - -This release is focused on improving scalability. - -Features -~~~~~~~~ - -- Variant matching is made more efficient using a split - apply - combine - approach when the data is split across chromosomes. This supports parallel PGS - calculation for the largest traits (e.g. cancer, 418 PGS [avg 261,000 - variants/score) ) in the PGS Catalog on big datasets such as UK Biobank. - -- Better support for running in offline environments: - - - Internet access is only required to download scores by ID. Scores can be - pre-downloaded using the utils package - (https://pypi.org/project/pgscatalog-utils/) - - - Scoring file metadata is read from headers and displayed in the report - (removed API calls during report generation) - -- Implemented flag (--efo_direct) to return only PGS tagged with exact EFO term - (e.g. no PGS for child/descendant terms in the ontology) - -pgsc_calc v1.2.0 (2022-10-11) ------------------------------ - -This release is focused on improving memory and storage usage. - -Features -~~~~~~~~ - -- Allow genotype dosages to be imported from VCF to be specified in ``vcf_genotype_field`` - of samplesheet_ (default: GT / hard calls) - -- Makes use of `durable caching`_ when relabelling and recoding target genomes (``--genotypes_cache``) - -- Improvements to use less storage space: - - - All intermediate files are now compressed by default - - - Add parameter to support zstd compressed input files - -- Improved memory usage when matching variants (``pgscatalog_utils=v0.1.2`` - https://github.com/PGScatalog/pgscatalog_utils) - -- Revised interface to select scores from the PGS Catalog using flags: - ``--trait_efo`` (EFO ID / traits), ``--pgp_id`` (PGP ID / publications), ``--pgs_id`` (PGS ID, individual scores). - -.. _samplesheet: https://pgsc-calc.readthedocs.io/en/dev/reference/input.html -.. _durable caching: https://pgsc-calc.readthedocs.io/en/dev/reference/params.html#parameter-schema - -pgsc_calc v1.1.0 (2022-09-15) ------------------------------ - -The first public release of the pgsc_calc pipeline. This release adds compatibility -for every score published in the PGS Catalog. Each scoring file in the PGS Catalog -has been processed to provide consistent genomic coordinates in builds GRCh37 and GRCh38. -The pipeline has been updated to take advantage of the harmonised scoring files (see -`PGS Catalog downloads`_ for additional details). - -.. _PGS Catalog downloads: https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos - -Features -~~~~~~~~ - -- Many of the underlying software tools are now implemented within a ``pgscatalog_utils`` - package (``v0.1.2``, https://github.com/PGScatalog/pgscatalog_utils and - https://pypi.org/project/pgscatalog-utils/ ). The packaging allows for independent - testing and development of tools for downloading and working with the scoring files. - -- The output report has been improved to have more detailed metadata describing - the scoring files and how well the variants match the target sampleset(s). - -- Improvements to variant matching: - - More precise control of variant matching parameters is now possible, like - ignoring strand flips - - ``match_variants`` should now use less RAM by default: - - A laptop with 16GB of RAM should be able to comfortably calculate scores on - the 1000 genomes dataset - - Fast matching mode (``--fast_match``) is available if ~32GB of RAM is - available and you'd like to calculate scores for larger datasets - -- Groups of scores from the PGS Catalog can be calculated by specifying a specific - ``--trait`` (EFO ID) or ``--publication`` (PGP ID), in addition to using individual - scoring files ``--pgs_id`` (PGS ID). - -- Score validation has been integrated with the test suite - -- Support for M1 Macs with ``--platform`` parameter (docker executor only) - - -Bug fixes -~~~~~~~~~ - -- Implemented a more robust prioritisation procedure if a variant has multiple - candidate matches or duplicated IDs - -- Fixed processing multiple samplesets in parallel (e.g. 1000 Genomes + UK - Biobank) - -- When combining multiple scoring files, all variants are now kept to reflect the - correct denominator for % matching statistics. - -- When trying to correct for strand flips the matched effect allele wasn't being - correctly complemented - -pgsc_calc v1.0.0 (2022-05-24) --------------------------------- - -This release produces scores that should be biologically meaningful. Significant -effort has been made to validate calculate scores on different datasets. In the -next release we'll add score validation to our test suite to make sure -calculated scores stay valid in the future. - -Features -~~~~~~~~ - -- Add support for PLINK2 format (samplesheet structure changed) -- Add support for allosomes (e.g. X, Y) -- Improve PGS Catalog compatibility (e.g. missing other allele) -- Add automatic liftover of scoring files to match target genome build -- Performance improvements to support UK BioBank scale data (500,000 genomes) -- Support calculation of multiple scores in parallel -- Significantly improved test coverage (> 80%) -- Lots of other small changes to improve correctness and handling edge cases - -pgsc_calc v0.1.3dev (2022-02-04) --------------------------------- - -Features -~~~~~~~~ - -- Simplified JSON input processes -- Add first draft of documentation -- Add JSON schemas for validating input data (mostly for web platform) - -pgsc_calc v0.1.2dev (2022-01-17) --------------------------------- - -Features -~~~~~~~~ - -- Add JSON input support for web platform functionality -- Set up simple CI tests with Github actions - -pgsc_calc v0.1.1dev (2021-12-16) --------------------------------- - -Features -~~~~~~~~ - -- First public release -- Support applying a single scoring file to target genomic data in GrCh37 build From 5b5692c1ad9a5ddc08c0d37e097452b26b7b4f10 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 29 Jul 2024 17:05:17 +0100 Subject: [PATCH 03/13] update schema with support for --maf_target and --geno_miss_target --- modules/local/ancestry/intersect_variants.nf | 4 +- nextflow.config | 2 + nextflow_schema.json | 355 +++++++++++-------- 3 files changed, 208 insertions(+), 153 deletions(-) diff --git a/modules/local/ancestry/intersect_variants.nf b/modules/local/ancestry/intersect_variants.nf index efcad0b6..d872bbf4 100644 --- a/modules/local/ancestry/intersect_variants.nf +++ b/modules/local/ancestry/intersect_variants.nf @@ -33,8 +33,8 @@ process INTERSECT_VARIANTS { pgscatalog-intersect --ref $ref_variants \ --target $variants \ --chrom $meta.chrom \ - --maf_target 0.1 \ - --geno_miss 0.1 \ + --maf_target $params.maf_target \ + --geno_miss $params.geno_miss_target \ --outdir . \ -v diff --git a/nextflow.config b/nextflow.config index 480ab095..27035ba5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,6 +43,8 @@ params { n_popcomp = 5 normalization_method = "empirical mean mean+var" n_normalization = 4 + maf_target = 0.1 + geno_miss_target = 0.1 // compatibility params liftover = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 6caed387..aefa1d31 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,33 +1,49 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/pgscatalog/pgsc_calc/master/nextflow_schema.json", "title": "pgscatalog/pgsc_calc pipeline parameters", - "description": "This pipeline applies scoring files from the PGS Catalog to target set(s) of genotyped samples", + "description": "The Polygenic Score Catalog Calculator is a nextflow pipeline for polygenic score calculation", "type": "object", - "defs": { + "definitions": { "input_output_options": { "title": "Input/output options", "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], "properties": { "input": { "type": "string", - "description": "Path to input samplesheet", - "format": "file-path" + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(csv|json)$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open", + "default": "results" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" }, "format": { "type": "string", "default": "csv", - "fa_icon": "fas fa-cog", - "description": "Format of input samplesheet", - "enum": ["csv", "json"] + "enum": ["csv", "json"], + "description": "What format is the samplesheet in? (csv/json)" }, "scorefile": { "type": "string", - "description": "Path to a scoring file in PGS Catalog format. Multiple scorefiles can be specified using wildcards (e.g., ``--scorefile \"path/to/scores/*.txt\"``)", - "fa_icon": "fas fa-file-alt", - "format": "file-path" + "description": "Path to a scoring file in PGS Catalog format. Multiple scorefiles can be specified using wildcards (e.g., ``--scorefile \\\"path/to/scores/*.txt\\\"``)" }, "pgs_id": { "type": "string", @@ -55,35 +71,68 @@ "type": "string", "description": "Path to a directory that can store relabelled genotypes (and the reference panel intersections and PCA with --run_ancestry) to speed up new PGS calculations on previously harmonized samples", "format": "directory-path" - }, - "outdir": { + } + } + }, + "compatibility_options": { + "title": "Compatibility options", + "type": "object", + "description": "Define parameters that control how scoring files and target genomes are made compatible with each other", + "default": "", + "properties": { + "target_build": { "type": "string", - "description": "Path to the output directory where the results will be saved.", - "fa_icon": "fas fa-folder-open", - "format": "directory-path", - "default": "results" + "description": "Genome build of target genomes", + "enum": ["GRCh37", "GRCh38"] }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "liftover": { + "type": "boolean", + "description": "Lift scoring files to match your target genomes. Requires build information in the header of the scoring files." + }, + "min_lift": { + "type": "number", + "default": 0.95, + "description": "Minimum proportion of variants required to successfully remap a scoring file to a different genome build.", + "minimum": 0, + "maximum": 1 } }, - "required": ["input", "format"] + "required": ["target_build"] }, - "ancestry_options": { - "title": "Ancestry options", + "new_group_4": { + "title": "New Group 4", "type": "object", - "description": "", + "description": "Define how variants are matched across scoring files and target genomes.", + "default": "", + "properties": { + "keep_multiallelic": { + "type": "boolean", + "description": "Allow matches of scoring file variants to multiallelic variants in the target dataset" + }, + "keep_ambiguous": { + "type": "boolean", + "description": "Keep matches of scoring file variants to strand ambiguous variants (e.g. A/T and C/G SNPs) in the target dataset. This assumes the scoring file and target dataset report variants on the same strand." + }, + "min_overlap": { + "type": "number", + "default": 0.75, + "minimum": 0, + "maximum": 1, + "description": "Minimum proportion of variants present in both the score file and input target genomic data" + } + } + }, + "genetic_ancestry_options": { + "title": "Genetic ancestry options", + "type": "object", + "description": "Parameters used to control genetic ancestry similarity analysis", "default": "", "properties": { "projection_method": { "type": "string", "default": "oadp", - "enum": ["oadp", "sp", "adp"], - "description": "The method for PCA prediction. oadp: most accurate. adp: accurate but slow. sp: fast but inaccurate." + "description": "The method for PCA prediction. oadp: most accurate. adp: accurate but slow. sp: fast but inaccurate.", + "enum": ["oadp", "sp", "adp"] }, "ancestry_method": { "type": "string", @@ -104,8 +153,8 @@ "normalization_method": { "type": "string", "default": "empirical mean mean+var", - "description": "Method used for normalisation of genetic ancestry", - "enum": ["empirical", "mean", "mean+var", "empirical mean mean+var"] + "enum": ["empirical", "mean", "mean+var", "empirical mean mean+var"], + "description": "Method used for normalisation of genetic ancestry" }, "n_normalization": { "type": "integer", @@ -116,6 +165,14 @@ "type": "boolean", "default": true, "description": "Load allelic frequencies from reference panel when scoring target genomes" + }, + "pca_maf_target": { + "type": "number", + "default": 0.1 + }, + "pca_geno_miss": { + "type": "number", + "default": 0.1 } }, "required": [ @@ -125,7 +182,9 @@ "n_popcomp", "normalization_method", "n_normalization", - "load_afreq" + "load_afreq", + "pca_maf_target", + "pca_geno_miss" ] }, "reference_options": { @@ -136,12 +195,13 @@ "properties": { "run_ancestry": { "type": "string", - "format": "file-path", - "description": "Path to reference database. Must be set if --ref_samplesheet is not set." + "description": "Path to reference database. Must be set if --ref_samplesheet is not set.", + "format": "file-path" }, "ref_samplesheet": { "type": "string", - "description": "Path to a samplesheet that describes the structure of reference data. Must be set if --ref isn't set." + "description": "Path to a samplesheet that describes the structure of reference data. Must be set if --ref isn't set.", + "format": "file-path" }, "hg19_chain": { "type": "string", @@ -153,6 +213,7 @@ "hg38_chain": { "type": "string", "description": "Path to a UCSC chain file for converting from hg38 to hg19. Needed if lifting over a custom scoring file.", + "pattern": ".*chain.gz$", "format": "file-path", "mimetype": "application/gzip" }, @@ -166,16 +227,16 @@ "mind_ref": { "type": "number", "default": 0.1, - "description": "Exclude samples with missing call frequencies greater than a threshold (in reference genomes)", "minimum": 0, - "maximum": 1 + "maximum": 1, + "description": "Exclude samples with missing call frequencies greater than a threshold (in reference genomes)" }, "maf_ref": { "type": "number", "default": 0.05, - "description": "Exclude variants with allele frequency lower than a threshold (in reference genomes)", "minimum": 0, - "maximum": 1 + "maximum": 1, + "description": "Exclude variants with allele frequency lower than a threshold (in reference genomes)" }, "hwe_ref": { "type": "number", @@ -191,83 +252,113 @@ }, "ld_grch37": { "type": "string", + "default": "/Users/bwingfield/Documents/projects/pgsc_calc/assets/ancestry/high-LD-regions-hg19-GRCh37.txt", "description": "Path to a file that contains areas of high linkage disequilibrium in the reference data (build GRCh37).", "format": "file-path", "mimetype": "text/plain" }, "ld_grch38": { "type": "string", + "default": "/Users/bwingfield/Documents/projects/pgsc_calc/assets/ancestry/high-LD-regions-hg38-GRCh38.txt", "description": "Path to a file that contains areas of high linkage disequilibrium in the reference data (build GRCh38).", "format": "file-path", "mimetype": "text/plain" }, "ref_format_version": { "type": "string", - "default": "v0.1" + "default": "v0.1", + "description": "Version of the default reference database" }, "ancestry_checksums": { - "type": "string" + "type": "string", + "description": "Used to validate files in the reference database when built" } - }, - "required": [ - "geno_ref", - "mind_ref", - "maf_ref", - "hwe_ref", - "indep_pairwise_ref", - "ld_grch37", - "ld_grch38" - ] + } }, - "compatibility_options": { - "title": "Compatibility options", + "developer_options": { + "title": "Developer options", "type": "object", - "description": "Define parameters that control how scoring files and target genomes are made compatible with each other", + "description": "Control subworkflow execution, useful for debugging", "default": "", "properties": { - "target_build": { - "type": "string", - "enum": ["GRCh37", "GRCh38"], - "description": "Genome build of target genomes" + "only_bootstrap": { + "type": "boolean", + "hidden": true }, - "liftover": { + "only_input": { "type": "boolean", - "description": "Lift scoring files to match your target genomes. Requires build information in the header of the scoring files." + "hidden": true }, - "min_lift": { - "type": "number", - "default": 0.95, - "description": "Minimum proportion of variants required to successfully remap a scoring file to a different genome build", - "minimum": 0, - "maximum": 1 + "only_compatible": { + "type": "boolean", + "hidden": true + }, + "only_match": { + "type": "boolean", + "hidden": true + }, + "only_projection": { + "type": "boolean", + "hidden": true + }, + "only_score": { + "type": "boolean", + "hidden": true + }, + "skip_ancestry": { + "type": "boolean", + "default": true, + "hidden": true } - }, - "required": ["target_build"] + } }, - "matching_options": { - "title": "Matching options", + "institutional_config_options": { + "title": "Institutional config options", "type": "object", - "description": "Define how variants are matched across scoring files and target genomes", - "default": "", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", "properties": { - "keep_multiallelic": { - "type": "boolean", - "description": "Allow matches of scoring file variants to multiallelic variants in the target dataset" + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "keep_ambiguous": { - "type": "boolean", - "description": "Keep matches of scoring file variants to strand ambiguous variants (e.g. A/T and C/G SNPs) in the target dataset. This assumes the scoring file and target dataset report variants on the same strand." + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" }, - "min_overlap": { - "type": "number", - "default": 0.75, - "description": "Minimum proportion of variants present in both the score file and input target genomic data", - "fa_icon": "fas fa-cog", - "minimum": 0, - "maximum": 1 + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" } - }, - "fa_icon": "fas fa-user-cog" + } }, "max_job_request_options": { "title": "Max job request options", @@ -298,7 +389,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -317,6 +408,12 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, "publish_dir_mode": { "type": "string", "default": "copy", @@ -352,88 +449,44 @@ "description": "Do not use coloured log outputs.", "fa_icon": "fas fa-palette", "hidden": true + }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true } } } }, "allOf": [ { - "$ref": "#/defs/input_output_options" + "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/defs/ancestry_options" + "$ref": "#/definitions/compatibility_options" }, { - "$ref": "#/defs/reference_options" + "$ref": "#/definitions/new_group_4" }, { - "$ref": "#/defs/compatibility_options" + "$ref": "#/definitions/genetic_ancestry_options" }, { - "$ref": "#/defs/matching_options" + "$ref": "#/definitions/reference_options" }, { - "$ref": "#/defs/max_job_request_options" + "$ref": "#/definitions/developer_options" }, { - "$ref": "#/defs/generic_options" - } - ], - "properties": { - "only_bootstrap": { - "type": "boolean", - "hidden": true - }, - "only_input": { - "type": "boolean", - "hidden": true - }, - "only_compatible": { - "type": "boolean", - "hidden": true - }, - "only_match": { - "type": "boolean", - "hidden": true - }, - "only_projection": { - "type": "boolean", - "hidden": true + "$ref": "#/definitions/institutional_config_options" }, - "only_score": { - "type": "boolean", - "hidden": true - }, - "skip_ancestry": { - "type": "boolean", - "default": true, - "hidden": true - }, - "hook_url": { - "type": "string" - }, - "version": { - "type": "boolean" - }, - "config_profile_name": { - "type": "string" - }, - "config_profile_description": { - "type": "string" - }, - "custom_config_version": { - "type": "string", - "default": "master" - }, - "custom_config_base": { - "type": "string", - "default": "https://raw.githubusercontent.com/nf-core/configs/master" - }, - "config_profile_contact": { - "type": "string" + { + "$ref": "#/definitions/max_job_request_options" }, - "config_profile_url": { - "type": "string" + { + "$ref": "#/definitions/generic_options" } - } + ] } From 0213dd1a9e8a07367365779e50affd8f66a5f4a0 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 29 Jul 2024 17:15:34 +0100 Subject: [PATCH 04/13] update directory path --- docs/how-to/multiple.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how-to/multiple.rst b/docs/how-to/multiple.rst index e8f44889..84d98f46 100644 --- a/docs/how-to/multiple.rst +++ b/docs/how-to/multiple.rst @@ -133,7 +133,7 @@ Congratulations, you've now calculated multiple scores in parallel! combine scores in the PGS Catalog with your own custom scores After the workflow executes successfully, the calculated scores and a summary -report should be available in the ``results/make/`` directory by default. If +report should be available in the ``results/`` directory by default. If you're interested in more information, see :ref:`interpret`. If the workflow didn't execute successfully, have a look at the From 59a7b63584864ade8a6eadaa2c8ddc98f6ae727c Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 29 Jul 2024 18:32:38 +0100 Subject: [PATCH 05/13] Fix and add schema descriptions. --- nextflow_schema.json | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index aefa1d31..58ab7627 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -166,13 +166,15 @@ "default": true, "description": "Load allelic frequencies from reference panel when scoring target genomes" }, - "pca_maf_target": { + "maf_target": { "type": "number", - "default": 0.1 + "default": 0.1, + "description": "Minimum MAF threshold in TARGET samples for variants to be included in the PCA." }, - "pca_geno_miss": { + "geno_miss_target": { "type": "number", - "default": 0.1 + "default": 0.1, + "description": "Maximum genotype missingness threshold in TARGET samples for variants to be included in the PCA." } }, "required": [ From 18e14e08da03691cfd621789d8211a0ef25baa0b Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 29 Jul 2024 18:39:26 +0100 Subject: [PATCH 06/13] Clarifying changes to PCA variant parameters. --- nextflow_schema.json | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 58ab7627..b1353129 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -125,7 +125,7 @@ "genetic_ancestry_options": { "title": "Genetic ancestry options", "type": "object", - "description": "Parameters used to control genetic ancestry similarity analysis", + "description": "Parameters used to control genetic ancestry similarity analysis on TARGET samples and variants included in PCA", "default": "", "properties": { "projection_method": { @@ -169,12 +169,16 @@ "maf_target": { "type": "number", "default": 0.1, - "description": "Minimum MAF threshold in TARGET samples for variants to be included in the PCA." + "description": "Minimum MAF threshold in TARGET samples for variants to be included in the PCA.", + "minimum": 0, + "maximum": 1 }, "geno_miss_target": { "type": "number", "default": 0.1, - "description": "Maximum genotype missingness threshold in TARGET samples for variants to be included in the PCA." + "description": "Maximum genotype missingness threshold in TARGET samples for variants to be included in the PCA.", + "minimum": 0, + "maximum": 1 } }, "required": [ @@ -192,7 +196,7 @@ "reference_options": { "title": "Reference options", "type": "object", - "description": "Define how reference genomes are defined and processed", + "description": "Define how genomes and variants in REFERENCE panel are defined and processed for PCA", "default": "", "properties": { "run_ancestry": { @@ -222,7 +226,7 @@ "geno_ref": { "type": "number", "default": 0.1, - "description": "Exclude variants with missing call frequencies greater than a threshold (in reference genomes)", + "description": "Exclude VARIANTS with percentage of missing genotype calls greater than a threshold (in reference genomes)", "minimum": 0, "maximum": 1 }, @@ -231,14 +235,14 @@ "default": 0.1, "minimum": 0, "maximum": 1, - "description": "Exclude samples with missing call frequencies greater than a threshold (in reference genomes)" + "description": "Exclude SAMPLES with percentage of missing genotype calls greater than a threshold (in reference genomes)" }, "maf_ref": { "type": "number", "default": 0.05, "minimum": 0, "maximum": 1, - "description": "Exclude variants with allele frequency lower than a threshold (in reference genomes)" + "description": "Exclude variants with minor allele frequency (MAF) lower than a threshold (in reference genomes)" }, "hwe_ref": { "type": "number", From f5e507dc415803989cfa00b80471ef8fe6916a60 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 30 Jul 2024 09:42:05 +0100 Subject: [PATCH 07/13] fix group name (matching options) --- nextflow_schema.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b1353129..5b6f74d4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -26,7 +26,7 @@ "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open", - "default": "results" + "default": "results/" }, "email": { "type": "string", @@ -99,8 +99,8 @@ }, "required": ["target_build"] }, - "new_group_4": { - "title": "New Group 4", + "matching_options": { + "title": "Matching options", "type": "object", "description": "Define how variants are matched across scoring files and target genomes.", "default": "", @@ -474,7 +474,7 @@ "$ref": "#/definitions/compatibility_options" }, { - "$ref": "#/definitions/new_group_4" + "$ref": "#/definitions/matching_options" }, { "$ref": "#/definitions/genetic_ancestry_options" From 34539187afa82bd3b880f5180063d89f8f353835 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 30 Jul 2024 12:01:34 +0100 Subject: [PATCH 08/13] docs review --- docs/conf.py | 2 +- docs/explanation/match.rst | 4 +- docs/explanation/output.rst | 2 + docs/how-to/bigjob.rst | 162 ++++++++++++++++++++++++------- docs/how-to/cache.rst | 25 ++--- docs/how-to/calculate_custom.rst | 2 +- docs/how-to/offline.rst | 8 +- docs/how-to/prepare.rst | 2 + docs/how-to/samplesheet.rst | 2 +- docs/index.rst | 7 +- 10 files changed, 162 insertions(+), 54 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 86057e0a..8e6ac019 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ project = 'Polygenic Score (PGS) Catalog Calculator' copyright = 'Polygenic Score (PGS) Catalog team (licensed under Apache License V2)' -# author = 'Polygenic Score (PGS) Catalog team' +author = 'Polygenic Score (PGS) Catalog team' # -- General configuration --------------------------------------------------- diff --git a/docs/explanation/match.rst b/docs/explanation/match.rst index a072eb65..85449b4c 100644 --- a/docs/explanation/match.rst +++ b/docs/explanation/match.rst @@ -37,6 +37,8 @@ When you evaluate the predictive performance of a score with low match rates it If you reduce ``--min_overlap`` then the calculator will output scores calculated with the remaining variants, **but these scores may not be representative of the original data submitted to the PGS Catalog.** +.. _wgs: + Are your target genomes imputed? Are they WGS? ---------------------------------------------- @@ -49,7 +51,7 @@ In the future we plan to improve support for WGS. Did you set the correct genome build? ------------------------------------- -The calculator will automatically grab scoring files in the correct genome build from the PGS Catalog. If match rates are low it may be because you have specified the wrong genome build. If you're using custom scoring files and the match rate is low it is possible that the `--liftover` command may have been omitted. +The calculator will automatically grab scoring files in the correct genome build from the PGS Catalog. If match rates are low it may be because you have specified the wrong genome build. If you're using custom scoring files and the match rate is low it is possible that the ``--liftover`` command may have been omitted. I'm still getting match rate errors. How do I figure out what's wrong? ---------------------------------------------------------------------- diff --git a/docs/explanation/output.rst b/docs/explanation/output.rst index 4aff03fe..37324e12 100644 --- a/docs/explanation/output.rst +++ b/docs/explanation/output.rst @@ -23,6 +23,7 @@ Calculated scores are stored in a gzipped-text space-delimted text file called seperate row (``length = n_samples*n_pgs``), and there will be at least four columns with the following headers: - ``sampleset``: the name of the input sampleset, or ``reference`` for the panel. +- ``FID``: the family identifier of each sample within the dataset (may be the same as IID). - ``IID``: the identifier of each sample within the dataset. - ``PGS``: the accession ID of the PGS being reported. - ``SUM``: reports the weighted sum of *effect_allele* dosages multiplied by their *effect_weight* @@ -56,6 +57,7 @@ describing the analysis of the target samples in relation to the reference panel following headers: - ``sampleset``: the name of the input sampleset, or ``reference`` for the panel. +- ``FID``: the family identifier of each sample within the dataset (may be the same as IID). - ``IID``: the identifier of each sample within the dataset. - ``[PC1 ... PCN]``: The projection of the sample within the PCA space defined by the reference panel. There will be as many PC columns as there are PCs calculated (default: 10). diff --git a/docs/how-to/bigjob.rst b/docs/how-to/bigjob.rst index 68380088..8940b616 100644 --- a/docs/how-to/bigjob.rst +++ b/docs/how-to/bigjob.rst @@ -74,43 +74,132 @@ limits. .. warning:: You'll probably want to use ``-profile singularity`` on a HPC. The pipeline requires Singularity v3.7 minimum. -However, in general you will have to adjust the ``executor`` options and job resource -allocations (e.g. ``process_low``). Here's an example for an LSF cluster: +Here's an example configuration running about 100 scores in parallel +on UK Biobank with a SLURM cluster: .. code-block:: text process { - queue = 'short' - clusterOptions = '' - scratch = true + errorStrategy = 'retry' + maxRetries = 3 + maxErrors = '-1' + executor = 'slurm' + + withName: 'DOWNLOAD_SCOREFILES' { + cpus = 1 + memory = { 1.GB * task.attempt } + time = { 1.hour * task.attempt } + } - withLabel:process_low { - cpus = 2 - memory = 8.GB - time = 1.h + withName: 'COMBINE_SCOREFILES' { + cpus = 1 + memory = { 8.GB * task.attempt } + time = { 2.hour * task.attempt } } - withLabel:process_medium { - cpus = 8 - memory = 64.GB - time = 4.h + + withName: 'PLINK2_MAKEBED' { + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } } - } - executor { - name = 'lsf' - jobName = { "$task.hash" } - } + withName: 'RELABEL_IDS' { + cpus = 1 + memory = { 16.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'PLINK2_ORIENT' { + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'DUMPSOFTWAREVERSIONS' { + cpus = 1 + memory = { 1.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'ANCESTRY_ANALYSIS' { + cpus = { 1 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'SCORE_REPORT' { + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } + } -In SLURM, queue is equivalent to a partition. Specific cluster parameters can be -provided by modifying ``clusterOptions``. You should change ``cpus``, -``memory``, and ``time`` to match the amount of resources used. Assuming the -configuration file you set up is saved as ``my_custom.config`` in your current -working directory, you're ready to run pgsc_calc. Instead of running nextflow -directly on the shell, save a bash script (``run_pgscalc.sh``) to a file -instead: + withName: 'EXTRACT_DATABASE' { + cpus = 1 + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'PLINK2_RELABELPVAR' { + cpus = 2 + memory = { 16.GB * task.attempt } + time = { 2.hour * task.attempt } + } + + withName: 'INTERSECT_VARIANTS' { + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'MATCH_VARIANTS' { + cpus = 2 + memory = { 32.GB * task.attempt } + time = { 6.hour * task.attempt } + } + + withName: 'FILTER_VARIANTS' { + cpus = 2 + memory = { 16.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'MATCH_COMBINE' { + cpus = 4 + memory = { 64.GB * task.attempt } + time = { 6.hour * task.attempt } + } + + withName: 'FRAPOSA_PCA' { + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 1.hour * task.attempt } + } + + withName: 'PLINK2_SCORE' { + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 12.hour * task.attempt } + } + + withName: 'SCORE_AGGREGATE' { + cpus = 2 + memory = { 16.GB * task.attempt } + time = { 4.hour * task.attempt } + } + } + +Assuming the configuration file you set up is saved as +``my_custom.config`` in your current working directory, you're ready +to run pgsc_calc. Instead of running nextflow directly on the shell, +save a bash script (``run_pgscalc.sh``) to a file instead: .. code-block:: bash - + + #SBATCH -J ukbiobank_pgs + #SBATCH -c 1 + #SBATCH -t 24:00:00 + #SBATCH --mem=2G + export NXF_ANSI_LOG=false export NXF_OPTS="-Xms500M -Xmx2G" @@ -126,20 +215,23 @@ instead: .. note:: The name of the nextflow and singularity modules will be different in your local environment - .. warning:: Make sure to copy input data to fast storage, and run the pipeline - on the same fast storage area. You might include these steps in your - bash script. Ask your sysadmin for help if you're not sure what this - means. +.. warning:: Make sure to copy input data to fast storage, and run the + pipeline on the same fast storage area. You might include + these steps in your bash script. Ask your sysadmin for + help if you're not sure what this means. .. code-block:: console - $ bsub -M 2GB -q short -o output.txt < run_pgscalc.sh - + $ sbatch run_pgsc_calc.sh + This will submit a nextflow driver job, which will submit additional jobs for -each process in the workflow. The nextflow driver requires up to 4GB of RAM -(bsub's ``-M`` parameter) and 2 CPUs to use (see a guide for `HPC users`_ here). +each process in the workflow. The nextflow driver requires up to 4GB of RAM and 2 CPUs to use (see a guide for `HPC users`_ here). -.. _`LSF and PBS`: https://nextflow.io/docs/latest/executor.html#slurm .. _`HPC users`: https://www.nextflow.io/blog/2021/5_tips_for_hpc_users.html .. _`a nextflow profile`: https://github.com/nf-core/configs + +Cloud deployments +----------------- + +We've deployed the calculator to Google Cloud Batch but some :doc:`special configuration is required`. diff --git a/docs/how-to/cache.rst b/docs/how-to/cache.rst index 6cad3a0b..b4f08697 100644 --- a/docs/how-to/cache.rst +++ b/docs/how-to/cache.rst @@ -1,23 +1,26 @@ .. _cache: -How do I speed up `pgsc_calc` computation times and avoid re-running code? -========================================================================== +How do I speed up computation times and avoid re-running code? +============================================================== -If you intend to run `pgsc_calc` multiple times on the same target samples (e.g. +If you intend to run ``pgsc_calc`` multiple times on the same target samples (e.g. on different sets of PGS, with different variant matching flags) it is worth cacheing information on invariant steps of the pipeline: - Genotype harmonzation (variant relabeling steps) -- Steps of `--run_ancestry` that: match variants between the target and reference panel and +- Steps of ``--run_ancestry`` that: match variants between the target and reference panel and generate PCA loadings that can be used to adjust the PGS for ancestry. -To do this you must specify a directory that can store these information across runs using the -`--genotypes_cache` flag to the nextflow command (also see :ref:`param ref`). Future runs of the -pipeline that use the same cache directory should then skip these steps and proceed to run only the -steps needed to calculate new PGS. This is slightly different than using the `-resume command in -nextflow `_ which mainly checks the -`work` directory and is more often used for restarting the pipeline when a specific step has failed -(e.g. for exceeding memory limits). +To do this you must specify a directory that can store these +information across runs using the ``--genotypes_cache`` flag to the +nextflow command (also see :ref:`param ref`). Future runs of the +pipeline that use the same cache directory should then skip these +steps and proceed to run only the steps needed to calculate new PGS. +This is slightly different than using the `-resume command in nextflow +`_ +which mainly checks the ``work`` directory and is more often used for +restarting the pipeline when a specific step has failed (e.g. for +exceeding memory limits). .. warning:: Always use a new cache directory for different samplesets, as redundant names may clash across runs. diff --git a/docs/how-to/calculate_custom.rst b/docs/how-to/calculate_custom.rst index 77333dee..5d7f17b4 100644 --- a/docs/how-to/calculate_custom.rst +++ b/docs/how-to/calculate_custom.rst @@ -26,7 +26,7 @@ minimal header in the following format: Header:: #pgs_name=metaGRS_CAD - #pgs_name=metaGRS_CAD + #pgs_id=metaGRS_CAD #trait_reported=Coronary artery disease #genome_build=GRCh37 diff --git a/docs/how-to/offline.rst b/docs/how-to/offline.rst index a77bf118..ca9e8da4 100644 --- a/docs/how-to/offline.rst +++ b/docs/how-to/offline.rst @@ -127,8 +127,12 @@ panel too. See :ref:`norm`. Download scoring files ---------------------- -It's best to manually download scoring files from the PGS Catalog in the correct -genome build. Using PGS001229 as an example: +.. tip:: Use our CLI application ``pgscatalog-download`` to `download multiple scoring`_ files in parallel and the correct genome build + +.. _download multiple scoring: https://pygscatalog.readthedocs.io/en/latest/how-to/guides/download.html + +You'll need to preload scoring files in the correct genome build. +Using PGS001229 as an example: https://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/PGS001229/ScoringFiles/ diff --git a/docs/how-to/prepare.rst b/docs/how-to/prepare.rst index d74427bc..ec174c4f 100644 --- a/docs/how-to/prepare.rst +++ b/docs/how-to/prepare.rst @@ -52,6 +52,8 @@ VCF from WGS See https://github.com/PGScatalog/pgsc_calc/discussions/123 for discussion about tools to convert the VCF files into ones suitable for calculating PGS. +If you input WGS data to the calculator without following the steps above then you will probably encounter match rate errors. For more information, see: :ref:`wgs` + ``plink`` binary fileset (bfile) -------------------------------- diff --git a/docs/how-to/samplesheet.rst b/docs/how-to/samplesheet.rst index ede58d8b..f94a84e6 100644 --- a/docs/how-to/samplesheet.rst +++ b/docs/how-to/samplesheet.rst @@ -27,7 +27,7 @@ download here <../../assets/examples/samplesheet.csv>`. There are four mandatory columns: -- **sampleset**: A text string (no spaces, or reserved characters [ '.' or '_' ]) referring +- **sampleset**: A text string (no spaces, or reserved characters [ ``.`` or ``_`` ]) referring to the name of a :term:`target dataset` of genotyping data containing at least one sample/individual (however cohort datasets will often contain many individuals with combined genotyped/imputed data). Data from a sampleset may be input as a single file, diff --git a/docs/index.rst b/docs/index.rst index dca0cb76..bec94718 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -54,7 +54,7 @@ The workflow relies on open source scientific software, including: A full description of included software is described in :ref:`containers`. .. _PLINK 2: https://www.cog-genomics.org/plink/2.0/ -.. _PGS Catalog Utilities: https://github.com/PGScatalog/pgscatalog_utils +.. _PGS Catalog Utilities: https://github.com/PGScatalog/pygscatalog .. _FRAPOSA: https://github.com/PGScatalog/fraposa_pgsc @@ -120,7 +120,10 @@ Documentation Changelog --------- -The :doc:`Changelog page` describes fixes and enhancements for each version. +The `Changelog page`_ describes fixes and enhancements for each version. + +.. _`Changelog page`: https://github.com/PGScatalog/pgsc_calc/releases + Features under development -------------------------- From 23551621c8191ad4b9472681ab224a5c79c67316 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 30 Jul 2024 14:05:52 +0100 Subject: [PATCH 09/13] Documenting new target MAF/missingness variant filters in explainer. --- docs/explanation/geneticancestry.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/explanation/geneticancestry.rst b/docs/explanation/geneticancestry.rst index fe8660f5..e703f6ab 100644 --- a/docs/explanation/geneticancestry.rst +++ b/docs/explanation/geneticancestry.rst @@ -130,7 +130,15 @@ how-to guide), and has the following steps: for variant-level QC (SNPs in Hardy–Weinberg equilibrium [p > 1e-04] that are bi-allelic and non-ambiguous, with low missingness [<10%], and minor allele frequency [MAF > 5%]) and sample-quality (missingness <10%). LD-pruning is then applied to the variants and sample passing these checks (r\ :sup:`2` threshold = 0.05), excluding - complex regions with high LD (e.g. MHC). These methods are implemented in the ``FILTER_VARIANTS`` module. + complex regions with high LD (e.g. MHC). These methods are implemented in the ``FILTER_VARIANTS`` module, and + the default settings can be changed (see :doc:`schema (Reference options) `). + + 1. **Additional variant filters on TARGET samples**: in ``v2.0.0-beta`` we introduced the ability to filter + target sample variants using minimum MAF [default 10%] and maximum genotype missingness [default 10%] to + improve PCA robustness when using imputed genotype data (see :doc:`schema (Ancestry options) `). + *Note: these parameters may need to be adjusted depending on your input data (currently optimized for large + cohorts like UKB), for individual samples we recommend the MAF filter to be lowered (``--pca_maf_target 0``) + to ensure homozygous reference calls are included.* 2. **PCA**: the LD-pruned variants of the unrelated samples passing QC are then used to define the PCA space of the reference panel (default: 10 PCs) using `FRAPOSA`_ (Fast and Robust Ancestry Prediction by using Online singular From d9bca8b0079c2e24c92135f5ec7b3f0c63165e43 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 30 Jul 2024 15:10:43 +0100 Subject: [PATCH 10/13] remove biocontainers from citations --- CITATIONS.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 128445bc..7cba0759 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -33,9 +33,6 @@ * [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. -* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) - > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. - * [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) * [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) From fbe96fafb2df0ca8d1a6315b41032d4ca0771f32 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 30 Jul 2024 15:18:01 +0100 Subject: [PATCH 11/13] update link from pgscatalog_utils to pygscatalog --- docs/_templates/globaltoc.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html index 048f9eef..b7cad074 100644 --- a/docs/_templates/globaltoc.html +++ b/docs/_templates/globaltoc.html @@ -35,7 +35,7 @@

Useful links

  • Issue tracker
  • Discussion board
  • -
  • pgscatalog_utils Github
  • +
  • pgscatalog-utils GitHub

  • From 94dadff221c819cf99347b263cd66daec343650b Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 30 Jul 2024 17:58:31 +0100 Subject: [PATCH 12/13] Re-align param names and module with schema --- modules/local/ancestry/intersect_variants.nf | 4 ++-- nextflow_schema.json | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/ancestry/intersect_variants.nf b/modules/local/ancestry/intersect_variants.nf index d872bbf4..e5c2efe3 100644 --- a/modules/local/ancestry/intersect_variants.nf +++ b/modules/local/ancestry/intersect_variants.nf @@ -33,8 +33,8 @@ process INTERSECT_VARIANTS { pgscatalog-intersect --ref $ref_variants \ --target $variants \ --chrom $meta.chrom \ - --maf_target $params.maf_target \ - --geno_miss $params.geno_miss_target \ + --maf_target $params.pca_maf_target \ + --geno_miss $params.pca_geno_miss_target \ --outdir . \ -v diff --git a/nextflow_schema.json b/nextflow_schema.json index 18a88128..174ba51b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -166,14 +166,14 @@ "default": true, "description": "Load allelic frequencies from reference panel when scoring target genomes" }, - "maf_target": { + "pca_maf_target": { "type": "number", "default": 0.1, "description": "Minimum MAF threshold in TARGET samples for variants to be included in the PCA.", "minimum": 0, "maximum": 1 }, - "geno_miss_target": { + "pca_geno_miss_target": { "type": "number", "default": 0.1, "description": "Maximum genotype missingness threshold in TARGET samples for variants to be included in the PCA.", @@ -190,7 +190,7 @@ "n_normalization", "load_afreq", "pca_maf_target", - "pca_geno_miss" + "pca_geno_miss_target" ] }, "reference_options": { From 70471cf7c411865defa3933cf36e8316eceabc69 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 30 Jul 2024 18:06:58 +0100 Subject: [PATCH 13/13] Add to conf --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index d18f0d3e..c4abafc8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,8 +43,8 @@ params { n_popcomp = 5 normalization_method = "empirical mean mean+var" n_normalization = 4 - maf_target = 0.1 - geno_miss_target = 0.1 + pca_maf_target = 0.1 + pca_geno_miss_target = 0.1 // compatibility params liftover = false