diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a5843..4ecfbfe33 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,6 +2,7 @@ "name": "nfcore", "image": "nfcore/gitpod:latest", "remoteUser": "gitpod", + "runArgs": ["--privileged"], // Configure tool-specific properties. "customizations": { diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index f40da9db0..ec7e537b7 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -9,7 +9,9 @@ Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/rnaseq then the best place to ask is on the nf-core Slack [#rnaseq](https://nfcore.slack.com/channels/rnaseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +:::info +If you need help using or modifying nf-core/rnaseq then the best place to ask is on the nf-core Slack [#rnaseq](https://nfcore.slack.com/channels/rnaseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +::: ## Contribution workflow @@ -116,4 +118,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 3ce613833..60b5e3f4e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,7 +42,7 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 22.10.1)_ + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f5d8dcf88..6ce4d2098 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: NXF_VER: - - "22.10.1" + - "23.04.0" - "latest-everything" steps: - name: Check out pipeline code @@ -232,16 +232,19 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test_cache,docker --aligner hisat2 ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ - salmon: - name: Test Salmon with workflow parameters + pseudo: + name: Test Pseudoaligners with workflow parameters if: ${{ (github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/rnaseq')) && !contains(github.event.head_commit.message, '[ci fast]') }} runs-on: ubuntu-latest strategy: matrix: parameters: - - "--skip_qc" - - "--skip_alignment --skip_pseudo_alignment" - - "--salmon_index false --transcript_fasta false" + - "--pseudo_aligner salmon --skip_qc" + - "--pseudo_aligner salmon --skip_alignment --skip_pseudo_alignment" + - "--pseudo_aligner salmon --salmon_index false --transcript_fasta false" + - "--pseudo_aligner kallisto --skip_qc" + - "--pseudo_aligner kallisto --skip_alignment --skip_pseudo_alignment" + - "--pseudo_aligner kallisto --kallisto_index false --transcript_fasta false" steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -280,6 +283,6 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Run pipeline with Salmon and various parameters + - name: Run pipeline with Salmon or Kallisto and various parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_cache,docker --pseudo_aligner salmon ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ + nextflow run ${GITHUB_WORKSPACE} -profile test_cache,docker ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ diff --git a/.github/workflows/cloud_tests_full.yml b/.github/workflows/cloud_tests_full.yml index 4738d9045..3c4b23106 100644 --- a/.github/workflows/cloud_tests_full.yml +++ b/.github/workflows/cloud_tests_full.yml @@ -30,7 +30,7 @@ jobs: compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/rnaseq/work-${{ github.sha }}" run_name: "aws_rnaseq_full_${{ matrix.aligner }}" - profiles: test_full_aws,public_aws_ecr + profiles: test_full_aws parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", diff --git a/.github/workflows/cloud_tests_small.yml b/.github/workflows/cloud_tests_small.yml index 1f430067b..6f2e6d8a9 100644 --- a/.github/workflows/cloud_tests_small.yml +++ b/.github/workflows/cloud_tests_small.yml @@ -25,7 +25,7 @@ jobs: compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/rnaseq/work-${{ github.sha }}" run_name: "aws_rnaseq_small" - profiles: test,public_aws_ecr + profiles: test parameters: | { "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/rnaseq/results-test-${{ github.sha }}" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 888cb4bc3..b8bdd2143 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.11" architecture: "x64" - name: Install dependencies diff --git a/.github/workflows/release-announcments.yml b/.github/workflows/release-announcments.yml new file mode 100644 index 000000000..6ad339277 --- /dev/null +++ b/.github/workflows/release-announcments.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.0.2 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc8..25488dcc0 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,4 +1,9 @@ image: nfcore/gitpod:latest +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update vscode: extensions: # based on nf-core.nf-core-extensionpack diff --git a/.nf-core.yml b/.nf-core.yml index e6c9d79f0..094459361 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -3,5 +3,6 @@ lint: files_unchanged: - assets/email_template.html - assets/email_template.txt - - lib/NfcoreSchema.groovy - lib/NfcoreTemplate.groovy + - pyproject.toml + multiqc_config: false diff --git a/CHANGELOG.md b/CHANGELOG.md index daae6c447..78429e553 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,67 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [hmehlan](https://github.com/hmehlan) +- [Jonathan Manning](https://github.com/pinin4fjords) +- [Júlia Mir Pedrol](https://github.com/mirpedrol) +- [Matthias Zepper](https://github.com/MatthiasZepper) +- [Maxime Garcia](https://github.com/maxulysse) +- [Steffen Möller](https://github.com/smoe) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- [PR #1049](https://github.com/nf-core/rnaseq/pull/1049) - Display a warning when `--extra_star_align_args` are used with `--aligner star_rsem` +- [PR #1051](https://github.com/nf-core/rnaseq/pull/1051) - Remove `public_aws_ecr` profile +- [PR #1054](https://github.com/nf-core/rnaseq/pull/1054) - Template update to nf-core/tools v2.9 +- [PR #1058](https://github.com/nf-core/rnaseq/pull/1058) - Use `nf-validation` plugin for parameter and samplesheet validation +- [PR #1068](https://github.com/nf-core/rnaseq/pull/1068) - Update `grep` version for `untar` module +- [PR #1073](https://github.com/nf-core/rnaseq/pull/1073) - Update documentation to discourage use of `--genome` +- [PR #1078](https://github.com/nf-core/rnaseq/pull/1078) - Updated pipeline template to [nf-core/tools 2.10](https://github.com/nf-core/tools/releases/tag/2.10) +- [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders +- [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 +- [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability +- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification +- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage +- [#976](https://github.com/nf-core/rnaseq/issues/976) - Add author and licenses for all custom scripts +- [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting +- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom +- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py` +- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields + +### Software dependencies + +| Dependency | Old version | New version | +| ----------------------- | ----------- | ----------- | +| `fastqc` | 0.11.9 | 0.12.1 | +| `multiqc` | 1.14 | 1.17 | +| `ucsc-bedgraphtobigwig` | 377 | 445 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + +### Modules / Subworkflows + +| Script | Old name | New name | +| ------------------------ | ----------------- | --------------------------- | +| `local/gtf_filter` | `GTF_GENE_FILTER` | `GTF_FILTER` | +| `local/tx2gene` | `SALMON_TX2GENE` | `TX2GENE` | +| `local/tximport` | `SALMON_TXIMPORT` | `TXIMPORT` | +| `local/quantify_salmon` | `QUANTIFY_SALMON` | `QUANTIFY_PSEUDO_ALIGNMENT` | +| `nf-core/kallisto_index` | | `KALLISTO_INDEX` | +| `nf-core/kallisto_quant` | | `KALLISTO_QUANT` | + ## [[3.12.0](https://github.com/nf-core/rnaseq/releases/tag/3.12.0)] - 2023-06-02 ### Credits @@ -21,7 +82,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes - [[#1011](https://github.com/nf-core/rnaseq/issues/1011)] - FastQ files from UMI-tools not being passed to fastp -- [[#1018](https://github.com/nf-core/rnaseq/issues/1018)] - Ability to skip both alignment and pseudo-alignment to only run pre-processing QC steps. +- [[#1018](https://github.com/nf-core/rnaseq/issues/1018)] - Ability to skip both alignment and pseudoalignment to only run pre-processing QC steps. - [PR #1016](https://github.com/nf-core/rnaseq/pull/1016) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #1025](https://github.com/nf-core/fetchngs/pull/1025) - Add `public_aws_ecr.config` to source mulled containers when using `public.ecr.aws` Docker Biocontainer registry - [PR #1038](https://github.com/nf-core/rnaseq/pull/1038) - Updated error log for count values when supplying `--additional_fasta` @@ -769,7 +830,7 @@ Major novel changes include: - Added options to skip several steps - Skip trimming using `--skipTrimming` - Skip BiotypeQC using `--skipBiotypeQC` - - Skip Alignment using `--skipAlignment` to only use pseudo-alignment using Salmon + - Skip Alignment using `--skipAlignment` to only use pseudoalignment using Salmon ### Documentation updates diff --git a/CITATIONS.md b/CITATIONS.md index 5e9d106fd..48a60b8e4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -22,6 +22,8 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + - [featureCounts](https://pubmed.ncbi.nlm.nih.gov/24227677/) > Liao Y, Smyth GK, Shi W. featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. 2014 Apr 1;30(7):923-30. doi: 10.1093/bioinformatics/btt656. Epub 2013 Nov 13. PubMed PMID: 24227677. @@ -140,5 +142,8 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f1..c089ec78c 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,18 +1,20 @@ -# Code of Conduct at nf-core (v1.0) +# Code of Conduct at nf-core (v1.4) ## Our Pledge -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - Age +- Ability - Body size +- Caste - Familial status - Gender identity and expression - Geographical location - Level of experience - Nationality and national origins - Native language -- Physical and neurological ability +- Neurodiversity - Race or ethnicity - Religion - Sexual identity and orientation @@ -22,80 +24,133 @@ Please note that the list above is alphabetised and is therefore not ranked in a ## Preamble -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +:::note +This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. +::: -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). + +Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. ## Our Responsibilities -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When and where does this Code of Conduct apply? -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - Communicating with an official project email address. - Communicating with community members within the nf-core Slack channel. - Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. - Representing nf-core on social media. This includes both official and personal accounts. ## nf-core cares 😊 -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - Ask for consent before sharing another community member’s personal information (including photographs) on social media. - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) - Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Accept feedback, yet be unafraid to question, deliberate, and learn. - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) - Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) ## nf-core frowns on 😕 -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. - Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. ### Online Trolling -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. -## Procedures for Reporting CoC violations +## Procedures for reporting CoC violations If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. + +Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. + +All reports will be handled with the utmost discretion and confidentiality. + +You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: + +- Your contact information. +- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. +- The behaviour that was in violation and the circumstances surrounding the incident. +- The approximate time of the behaviour (if different than the time the report was made). +- Other people involved in the incident, if applicable. +- If you believe the incident is ongoing. +- If there is a publicly available record (e.g. mailing list record, a screenshot). +- Any additional information. + +After you file a report, one or more members of our Safety Team will contact you to follow up on your report. + +## Who will read and handle reports + +All reports will be read and handled by the members of the Safety Team at nf-core. + +If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. + +To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. + +## Reviewing reports + +After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. + +The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. + +In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. -All reports will be handled with utmost discretion and confidentially. +## Confidentiality + +All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. + +We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. + +## Enforcement + +Actions taken by the nf-core’s Safety Team may include, but are not limited to: + +- Asking anyone to stop a behaviour. +- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. +- Removing access to the gather.town and Slack, either temporarily or permanently. +- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. +- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. +- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. +- No action. ## Attribution and Acknowledgements @@ -106,6 +161,22 @@ All reports will be handled with utmost discretion and confidentially. ## Changelog -### v1.0 - March 12th, 2021 +### v1.4 - February 8th, 2022 + +- Included a new member of the Safety Team. Corrected a typographical error in the text. + +### v1.3 - December 10th, 2021 + +- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. + +### v1.2 - November 12th, 2021 + +- Removed information specific to reporting CoC violations at the Hackathon in October 2021. + +### v1.1 - October 14th, 2021 + +- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. + +### v1.0 - March 15th, 2021 - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/README.md b/README.md index b7f10e56a..b45db8c7a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # ![nf-core/rnaseq](docs/images/nf-core-rnaseq_logo_light.png#gh-light-mode-only) ![nf-core/rnaseq](docs/images/nf-core-rnaseq_logo_dark.png#gh-dark-mode-only) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/rnaseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.1400710-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.1400710) +[![GitHub Actions CI Status](https://github.com/nf-core/rnaseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/rnaseq/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/rnaseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/rnaseq/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/rnaseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.1400710-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.1400710) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -38,7 +39,7 @@ 3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html) 4. [`Preseq`](http://smithlabresearch.org/software/preseq/) 5. [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) -15. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); _optional_) +15. Pseudoalignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/) or ['Kallisto'](https://pachterlab.github.io/kallisto/); _optional_) 16. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) > **Note** @@ -49,10 +50,11 @@ ## Usage -> **Note** -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how -> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) -> with `-profile test` before running the workflow on actual data. +:::note +If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +with `-profile test` before running the workflow on actual data. +::: First, prepare a samplesheet with your input data that looks as follows: @@ -82,14 +84,22 @@ nextflow run nf-core/rnaseq \ -profile ``` -For more details, please refer to the [usage documentation](https://nf-co.re/rnaseq/usage) and the [parameter documentation](https://nf-co.re/rnaseq/parameters). +:::warning +Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +::: + +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/rnaseq/usage) and the [parameter documentation](https://nf-co.re/rnaseq/parameters). ## Pipeline output -To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/rnaseq/results) tab on the nf-core website pipeline page. +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/rnaseq/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/rnaseq/output). +This pipeline quantifies RNA-sequenced reads relative to genes/transcripts in the genome and normalizes the resulting data. It does not compare the samples statistically in order to assign significance in the form of FDR or P-values. For downstream analyses, the output files from this pipeline can be analysed directly in statistical environments like [R](https://www.r-project.org/), [Julia](https://julialang.org/) or via the [nf-core/differentialabundance](https://github.com/nf-core/differentialabundance/) pipeline. + ## Online videos A short talk about the history, current status and functionality on offer in this pipeline was given by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) on [8th February 2022](https://nf-co.re/events/2022/bytesize-32-nf-core-rnaseq) as part of the nf-core/bytesize series. @@ -102,7 +112,7 @@ These scripts were originally written for use at the [National Genomics Infrastr The pipeline was re-written in Nextflow DSL2 and is primarily maintained by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/). -The pipeline workflow diagram was designed by Sarah Guinchard ([@G-Sarah](https://github.com/G-Sarah)) and James Fellows Yates ([@jfy133](https://github.com/jfy133)). +The pipeline workflow diagram was initially designed by Sarah Guinchard ([@G-Sarah](https://github.com/G-Sarah)) and James Fellows Yates ([@jfy133](https://github.com/jfy133)), further modifications where made by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) and Maxime Garcia ([@maxulysse](https://github.com/maxulysse)). Many thanks to other who have helped out along the way too, including (but not limited to): diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index c263da31b..56c961c8e 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,17 +3,21 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/rnaseq Methods Description" section_href: "https://github.com/nf-core/rnaseq" plot_type: "html" -## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/rnaseq v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

Data was processed using nf-core/rnaseq v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

Notes:
diff --git a/assets/multiqc/deseq2_pca_header.txt b/assets/multiqc/deseq2_pca_header.txt index 2599732af..636f9374d 100644 --- a/assets/multiqc/deseq2_pca_header.txt +++ b/assets/multiqc/deseq2_pca_header.txt @@ -2,7 +2,7 @@ #section_name: 'DESeq2 PCA plot' #description: "PCA plot between samples in the experiment. # These values are calculated using DESeq2 -# in the deseq2_qc.r script." +# in the deseq2_qc.r script." #plot_type: 'scatter' #anchor: 'deseq2_pca' #pconfig: diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 736156783..4085d95e3 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/rnaseq + This report has been generated by the nf-core/rnaseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-rnaseq-methods-description": order: -1000 @@ -23,6 +23,7 @@ run_modules: - hisat2 - rsem - salmon + - kallisto - samtools - picard - preseq @@ -66,6 +67,7 @@ extra_fn_clean_exts: - ".umi_dedup" - "_val" - ".markdup" + - "_primary" # Customise the module search patterns to speed up execution time # - Skip module sub-tools that we are not interested in diff --git a/assets/nf-core-rnaseq_logo_light.png b/assets/nf-core-rnaseq_logo_light.png index 5362512d3..b0c99cb83 100644 Binary files a/assets/nf-core-rnaseq_logo_light.png and b/assets/nf-core-rnaseq_logo_light.png differ diff --git a/assets/schema_input.json b/assets/schema_input.json index f2eb5ea81..1af6309dd 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,30 +10,35 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "Sample name must be provided and cannot contain spaces", + "meta": ["id"] }, "fastq_1": { "type": "string", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, "fastq_2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "type": "string", + "format": "file-path", + "exists": true, "anyOf": [ { - "type": "string", "pattern": "^\\S+\\.f(ast)?q\\.gz$" }, { - "type": "string", "maxLength": 0 } ] }, "strandedness": { "type": "string", - "errorMessage": "Strandedness must be provided and be one of 'forward', 'reverse' or 'unstranded'", - "enum": ["forward", "reverse", "unstranded"] + "errorMessage": "Strandedness must be provided and be one of 'auto', 'forward', 'reverse' or 'unstranded'", + "enum": ["forward", "reverse", "unstranded", "auto"], + "meta": ["strandedness"] } }, "required": ["sample", "fastq_1", "strandedness"] diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index 05971a5d9..000000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import errno -import argparse - - -def parse_args(args=None): - Description = "Reformat nf-core/rnaseq samplesheet file and check its contents." - Epilog = "Example usage: python check_samplesheet.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) - - -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def print_error(error, context="Line", context_str=""): - error_str = f"ERROR: Please check samplesheet -> {error}" - if context != "" and context_str != "": - error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'" - print(error_str) - sys.exit(1) - - -def check_samplesheet(file_in, file_out): - """ - This function checks that the samplesheet follows the following structure: - - sample,fastq_1,fastq_2,strandedness - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,forward - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz,forward - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,,forward - - For an example see: - https://github.com/nf-core/test-datasets/blob/rnaseq/samplesheet/v3.1/samplesheet_test.csv - """ - - sample_mapping_dict = {} - with open(file_in, "r", encoding="utf-8-sig") as fin: - ## Check header - MIN_COLS = 3 - HEADER = ["sample", "fastq_1", "fastq_2", "strandedness"] - header = [x.strip('"') for x in fin.readline().strip().split(",")] - if header[: len(HEADER)] != HEADER: - print(f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}") - sys.exit(1) - - ## Check sample entries - for line in fin: - if line.strip(): - lspl = [x.strip().strip('"') for x in line.strip().split(",")] - - ## Check valid number of columns per row - if len(lspl) < len(HEADER): - print_error( - f"Invalid number of columns (minimum = {len(HEADER)})!", - "Line", - line, - ) - - num_cols = len([x for x in lspl[: len(HEADER)] if x]) - if num_cols < MIN_COLS: - print_error( - f"Invalid number of populated columns (minimum = {MIN_COLS})!", - "Line", - line, - ) - - ## Check sample name entries - sample, fastq_1, fastq_2, strandedness = lspl[: len(HEADER)] - if sample.find(" ") != -1: - print(f"WARNING: Spaces have been replaced by underscores for sample: {sample}") - sample = sample.replace(" ", "_") - if not sample: - print_error("Sample entry has not been specified!", "Line", line) - - ## Check FastQ file extension - for fastq in [fastq_1, fastq_2]: - if fastq: - if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): - print_error( - "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", - "Line", - line, - ) - - ## Check strandedness - strandednesses = ["unstranded", "forward", "reverse", "auto"] - if strandedness: - if strandedness not in strandednesses: - print_error( - f"Strandedness must be one of '{', '.join(strandednesses)}'!", - "Line", - line, - ) - else: - print_error( - f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.", - "Line", - line, - ) - - ## Auto-detect paired-end/single-end - sample_info = [] ## [single_end, fastq_1, fastq_2, strandedness] - if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2, strandedness] - elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2, strandedness] - else: - print_error("Invalid combination of columns provided!", "Line", line) - - ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness ]]} - sample_info = sample_info + lspl[len(HEADER) :] - if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [sample_info] - else: - if sample_info in sample_mapping_dict[sample]: - print_error("Samplesheet contains duplicate rows!", "Line", line) - else: - sample_mapping_dict[sample].append(sample_info) - - ## Write validated samplesheet with appropriate columns - if len(sample_mapping_dict) > 0: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - fout.write( - ",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"] + header[len(HEADER) :]) + "\n" - ) - for sample in sorted(sample_mapping_dict.keys()): - ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): - print_error( - f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", - "Sample", - sample, - ) - - ## Check that multiple runs of the same sample are of the same strandedness - if not all(x[3] == sample_mapping_dict[sample][0][3] for x in sample_mapping_dict[sample]): - print_error( - f"Multiple runs of a sample must have the same strandedness!", - "Sample", - sample, - ) - - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n") - else: - print_error(f"No entries to process!", "Samplesheet: {file_in}") - - -def main(args=None): - args = parse_args(args) - check_samplesheet(args.FILE_IN, args.FILE_OUT) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/deseq2_qc.r b/bin/deseq2_qc.r index d99bc7c2b..14bd48f70 100755 --- a/bin/deseq2_qc.r +++ b/bin/deseq2_qc.r @@ -1,5 +1,7 @@ #!/usr/bin/env Rscript +# Written by Harshil Patel and Gavin Kelly and released under the MIT license. + ################################################ ################################################ ## REQUIREMENTS ## diff --git a/bin/dupradar.r b/bin/dupradar.r index daf5a5798..d58b0eee1 100755 --- a/bin/dupradar.r +++ b/bin/dupradar.r @@ -1,5 +1,7 @@ #!/usr/bin/env Rscript +# Written by Phil Ewels and released under the MIT license. + # Command line argument processing args = commandArgs(trailingOnly=TRUE) if (length(args) < 5) { diff --git a/bin/fasta2gtf.py b/bin/fasta2gtf.py index 052322b24..716ee06ce 100755 --- a/bin/fasta2gtf.py +++ b/bin/fasta2gtf.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 + +# Written by Pranathi Vemuri and released under the MIT license. + """ Read a custom fasta file and create a custom GTF containing each entry """ diff --git a/bin/fastq_dir_to_samplesheet.py b/bin/fastq_dir_to_samplesheet.py index 1eb3657c6..7ceebe525 100755 --- a/bin/fastq_dir_to_samplesheet.py +++ b/bin/fastq_dir_to_samplesheet.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +# Written by Harshil Patel and released under the MIT license. + import os import sys import glob diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py new file mode 100755 index 000000000..265250627 --- /dev/null +++ b/bin/filter_gtf.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# Written by Olga Botvinnik with subsequent reworking by Jonathan Manning. Released under the MIT license. + +import logging +import argparse +import re +import statistics +from typing import Set + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger("fasta_gtf_filter") +logger.setLevel(logging.INFO) + + +def extract_fasta_seq_names(fasta_name: str) -> Set[str]: + """Extracts the sequence names from a FASTA file.""" + with open(fasta_name) as fasta: + return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")} + + +def tab_delimited(file: str) -> float: + """Check if file is tab-delimited and return median number of tabs.""" + with open(file, "r") as f: + data = f.read(1024) + return statistics.median(line.count("\t") for line in data.split("\n")) + + +def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: + """Filter GTF file based on FASTA sequence names.""" + if tab_delimited(gtf_in) != 8: + raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") + + seq_names_in_genome = extract_fasta_seq_names(fasta) + logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome))) + + seq_names_in_gtf = set() + try: + with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out: + line_count = 0 + for line in gtf: + seq_name = line.split("\t")[0] + seq_names_in_gtf.add(seq_name) # Add sequence name to the set + + if seq_name in seq_names_in_genome: + if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line): + out.write(line) + line_count += 1 + + if line_count == 0: + raise ValueError("All GTF lines removed by filters") + + except IOError as e: + logger.error(f"File operation failed: {e}") + return + + logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") + parser.add_argument("--gtf", type=str, required=True, help="GTF file") + parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file") + parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files") + parser.add_argument( + "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file" + ) + + args = parser.parse_args() + filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py deleted file mode 100755 index 9f876eaa0..000000000 --- a/bin/filter_gtf_for_genes_in_genome.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -import logging -from itertools import groupby -import argparse - -# Create a logger -logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def is_header(line): - return line[0] == ">" - - -def extract_fasta_seq_names(fasta_name): - """ - modified from Brent Pedersen - Correct Way To Parse A Fasta File In Python - given a fasta file. yield tuples of header, sequence - from https://www.biostars.org/p/710/ - """ - # first open the file outside - fh = open(fasta_name) - - # ditch the boolean (x[0]) and just keep the header or sequence since - # we know they alternate. - faiter = (x[1] for x in groupby(fh, is_header)) - - for i, header in enumerate(faiter): - line = next(header) - if is_header(line): - # drop the ">" - headerStr = line[1:].strip().split()[0] - yield headerStr - - -def extract_genes_in_genome(fasta, gtf_in, gtf_out): - seq_names_in_genome = set(extract_fasta_seq_names(fasta)) - logger.info("Extracted chromosome sequence names from : %s" % fasta) - logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) - seq_names_in_gtf = set([]) - - n_total_lines = 0 - n_lines_in_genome = 0 - with open(gtf_out, "w") as f: - with open(gtf_in) as g: - for line in g.readlines(): - n_total_lines += 1 - seq_name_gtf = line.split("\t")[0] - seq_names_in_gtf.add(seq_name_gtf) - if seq_name_gtf in seq_names_in_genome: - n_lines_in_genome += 1 - f.write(line) - logger.info( - "Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta) - ) - logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf))) - - logger.info("Wrote matching lines to %s" % gtf_out) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""") - parser.add_argument("--gtf", type=str, help="GTF file") - parser.add_argument("--fasta", type=str, help="Genome fasta file") - parser.add_argument( - "-o", - "--output", - dest="output", - default="genes_in_genome.gtf", - type=str, - help="GTF features on fasta genome sequences", - ) - - args = parser.parse_args() - extract_genes_in_genome(args.fasta, args.gtf, args.output) diff --git a/bin/mqc_features_stat.py b/bin/mqc_features_stat.py index 689a3f215..fa69b231c 100755 --- a/bin/mqc_features_stat.py +++ b/bin/mqc_features_stat.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +# Written by Senthilkumar Panneerselvam and released under the MIT license. + import argparse import logging import os diff --git a/bin/salmon_tx2gene.py b/bin/salmon_tx2gene.py deleted file mode 100755 index 6c2f2bef6..000000000 --- a/bin/salmon_tx2gene.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from collections import OrderedDict, defaultdict, Counter -import logging -import argparse -import glob -import os - -# Create a logger -logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def read_top_transcript(salmon): - txs = set() - fn = glob.glob(os.path.join(salmon, "*", "quant.sf"))[0] - with open(fn) as inh: - for line in inh: - if line.startswith("Name"): - continue - txs.add(line.split()[0]) - if len(txs) > 100: - break - logger.info("Transcripts found in FASTA: %s" % txs) - return txs - - -def tx2gene(gtf, salmon, gene_id, extra, out): - txs = read_top_transcript(salmon) - votes = Counter() - gene_dict = defaultdict(list) - with open(gtf) as inh: - for line in inh: - if line.startswith("#"): - continue - cols = line.split("\t") - attr_dict = OrderedDict() - for gff_item in cols[8].split(";"): - item_pair = gff_item.strip().split(" ") - if len(item_pair) > 1: - value = item_pair[1].strip().replace('"', "") - if value in txs: - votes[item_pair[0].strip()] += 1 - - attr_dict[item_pair[0].strip()] = value - try: - gene_dict[attr_dict[gene_id]].append(attr_dict) - except KeyError: - continue - - if not votes: - logger.warning("No attribute in GTF matching transcripts") - return None - - txid = votes.most_common(1)[0][0] - logger.info("Attributed found to be transcript: %s" % txid) - seen = set() - with open(out, "w") as outh: - for gene in gene_dict: - for row in gene_dict[gene]: - if txid not in row: - continue - if (gene, row[txid]) not in seen: - seen.add((gene, row[txid])) - if not extra in row: - extra_id = gene - else: - extra_id = row[extra] - print("%s\t%s\t%s" % (row[txid], gene, extra_id), file=outh) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Get tx to gene names for tximport""") - parser.add_argument("--gtf", type=str, help="GTF file") - parser.add_argument("--salmon", type=str, help="output of salmon") - parser.add_argument("--id", type=str, help="gene id in the gtf file") - parser.add_argument("--extra", type=str, help="extra id in the gtf file") - parser.add_argument( - "-o", - "--output", - dest="output", - default="tx2gene.tsv", - type=str, - help="file with output", - ) - - args = parser.parse_args() - tx2gene(args.gtf, args.salmon, args.id, args.extra, args.output) diff --git a/bin/salmon_summarizedexperiment.r b/bin/summarizedexperiment.r similarity index 87% rename from bin/salmon_summarizedexperiment.r rename to bin/summarizedexperiment.r index 5ebdd317b..767d542c7 100755 --- a/bin/salmon_summarizedexperiment.r +++ b/bin/summarizedexperiment.r @@ -1,19 +1,21 @@ #!/usr/bin/env Rscript +# Written by Lorena Pantano and released under the MIT license. + library(SummarizedExperiment) -## Create SummarizedExperiment (se) object from Salmon counts +## Create SummarizedExperiment (se) object from counts args <- commandArgs(trailingOnly = TRUE) -if (length(args) < 2) { - stop("Usage: salmon_se.r ", call. = FALSE) +if (length(args) < 3) { + stop("Usage: summarizedexperiment.r ", call. = FALSE) } coldata <- args[1] counts_fn <- args[2] tpm_fn <- args[3] +tx2gene <- args[4] -tx2gene <- "salmon_tx2gene.tsv" info <- file.info(tx2gene) if (info$size == 0) { tx2gene <- NULL diff --git a/bin/tx2gene.py b/bin/tx2gene.py new file mode 100755 index 000000000..8e0c1c6a5 --- /dev/null +++ b/bin/tx2gene.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python + +# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license. + +import logging +import argparse +import glob +import os +from collections import Counter, defaultdict, OrderedDict +from collections.abc import Set +from typing import Dict + +# Configure logging +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]: + """ + Read the top 100 transcripts from the quantification file. + + Parameters: + quant_dir (str): Directory where quantification files are located. + file_pattern (str): Pattern to match quantification files. + + Returns: + set: A set containing the top 100 transcripts. + """ + try: + # Find the quantification file within the directory + quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0] + with open(quant_file_path, "r") as file_handle: + # Read the file and extract the top 100 transcripts + return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100} + except IndexError: + # Log an error and raise a FileNotFoundError if the quant file does not exist + logger.error("No quantification files found.") + raise FileNotFoundError("Quantification file not found.") + + +def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str: + """ + Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'. + + Parameters: + gtf_file (str): Path to the GTF file. + transcripts (Set[str]): A set of transcripts to match in the GTF file. + + Returns: + str: The attribute name that corresponds to transcripts in the GTF file. + """ + votes = Counter() + with open(gtf_file) as inh: + # Read GTF file, skipping header lines + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\t") + # Parse attribute column and update votes for each attribute found + attributes = dict(item.strip().split(" ", 1) for item in cols[8].split(";") if item.strip()) + votes.update(key for key, value in attributes.items() if value.strip('"') in transcripts) + + if not votes: + # Log a warning if no matching attribute is found + logger.warning("No attribute in GTF matching transcripts") + return "" + + # Check if 'transcript_id' is among the attributes with the highest votes + if "transcript_id" in votes and votes["transcript_id"] == max(votes.values()): + logger.info("Attribute 'transcript_id' corresponds to transcripts.") + return "transcript_id" + + # If 'transcript_id' isn't the highest, determine the most common attribute that matches the transcripts + attribute, _ = votes.most_common(1)[0] + logger.info(f"Attribute '{attribute}' corresponds to transcripts.") + return attribute + + +def parse_attributes(attributes_text: str) -> Dict[str, str]: + """ + Parse the attributes column of a GTF file. + + :param attributes_text: The attributes column as a string. + :return: A dictionary of the attributes. + """ + # Split the attributes string by semicolon and strip whitespace + attributes = attributes_text.strip().split(";") + attr_dict = OrderedDict() + + # Iterate over each attribute pair + for attribute in attributes: + # Split the attribute into key and value, ensuring there are two parts + parts = attribute.strip().split(" ", 1) + if len(parts) == 2: + key, value = parts + # Remove any double quotes from the value + value = value.replace('"', "") + attr_dict[key] = value + + return attr_dict + + +def map_transcripts_to_gene( + quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str +) -> bool: + """ + Map transcripts to gene names and write the output to a file. + + Parameters: + quant_type (str): The quantification method used (e.g., 'salmon'). + gtf_file (str): Path to the GTF file. + quant_dir (str): Directory where quantification files are located. + gene_id (str): The gene ID attribute in the GTF file. + extra_id_field (str): Additional ID field in the GTF file. + output_file (str): The output file path. + + Returns: + bool: True if the operation was successful, False otherwise. + """ + # Read the top transcripts based on quantification type + transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv") + # Discover the attribute that corresponds to transcripts in the GTF + transcript_attribute = discover_transcript_attribute(gtf_file, transcripts) + + if not transcript_attribute: + # If no attribute is found, return False + return False + + # Open GTF and output file to write the mappings + # Initialize the set to track seen combinations + seen = set() + + with open(gtf_file) as inh, open(output_file, "w") as output_handle: + # Parse each line of the GTF, mapping transcripts to genes + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\t") + attr_dict = parse_attributes(cols[8]) + if gene_id in attr_dict and transcript_attribute in attr_dict: + # Create a unique identifier for the transcript-gene combination + transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id]) + + # Check if the combination has already been seen + if transcript_gene_pair not in seen: + # If it's a new combination, write it to the output and add to the seen set + extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id]) + output_handle.write(f"{attr_dict[transcript_attribute]}\t{attr_dict[gene_id]}\t{extra_id}\n") + seen.add(transcript_gene_pair) + + return True + + +# Main function to parse arguments and call the mapping function +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Map transcripts to gene names for tximport.") + parser.add_argument("--quant_type", type=str, help="Quantification type", default="salmon") + parser.add_argument("--gtf", type=str, help="GTF file", required=True) + parser.add_argument("--quants", type=str, help="Output of quantification", required=True) + parser.add_argument("--id", type=str, help="Gene ID in the GTF file", required=True) + parser.add_argument("--extra", type=str, help="Extra ID in the GTF file") + parser.add_argument("-o", "--output", dest="output", default="tx2gene.tsv", type=str, help="File with output") + + args = parser.parse_args() + if not map_transcripts_to_gene(args.quant_type, args.gtf, args.quants, args.id, args.extra, args.output): + logger.error("Failed to map transcripts to genes.") diff --git a/bin/salmon_tximport.r b/bin/tximport.r similarity index 77% rename from bin/salmon_tximport.r rename to bin/tximport.r index e4416080d..ee3d8b212 100755 --- a/bin/salmon_tximport.r +++ b/bin/tximport.r @@ -1,29 +1,34 @@ #!/usr/bin/env Rscript +# Written by Lorena Pantano and released under the MIT license. + library(SummarizedExperiment) library(tximport) args = commandArgs(trailingOnly=TRUE) -if (length(args) < 2) { - stop("Usage: salmon_tximport.r ", call.=FALSE) +if (length(args) < 4) { + stop("Usage: tximport.r ", call.=FALSE) } coldata = args[1] path = args[2] sample_name = args[3] +quant_type = args[4] +tx2gene_path = args[5] prefix = sample_name -tx2gene = "salmon_tx2gene.tsv" -info = file.info(tx2gene) + +info = file.info(tx2gene_path) if (info$size == 0) { tx2gene = NULL } else { - rowdata = read.csv(tx2gene, sep="\t", header = FALSE) + rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) colnames(rowdata) = c("tx", "gene_id", "gene_name") tx2gene = rowdata[,1:2] } -fns = list.files(path, pattern = "quant.sf", recursive = T, full.names = T) +pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") +fns = list.files(path, pattern = pattern, recursive = T, full.names = T) names = basename(dirname(fns)) names(fns) = names @@ -32,11 +37,13 @@ if (file.exists(coldata)) { coldata = coldata[match(names, coldata[,1]),] coldata = cbind(files = fns, coldata) } else { - message("ColData not avaliable ", coldata) + message("ColData not available: ", coldata) coldata = data.frame(files = fns, names = names) } -txi = tximport(fns, type = "salmon", txOut = TRUE) +dropInfReps = quant_type == "kallisto" + +txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) rownames(coldata) = coldata[["names"]] extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) if (length(extra) > 0) { @@ -49,8 +56,8 @@ se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = tx rowData = rowdata) if (!is.null(tx2gene)) { gi = summarizeToGene(txi, tx2gene = tx2gene) - gi.ls = summarizeToGene(txi, tx2gene = tx2gene,countsFromAbundance="lengthScaledTPM") - gi.s = summarizeToGene(txi, tx2gene = tx2gene,countsFromAbundance="scaledTPM") + gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") + gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") growdata = unique(rowdata[,2:3]) growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] rownames(growdata) = growdata[["tx"]] @@ -78,9 +85,10 @@ if(exists("gse")){ write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) } -write.table(build_table(se,"abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) +write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) # Print sessioninfo to standard out citation("tximeta") sessionInfo() + diff --git a/conf/modules.config b/conf/modules.config index 3c31fb469..6e28f549b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -23,14 +23,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: 'SAMPLESHEET_CHECK' { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -49,8 +41,7 @@ process { publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -62,8 +53,7 @@ process { publishDir = [ path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -72,8 +62,7 @@ process { publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -81,8 +70,7 @@ process { publishDir = [ path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -91,8 +79,16 @@ process { publishDir = [ path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'KALLISTO_INDEX' { + ext.args = { params.gencode ? '--gencode' : '' } + publishDir = [ + path: { "${params.outdir}/genome/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -101,8 +97,7 @@ process { publishDir = [ path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -110,8 +105,7 @@ process { publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -119,17 +113,16 @@ process { publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } - withName: 'GTF_GENE_FILTER' { + withName: 'GTF_FILTER' { + ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' } publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -137,8 +130,7 @@ process { publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } @@ -146,8 +138,7 @@ process { publishDir = [ path: { "${params.outdir}/fastq" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_merged_fastq + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_merged_fastq ? filename : null } ] } } @@ -159,8 +150,7 @@ if (!params.skip_bbsplit && params.bbsplit_fasta_list) { publishDir = [ path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_reference + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } } @@ -175,9 +165,6 @@ process { ext.args = '--record-count 1000000 --seed 1' ext.prefix = { "${meta.id}.subsampled" } publishDir = [ - path: { "${params.outdir}/sample_fastq/fastq" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: false ] } @@ -185,9 +172,6 @@ process { withName: '.*:FASTQ_SUBSAMPLE_FQ_SALMON:SALMON_QUANT' { ext.args = '--skipQuant' publishDir = [ - path: { "${params.outdir}/sample_fastq/salmon" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('_meta_info.json') ? null : filename }, enabled: false ] } @@ -244,7 +228,7 @@ if (!params.skip_trimming) { path: { "${params.outdir}/${params.trimmer}" }, mode: params.publish_dir_mode, pattern: "*.fq.gz", - enabled: params.save_trimmed + saveAs: { params.save_trimmed ? it : null } ], [ path: { "${params.outdir}/${params.trimmer}" }, @@ -259,7 +243,7 @@ if (!params.skip_trimming) { if (params.trimmer == 'fastp') { process { withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTP' { - ext.args = params.extra_fastp_args ?: '' + ext.args = { params.extra_fastp_args ?: '' } publishDir = [ [ path: { "${params.outdir}/${params.trimmer}" }, @@ -275,7 +259,7 @@ if (!params.skip_trimming) { path: { "${params.outdir}/${params.trimmer}" }, mode: params.publish_dir_mode, pattern: "*.fastq.gz", - enabled: params.save_trimmed + saveAs: { params.save_trimmed ? it : null } ] ] } @@ -286,12 +270,12 @@ if (!params.skip_trimming) { if (params.with_umi && !params.skip_umi_extract) { process { withName: 'UMITOOLS_EXTRACT' { - ext.args = [ + ext.args = { [ params.umitools_extract_method ? "--extract-method=${params.umitools_extract_method}" : '', params.umitools_bc_pattern ? "--bc-pattern='${params.umitools_bc_pattern}'" : '', params.umitools_bc_pattern2 ? "--bc-pattern2='${params.umitools_bc_pattern2}'" : '', params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '' - ].join(' ').trim() + ].join(' ').trim() } publishDir = [ [ path: { "${params.outdir}/umitools" }, @@ -302,7 +286,7 @@ if (params.with_umi && !params.skip_umi_extract) { path: { "${params.outdir}/umitools" }, mode: params.publish_dir_mode, pattern: "*.fastq.gz", - enabled: params.save_umi_intermeds + saveAs: { params.save_umi_intermeds ? it : null } ] ] } @@ -327,7 +311,7 @@ if (!params.skip_bbsplit) { path: { "${params.outdir}/bbsplit" }, mode: params.publish_dir_mode, pattern: '*.fastq.gz', - enabled: params.save_bbsplit_reads + saveAs: { params.save_bbsplit_reads ? it : null } ] ] } @@ -348,7 +332,7 @@ if (params.remove_ribo_rna) { path: { "${params.outdir}/sortmerna" }, mode: params.publish_dir_mode, pattern: "*.fastq.gz", - enabled: params.save_non_ribo_reads + saveAs: { params.save_non_ribo_reads ? it : null } ] ] } @@ -376,25 +360,21 @@ if (!params.skip_alignment) { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: "*.bam", - enabled: ( ['star_salmon','hisat2'].contains(params.aligner) && - ( params.save_align_intermeds || - ( !params.with_umi && params.skip_markduplicates ) - ) - ) || params.save_align_intermeds || params.skip_markduplicates + saveAs: { ( ['star_salmon','hisat2'].contains(params.aligner) && + ( params.save_align_intermeds || ( !params.with_umi && params.skip_markduplicates ) ) + ) || params.save_align_intermeds || params.skip_markduplicates ? it : null } ] } withName: 'NFCORE_RNASEQ:RNASEQ:.*:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { - ext.args = params.bam_csi_index ? '-c' : '' + ext.args = { params.bam_csi_index ? '-c' : '' } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: "*.{bai,csi}", - enabled: ( ['star_salmon','hisat2'].contains(params.aligner) && - ( params.save_align_intermeds || - ( !params.with_umi && params.skip_markduplicates ) - ) - ) || params.save_align_intermeds || params.skip_markduplicates + saveAs: { ( ['star_salmon','hisat2'].contains(params.aligner) && + ( params.save_align_intermeds || ( !params.with_umi && params.skip_markduplicates ) ) + ) || params.save_align_intermeds || params.skip_markduplicates ? it : null } ] } } @@ -419,7 +399,7 @@ if (!params.skip_alignment) { } withName: '.*:BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' { - ext.args = params.bam_csi_index ? '-c' : '' + ext.args = { params.bam_csi_index ? '-c' : '' } ext.prefix = { "${meta.id}.markdup.sorted" } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, @@ -458,27 +438,19 @@ if (!params.skip_alignment) { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: ( - params.save_align_intermeds || - params.with_umi || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? it : null } ] ] } withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME:SAMTOOLS_INDEX' { - ext.args = params.bam_csi_index ? '-c' : '' + ext.args = { params.bam_csi_index ? '-c' : '' } ext.prefix = { "${meta.id}.umi_dedup.sorted" } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.{bai,csi}', - enabled: ( - params.save_align_intermeds || - params.with_umi || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? it : null } ] } @@ -498,7 +470,6 @@ if (!params.skip_alignment) { withName: 'BEDTOOLS_GENOMECOV' { ext.args = '-split -du' publishDir = [ - path: { "${params.outdir}/bedtools/${meta.id}" }, enabled: false ] } @@ -506,7 +477,6 @@ if (!params.skip_alignment) { withName: '.*:BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD:UCSC_BEDCLIP' { ext.prefix = { "${meta.id}.clip.forward" } publishDir = [ - path: { "${params.outdir}/${params.aligner}" }, enabled: false ] } @@ -523,7 +493,6 @@ if (!params.skip_alignment) { withName: '.*:BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE:UCSC_BEDCLIP' { ext.prefix = { "${meta.id}.clip.reverse" } publishDir = [ - path: { "${params.outdir}/${params.aligner}" }, enabled: false ] } @@ -542,10 +511,10 @@ if (!params.skip_alignment) { if (!params.skip_stringtie) { process { withName: 'STRINGTIE_STRINGTIE' { - ext.args = [ + ext.args = { [ '-v', params.stringtie_ignore_gtf ? '' : '-e' - ].join(' ').trim() + ].join(' ').trim() } publishDir = [ path: { "${params.outdir}/${params.aligner}/stringtie" }, mode: params.publish_dir_mode, @@ -563,7 +532,7 @@ if (!params.skip_alignment) { if (!params.skip_alignment && params.aligner == 'star_salmon') { process { withName: '.*:ALIGN_STAR:STAR_ALIGN|.*:ALIGN_STAR:STAR_ALIGN_IGENOMES' { - ext.args = [ + ext.args = { [ '--quantMode TranscriptomeSAM', '--twopassMode Basic', '--outSAMtype BAM Unsorted', @@ -576,7 +545,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { '--outSAMstrandField intronMotif', params.save_unaligned ? '--outReadsUnmapped Fastx' : '', params.extra_star_align_args ? params.extra_star_align_args.split("\\s(?=--)") : '' - ].flatten().unique(false).join(' ').trim() + ].flatten().unique(false).join(' ').trim() } publishDir = [ [ path: { "${params.outdir}/${params.aligner}/log" }, @@ -587,19 +556,19 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: params.save_align_intermeds + saveAs: { params.save_align_intermeds ? it : null } ], [ path: { "${params.outdir}/${params.aligner}/unmapped" }, mode: params.publish_dir_mode, pattern: '*.fastq.gz', - enabled: params.save_unaligned + saveAs: { params.save_unaligned ? it : null } ] ] } withName: '.*:QUANTIFY_STAR_SALMON:SALMON_QUANT' { - ext.args = params.extra_salmon_quant_args ?: '' + ext.args = { params.extra_salmon_quant_args ?: '' } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -607,7 +576,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } - withName: '.*:QUANTIFY_STAR_SALMON:SALMON_TX2GENE' { + withName: '.*:QUANTIFY_STAR_SALMON:TX2GENE' { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -615,7 +584,8 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } - withName: '.*:QUANTIFY_STAR_SALMON:SALMON_TXIMPORT' { + withName: '.*:QUANTIFY_STAR_SALMON:TXIMPORT' { + ext.prefix = { "${quant_type}.merged" } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -623,7 +593,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } - withName: '.*:QUANTIFY_STAR_SALMON:SALMON_SE_.*' { + withName: '.*:QUANTIFY_STAR_SALMON:SE_.*' { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -641,10 +611,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] } @@ -660,10 +627,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] ] } @@ -674,10 +638,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] } @@ -686,10 +647,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bai', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] } @@ -699,10 +657,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}/samtools_stats" }, mode: params.publish_dir_mode, pattern: '*.{stats,flagstat,idxstats}', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] } @@ -723,10 +678,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] ] } @@ -736,10 +688,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bai', - enabled: ( - params.save_align_intermeds || - params.save_umi_intermeds - ) + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] } @@ -757,13 +706,12 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { if (!params.skip_qc & !params.skip_deseq2_qc) { process { withName: 'DESEQ2_QC_STAR_SALMON' { - ext.args = [ + ext.args = { [ "--id_col 1", "--sample_suffix ''", - "--outprefix deseq2", "--count_col 3", params.deseq2_vst ? '--vst TRUE' : '' - ].join(' ').trim() + ].join(' ').trim() } ext.args2 = 'star_salmon' publishDir = [ path: { "${params.outdir}/${params.aligner}/deseq2_qc" }, @@ -799,7 +747,7 @@ if (!params.skip_alignment && params.aligner == 'star_rsem') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: "*.bam", - enabled: params.save_align_intermeds + saveAs: { params.save_align_intermeds ? it : null } ], [ path: { "${params.outdir}/${params.aligner}/log" }, @@ -821,13 +769,12 @@ if (!params.skip_alignment && params.aligner == 'star_rsem') { if (!params.skip_qc & !params.skip_deseq2_qc) { process { withName: 'DESEQ2_QC_RSEM' { - ext.args = [ + ext.args = { [ "--id_col 1", "--sample_suffix ''", - "--outprefix deseq2", "--count_col 3", params.deseq2_vst ? '--vst TRUE' : '' - ].join(' ').trim() + ].join(' ').trim() } ext.args2 = 'star_rsem' publishDir = [ path: { "${params.outdir}/${params.aligner}/deseq2_qc" }, @@ -857,13 +804,13 @@ if (!params.skip_alignment && params.aligner == 'hisat2') { path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, pattern: '*.bam', - enabled: params.save_align_intermeds + saveAs: { params.save_align_intermeds ? it : null } ], [ path: { "${params.outdir}/${params.aligner}/unmapped" }, mode: params.publish_dir_mode, pattern: '*.fastq.gz', - enabled: params.save_unaligned + saveAs: { params.save_unaligned ? it : null } ] ] } @@ -944,11 +891,11 @@ if (!params.skip_alignment && !params.skip_qc) { if (!params.skip_biotype_qc && params.featurecounts_group_type) { process { withName: 'SUBREAD_FEATURECOUNTS' { - ext.args = [ + ext.args = { [ '-B -C', params.gencode ? "-g gene_type" : "-g $params.featurecounts_group_type", "-t $params.featurecounts_feature_type" - ].join(' ').trim() + ].join(' ').trim() } publishDir = [ path: { "${params.outdir}/${params.aligner}/featurecounts" }, mode: params.publish_dir_mode, @@ -1120,7 +1067,8 @@ if (!params.skip_alignment && !params.skip_qc) { if (!params.skip_multiqc) { process { withName: 'MULTIQC' { - ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + ext.prefix = "multiqc_report" publishDir = [ path: { [ "${params.outdir}/multiqc", @@ -1134,21 +1082,39 @@ if (!params.skip_multiqc) { } // -// Salmon pseudo-alignment options +// Salmon/ Kallisto pseudoalignment options // if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { process { - withName: '.*:QUANTIFY_SALMON:SALMON_QUANT' { - ext.args = params.extra_salmon_quant_args ?: '' + + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' { + ext.args = { params.extra_salmon_quant_args ?: '' } publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('_meta_info.json') ? null : filename } ] } + } +} - withName: '.*:QUANTIFY_SALMON:SALMON_TX2GENE' { +if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:KALLISTO_QUANT' { + ext.args = params.extra_kallisto_quant_args ?: '' + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('.run_info.json') || filename.endsWith('.log') ? null : filename } + ] + } + } +} + +if (!params.skip_pseudo_alignment) { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' { publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, @@ -1156,7 +1122,8 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { ] } - withName: '.*:QUANTIFY_SALMON:SALMON_TXIMPORT' { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TXIMPORT' { + ext.prefix = { "${quant_type}.merged" } publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, @@ -1164,7 +1131,7 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { ] } - withName: '.*:QUANTIFY_SALMON:SALMON_SE_.*' { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SE_.*' { publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, @@ -1175,15 +1142,14 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { if (!params.skip_qc & !params.skip_deseq2_qc) { process { - withName: 'DESEQ2_QC_SALMON' { - ext.args = [ + withName: 'DESEQ2_QC_PSEUDO' { + ext.args = { [ "--id_col 1", "--sample_suffix ''", - "--outprefix deseq2", "--count_col 3", params.deseq2_vst ? '--vst TRUE' : '' - ].join(' ').trim() - ext.args2 = 'salmon' + ].join(' ').trim() } + ext.args2 = { params.pseudo_aligner } publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}/deseq2_qc" }, mode: params.publish_dir_mode, diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config deleted file mode 100644 index f584b3926..000000000 --- a/conf/public_aws_ecr.config +++ /dev/null @@ -1,72 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - AWS ECR Config -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config to set public AWS ECR images wherever possible - This improves speed when running on AWS infrastructure. - Use this as an example template when using your own private registry. ----------------------------------------------------------------------------------------- -*/ - -docker.registry = 'public.ecr.aws' -podman.registry = 'public.ecr.aws' - -process { - withName: 'CAT_ADDITIONAL_FASTA' { - container = 'quay.io/biocontainers/python:3.9--1' - } - withName: 'CAT_FASTQ' { - container = 'quay.io/nf-core/ubuntu:20.04' - } - withName: 'DESEQ2_QC' { - container = 'quay.io/biocontainers/mulled-v2-8849acf39a43cdd6c839a369a74c0adc823e2f91:ab110436faf952a33575c64dd74615a84011450b-0' - } - withName: 'GTF2BED' { - container = 'quay.io/biocontainers/perl:5.26.2' - } - withName: 'GTF_GENE_FILTER' { - container = 'quay.io/biocontainers/python:3.9--1' - } - withName: 'GUNZIP' { - container = 'quay.io/nf-core/ubuntu:20.04' - } - withName: 'HISAT2_ALIGN' { - container = 'quay.io/biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' - } - withName: 'MULTIQC_CUSTOM_BIOTYPE' { - container = 'quay.io/biocontainers/python:3.9--1' - } - withName: 'PREPROCESS_TRANSCRIPTS_FASTA_GENCODE' { - container = 'quay.io/nf-core/ubuntu:20.04' - } - withName: 'RSEM_CALCULATEEXPRESSION' { - container = 'quay.io/biocontainers/mulled-v2-cf0123ef83b3c38c13e3b0696a3f285d3f20f15b:64aad4a4e144878400649e71f42105311be7ed87-0' - } - withName: 'RSEM_MERGE_COUNTS' { - container = 'quay.io/nf-core/ubuntu:20.04' - } - withName: 'RSEM_PREPAREREFERENCE' { - container = 'quay.io/biocontainers/mulled-v2-cf0123ef83b3c38c13e3b0696a3f285d3f20f15b:64aad4a4e144878400649e71f42105311be7ed87-0' - } - withName: 'SALMON_TX2GENE' { - container = 'quay.io/biocontainers/python:3.9--1' - } - withName: 'SAMPLESHEET_CHECK' { - container = 'quay.io/biocontainers/python:3.9--1' - } - withName: 'STAR_ALIGN' { - container = 'quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' - } - withName: 'STAR_ALIGN_IGENOMES' { - container = 'quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0' - } - withName: 'STAR_GENOMEGENERATE' { - container = 'quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' - } - withName: 'STAR_GENOMEGENERATE_IGENOMES' { - container = 'quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0' - } - withName: 'UNTAR' { - container = 'quay.io/nf-core/ubuntu:20.04' - } -} diff --git a/conf/test.config b/conf/test.config index d0430eacb..f9154ba31 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,19 +21,19 @@ params { // Input data - input = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.10/samplesheet_test.csv" + input = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/samplesheet/v3.10/samplesheet_test.csv" // Genome references - fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fasta" - gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf.gz" - gff = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gff.gz" - transcript_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/transcriptome.fasta" - additional_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/gfp.fa.gz" - - bbsplit_fasta_list = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/bbsplit_fasta_list.txt" - hisat2_index = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/hisat2.tar.gz" - salmon_index = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/salmon.tar.gz" - rsem_index = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/rsem.tar.gz" + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/genome.fasta" + gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/genes_with_empty_tid.gtf.gz" + gff = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/genes.gff.gz" + transcript_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/transcriptome.fasta" + additional_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/gfp.fa.gz" + + bbsplit_fasta_list = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/bbsplit_fasta_list.txt" + hisat2_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/hisat2.tar.gz" + salmon_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/salmon.tar.gz" + rsem_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/rsem.tar.gz" // Other parameters skip_bbsplit = false diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.png b/docs/images/nf-core-rnaseq_metro_map_grey.png index 0a3645f81..0dbf23f81 100644 Binary files a/docs/images/nf-core-rnaseq_metro_map_grey.png and b/docs/images/nf-core-rnaseq_metro_map_grey.png differ diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.svg b/docs/images/nf-core-rnaseq_metro_map_grey.svg index 2e2a07374..b17e5844c 100644 --- a/docs/images/nf-core-rnaseq_metro_map_grey.svg +++ b/docs/images/nf-core-rnaseq_metro_map_grey.svg @@ -7,7 +7,7 @@ viewBox="0 0 646.4851 269.92565" version="1.1" id="svg8" - inkscape:version="1.2.2 (1:1.2.2+202212051552+b0a8486541)" + inkscape:version="1.3 (1:1.3+202307231459+0e150ed6c4)" sodipodi:docname="nf-core-rnaseq_metro_map_grey.svg" inkscape:export-filename="nf-core-rnaseq_metro_map_grey.png" inkscape:export-xdpi="89" @@ -655,14 +655,14 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="0.40305088" - inkscape:cx="916.75771" - inkscape:cy="282.8427" + inkscape:zoom="1.14" + inkscape:cx="1333.7719" + inkscape:cy="345.17543" inkscape:document-units="mm" inkscape:current-layer="layer1" showgrid="false" inkscape:window-width="1920" - inkscape:window-height="1052" + inkscape:window-height="1128" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1" @@ -684,7 +684,11 @@ type="xygrid" id="grid66457" originx="64.401789" - originy="-15.824357" />image/svg+xml23SalmonKallistoMultiQCsubsamplefastq(fq)TSVBIGWIGHTMLPseudo-aligner: Salmon, Quantification: SalmonAligner: HISAT2, Quantification: NoneAligner: STAR, Quantification: Salmon (default)Pseudo-aligner: Kallisto, Quantification: KallistoAligner: STAR, Quantification: RSEMMETHOD1. Pre-processing4. Post-processingLicense:License:TrimTrimGalore!FastQCFastQC(Salmon)(Salmon)BBSplitBBSplitFastQCFastQCFastP + y="-45.41811">FastP diff --git a/docs/output.md b/docs/output.md index aee68700f..a4c559b3b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -41,8 +41,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [featureCounts](#featurecounts) - Read counting relative to gene biotype - [DESeq2](#deseq2) - PCA plot and sample pairwise distance heatmap and dendrogram - [MultiQC](#multiqc) - Present QC for raw reads, alignment, read counting and sample similiarity -- [Pseudo-alignment and quantification](#pseudo-alignment-and-quantification) - - [Salmon](#salmon) - Wicked fast gene and isoform quantification relative to the transcriptome +- [Pseudoalignment and quantification](#pseudoalignment-and-quantification) + - [Salmon](#pseudoalignment) - Wicked fast gene and isoform quantification relative to the transcriptome + - [Kallisto](#pseudoalignment) - Near-optimal probabilistic RNA-seq quantification - [Workflow reporting and genomes](#workflow-reporting-and-genomes) - [Reference genome files](#reference-genome-files) - Saving reference genome indices/files - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -180,7 +181,7 @@ When `--remove_ribo_rna` is specified, the pipeline uses [SortMeRNA](https://git ## Alignment and quantification -### STAR and Salmon +### STAR, Salmon and Kallisto
Output files @@ -199,12 +200,12 @@ When `--remove_ribo_rna` is specified, the pipeline uses [SortMeRNA](https://git [STAR](https://github.com/alexdobin/STAR) is a read aligner designed for splice aware mapping typical of RNA sequencing data. STAR stands for *S*pliced *T*ranscripts *A*lignment to a *R*eference, and has been shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. Using `--aligner star_salmon` is the default alignment and quantification option. -[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) is a tool for wicked-fast transcript quantification from RNA-seq data. It requires a set of target transcripts (either from a reference or de-novo assembly) in order to perform quantification. All you need to run Salmon is a FASTA file containing your reference transcripts and a set of FASTA/FASTQ/BAM file(s) containing your reads. The transcriptome-level BAM files generated by STAR are provided to Salmon for downstream quantification. You can of course also provide FASTQ files directly as input to Salmon in order to pseudo-align and quantify your data by providing the `--pseudo_aligner salmon` parameter. The results generated by the pipeline are exactly the same whether you provide BAM or FASTQ input so please see the [Salmon](#salmon) results section for more details. - The STAR section of the MultiQC report shows a bar plot with alignment rates: good samples should have most reads as _Uniquely mapped_ and few _Unmapped_ reads. ![MultiQC - STAR alignment scores plot](images/mqc_star.png) +[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) and [Kallisto](https://pachterlab.github.io/kallisto/), from the Pachter Lab, are provided as options for pseudoalignment. Both allow quantification of reads against an index generated from a reference set of target transcripts. By default, the transcriptome-level BAM files generated by STAR are provided to Salmon for downstream quantification, and Kallisto is not an option here (it does not allow BAM file input). But you can provide FASTQ files directly as input to either Salmon or Kallisto in order to pseudoalign and quantify your data by providing the `--pseudo_aligner salmon` or `--pseudo_aligner kallisto` parameter. See the [Salmon](#pseudoalignment) and [Kallisto](#pseudoalignment) results sections for more details. + ### STAR via RSEM
@@ -668,25 +669,34 @@ The plot on the left hand side shows the standard PC plot - notice the variable Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQC, Cutadapt, SortMeRNA, STAR, RSEM, HISAT2, Salmon, SAMtools, Picard, RSeQC, Qualimap, Preseq and featureCounts. Additionally, various custom content has been added to the report to assess the output of dupRadar, DESeq2 and featureCounts biotypes, and to highlight samples failing a mimimum mapping threshold or those that failed to match the strand-specificity provided in the input samplesheet. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . -## Pseudo-alignment and quantification +## Pseudoalignment and quantification -### Salmon +### Pseudoalignment + +The principal output files are the same between Salmon and Kallsto: + +
+Output files + +- `/` + - `.merged.gene_counts.tsv`: Matrix of gene-level raw counts across all samples. + - `.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. + - `.gene_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `.merged.gene_counts_scaled.tsv`: Matrix of gene-level library size-scaled counts across all samples. + - `.merged.gene_counts_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated library size-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `.merged.gene_counts_length_scaled.tsv`: Matrix of gene-level length-scaled counts across all samples. + - `.merged.gene_counts_length_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated length-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. + - `.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. + - `.merged.transcript_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated isoform-level raw counts (`counts`) and transcript length (`length`) in the assays slot for transcripts. + - `tx2gene.tsv`: Tab-delimited file containing gene to transcripts ids mappings. +
+ +An additional subset of files are distinct to each tool, for Salmon:
Output files -- `salmon/` - - `salmon.merged.gene_counts.tsv`: Matrix of gene-level raw counts across all samples. - - `salmon.merged.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. - - `salmon.merged.gene_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated counts (`counts`) and transcript length (`length`) in the assays slot for genes. - - `salmon.merged.gene_counts_scaled.tsv`: Matrix of gene-level library size-scaled counts across all samples. - - `salmon.merged.gene_counts_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated library size-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. - - `salmon.merged.gene_counts_length_scaled.tsv`: Matrix of gene-level length-scaled counts across all samples. - - `salmon.merged.gene_counts_length_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated length-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. - - `salmon.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. - - `salmon.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. - - `salmon.merged.transcript_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated isoform-level raw counts (`counts`) and transcript length (`length`) in the assays slot for transcripts. - - `salmon_tx2gene.tsv`: Tab-delimited file containing gene to transcripts ids mappings. - `salmon//` - `aux_info/`: Auxiliary info e.g. versions and number of mapped reads. - `cmd_info.json`: Information about the Salmon quantification command, version and options. @@ -695,14 +705,25 @@ Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQ - `logs/`: Contains the file `salmon_quant.log` giving a record of Salmon's quantification. - `quant.genes.sf`: Salmon _gene_-level quantification of the sample, including feature length, effective length, TPM, and number of reads. - `quant.sf`: Salmon _transcript_-level quantification of the sample, including feature length, effective length, TPM, and number of reads. +
-
+... and Kallisto: + +
+Output files + +- `kallisto//` + - `abundance.h5`: a HDF5 binary file containing run info, abundance esimates, bootstrap estimates, and transcript length information length. This file can be read in by [sleuth](https://pachterlab.github.io/sleuth/about). + - `abundance.tsv`: a plaintext file of the abundance estimates. It does not contains bootstrap estimates. + - `run_info.json`: a json file containing information about the run. + - `kallisto_quant.log`: standard output from the Kallisto process per sample. +
-As described in the [STAR and Salmon](#star-and-salmon) section, you can choose to pseudo-align and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) by providing the `--pseudo_aligner salmon` parameter. By default, Salmon is run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon in isolation. If Salmon is run in isolation, the outputs mentioned above will be found in a folder named `salmon`. If Salmon is run alongside STAR, the folder will be named `star_salmon`. +As described in the [STAR and Salmon](#star-and-salmon) section, you can choose to pseudoalign and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by providing the `--pseudo_aligner` parameter. By default, Salmon is run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. If Salmon or Kallisto are run in isolation, the outputs mentioned above will be found in a folder named `salmon` or `kallisto`. If Salmon is run alongside STAR, the folder will be named `star_salmon`. Transcripts with large inferential uncertainty won't be assigned the exact number of reads reproducibly, every time Salmon is run. Read more about this on the [nf-core/rnaseq](https://github.com/nf-core/rnaseq/issues/585) and [salmon](https://github.com/COMBINE-lab/salmon/issues/613) Github repos. -The [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html) package is used in this pipeline to summarise the results generated by Salmon into matrices for use with downstream differential analysis packages. We use tximport with different options to summarize count and TPM quantifications at the gene- and transcript-level. Please see [#499](https://github.com/nf-core/rnaseq/issues/499) for discussion and links regarding which counts are suitable for different types of analysis. +The [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html) package is used in this pipeline to summarise the results generated by Salmon or Kallisto into matrices for use with downstream differential analysis packages. We use tximport with different options to summarize count and TPM quantifications at the gene- and transcript-level. Please see [#499](https://github.com/nf-core/rnaseq/issues/499) for discussion and links regarding which counts are suitable for different types of analysis. According to the `txtimport` documentation you can do one of the following: @@ -728,6 +749,7 @@ According to the `txtimport` documentation you can do one of the following: - `hisat2/`: Directory containing HISAT2 indices. - `rsem/`: Directory containing STAR and RSEM indices. - `salmon/`: Directory containing Salmon indices. + - `kallisto/`: Directory containing Kallisto indices.
@@ -742,6 +764,7 @@ A number of genome-specific files are generated by the pipeline because they are - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`. diff --git a/docs/usage.md b/docs/usage.md index 856e00c7f..524acba9b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -65,17 +65,19 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. -You also have the option to pseudo-align and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) by providing the `--pseudo_aligner salmon` parameter. Salmon will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). +You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. + +The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual). When running Salmon in mapping-based mode via `--pseudo_aligner salmon` the entire genome of the organism is used by default for the decoy-aware transcriptome when creating the indices (see second bulleted option in [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)). -Two additional parameters `--extra_star_align_args` and `--extra_salmon_quant_args` were added in v3.10 of the pipeline that allow you to append any custom parameters to the STAR align and Salmon quant commands, respectively. Note, the `--seqBias` and `--gcBias` are not provided to Salmon quant by default so you can provide these via `--extra_salmon_quant_args '--seqBias --gcBias'` if required. +Two additional parameters `--extra_star_align_args` and `--extra_salmon_quant_args` were added in v3.10 of the pipeline that allow you to append any custom parameters to the STAR align and Salmon quant commands, respectively. Note, the `--seqBias` and `--gcBias` are not provided to Salmon quant by default so you can provide these via `--extra_salmon_quant_args '--seqBias --gcBias'` if required. You can now also supply additional arguments to Kallisto via `--extra_kallisto_quant_args`. -> **NB:** You can use `--skip_alignment --skip_pseudo_alignment` if you only want to run the pre-processing QC steps in the pipeline like FastQ, trimming etc. This will skip alignment, pseudo-alignment and any post-alignment processing steps. +> **NB:** You can use `--skip_alignment --skip_pseudo_alignment` if you only want to run the pre-processing QC steps in the pipeline like FastQ, trimming etc. This will skip alignment, pseudoalignment and any post-alignment processing steps. ## Quantification options -The current options align with STAR and quantify using either Salmon (`--aligner star_salmon`) / RSEM (`--aligner star_rsem`). You also have the option to pseudo-align and quantify your data with Salmon by providing the `--pseudo_aligner salmon` parameter. +The current options align with STAR and quantify using either Salmon (`--aligner star_salmon`) / RSEM (`--aligner star_rsem`). You also have the option to pseudoalign and quantify your data with Salmon or Kallisto by providing the `--pseudo_aligner salmon` or `--pseudo_aligner kallisto` parameter, respectively. Since v3.0 of the pipeline, featureCounts is no longer used to perform gene/transcript quantification, however it is still used to generate QC metrics based on [biotype](http://www.ensembl.org/info/genome/genebuild/biotypes.html) information available within GFF/GTF genome annotation files. This decision was made primarily because of the limitations of featureCounts to appropriately quantify gene expression data. Please see [Zhao et al., 2015](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0141910#pone-0141910-t001) and [Soneson et al., 2015](https://f1000research.com/articles/4-1521/v1). @@ -91,9 +93,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | @@ -131,29 +133,48 @@ If unique molecular identifiers were used to prepare the library, add the follow --umitools_bc_pattern "^(?P.{6})(?P.{4}).*" ``` -## Reference genome files +## Reference genome options Please refer to the [nf-core website](https://nf-co.re/usage/reference_genomes) for general usage docs and guidelines regarding reference genomes. -The minimum reference genome requirements for this pipeline are a FASTA and GTF file, all other files required to run the pipeline can be generated from these files. However, it is more storage and compute friendly if you are able to re-use reference genome files as efficiently as possible. It is recommended to use the `--save_reference` parameter if you are using the pipeline to build new indices (e.g. custom genomes that are unavailable on [AWS iGenomes](https://nf-co.re/usage/reference_genomes#custom-genomes)) so that you can save them somewhere locally. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space. You can then either provide the appropriate reference genome files on the command-line via the appropriate parameters (e.g. `--star_index '/path/to/STAR/index/'`) or via a custom config file. Another option is to run the pipeline once with `--save_reference --skip_alignment --skip_pseudo_alignment` to generate and save all of the required reference files and indices to the results directory. You can then move the reference files in `/genome/` to a more permanent location and use these paths to override the relevant parameters in the pipeline e.g. `--star_index`. +### Explicit reference file specification (recommended) + +The minimum reference genome requirements for this pipeline are a FASTA and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like: + +``` +latest_release=$(curl -s 'http://rest.ensembl.org/info/software?content-type=application/json' | grep -o '"release":[0-9]*' | cut -d: -f2) +wget -L ftp://ftp.ensembl.org/pub/release-${latest_release}/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz +wget -L ftp://ftp.ensembl.org/pub/release-${latest_release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.${latest_release}.gtf.gz +``` + +These files can then be specified to the workflow with the `--fasta` and `--gtf` parameters. + +Notes: + +- Compressed reference files are supported by the pipeline i.e. standard files with the `.gz` extension and indices folders with the `tar.gz` extension. -- If `--genome` is provided then the FASTA and GTF files (and existing indices) will be automatically obtained from AWS-iGenomes unless these have already been downloaded locally in the path specified by `--igenomes_base`. - If `--gff` is provided as input then this will be converted to a GTF file, or the latter will be used if both are provided. - If `--gene_bed` is not provided then it will be generated from the GTF file. - If `--additional_fasta` is provided then the features in this file (e.g. ERCC spike-ins) will be automatically concatenated onto both the reference FASTA file as well as the GTF annotation before building the appropriate indices. +- When using `--aligner star_rsem`, both the STAR and RSEM indices should be present in the path specified by `--rsem_index` (see [#568](https://github.com/nf-core/rnaseq/issues/568)). + +#### Indices + +By default, indices are generated dynamically by the workflow for tools such as STAR and Salmon. Since indexing is an expensive process in time and resources you should ensure that it is only done once, by retaining the indices generated from each batch of reference files: -When using `--aligner star_rsem`, both the STAR and RSEM indices should be present in the path specified by `--rsem_index` (see [#568](https://github.com/nf-core/rnaseq/issues/568)). +- the `--save_reference` parameter will save your indices in your results directory +- the `--skip_alignment --skip_pseudo_alignment` will disable other processes if you'd like to do an 'indexing only' workflow run. -> **NB:** Compressed reference files are also supported by the pipeline i.e. standard files with the `.gz` extension and indices folders with the `tar.gz` extension. +Once you have the indices from a workflow run you should save them somewhere central and reuse them in subsequent runs using custom config files or command line parameters such as `--star_index '/path/to/STAR/index/'`. -As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)). +#### Gencode If you are using [GENCODE](https://www.gencodegenes.org/) reference genome files please specify the `--gencode` parameter because the format of these files is slightly different to ENSEMBL genome files: - The `--gtf_group_features_type` parameter will automatically be set to `gene_type` as opposed to `gene_biotype`, respectively. - If you are running Salmon, the `--gencode` flag will also be passed to the index building step to overcome parsing issues resulting from the transcript IDs in GENCODE fasta files being separated by vertical pipes (`|`) instead of spaces (see [this issue](https://github.com/COMBINE-lab/salmon/issues/15)). -## Prokaryotic genome annotations +#### Prokaryotic genome annotations This pipeline uses featureCounts to generate QC metrics based on [biotype](http://www.ensembl.org/info/genome/genebuild/biotypes.html) information available within GFF/GTF genome annotation files. The format of these annotation files can vary significantly depending on the source of the annotation and the type of organism. The default settings in the pipeline are tailored towards Ensembl GTF annotations available for eukaryotic genomes. Prokaryotic genome annotations tend to be distributed in GFF format which are structured differently in terms of the feature naming conventions. There are a number of ways you can tune the behaviour of the pipeline to cater for differences/absence of biotype information: @@ -164,14 +185,41 @@ This pipeline uses featureCounts to generate QC metrics based on [biotype](http: Please get in touch with us on the #rnaseq channel in the [nf-core Slack workspace](https://nf-co.re/join) if you are having problems or need any advice. +### iGenomes (not recommended) + +If the `--genome` parameter is provided (e.g. `--genome GRCh37`) then the FASTA and GTF files (and existing indices) will be automatically obtained from AWS-iGenomes unless these have already been downloaded locally in the path specified by `--igenomes_base`. + +However this is no longer recommended because: + +- Gene annotations in iGenomes are extremely out of date. This can be particularly problematic for RNA-seq analysis, which relies on accurate gene annotation. +- Some iGenomes references (e.g., GRCh38) point to annotation files that use gene symbols as the primary identifier. This can cause issues for downstream analysis, such as the nf-core [differential abundance](https://nf-co.re/differentialabundance) workflow where a conventional gene identifier distinct from symbol is expected. + +Notes: + +- As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)). + +### GTF filtering + +By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter. + ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/rnaseq --input --outdir --genome GRCh37 -profile docker +nextflow run \ + nf-core/rnaseq \ + --input \ + --outdir \ + --gtf \ + --fasta \ + --igenomes_ignore \ + --genome null \ + -profile docker ``` +> **NB:** Loading iGenomes configuration remains the default for reasons of consistency with other workflows, but should be disabled when not using iGenomes, applying the recommended usage above. + This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -187,7 +235,9 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: The above pipeline run specified with a params file in yaml format: @@ -224,11 +274,15 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: ## Core Nextflow arguments -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: ### `-profile` @@ -236,7 +290,9 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +:::info +We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +::: The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 9b34804d6..000000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,530 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import nextflow.Nextflow -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-apptainer', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - Nextflow.error('Exiting!') - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index bce1492b9..cecbacf29 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,7 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput class NfcoreTemplate { @@ -151,7 +152,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -245,6 +246,21 @@ class NfcoreTemplate { } } + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def output_pf = new File(output_d, "params_${timestamp}.json") + def jsonStr = JsonOutput.toJson(params) + output_pf.text = JsonOutput.prettyPrint(jsonStr) + } + // // Print pipeline summary on completion // diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 8678e2614..4d94dfe08 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -19,40 +19,10 @@ class WorkflowMain { " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --outdir --genome GRCh37 -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } - // // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params) - System.exit(0) - } // Print workflow version and exit on --version if (params.version) { @@ -61,17 +31,9 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params) - // Warn about using custom configs to provide pipeline parameters NfcoreTemplate.warnParamsProvidedInConfig(workflow, log) - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) @@ -82,12 +44,8 @@ class WorkflowMain { // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - - // Check input has been provided - if (!params.input) { - Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") - } } + // // Get attribute from genome config file e.g. fasta // diff --git a/lib/WorkflowRnaseq.groovy b/lib/WorkflowRnaseq.groovy index 08d684e77..25e52ffe4 100755 --- a/lib/WorkflowRnaseq.groovy +++ b/lib/WorkflowRnaseq.groovy @@ -11,14 +11,9 @@ class WorkflowRnaseq { // // Check and validate parameters // - public static void initialise(params, log, valid_params) { + public static void initialise(params, log) { genomeExistsError(params, log) - - if (!params.fasta) { - Nextflow.error("Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file.") - } - if (!params.gtf && !params.gff) { Nextflow.error("No GTF or GFF3 annotation specified! The pipeline requires at least one of these files.") } @@ -54,27 +49,13 @@ class WorkflowRnaseq { } } - if (!params.skip_trimming) { - if (!valid_params['trimmers'].contains(params.trimmer)) { - Nextflow.error("Invalid option: '${params.trimmer}'. Valid options for '--trimmer': ${valid_params['trimmers'].join(', ')}.") - } - } - - if (!params.skip_alignment) { - if (!valid_params['aligners'].contains(params.aligner)) { - Nextflow.error("Invalid option: '${params.aligner}'. Valid options for '--aligner': ${valid_params['aligners'].join(', ')}.") - } - } else { + if (params.skip_alignment) { skipAlignmentWarn(log) } if (!params.skip_pseudo_alignment && params.pseudo_aligner) { - if (!valid_params['pseudoaligners'].contains(params.pseudo_aligner)) { - Nextflow.error("Invalid option: '${params.pseudo_aligner}'. Valid options for '--pseudo_aligner': ${valid_params['pseudoaligners'].join(', ')}.") - } else { - if (!(params.salmon_index || params.transcript_fasta || (params.fasta && (params.gtf || params.gff)))) { - Nextflow.error("To use `--pseudo_aligner 'salmon'`, you must provide either --salmon_index or --transcript_fasta or both --fasta and --gtf / --gff.") - } + if (!(params.salmon_index || params.transcript_fasta || (params.fasta && (params.gtf || params.gff)))) { + Nextflow.error("To use `--pseudo_aligner 'salmon'`, you must provide either --salmon_index or --transcript_fasta or both --fasta and --gtf / --gff.") } } @@ -86,6 +67,9 @@ class WorkflowRnaseq { if (params.rsem_index && params.star_index) { rsemStarIndexWarn(log) } + if (params.aligner == 'star_rsem' && params.extra_star_align_args) { + rsemStarExtraArgumentsWarn(log) + } } // Warn if --additional_fasta provided with aligner index @@ -106,10 +90,32 @@ class WorkflowRnaseq { } // Check which RSeQC modules we are running + def valid_rseqc_modules = ['bam_stat', 'inner_distance', 'infer_experiment', 'junction_annotation', 'junction_saturation', 'read_distribution', 'read_duplication', 'tin'] def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : [] - if ((valid_params['rseqc_modules'] + rseqc_modules).unique().size() != valid_params['rseqc_modules'].size()) { - Nextflow.error("Invalid option: ${params.rseqc_modules}. Valid options for '--rseqc_modules': ${valid_params['rseqc_modules'].join(', ')}") + if ((valid_rseqc_modules + rseqc_modules).unique().size() != valid_rseqc_modules.size()) { + Nextflow.error("Invalid option: ${params.rseqc_modules}. Valid options for '--rseqc_modules': ${valid_rseqc_modules.join(', ')}") + } + } + + // + // Function to validate channels from input samplesheet + // + public static ArrayList validateInput(input) { + def (metas, fastqs) = input[1..2] + + // Check that multiple runs of the same sample are of the same strandedness + def strandedness_ok = metas.collect{ it.strandedness }.unique().size == 1 + if (!strandedness_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must have the same strandedness!: ${metas[0].id}") } + + // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end + def endedness_ok = metas.collect{ it.single_end }.unique().size == 1 + if (!endedness_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") + } + + return [ metas[0], fastqs ] } // @@ -267,26 +273,52 @@ class WorkflowRnaseq { } // - // Create MultiQC tsv custom content from a list of values + // Generate methods description for MultiQC // - public static String multiqcTsvFromList(tsv_data, header) { - def tsv_string = "" - if (tsv_data.size() > 0) { - tsv_string += "${header.join('\t')}\n" - tsv_string += tsv_data.join('\n') - } - return tsv_string + public static String toolCitationText(params) { + // TODO nf-core: Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + public static String toolBibliographyText(params) { + // TODO Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() + // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() @@ -295,6 +327,18 @@ class WorkflowRnaseq { return description_html } + // + // Create MultiQC tsv custom content from a list of values + // + public static String multiqcTsvFromList(tsv_data, header) { + def tsv_string = "" + if (tsv_data.size() > 0) { + tsv_string += "${header.join('\t')}\n" + tsv_string += tsv_data.join('\n') + } + return tsv_string + } + // // Exit pipeline if incorrect --genome key provided // @@ -394,6 +438,19 @@ class WorkflowRnaseq { "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } + // + // Print a warning if using '--aligner star_rsem' and providing '--star_extra_alignment_args' + // + private static void rsemStarExtraArgumentsWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " No additional arguments can be passed to STAR when using RSEM.\n" + + " Because RSEM enforces its own parameters for STAR, any extra arguments\n" + + " to STAR will be ignored. Alternatively, choose the STAR+Salmon route.\n\n" + + " This warning has been generated because you have provided both\n" + + " '--aligner star_rsem' and '--extra_star_align_args'.\n\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + // // Print a warning if using '--additional_fasta' and '--_index' // diff --git a/main.nf b/main.nf index 19f53b245..69ae6c3b3 100755 --- a/main.nf +++ b/main.nf @@ -28,6 +28,7 @@ params.star_index = WorkflowMain.getGenomeAttribute(params, 'star') params.hisat2_index = WorkflowMain.getGenomeAttribute(params, 'hisat2') params.rsem_index = WorkflowMain.getGenomeAttribute(params, 'rsem') params.salmon_index = WorkflowMain.getGenomeAttribute(params, 'salmon') +params.kallisto_index = WorkflowMain.getGenomeAttribute(params, 'kallisto') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -35,6 +36,22 @@ params.salmon_index = WorkflowMain.getGenomeAttribute(params, 'salmon') ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + WorkflowMain.initialise(workflow, params, log) /* diff --git a/modules.json b/modules.json index 030893337..74bd0d197 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "bbmap/bbsplit": { "branch": "master", - "git_sha": "e228790f2957152ad2534e39abd7b3878963e89d", + "git_sha": "de3e6fc949dcffb8d3508c015f435ace5773ff08", "installed_by": ["modules"] }, "cat/fastq": { @@ -17,7 +17,7 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", "installed_by": ["modules"] }, "custom/getchromsizes": { @@ -32,7 +32,7 @@ }, "fastqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "102cc9b709a6da9f7cee2373563ab1464fca9c0a", "installed_by": ["fastq_fastqc_umitools_trimgalore", "fastq_fastqc_umitools_fastp"] }, "fq/subsample": { @@ -47,17 +47,17 @@ }, "gunzip": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", "installed_by": ["modules"] }, "hisat2/align": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "a1881f6374506f9e031b7af814768cdb44a6a7d3", "installed_by": ["fastq_align_hisat2"] }, "hisat2/build": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "f2f48836bf5c59434966a6c3b2211b29363f31ab", "installed_by": ["modules"] }, "hisat2/extractsplicesites": { @@ -65,9 +65,19 @@ "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": ["modules"] }, + "kallisto/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "kallisto/quant": { + "branch": "master", + "git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad", + "installed_by": ["modules"] + }, "picard/markduplicates": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "2ee934606f1fdf7fc1cb05d6e8abc13bec8ab448", "installed_by": ["bam_markduplicates_picard"] }, "preseq/lcextrap": { @@ -77,7 +87,7 @@ }, "qualimap/rnaseq": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "4657d98bc9f565e067c4d924126ce107056f5e2f", "installed_by": ["modules"] }, "rsem/calculateexpression": { @@ -137,17 +147,17 @@ }, "salmon/quant": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "c5b528d0a51c31621b485ab3bcc008f483619ea6", "installed_by": ["modules", "fastq_subsample_fq_salmon"] }, "samtools/flagstat": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "570ec5bcfe19c49e16c9ca35a7a116563af6cc1c", "installed_by": ["bam_stats_samtools"] }, "samtools/idxstats": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "e662ab16e0c11f1e62983e21de9871f59371a639", "installed_by": ["bam_stats_samtools"] }, "samtools/index": { @@ -176,12 +186,12 @@ }, "star/align": { "branch": "master", - "git_sha": "57d75dbac06812c59798a48585032f6e50bb1914", + "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", "installed_by": ["modules"] }, "star/genomegenerate": { "branch": "master", - "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", "installed_by": ["modules"] }, "stringtie/stringtie": { @@ -201,17 +211,17 @@ }, "ucsc/bedclip": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "240937a2a9c30298110753292be041188891f2cb", "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] }, "ucsc/bedgraphtobigwig": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "66290981ab6038ea86177ade40b9449bc790b0ce", "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] }, "umitools/dedup": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "7297204bf49273300a3dbfa4b7a4027c8683f1bd", "installed_by": ["bam_dedup_stats_samtools_umitools"] }, "umitools/extract": { @@ -221,7 +231,7 @@ }, "untar": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", "installed_by": ["modules"] } } @@ -230,12 +240,12 @@ "nf-core": { "bam_dedup_stats_samtools_umitools": { "branch": "master", - "git_sha": "e228790f2957152ad2534e39abd7b3878963e89d", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, "bam_markduplicates_picard": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, "bam_rseqc": { @@ -245,41 +255,41 @@ }, "bam_sort_stats_samtools": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["fastq_align_hisat2"] }, "bam_stats_samtools": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": [ - "bam_markduplicates_picard", "bam_sort_stats_samtools", + "bam_markduplicates_picard", "bam_dedup_stats_samtools_umitools" ] }, "bedgraph_bedclip_bedgraphtobigwig": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, "fastq_align_hisat2": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, "fastq_fastqc_umitools_fastp": { "branch": "master", - "git_sha": "48dbb403fb2849b3d2c6c2e3eaaedbcca799428d", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, "fastq_fastqc_umitools_trimgalore": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, "fastq_subsample_fq_salmon": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] } } diff --git a/modules/local/bedtools_genomecov.nf b/modules/local/bedtools_genomecov/main.nf similarity index 100% rename from modules/local/bedtools_genomecov.nf rename to modules/local/bedtools_genomecov/main.nf diff --git a/modules/local/cat_additional_fasta.nf b/modules/local/cat_additional_fasta/main.nf similarity index 100% rename from modules/local/cat_additional_fasta.nf rename to modules/local/cat_additional_fasta/main.nf diff --git a/modules/local/deseq2_qc.nf b/modules/local/deseq2_qc/main.nf similarity index 97% rename from modules/local/deseq2_qc.nf rename to modules/local/deseq2_qc/main.nf index 7b3fac57b..466194934 100644 --- a/modules/local/deseq2_qc.nf +++ b/modules/local/deseq2_qc/main.nf @@ -32,11 +32,13 @@ process DESEQ2_QC { def args2 = task.ext.args2 ?: '' def label_lower = args2.toLowerCase() def label_upper = args2.toUpperCase() + prefix = task.ext.prefix ?: "deseq2" """ deseq2_qc.r \\ --count_file $counts \\ --outdir ./ \\ --cores $task.cpus \\ + --outprefix $prefix \\ $args if [ -f "R_sessionInfo.log" ]; then diff --git a/modules/local/dupradar.nf b/modules/local/dupradar/main.nf similarity index 100% rename from modules/local/dupradar.nf rename to modules/local/dupradar/main.nf diff --git a/modules/local/gtf2bed.nf b/modules/local/gtf2bed/main.nf similarity index 100% rename from modules/local/gtf2bed.nf rename to modules/local/gtf2bed/main.nf diff --git a/modules/local/gtf_gene_filter.nf b/modules/local/gtf_filter/main.nf similarity index 67% rename from modules/local/gtf_gene_filter.nf rename to modules/local/gtf_filter/main.nf index cd8e16adb..d14e8ff42 100644 --- a/modules/local/gtf_gene_filter.nf +++ b/modules/local/gtf_filter/main.nf @@ -1,4 +1,4 @@ -process GTF_GENE_FILTER { +process GTF_FILTER { tag "$fasta" conda "conda-forge::python=3.9.5" @@ -11,18 +11,18 @@ process GTF_GENE_FILTER { path gtf output: - path "*.gtf" , emit: gtf - path "versions.yml", emit: versions + path "*.filtered.gtf", emit: genome_gtf + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when - script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/ + script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/ """ - filter_gtf_for_genes_in_genome.py \\ + filter_gtf.py \\ --gtf $gtf \\ --fasta $fasta \\ - -o ${fasta.baseName}_genes.gtf + --prefix ${fasta.baseName} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/multiqc.nf b/modules/local/multiqc/main.nf similarity index 89% rename from modules/local/multiqc.nf rename to modules/local/multiqc/main.nf index 6beb43310..a59d8a533 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' : + 'biocontainers/multiqc:1.17--pyhdfd78af_1' }" input: path multiqc_config @@ -23,7 +23,7 @@ process MULTIQC { path ('star/*') path ('hisat2/*') path ('rsem/*') - path ('salmon/*') + path ('pseudoalignment/*') path ('samtools/stats/*') path ('samtools/flagstat/*') path ('samtools/idxstats/*') @@ -57,8 +57,10 @@ process MULTIQC { script: def args = task.ext.args ?: '' def custom_config = params.multiqc_config ? "--config $multiqc_custom_config" : '' + prefix = task.ext.prefix ?: "multiqc_report" """ multiqc \\ + -n ${prefix}.html \\ -f \\ $args \\ $custom_config \\ diff --git a/modules/local/multiqc_custom_biotype.nf b/modules/local/multiqc_custom_biotype/main.nf similarity index 100% rename from modules/local/multiqc_custom_biotype.nf rename to modules/local/multiqc_custom_biotype/main.nf diff --git a/modules/local/preprocess_transcripts_fasta_gencode.nf b/modules/local/preprocess_transcripts_fasta_gencode/main.nf similarity index 100% rename from modules/local/preprocess_transcripts_fasta_gencode.nf rename to modules/local/preprocess_transcripts_fasta_gencode/main.nf diff --git a/modules/local/rsem_merge_counts.nf b/modules/local/rsem_merge_counts/main.nf similarity index 100% rename from modules/local/rsem_merge_counts.nf rename to modules/local/rsem_merge_counts/main.nf diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index f8251de88..000000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.9.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'biocontainers/python:3.9--1' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/star_align_igenomes.nf b/modules/local/star_align_igenomes/main.nf similarity index 100% rename from modules/local/star_align_igenomes.nf rename to modules/local/star_align_igenomes/main.nf diff --git a/modules/local/star_genomegenerate_igenomes.nf b/modules/local/star_genomegenerate_igenomes/main.nf similarity index 100% rename from modules/local/star_genomegenerate_igenomes.nf rename to modules/local/star_genomegenerate_igenomes/main.nf diff --git a/modules/local/salmon_summarizedexperiment.nf b/modules/local/summarizedexperiment/main.nf similarity index 91% rename from modules/local/salmon_summarizedexperiment.nf rename to modules/local/summarizedexperiment/main.nf index 2278ea14e..f278e2301 100644 --- a/modules/local/salmon_summarizedexperiment.nf +++ b/modules/local/summarizedexperiment/main.nf @@ -1,4 +1,4 @@ -process SALMON_SUMMARIZEDEXPERIMENT { +process SUMMARIZEDEXPERIMENT { tag "$tx2gene" label "process_medium" @@ -21,10 +21,11 @@ process SALMON_SUMMARIZEDEXPERIMENT { script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ """ - salmon_summarizedexperiment.r \\ + summarizedexperiment.r \\ NULL \\ $counts \\ - $tpm + $tpm \\ + $tx2gene cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/salmon_tx2gene.nf b/modules/local/tx2gene/main.nf similarity index 83% rename from modules/local/salmon_tx2gene.nf rename to modules/local/tx2gene/main.nf index b6e9df662..150e74afb 100644 --- a/modules/local/salmon_tx2gene.nf +++ b/modules/local/tx2gene/main.nf @@ -1,4 +1,4 @@ -process SALMON_TX2GENE { +process TX2GENE { tag "$gtf" label "process_low" @@ -8,7 +8,8 @@ process SALMON_TX2GENE { 'biocontainers/python:3.9--1' }" input: - path ("salmon/*") + path ("quants/*") + val quant_type path gtf output: @@ -20,12 +21,13 @@ process SALMON_TX2GENE { script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ """ - salmon_tx2gene.py \\ + tx2gene.py \\ + --quant_type $quant_type \\ --gtf $gtf \\ - --salmon salmon \\ + --quants quants \\ --id $params.gtf_group_features \\ --extra $params.gtf_extra_attributes \\ - -o salmon_tx2gene.tsv + -o tx2gene.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/salmon_tximport.nf b/modules/local/tximport/main.nf similarity index 86% rename from modules/local/salmon_tximport.nf rename to modules/local/tximport/main.nf index 181c897cd..56d826a7d 100644 --- a/modules/local/salmon_tximport.nf +++ b/modules/local/tximport/main.nf @@ -1,4 +1,4 @@ -process SALMON_TXIMPORT { +process TXIMPORT { label "process_medium" conda "bioconda::bioconductor-tximeta=1.12.0" @@ -7,8 +7,9 @@ process SALMON_TXIMPORT { 'biocontainers/bioconductor-tximeta:1.12.0--r41hdfd78af_0' }" input: - path ("salmon/*") + path ("quants/*") path tx2gene + val quant_type output: path "*gene_tpm.tsv" , emit: tpm_gene @@ -23,11 +24,14 @@ process SALMON_TXIMPORT { task.ext.when == null || task.ext.when script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ + prefix = task.ext.prefix ?: "${quant_type}.merged" """ - salmon_tximport.r \\ + tximport.r \\ NULL \\ - salmon \\ - salmon.merged + quants \\ + $prefix \\ + $quant_type \\ + $tx2gene cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/umitools_prepareforrsem.nf b/modules/local/umitools_prepareforrsem/main.nf similarity index 100% rename from modules/local/umitools_prepareforrsem.nf rename to modules/local/umitools_prepareforrsem/main.nf diff --git a/modules/nf-core/bbmap/bbsplit/main.nf b/modules/nf-core/bbmap/bbsplit/main.nf index 67190ff1d..f82bb9e47 100644 --- a/modules/nf-core/bbmap/bbsplit/main.nf +++ b/modules/nf-core/bbmap/bbsplit/main.nf @@ -1,4 +1,5 @@ process BBMAP_BBSPLIT { + tag "$meta.id" label 'process_high' label 'error_retry' diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 000000000..f0c63f698 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc872733..7685b33cd 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index c32657de7..5f15a5fde 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: @@ -16,7 +16,6 @@ input: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -30,7 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 000000000..eec1db10a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 000000000..4274ed57a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,27 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ], + "1": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "2": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "versions": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "yml": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ] + } + ], + "timestamp": "2023-11-03T14:43:22.157011" + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 000000000..405aa24ae --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 07d5e4331..67209f793 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "bioconda::fastqc=0.12.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) @@ -29,7 +29,11 @@ process FASTQC { printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done - fastqc $args --threads $task.cpus $renamed_files + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test new file mode 100644 index 000000000..badb67161 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -0,0 +1,40 @@ +nextflow_process { + + name "Test Process FASTQC" + script "modules/nf-core/fastqc/main.nf" + process "FASTQC" + tag "fastqc" + tag "modules_nfcore" + + test("Single-Read") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
    Mon 2 Oct 2023
    test.gz
    + // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, + { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + ) + } + } +} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 000000000..636a32cea --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-10-09T23:40:54+0000" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index e7189d2fb..73bf08cde 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -21,10 +21,14 @@ process GUNZIP { def args = task.ext.args ?: '' gunzip = archive.toString() - '.gz' """ - gunzip \\ - -f \\ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ $args \\ - $archive + $archive \\ + > $gunzip cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/hisat2/align/main.nf b/modules/nf-core/hisat2/align/main.nf index e69204692..db8e8bb6d 100644 --- a/modules/nf-core/hisat2/align/main.nf +++ b/modules/nf-core/hisat2/align/main.nf @@ -33,6 +33,7 @@ process HISAT2_ALIGN { } else if (meta.strandedness == 'reverse') { strandedness = meta.single_end ? '--rna-strandness R' : '--rna-strandness RF' } + ss = "$splicesites" ? "--known-splicesite-infile $splicesites" : '' def seq_center = params.seq_center ? "--rg-id ${prefix} --rg SM:$prefix --rg CN:${params.seq_center.replaceAll('\\s','_')}" : "--rg-id ${prefix} --rg SM:$prefix" if (meta.single_end) { def unaligned = params.save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' @@ -42,7 +43,7 @@ process HISAT2_ALIGN { -x \$INDEX \\ -U $reads \\ $strandedness \\ - --known-splicesite-infile $splicesites \\ + $ss \\ --summary-file ${prefix}.hisat2.summary.log \\ --threads $task.cpus \\ $seq_center \\ @@ -65,7 +66,7 @@ process HISAT2_ALIGN { -1 ${reads[0]} \\ -2 ${reads[1]} \\ $strandedness \\ - --known-splicesite-infile $splicesites \\ + $ss \\ --summary-file ${prefix}.hisat2.summary.log \\ --threads $task.cpus \\ $seq_center \\ diff --git a/modules/nf-core/hisat2/build/main.nf b/modules/nf-core/hisat2/build/main.nf index 5ad36a548..90f8efcc6 100644 --- a/modules/nf-core/hisat2/build/main.nf +++ b/modules/nf-core/hisat2/build/main.nf @@ -37,9 +37,9 @@ process HISAT2_BUILD { def hisat2_build_memory = params.hisat2_build_memory ? (params.hisat2_build_memory as nextflow.util.MemoryUnit).toGiga() : 0 if (avail_mem >= hisat2_build_memory) { log.info "[HISAT2 index build] At least ${hisat2_build_memory} GB available, so using splice sites and exons to build HISAT2 index" - extract_exons = "hisat2_extract_exons.py $gtf > ${gtf.baseName}.exons.txt" - ss = "--ss $splicesites" - exon = "--exon ${gtf.baseName}.exons.txt" + extract_exons = gtf ? "hisat2_extract_exons.py $gtf > ${gtf.baseName}.exons.txt" : "" + ss = splicesites ? "--ss $splicesites" : "" + exon = gtf ? "--exon ${gtf.baseName}.exons.txt" : "" } else { log.info "[HISAT2 index build] Less than ${hisat2_build_memory} GB available, so NOT using splice sites and exons to build HISAT2 index." log.info "[HISAT2 index build] Use --hisat2_build_memory [small number] to skip this check." diff --git a/modules/nf-core/kallisto/index/environment.yml b/modules/nf-core/kallisto/index/environment.yml new file mode 100644 index 000000000..471b006a8 --- /dev/null +++ b/modules/nf-core/kallisto/index/environment.yml @@ -0,0 +1,7 @@ +name: kallisto_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kallisto=0.48.0 diff --git a/modules/nf-core/kallisto/index/main.nf b/modules/nf-core/kallisto/index/main.nf new file mode 100644 index 000000000..28a47dbeb --- /dev/null +++ b/modules/nf-core/kallisto/index/main.nf @@ -0,0 +1,44 @@ +process KALLISTO_INDEX { + tag "$fasta" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2': + 'biocontainers/kallisto:0.48.0--h15996b6_2' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("kallisto") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + kallisto \\ + index \\ + $args \\ + -i kallisto \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ + + stub: + """ + touch kallisto + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/index/meta.yml b/modules/nf-core/kallisto/index/meta.yml new file mode 100644 index 000000000..d366aeb45 --- /dev/null +++ b/modules/nf-core/kallisto/index/meta.yml @@ -0,0 +1,41 @@ +name: kallisto_index +description: Create kallisto index +keywords: + - kallisto + - kallisto/index + - index +tools: + - kallisto: + description: Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + licence: ["BSD-2-Clause"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: genome fasta file + pattern: "*.{fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - index: + type: directory + description: Kallisto genome index + pattern: "*.idx" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ggabernet" +maintainers: + - "@ggabernet" diff --git a/modules/nf-core/kallisto/index/tests/main.nf.test b/modules/nf-core/kallisto/index/tests/main.nf.test new file mode 100644 index 000000000..97933d697 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process KALLISTO_INDEX" + script "../main.nf" + process "KALLISTO_INDEX" + tag "modules" + tag "modules_nfcore" + tag "kallisto" + tag "kallisto/index" + + test("homo_sapiens genome_fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test_fasta' ], // meta map + [ file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/kallisto/index/tests/main.nf.test.snap b/modules/nf-core/kallisto/index/tests/main.nf.test.snap new file mode 100644 index 000000000..c0f45ac45 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "homo_sapiens genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test_fasta" + }, + "kallisto:md5,2dab84e1456201beca5a43f4c514d67c" + ] + ], + "1": [ + "versions.yml:md5,178f9b57d4228edc356911d571b958a4" + ], + "index": [ + [ + { + "id": "test_fasta" + }, + "kallisto:md5,2dab84e1456201beca5a43f4c514d67c" + ] + ], + "versions": [ + "versions.yml:md5,178f9b57d4228edc356911d571b958a4" + ] + } + ], + "timestamp": "2023-11-02T09:58:48.83625986" + } +} \ No newline at end of file diff --git a/modules/nf-core/kallisto/index/tests/tags.yml b/modules/nf-core/kallisto/index/tests/tags.yml new file mode 100644 index 000000000..9f47b88a1 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/tags.yml @@ -0,0 +1,2 @@ +kallisto/index: + - modules/nf-core/kallisto/index/** diff --git a/modules/nf-core/kallisto/quant/environment.yml b/modules/nf-core/kallisto/quant/environment.yml new file mode 100644 index 000000000..c2d6306bb --- /dev/null +++ b/modules/nf-core/kallisto/quant/environment.yml @@ -0,0 +1,7 @@ +name: kallisto_quant +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kallisto=0.48.0 diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf new file mode 100644 index 000000000..f5444d791 --- /dev/null +++ b/modules/nf-core/kallisto/quant/main.nf @@ -0,0 +1,78 @@ +process KALLISTO_QUANT { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2': + 'biocontainers/kallisto:0.48.0--h15996b6_2' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + path gtf + path chromosomes + val fragment_length + val fragment_length_sd + + output: + tuple val(meta), path("${prefix}") , emit: results + tuple val(meta), path("*.run_info.json") , emit: json_info + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def gtf_input = gtf ? "--gtf ${gtf}" : '' + def chromosomes_input = chromosomes ? "--chromosomes ${chromosomes}" : '' + + def single_end_params = '' + if (meta.single_end) { + if (!(fragment_length =~ /^\d+$/)) { + error "fragment_length must be set and numeric for single-end data" + } + if (!(fragment_length_sd =~ /^\d+$/)) { + error "fragment_length_sd must be set and numeric for single-end data" + } + single_end_params = "--single --fragment-length=${fragment_length} --sd=${fragment_length_sd}" + } + + def strandedness = '' + if (!args.contains('--fr-stranded') && !args.contains('--rf-stranded')) { + strandedness = (meta.strandedness == 'forward') ? '--fr-stranded' : + (meta.strandedness == 'reverse') ? '--rf-stranded' : '' + } + + """ + mkdir -p $prefix && kallisto quant \\ + --threads ${task.cpus} \\ + --index ${index} \\ + ${gtf_input} \\ + ${chromosomes_input} \\ + ${single_end_params} \\ + ${strandedness} \\ + ${args} \\ + -o $prefix \\ + ${reads} 2> >(tee -a ${prefix}/kallisto_quant.log >&2) + + cp ${prefix}/kallisto_quant.log ${prefix}.log + cp ${prefix}/run_info.json ${prefix}.run_info.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" ) + END_VERSIONS + """ + + stub: + """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/quant/meta.yml b/modules/nf-core/kallisto/quant/meta.yml new file mode 100644 index 000000000..d5100290f --- /dev/null +++ b/modules/nf-core/kallisto/quant/meta.yml @@ -0,0 +1,77 @@ +name: "kallisto_quant" +description: Computes equivalence classes for reads and quantifies abundances +keywords: + - quant + - kallisto + - pseudoalignment +tools: + - "kallisto": + description: "Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads." + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + doi: "10.1038/nbt.3519" + licence: ["BSD_2_clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fastq,fastq.gz}" + - index: + type: file + description: Kallisto genome index. + pattern: "*.idx" + - gtf: + type: file + description: Optional gtf file for translation of transcripts into genomic coordinates. + pattern: "*.gtf" + - chromosomes: + type: file + description: Optional tab separated file with chromosome names and lengths. + pattern: "*.tsv" + - fragment_length: + type: integer + description: For single-end mode only, the estimated average fragment length. + - fragment_length_sd: + type: integer + description: For single-end mode only, the estimated standard deviation of the fragment length. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: File containing log information from running kallisto quant + pattern: "*.log.txt" + - abundance: + type: file + description: Plaintext file of the abundance estimates + pattern: "abundance.tsv" + - abundance_hdf5: + type: file + description: | + A HDF5 binary file containing run info, abundance estimates, bootstrap + estimates, and transcript length information + pattern: "abundance.h5" + - run_info: + type: file + description: A json file containing information about the run + pattern: "run_info.json" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/nf-core/kallisto/quant/tests/main.nf.test b/modules/nf-core/kallisto/quant/tests/main.nf.test new file mode 100644 index 000000000..f716e5e6d --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/main.nf.test @@ -0,0 +1,92 @@ +nextflow_process { + + name "Test Process KALLISTO_QUANT" + script "../main.nf" + process "KALLISTO_QUANT" + tag "modules" + tag "modules_nfcore" + tag "kallisto" + tag "kallisto/quant" + + setup { + run("KALLISTO_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [ id:'test_fasta' ], // meta map + [ file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true) ] + ] + """ + } + } + } + + test("sarscov2 single-end") { + config "./single_end.config" + + when { + params{ + outdir = "$outputDir" + } + + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = KALLISTO_INDEX.out.index + input[2] = [] + input[3] = [] + input[4] = 150 + input[5] = 75 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(path("$outputDir/kallisto/test/abundance.tsv")).match("abundance_tsv_single") }, + { assert snapshot(process.out.log).match("se_log") }, + { assert snapshot(process.out.versions).match("se_versions") } + ) + } + } + + test("sarscov2 paired-end") { + config "./paired_end.config" + + when { + params{ + outdir = "$outputDir" + } + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = KALLISTO_INDEX.out.index + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(path("$outputDir/kallisto/test/abundance.tsv")).match("abundance_tsv_paired") }, + { assert snapshot(process.out.log).match("pe_log") }, + { assert snapshot(process.out.versions).match("pe_versions") } + ) + } + } +} diff --git a/modules/nf-core/kallisto/quant/tests/main.nf.test.snap b/modules/nf-core/kallisto/quant/tests/main.nf.test.snap new file mode 100644 index 000000000..d59d7f06f --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/main.nf.test.snap @@ -0,0 +1,58 @@ +{ + "pe_log": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.log:md5,8a5987f8e779cd12ca708e2212f771f5" + ] + ] + ], + "timestamp": "2023-11-02T09:16:05.163403975" + }, + "se_versions": { + "content": [ + [ + "versions.yml:md5,f981ad0cc089194a8fb00a47948eea94" + ] + ], + "timestamp": "2023-10-30T22:11:17.982220232" + }, + "abundance_tsv_paired": { + "content": [ + "abundance.tsv:md5,f0a9a2543f8fc0c8442be0a939d70f66" + ], + "timestamp": "2023-11-02T09:16:05.157883165" + }, + "se_log": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,9c166f0c50cd4fdbdbf1bff9d5d8aba2" + ] + ] + ], + "timestamp": "2023-10-30T22:11:17.973230763" + }, + "abundance_tsv_single": { + "content": [ + "abundance.tsv:md5,8a4afe91e6a75b4e619daaf664eb7d9b" + ], + "timestamp": "2023-11-02T09:01:51.48615229" + }, + "pe_versions": { + "content": [ + [ + "versions.yml:md5,f981ad0cc089194a8fb00a47948eea94" + ] + ], + "timestamp": "2023-11-02T09:16:05.168753684" + } +} diff --git a/modules/nf-core/kallisto/quant/tests/paired_end.config b/modules/nf-core/kallisto/quant/tests/paired_end.config new file mode 100644 index 000000000..8730f1c4b --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/paired_end.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} diff --git a/modules/nf-core/kallisto/quant/tests/single_end.config b/modules/nf-core/kallisto/quant/tests/single_end.config new file mode 100644 index 000000000..7022246bc --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/single_end.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} diff --git a/modules/nf-core/kallisto/quant/tests/tags.yml b/modules/nf-core/kallisto/quant/tests/tags.yml new file mode 100644 index 000000000..460936583 --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/tags.yml @@ -0,0 +1,3 @@ +kallisto/quant: + - modules/nf-core/kallisto/index/** + - modules/nf-core/kallisto/quant/** diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf index facd7efb9..ebfa0864d 100644 --- a/modules/nf-core/picard/markduplicates/main.nf +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -30,6 +30,9 @@ process PICARD_MARKDUPLICATES { } else { avail_mem = (task.memory.mega*0.8).intValue() } + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ picard \\ -Xmx${avail_mem}M \\ @@ -48,6 +51,7 @@ process PICARD_MARKDUPLICATES { stub: def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ touch ${prefix}.bam touch ${prefix}.bam.bai diff --git a/modules/nf-core/qualimap/rnaseq/main.nf b/modules/nf-core/qualimap/rnaseq/main.nf index 2c0e4105f..044c983fc 100644 --- a/modules/nf-core/qualimap/rnaseq/main.nf +++ b/modules/nf-core/qualimap/rnaseq/main.nf @@ -9,7 +9,7 @@ process QUALIMAP_RNASEQ { input: tuple val(meta), path(bam) - path gtf + tuple val(meta2), path(gtf) output: tuple val(meta), path("${prefix}"), emit: results diff --git a/modules/nf-core/qualimap/rnaseq/meta.yml b/modules/nf-core/qualimap/rnaseq/meta.yml new file mode 100644 index 000000000..7738f08d8 --- /dev/null +++ b/modules/nf-core/qualimap/rnaseq/meta.yml @@ -0,0 +1,52 @@ +name: qualimap_rnaseq +description: Evaluate alignment data +keywords: + - quality control + - qc + - rnaseq +tools: + - qualimap: + description: | + Qualimap 2 is a platform-independent application written in + Java and R that provides both a Graphical User Interface and + a command-line interface to facilitate the quality control of + alignment sequencing data and its derivatives like feature counts. + homepage: http://qualimap.bioinfo.cipf.es/ + documentation: http://qualimap.conesalab.org/doc_html/index.html + doi: 10.1093/bioinformatics/bts503 + licence: ["GPL-2.0-only"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file of the reference genome + pattern: "*.{gtf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: directory + description: Qualimap results dir + pattern: "*/*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" diff --git a/modules/nf-core/salmon/quant/main.nf b/modules/nf-core/salmon/quant/main.nf index e356af26d..f11b6c2e2 100644 --- a/modules/nf-core/salmon/quant/main.nf +++ b/modules/nf-core/salmon/quant/main.nf @@ -28,7 +28,9 @@ process SALMON_QUANT { prefix = task.ext.prefix ?: "${meta.id}" def reference = "--index $index" - def input_reads = meta.single_end ? "-r $reads" : "-1 ${reads[0]} -2 ${reads[1]}" + def reads1 = [], reads2 = [] + meta.single_end ? [reads].flatten().each{reads1 << it} : reads.eachWithIndex{ v, ix -> ( ix & 1 ? reads2 : reads1) << v } + def input_reads = meta.single_end ? "-r ${reads1.join(" ")}" : "-1 ${reads1.join(" ")} -2 ${reads2.join(" ")}" if (alignment_mode) { reference = "-t $transcript_fasta" input_reads = "-a $reads" diff --git a/modules/nf-core/salmon/quant/meta.yml b/modules/nf-core/salmon/quant/meta.yml index ea01e0df8..e809ade2b 100644 --- a/modules/nf-core/salmon/quant/meta.yml +++ b/modules/nf-core/salmon/quant/meta.yml @@ -22,8 +22,9 @@ input: - reads: type: file description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. + List of input FastQ files for single-end or paired-end data. + Multiple single-end fastqs or pairs of paired-end fastqs are + handled. - index: type: directory description: Folder containing the star index files diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf index eb7e72fc6..b75707eca 100644 --- a/modules/nf-core/samtools/flagstat/main.nf +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -32,4 +32,15 @@ process SAMTOOLS_FLAGSTAT { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf index a257d7002..83c7c34b9 100644 --- a/modules/nf-core/samtools/idxstats/main.nf +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -33,4 +33,16 @@ process SAMTOOLS_IDXSTATS { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf index 8cb8e9a4d..d0e203848 100644 --- a/modules/nf-core/star/align/main.nf +++ b/modules/nf-core/star/align/main.nf @@ -9,8 +9,8 @@ process STAR_ALIGN { input: tuple val(meta), path(reads, stageAs: "input*/*") - path index - path gtf + tuple val(meta2), path(index) + tuple val(meta3), path(gtf) val star_ignore_sjdbgtf val seq_platform val seq_center diff --git a/modules/nf-core/star/align/meta.yml b/modules/nf-core/star/align/meta.yml index bce16d360..3d8fed0cc 100644 --- a/modules/nf-core/star/align/meta.yml +++ b/modules/nf-core/star/align/meta.yml @@ -25,10 +25,34 @@ input: description: | List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - index: type: directory description: STAR genome index pattern: "star" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: Annotation GTF file + pattern: "*.{gtf}" + - star_ignore_sjdbgtf: + type: boolean + description: Ignore annotation GTF file + - seq_platform: + type: string + description: Sequencing platform + - seq_center: + type: string + description: Sequencing center + output: - bam: type: file diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index 2407d0066..434240427 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -8,12 +8,12 @@ process STAR_GENOMEGENERATE { 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" input: - path fasta - path gtf + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) output: - path "star" , emit: index - path "versions.yml", emit: versions + tuple val(meta), path("star") , emit: index + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml index 8181157a1..eba2d9cf1 100644 --- a/modules/nf-core/star/genomegenerate/meta.yml +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -15,14 +15,29 @@ tools: doi: 10.1093/bioinformatics/bts635 licence: ["MIT"] input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - fasta: type: file description: Fasta file of the reference genome + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - gtf: type: file description: GTF file of the reference genome output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - index: type: directory description: Folder containing the star index files diff --git a/modules/nf-core/ucsc/bedclip/meta.yml b/modules/nf-core/ucsc/bedclip/meta.yml index ca278552a..cc5d9d715 100755 --- a/modules/nf-core/ucsc/bedclip/meta.yml +++ b/modules/nf-core/ucsc/bedclip/meta.yml @@ -1,7 +1,9 @@ name: ucsc_bedclip -description: See http://hgdownload.cse.ucsc.edu/admin/exe/ +description: Remove lines from bed file that refer to off-chromosome locations. keywords: - - sort + - bed + - genomics + - ucsc tools: - ucsc: description: Remove lines from bed file that refer to off-chromosome locations. diff --git a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf index 054924e7e..06bb47099 100644 --- a/modules/nf-core/ucsc/bedgraphtobigwig/main.nf +++ b/modules/nf-core/ucsc/bedgraphtobigwig/main.nf @@ -3,10 +3,10 @@ process UCSC_BEDGRAPHTOBIGWIG { label 'process_single' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - conda "bioconda::ucsc-bedgraphtobigwig=377" + conda "bioconda::ucsc-bedgraphtobigwig=445" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ucsc-bedgraphtobigwig:377--h446ed27_1' : - 'biocontainers/ucsc-bedgraphtobigwig:377--h446ed27_1' }" + 'https://depot.galaxyproject.org/singularity/ucsc-bedgraphtobigwig:445--h954228d_0' : + 'biocontainers/ucsc-bedgraphtobigwig:445--h954228d_0' }" input: tuple val(meta), path(bedgraph) @@ -22,7 +22,7 @@ process UCSC_BEDGRAPHTOBIGWIG { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = '377' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + def VERSION = '445' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ bedGraphToBigWig \\ $bedgraph \\ @@ -34,4 +34,16 @@ process UCSC_BEDGRAPHTOBIGWIG { ucsc: $VERSION END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '445' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}.bigWig + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ucsc: $VERSION + END_VERSIONS + """ } diff --git a/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml b/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml index ba8915bed..416c91e07 100755 --- a/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml +++ b/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml @@ -3,6 +3,9 @@ description: Convert a bedGraph file to bigWig format. keywords: - bedgraph - bigwig + - ucsc + - bedgraphtobigwig + - converter tools: - ucsc: description: Convert a bedGraph file to bigWig format. diff --git a/modules/nf-core/umitools/dedup/main.nf b/modules/nf-core/umitools/dedup/main.nf index 2bd95da7c..56ea04691 100644 --- a/modules/nf-core/umitools/dedup/main.nf +++ b/modules/nf-core/umitools/dedup/main.nf @@ -12,7 +12,8 @@ process UMITOOLS_DEDUP { val get_output_stats output: - tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("${prefix}.bam") , emit: bam + tuple val(meta), path("*.log") , emit: log tuple val(meta), path("*edit_distance.tsv"), optional:true, emit: tsv_edit_distance tuple val(meta), path("*per_umi.tsv") , optional:true, emit: tsv_per_umi tuple val(meta), path("*per_position.tsv") , optional:true, emit: tsv_umi_per_position @@ -23,9 +24,10 @@ process UMITOOLS_DEDUP { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" def paired = meta.single_end ? "" : "--paired" - def stats = get_output_stats ? "--output-stats $prefix" : "" + stats = get_output_stats ? "--output-stats ${prefix}" : "" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" if (!(args ==~ /.*--random-seed.*/)) {args += " --random-seed=100"} """ @@ -33,6 +35,7 @@ process UMITOOLS_DEDUP { dedup \\ -I $bam \\ -S ${prefix}.bam \\ + -L ${prefix}.log \\ $stats \\ $paired \\ $args @@ -42,4 +45,18 @@ process UMITOOLS_DEDUP { umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') END_VERSIONS """ + + stub: + """ + touch ${prefix}.bam + touch ${prefix}.log + touch ${prefix}_edit_distance.tsv + touch ${prefix}_per_umi.tsv + touch ${prefix}_per_position.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/umitools/dedup/meta.yml b/modules/nf-core/umitools/dedup/meta.yml index 0719a9552..534d4c6b0 100644 --- a/modules/nf-core/umitools/dedup/meta.yml +++ b/modules/nf-core/umitools/dedup/meta.yml @@ -3,6 +3,7 @@ description: Deduplicate reads based on the mapping co-ordinate and the UMI atta keywords: - umitools - deduplication + - dedup tools: - umi_tools: description: > @@ -40,6 +41,10 @@ output: type: file description: BAM file with deduplicated UMIs. pattern: "*.{bam}" + - log: + type: file + description: File with logging information + pattern: "*.{log}" - tsv_edit_distance: type: file description: Reports the (binned) average edit distance between the UMIs at each position. diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf index 8cd1856c7..61461c391 100644 --- a/modules/nf-core/untar/main.nf +++ b/modules/nf-core/untar/main.nf @@ -2,7 +2,7 @@ process UNTAR { tag "$archive" label 'process_single' - conda "conda-forge::sed=4.7 bioconda::grep=3.4 conda-forge::tar=1.34" + conda "conda-forge::sed=4.7 conda-forge::grep=3.11 conda-forge::tar=1.34" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/nextflow.config b/nextflow.config index 09bed2da2..ab539161f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,8 @@ params { splicesites = null gtf_extra_attributes = 'gene_name' gtf_group_features = 'gene_id' + skip_gtf_filter = false + skip_gtf_transcript_filter = false featurecounts_feature_type = 'exon' featurecounts_group_type = 'gene_biotype' gencode = false @@ -66,6 +68,9 @@ params { min_mapped_reads = 5 extra_star_align_args = null extra_salmon_quant_args = null + extra_kallisto_quant_args = null + kallisto_quant_fraglen = 200 + kallisto_quant_fraglen_sd = 200 save_merged_fastq = false save_unaligned = false save_align_intermeds = false @@ -105,17 +110,14 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null test_data_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3' // Max resource options @@ -123,6 +125,14 @@ params { max_memory = '128.GB' max_cpus = 16 max_time = '240.h' + + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -219,6 +229,7 @@ profiles { } apptainer { apptainer.enabled = true + apptainer.autoMounts = true conda.enabled = false docker.enabled = false singularity.enabled = false @@ -228,17 +239,14 @@ profiles { } gitpod { executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB - } - public_aws_ecr { - includeConfig 'conf/public_aws_ecr.config' + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test.config' } test_cache { includeConfig 'conf/test_cache.config' } test_full { includeConfig 'conf/test_full.config' } - test_full_aws { - includeConfig 'conf/test_full.config' + test_full_aws { + includeConfig 'conf/test_full.config' } test_full_gcp { includeConfig 'conf/test_full.config' @@ -250,6 +258,19 @@ profiles { } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} + // Load igenomes.config if required if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' @@ -271,12 +292,6 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -// Set default registry for Docker and Podman independent of -profile -// Will not be used unless Docker / Podman are enabled -// Set to your registry if you have a mirror of containers -docker.registry = 'quay.io' -podman.registry = 'quay.io' - def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true @@ -301,8 +316,8 @@ manifest { homePage = 'https://github.com/nf-core/rnaseq' description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' - nextflowVersion = '!>=22.10.1' - version = '3.12.0' + nextflowVersion = '!>=23.04.0' + version = '3.13.0' doi = 'https://doi.org/10.5281/zenodo.1400710' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 4bb08ed2d..60e6585cf 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,11 +10,12 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["outdir"], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "schema": "assets/schema_input.json", @@ -39,129 +40,9 @@ "type": "string", "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" - }, - "save_merged_fastq": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Save FastQ files after merging re-sequenced libraries in the results directory." } } }, - "umi_options": { - "title": "UMI options", - "type": "object", - "description": "Options for processing reads with unique molecular identifiers", - "default": "", - "properties": { - "with_umi": { - "type": "boolean", - "fa_icon": "fas fa-barcode", - "description": "Enable UMI-based read deduplication." - }, - "umitools_extract_method": { - "type": "string", - "default": "string", - "fa_icon": "fas fa-barcode", - "description": "UMI pattern to use. Can be either 'string' (default) or 'regex'.", - "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).\n" - }, - "skip_umi_extract": { - "type": "boolean", - "fa_icon": "fas fa-compress-alt", - "description": "Skip the UMI extraction from the read in case the UMIs have been moved to the headers in advance of the pipeline run." - }, - "umitools_bc_pattern": { - "type": "string", - "fa_icon": "fas fa-barcode", - "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).", - "description": "The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides of the read are from the UMI." - }, - "umitools_bc_pattern2": { - "type": "string", - "fa_icon": "fas fa-barcode", - "description": "The UMI barcode pattern to use if the UMI is located in read 2." - }, - "umi_discard_read": { - "type": "integer", - "fa_icon": "fas fa-barcode", - "description": "After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively." - }, - "umitools_umi_separator": { - "type": "string", - "fa_icon": "fas fa-star-half-alt", - "description": "The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with UMI-tools and used other software." - }, - "umitools_grouping_method": { - "type": "string", - "default": "directional", - "fa_icon": "far fa-object-ungroup", - "description": "Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying the reads with the same mapping position, but treat similar yet nonidentical UMIs differently.", - "enum": ["unique", "percentile", "cluster", "adjacency", "directional"] - }, - "umitools_dedup_stats": { - "type": "boolean", - "fa_icon": "fas fa-barcode", - "help_text": "It can be quite time consuming generating these output stats - see [#827](https://github.com/nf-core/rnaseq/issues/827).", - "description": "Generate output stats when running \"umi_tools dedup\"." - }, - "save_umi_intermeds": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "If this option is specified, intermediate FastQ and BAM files produced by UMI-tools are also saved in the results directory." - } - }, - "fa_icon": "fas fa-barcode" - }, - "read_filtering_options": { - "title": "Read filtering options", - "type": "object", - "description": "Options for filtering reads prior to alignment", - "default": "", - "properties": { - "bbsplit_fasta_list": { - "type": "string", - "fa_icon": "fas fa-list-alt", - "description": "Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit. You have to also explicitly set `--skip_bbsplit false` if you want to use BBSplit.", - "help_text": "The file should contain 2 columns: short name and full path to reference genome(s) e.g. \n```\nmm10,/path/to/mm10.fa\necoli,/path/to/ecoli.fa\n```" - }, - "bbsplit_index": { - "type": "string", - "fa_icon": "fas fa-bezier-curve", - "description": "Path to directory or tar.gz archive for pre-built BBSplit index.", - "help_text": "The BBSplit index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--bbsplit_index` for future runs." - }, - "save_bbsplit_reads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "If this option is specified, FastQ files split by reference will be saved in the results directory." - }, - "skip_bbsplit": { - "type": "boolean", - "default": true, - "fa_icon": "fas fa-fast-forward", - "description": "Skip BBSplit for removal of non-reference genome reads." - }, - "remove_ribo_rna": { - "type": "boolean", - "fa_icon": "fas fa-trash-alt", - "description": "Enable the removal of reads derived from ribosomal RNA using SortMeRNA.", - "help_text": "Any patterns found in the sequences defined by the '--ribo_database_manifest' parameter will be used." - }, - "ribo_database_manifest": { - "type": "string", - "default": "${projectDir}/assets/rrna-db-defaults.txt", - "fa_icon": "fas fa-database", - "description": "Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA.", - "help_text": "By default, [rRNA databases](https://github.com/biocore/sortmerna/tree/master/data/rRNA_databases) defined in the SortMeRNA GitHub repo are used. You can see an example in the pipeline Github repository in `assets/rrna-default-dbs.txt`.\nPlease note that commercial/non-academic entities require [`licensing for SILVA`](https://www.arb-silva.de/silva-license-information) for these default databases." - }, - "save_non_ribo_reads": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "If this option is specified, intermediate FastQ files containing non-rRNA reads will be saved in the results directory." - } - }, - "fa_icon": "fas fa-trash-alt" - }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -172,11 +53,12 @@ "type": "string", "description": "Name of iGenomes reference.", "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "help_text": "If using a reference genome configured in the pipeline using iGenomes (not recommended), use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "fasta": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", @@ -186,6 +68,7 @@ "gtf": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.gtf(\\.gz)?$", "description": "Path to GTF annotation file.", @@ -195,6 +78,7 @@ "gff": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.gff(\\.gz)?$", "fa_icon": "fas fa-code-branch", @@ -204,6 +88,7 @@ "gene_bed": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.bed(\\.gz)?$", "fa_icon": "fas fa-procedures", @@ -212,6 +97,7 @@ "transcript_fasta": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "fa_icon": "far fa-file-code", @@ -220,6 +106,7 @@ "additional_fasta": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "fa_icon": "far fa-file-code", @@ -229,6 +116,7 @@ "splicesites": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "fa_icon": "fas fa-hand-scissors", "description": "Splice sites file required for HISAT2." @@ -236,27 +124,38 @@ "star_index": { "type": "string", "format": "path", + "exists": true, "fa_icon": "fas fa-bezier-curve", "description": "Path to directory or tar.gz archive for pre-built STAR index." }, "hisat2_index": { "type": "string", "format": "path", + "exists": true, "fa_icon": "fas fa-bezier-curve", "description": "Path to directory or tar.gz archive for pre-built HISAT2 index." }, "rsem_index": { "type": "string", "format": "path", + "exists": true, "fa_icon": "fas fa-bezier-curve", "description": "Path to directory or tar.gz archive for pre-built RSEM index." }, "salmon_index": { "type": "string", "format": "path", + "exists": true, "fa_icon": "fas fa-bezier-curve", "description": "Path to directory or tar.gz archive for pre-built Salmon index." }, + "kallisto_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built Kallisto index." + }, "hisat2_build_memory": { "type": "string", "default": "200.GB", @@ -297,12 +196,6 @@ "fa_icon": "fas fa-indent", "help_text": "The feature type used from the GTF file when generating the biotype plot with featureCounts." }, - "save_reference": { - "type": "boolean", - "description": "If generated by the pipeline save the STAR index in the results directory.", - "help_text": "If an alignment index is generated by the pipeline use this parameter to save it to your results folder. These can then be used for future pipeline runs, reducing processing times.", - "fa_icon": "fas fa-save" - }, "igenomes_base": { "type": "string", "format": "directory-path", @@ -318,7 +211,8 @@ "hidden": true, "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." } - } + }, + "required": ["fasta"] }, "read_trimming_options": { "title": "Read trimming options", @@ -348,20 +242,105 @@ "default": 10000, "fa_icon": "fas fa-hand-paper", "description": "Minimum number of trimmed reads below which samples are removed from further processing. Some downstream steps in the pipeline will fail if this threshold is too low." + } + } + }, + "read_filtering_options": { + "title": "Read filtering options", + "type": "object", + "description": "Options for filtering reads prior to alignment", + "default": "", + "properties": { + "bbsplit_fasta_list": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "fa_icon": "fas fa-list-alt", + "description": "Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit. You have to also explicitly set `--skip_bbsplit false` if you want to use BBSplit.", + "help_text": "The file should contain 2 columns: short name and full path to reference genome(s) e.g. \n```\nmm10,/path/to/mm10.fa\necoli,/path/to/ecoli.fa\n```" }, - "skip_trimming": { + "bbsplit_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built BBSplit index.", + "help_text": "The BBSplit index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--bbsplit_index` for future runs." + }, + "remove_ribo_rna": { "type": "boolean", - "description": "Skip the adapter trimming step.", - "help_text": "Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no adapter contamination in your data.", - "fa_icon": "fas fa-fast-forward" + "fa_icon": "fas fa-trash-alt", + "description": "Enable the removal of reads derived from ribosomal RNA using SortMeRNA.", + "help_text": "Any patterns found in the sequences defined by the '--ribo_database_manifest' parameter will be used." }, - "save_trimmed": { + "ribo_database_manifest": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "default": "${projectDir}/assets/rrna-db-defaults.txt", + "fa_icon": "fas fa-database", + "description": "Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA.", + "help_text": "By default, [rRNA databases](https://github.com/biocore/sortmerna/tree/master/data/rRNA_databases) defined in the SortMeRNA GitHub repo are used. You can see an example in the pipeline Github repository in `assets/rrna-default-dbs.txt`.\nPlease note that commercial/non-academic entities require [`licensing for SILVA`](https://www.arb-silva.de/silva-license-information) for these default databases." + } + }, + "fa_icon": "fas fa-trash-alt" + }, + "umi_options": { + "title": "UMI options", + "type": "object", + "description": "Options for processing reads with unique molecular identifiers", + "default": "", + "properties": { + "with_umi": { "type": "boolean", - "description": "Save the trimmed FastQ files in the results directory.", - "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", - "fa_icon": "fas fa-save" + "fa_icon": "fas fa-barcode", + "description": "Enable UMI-based read deduplication." + }, + "umitools_extract_method": { + "type": "string", + "default": "string", + "fa_icon": "fas fa-barcode", + "description": "UMI pattern to use. Can be either 'string' (default) or 'regex'.", + "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).\n" + }, + "umitools_bc_pattern": { + "type": "string", + "fa_icon": "fas fa-barcode", + "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).", + "description": "The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides of the read are from the UMI." + }, + "umitools_bc_pattern2": { + "type": "string", + "fa_icon": "fas fa-barcode", + "description": "The UMI barcode pattern to use if the UMI is located in read 2." + }, + "umi_discard_read": { + "type": "integer", + "fa_icon": "fas fa-barcode", + "description": "After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively." + }, + "umitools_umi_separator": { + "type": "string", + "fa_icon": "fas fa-star-half-alt", + "description": "The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with UMI-tools and used other software." + }, + "umitools_grouping_method": { + "type": "string", + "default": "directional", + "fa_icon": "far fa-object-ungroup", + "description": "Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying the reads with the same mapping position, but treat similar yet nonidentical UMIs differently.", + "enum": ["unique", "percentile", "cluster", "adjacency", "directional"] + }, + "umitools_dedup_stats": { + "type": "boolean", + "fa_icon": "fas fa-barcode", + "help_text": "It can be quite time consuming generating these output stats - see [#827](https://github.com/nf-core/rnaseq/issues/827).", + "description": "Generate output stats when running \"umi_tools dedup\"." } - } + }, + "fa_icon": "fas fa-barcode" }, "alignment_options": { "title": "Alignment options", @@ -380,7 +359,7 @@ "type": "string", "description": "Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'.", "fa_icon": "fas fa-hamburger", - "enum": ["salmon"] + "enum": ["salmon", "kallisto"] }, "bam_csi_index": { "type": "boolean", @@ -396,11 +375,29 @@ "type": "string", "fa_icon": "fas fa-fast-forward", "description": " Override Salmon library type inferred based on strandedness defined in meta object.", - "help_text": "See [Salmon docs](https://salmon.readthedocs.io/en/latest/library_type.html)." + "help_text": "See [Salmon docs](https://salmon.readthedocs.io/en/latest/library_type.html).", + "enum": [ + "A", + "IS", + "ISF", + "ISR", + "IU", + "MS", + "MSF", + "MSR", + "MU", + "OS", + "OSF", + "OSR", + "OU", + "SF", + "SR", + "U" + ] }, "min_mapped_reads": { "type": "number", - "default": 5, + "default": 5.0, "fa_icon": "fas fa-percentage", "description": "Minimum percentage of uniquely mapped reads below which samples are removed from further processing.", "help_text": "Some downstream steps in the pipeline will fail if this threshold is too low." @@ -417,7 +414,7 @@ }, "extra_star_align_args": { "type": "string", - "description": "Extra arguments to pass to STAR alignment command in addition to defaults defined by the pipeline.", + "description": "Extra arguments to pass to STAR alignment command in addition to defaults defined by the pipeline. Only available for the STAR-Salmon route.", "fa_icon": "fas fa-plus" }, "extra_salmon_quant_args": { @@ -425,53 +422,146 @@ "description": "Extra arguments to pass to Salmon quant command in addition to defaults defined by the pipeline.", "fa_icon": "fas fa-plus" }, - "save_unaligned": { + "extra_kallisto_quant_args": { + "type": "string", + "description": "Extra arguments to pass to Kallisto quant command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" + }, + "kallisto_quant_fraglen": { + "type": "integer", + "description": "In single-end mode Kallisto requires an estimated fragment length. Specify a default value for that here. TODO: use existing RSeQC results to do this dynamically.", + "default": 200, + "fa_icon": "fas fa-ruler-horizontal" + }, + "kallisto_quant_fraglen_sd": { + "type": "integer", + "description": "In single-end mode, Kallisto requires an estimated standard error for fragment length. Specify a default value for that here. TODO: use existing RSeQC results to do this dynamically.", + "default": 200, + "fa_icon": "fas fa-sort-amount-up-alt" + } + } + }, + "optional_outputs": { + "title": "Optional outputs", + "type": "object", + "description": "Additional output files produces as intermediates that can be saved", + "default": "", + "properties": { + "save_merged_fastq": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Where possible, save unaligned reads from either STAR, HISAT2 or Salmon to the results directory.", - "help_text": "This may either be in the form of FastQ or BAM files depending on the options available for that particular tool." + "description": "Save FastQ files after merging re-sequenced libraries in the results directory." }, - "save_align_intermeds": { + "save_umi_intermeds": { "type": "boolean", - "description": "Save the intermediate BAM files from the alignment step.", - "help_text": "By default, intermediate BAM files will not be saved. The final BAM files created after the appropriate filtering step are always saved to limit storage usage. Set this parameter to also save other intermediate BAM files.", + "fa_icon": "fas fa-save", + "description": "If this option is specified, intermediate FastQ and BAM files produced by UMI-tools are also saved in the results directory." + }, + "save_non_ribo_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "If this option is specified, intermediate FastQ files containing non-rRNA reads will be saved in the results directory." + }, + "save_bbsplit_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "If this option is specified, FastQ files split by reference will be saved in the results directory." + }, + "save_reference": { + "type": "boolean", + "description": "If generated by the pipeline save the STAR index in the results directory.", + "help_text": "If an alignment index is generated by the pipeline use this parameter to save it to your results folder. These can then be used for future pipeline runs, reducing processing times.", "fa_icon": "fas fa-save" }, - "skip_markduplicates": { + "save_trimmed": { "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "description": "Skip picard MarkDuplicates step." + "description": "Save the trimmed FastQ files in the results directory.", + "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", + "fa_icon": "fas fa-save" }, - "skip_alignment": { + "save_align_intermeds": { "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "description": "Skip all of the alignment-based processes within the pipeline." + "description": "Save the intermediate BAM files from the alignment step.", + "help_text": "By default, intermediate BAM files will not be saved. The final BAM files created after the appropriate filtering step are always saved to limit storage usage. Set this parameter to also save other intermediate BAM files.", + "fa_icon": "fas fa-save" }, - "skip_pseudo_alignment": { + "save_unaligned": { "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "description": "Skip all of the pseudo-alignment-based processes within the pipeline." + "fa_icon": "fas fa-save", + "description": "Where possible, save unaligned reads from either STAR, HISAT2 or Salmon to the results directory.", + "help_text": "This may either be in the form of FastQ or BAM files depending on the options available for that particular tool." } } }, - "process_skipping_options": { - "title": "Process skipping options", + "quality_control": { + "title": "Quality Control", "type": "object", - "fa_icon": "fas fa-fast-forward", - "description": "Options to skip various steps within the workflow.", + "description": "Additional quality control options.", + "default": "", "properties": { + "deseq2_vst": { + "type": "boolean", + "description": "Use vst transformation instead of rlog with DESeq2.", + "help_text": "See [DESeq2 docs](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization).", + "fa_icon": "fas fa-dolly", + "default": true + }, "rseqc_modules": { "type": "string", "default": "bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication", "fa_icon": "fas fa-chart-pie", "description": "Specify the RSeQC modules to run." + } + } + }, + "process_skipping_options": { + "title": "Process skipping options", + "type": "object", + "fa_icon": "fas fa-fast-forward", + "description": "Options to skip various steps within the workflow.", + "properties": { + "skip_gtf_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.", + "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option." }, - "deseq2_vst": { + "skip_gtf_transcript_filter": { "type": "boolean", - "description": "Use vst transformation instead of rlog with DESeq2.", - "help_text": "See [DESeq2 docs](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization).", - "fa_icon": "fas fa-dolly", - "default": true + "fa_icon": "fas fa-forward", + "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline." + }, + "skip_bbsplit": { + "type": "boolean", + "default": true, + "fa_icon": "fas fa-fast-forward", + "description": "Skip BBSplit for removal of non-reference genome reads." + }, + "skip_umi_extract": { + "type": "boolean", + "fa_icon": "fas fa-compress-alt", + "description": "Skip the UMI extraction from the read in case the UMIs have been moved to the headers in advance of the pipeline run." + }, + "skip_trimming": { + "type": "boolean", + "description": "Skip the adapter trimming step.", + "help_text": "Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no adapter contamination in your data.", + "fa_icon": "fas fa-fast-forward" + }, + "skip_alignment": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip all of the alignment-based processes within the pipeline." + }, + "skip_pseudo_alignment": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip all of the pseudoalignment-based processes within the pipeline." + }, + "skip_markduplicates": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip picard MarkDuplicates step." }, "skip_bigwig": { "type": "boolean", @@ -615,7 +705,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -685,18 +775,27 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, "multiqc_logo": { "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", "fa_icon": "fas fa-image", "hidden": true }, "multiqc_methods_description": { "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, @@ -707,12 +806,26 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } } @@ -722,20 +835,26 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/umi_options" + "$ref": "#/definitions/reference_genome_options" }, { - "$ref": "#/definitions/read_filtering_options" + "$ref": "#/definitions/read_trimming_options" }, { - "$ref": "#/definitions/reference_genome_options" + "$ref": "#/definitions/read_filtering_options" }, { - "$ref": "#/definitions/read_trimming_options" + "$ref": "#/definitions/umi_options" }, { "$ref": "#/definitions/alignment_options" }, + { + "$ref": "#/definitions/optional_outputs" + }, + { + "$ref": "#/definitions/quality_control" + }, { "$ref": "#/definitions/process_skipping_options" }, diff --git a/pyproject.toml b/pyproject.toml index 0d62beb6f..bc01239b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ -# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Config file for Python. Mostly used to configure linting of bin/*.py with Black. # Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. [tool.black] line-length = 120 diff --git a/subworkflows/local/align_star.nf b/subworkflows/local/align_star/main.nf similarity index 91% rename from subworkflows/local/align_star.nf rename to subworkflows/local/align_star/main.nf index ffa027257..1dba0c085 100644 --- a/subworkflows/local/align_star.nf +++ b/subworkflows/local/align_star/main.nf @@ -2,15 +2,15 @@ // Alignment with STAR // -include { STAR_ALIGN } from '../../modules/nf-core/star/align/main' -include { STAR_ALIGN_IGENOMES } from '../../modules/local/star_align_igenomes' -include { BAM_SORT_STATS_SAMTOOLS } from '../nf-core/bam_sort_stats_samtools/main' +include { STAR_ALIGN } from '../../../modules/nf-core/star/align' +include { STAR_ALIGN_IGENOMES } from '../../../modules/local/star_align_igenomes' +include { BAM_SORT_STATS_SAMTOOLS } from '../../nf-core/bam_sort_stats_samtools' workflow ALIGN_STAR { take: reads // channel: [ val(meta), [ reads ] ] - index // channel: /path/to/star/index/ - gtf // channel: /path/to/genome.gtf + index // channel: [ val(meta), [ index ] ] + gtf // channel: [ val(meta), [ gtf ] ] star_ignore_sjdbgtf // boolean: when using pre-built STAR indices do not re-extract and use splice junctions from the GTF file seq_platform // string : sequencing platform seq_center // string : sequencing center diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index 172d11ea5..000000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,45 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - meta.strandedness = row.strandedness - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome/main.nf similarity index 69% rename from subworkflows/local/prepare_genome.nf rename to subworkflows/local/prepare_genome/main.nf index b83126cd7..0be947954 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -2,34 +2,36 @@ // Uncompress and prepare reference genome files // -include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GTF } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GENE_BED } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GFF } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GENE_BED } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip' -include { UNTAR as UNTAR_BBSPLIT_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_STAR_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_RSEM_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_HISAT2_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_SALMON_INDEX } from '../../modules/nf-core/untar/main' +include { UNTAR as UNTAR_BBSPLIT_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_STAR_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_RSEM_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_HISAT2_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_SALMON_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_KALLISTO_INDEX } from '../../../modules/nf-core/untar' -include { CUSTOM_GETCHROMSIZES } from '../../modules/nf-core/custom/getchromsizes/main' -include { GFFREAD } from '../../modules/nf-core/gffread/main' -include { BBMAP_BBSPLIT } from '../../modules/nf-core/bbmap/bbsplit/main' -include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate/main' -include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites/main' -include { HISAT2_BUILD } from '../../modules/nf-core/hisat2/build/main' -include { SALMON_INDEX } from '../../modules/nf-core/salmon/index/main' -include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../modules/nf-core/rsem/preparereference/main' -include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../modules/nf-core/rsem/preparereference/main' +include { CUSTOM_GETCHROMSIZES } from '../../../modules/nf-core/custom/getchromsizes' +include { GFFREAD } from '../../../modules/nf-core/gffread' +include { BBMAP_BBSPLIT } from '../../../modules/nf-core/bbmap/bbsplit' +include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate' +include { HISAT2_EXTRACTSPLICESITES } from '../../../modules/nf-core/hisat2/extractsplicesites' +include { HISAT2_BUILD } from '../../../modules/nf-core/hisat2/build' +include { SALMON_INDEX } from '../../../modules/nf-core/salmon/index' +include { KALLISTO_INDEX } from '../../../modules/nf-core/kallisto/index' +include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../../modules/nf-core/rsem/preparereference' +include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../../modules/nf-core/rsem/preparereference' -include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../modules/local/preprocess_transcripts_fasta_gencode' -include { GTF2BED } from '../../modules/local/gtf2bed' -include { CAT_ADDITIONAL_FASTA } from '../../modules/local/cat_additional_fasta' -include { GTF_GENE_FILTER } from '../../modules/local/gtf_gene_filter' -include { STAR_GENOMEGENERATE_IGENOMES } from '../../modules/local/star_genomegenerate_igenomes' +include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode' +include { GTF2BED } from '../../../modules/local/gtf2bed' +include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta' +include { GTF_FILTER } from '../../../modules/local/gtf_filter' +include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes' workflow PREPARE_GENOME { take: @@ -44,12 +46,14 @@ workflow PREPARE_GENOME { star_index // directory: /path/to/star/index/ rsem_index // directory: /path/to/rsem/index/ salmon_index // directory: /path/to/salmon/index/ + kallisto_index // directory: /path/to/kallisto/index/ hisat2_index // directory: /path/to/hisat2/index/ bbsplit_index // directory: /path/to/rsem/index/ gencode // boolean: whether the genome is from GENCODE is_aws_igenome // boolean: whether the genome files are from AWS iGenomes biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file prepare_tool_indices // list: tools to prepare indices for + filter_gtf // boolean: whether to filter GTF file main: @@ -68,22 +72,30 @@ workflow PREPARE_GENOME { // // Uncompress GTF annotation file or create from GFF3 if required // - if (gtf) { - if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } else { - ch_gtf = Channel.value(file(gtf)) + if (gtf || gff) { + if (gtf) { + if (gtf.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } else { + ch_gtf = Channel.value(file(gtf)) + } + } else if (gff) { + if (gff.endsWith('.gz')) { + ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) + } else { + ch_gff = Channel.value(file(gff)) + } + ch_gtf = GFFREAD ( ch_gff ).gtf + ch_versions = ch_versions.mix(GFFREAD.out.versions) } - } else if (gff) { - if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) - } else { - ch_gff = Channel.value(file(gff)) + + if (filter_gtf) { + GTF_FILTER ( ch_fasta, ch_gtf ) + ch_gtf = GTF_FILTER.out.genome_gtf + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) } - ch_gtf = GFFREAD ( ch_gff ).gtf - ch_versions = ch_versions.mix(GFFREAD.out.versions) } // @@ -133,9 +145,8 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } } else { - ch_filter_gtf = GTF_GENE_FILTER ( ch_fasta, ch_gtf ).gtf - ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filter_gtf ).transcript_fasta - ch_versions = ch_versions.mix(GTF_GENE_FILTER.out.versions) + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } @@ -191,7 +202,7 @@ workflow PREPARE_GENOME { ch_star_index = STAR_GENOMEGENERATE_IGENOMES ( ch_fasta, ch_gtf ).index ch_versions = ch_versions.mix(STAR_GENOMEGENERATE_IGENOMES.out.versions) } else { - ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index + ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] } ).index.map { it[1] } ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) } } @@ -257,6 +268,24 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) } } + + // + // Uncompress Kallisto index or generate from scratch if required + // + ch_kallisto_index = Channel.empty() + if (kallisto_index) { + if (kallisto_index.endsWith('.tar.gz')) { + ch_kallisto_index = UNTAR_KALLISTO_INDEX ( [ [:], kallisto_index ] ).untar + ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) + } else { + ch_kallisto_index = Channel.value([[:], file(kallisto_index)]) + } + } else { + if ('kallisto' in prepare_tool_indices) { + ch_kallisto_index = KALLISTO_INDEX ( ch_transcript_fasta.map{[ [:], it]} ).index + ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) + } + } emit: fasta = ch_fasta // channel: path(genome.fasta) @@ -271,6 +300,6 @@ workflow PREPARE_GENOME { rsem_index = ch_rsem_index // channel: path(rsem/index/) hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) salmon_index = ch_salmon_index // channel: path(salmon/index/) - + kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf new file mode 100644 index 000000000..886b45e7b --- /dev/null +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -0,0 +1,98 @@ +// +// Pseudoalignment and quantification with Salmon or Kallisto +// + +include { SALMON_QUANT } from '../../../modules/nf-core/salmon/quant' +include { KALLISTO_QUANT } from '../../../modules/nf-core/kallisto/quant' +include { TX2GENE } from '../../../modules/local/tx2gene' +include { TXIMPORT } from '../../../modules/local/tximport' + +include { SUMMARIZEDEXPERIMENT as SE_GENE } from '../../../modules/local/summarizedexperiment' +include { SUMMARIZEDEXPERIMENT as SE_GENE_LENGTH_SCALED } from '../../../modules/local/summarizedexperiment' +include { SUMMARIZEDEXPERIMENT as SE_GENE_SCALED } from '../../../modules/local/summarizedexperiment' +include { SUMMARIZEDEXPERIMENT as SE_TRANSCRIPT } from '../../../modules/local/summarizedexperiment' + +workflow QUANTIFY_PSEUDO_ALIGNMENT { + take: + reads // channel: [ val(meta), [ reads ] ] + index // channel: /path/to//index/ + transcript_fasta // channel: /path/to/transcript.fasta + gtf // channel: /path/to/genome.gtf + pseudo_aligner // val: kallisto or salmon + alignment_mode // bool: Run Salmon in alignment mode + lib_type // val: String to override Salmon library type + kallisto_quant_fraglen // val: Estimated fragment length required by Kallisto in single-end mode + kallisto_quant_fraglen_sd // val: Estimated standard error for fragment length required by Kallisto in single-end mode + + main: + + ch_versions = Channel.empty() + + // + // Quantify and merge counts across samples + // + // NOTE: MultiQC needs Salmon outputs, but Kallisto logs + if (pseudo_aligner == 'salmon') { + SALMON_QUANT ( reads, index, gtf, transcript_fasta, alignment_mode, lib_type ) + ch_pseudo_results = SALMON_QUANT.out.results + ch_pseudo_multiqc = ch_pseudo_results + ch_versions = ch_versions.mix(SALMON_QUANT.out.versions.first()) + } else { + KALLISTO_QUANT ( reads, index, gtf, [], kallisto_quant_fraglen, kallisto_quant_fraglen_sd) + ch_pseudo_results = KALLISTO_QUANT.out.results + ch_pseudo_multiqc = KALLISTO_QUANT.out.log + ch_versions = ch_versions.mix(KALLISTO_QUANT.out.versions.first()) + } + + TX2GENE ( ch_pseudo_results.collect{it[1]}, pseudo_aligner, gtf ) + ch_versions = ch_versions.mix(TX2GENE.out.versions) + + TXIMPORT ( ch_pseudo_results.collect{it[1]}, TX2GENE.out.tsv.collect(), pseudo_aligner ) + ch_versions = ch_versions.mix(TXIMPORT.out.versions) + + SE_GENE ( + TXIMPORT.out.counts_gene, + TXIMPORT.out.tpm_gene, + TX2GENE.out.tsv.collect() + ) + ch_versions = ch_versions.mix(SE_GENE.out.versions) + + SE_GENE_LENGTH_SCALED ( + TXIMPORT.out.counts_gene_length_scaled, + TXIMPORT.out.tpm_gene, + TX2GENE.out.tsv.collect() + ) + + SE_GENE_SCALED ( + TXIMPORT.out.counts_gene_scaled, + TXIMPORT.out.tpm_gene, + TX2GENE.out.tsv.collect() + ) + + SE_TRANSCRIPT ( + TXIMPORT.out.counts_transcript, + TXIMPORT.out.tpm_transcript, + TX2GENE.out.tsv.collect() + ) + + emit: + results = ch_pseudo_results // channel: [ val(meta), results_dir ] + multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ] + + tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] + counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] + counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] + counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] + tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] + counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] + + merged_gene_rds = SE_GENE.out.rds // path: *.rds + merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds + merged_gene_rds_scaled = SE_GENE_SCALED.out.rds // path: *.rds + + merged_counts_transcript = TXIMPORT.out.counts_transcript // path: *.transcript_counts.tsv + merged_tpm_transcript = TXIMPORT.out.tpm_transcript // path: *.transcript_tpm.tsv + merged_transcript_rds = SE_TRANSCRIPT.out.rds // path: *.rds + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/quantify_rsem.nf b/subworkflows/local/quantify_rsem/main.nf similarity index 92% rename from subworkflows/local/quantify_rsem.nf rename to subworkflows/local/quantify_rsem/main.nf index 666af33b1..4b7473634 100644 --- a/subworkflows/local/quantify_rsem.nf +++ b/subworkflows/local/quantify_rsem/main.nf @@ -2,9 +2,9 @@ // Gene/transcript quantification with RSEM // -include { RSEM_CALCULATEEXPRESSION } from '../../modules/nf-core/rsem/calculateexpression/main' -include { RSEM_MERGE_COUNTS } from '../../modules/local/rsem_merge_counts' -include { BAM_SORT_STATS_SAMTOOLS } from '../nf-core/bam_sort_stats_samtools/main' +include { RSEM_CALCULATEEXPRESSION } from '../../../modules/nf-core/rsem/calculateexpression' +include { RSEM_MERGE_COUNTS } from '../../../modules/local/rsem_merge_counts' +include { BAM_SORT_STATS_SAMTOOLS } from '../../nf-core/bam_sort_stats_samtools' workflow QUANTIFY_RSEM { take: diff --git a/subworkflows/local/quantify_salmon.nf b/subworkflows/local/quantify_salmon.nf deleted file mode 100644 index 4ab996b2c..000000000 --- a/subworkflows/local/quantify_salmon.nf +++ /dev/null @@ -1,83 +0,0 @@ -// -// Pseudo-alignment and quantification with Salmon -// - -include { SALMON_QUANT } from '../../modules/nf-core/salmon/quant/main' -include { SALMON_TX2GENE } from '../../modules/local/salmon_tx2gene' -include { SALMON_TXIMPORT } from '../../modules/local/salmon_tximport' - -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_GENE } from '../../modules/local/salmon_summarizedexperiment' -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_GENE_LENGTH_SCALED } from '../../modules/local/salmon_summarizedexperiment' -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_GENE_SCALED } from '../../modules/local/salmon_summarizedexperiment' -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_TRANSCRIPT } from '../../modules/local/salmon_summarizedexperiment' - -workflow QUANTIFY_SALMON { - take: - reads // channel: [ val(meta), [ reads ] ] - index // channel: /path/to/salmon/index/ - transcript_fasta // channel: /path/to/transcript.fasta - gtf // channel: /path/to/genome.gtf - alignment_mode // bool: Run Salmon in alignment mode - lib_type // val: String to override salmon library type - - main: - - ch_versions = Channel.empty() - - // - // Quantify and merge counts across samples - // - SALMON_QUANT ( reads, index, gtf, transcript_fasta, alignment_mode, lib_type ) - ch_versions = ch_versions.mix(SALMON_QUANT.out.versions.first()) - - SALMON_TX2GENE ( SALMON_QUANT.out.results.collect{it[1]}, gtf ) - ch_versions = ch_versions.mix(SALMON_TX2GENE.out.versions) - - SALMON_TXIMPORT ( SALMON_QUANT.out.results.collect{it[1]}, SALMON_TX2GENE.out.tsv.collect() ) - ch_versions = ch_versions.mix(SALMON_TXIMPORT.out.versions) - - SALMON_SE_GENE ( - SALMON_TXIMPORT.out.counts_gene, - SALMON_TXIMPORT.out.tpm_gene, - SALMON_TX2GENE.out.tsv.collect() - ) - ch_versions = ch_versions.mix(SALMON_SE_GENE.out.versions) - - SALMON_SE_GENE_LENGTH_SCALED ( - SALMON_TXIMPORT.out.counts_gene_length_scaled, - SALMON_TXIMPORT.out.tpm_gene, - SALMON_TX2GENE.out.tsv.collect() - ) - - SALMON_SE_GENE_SCALED ( - SALMON_TXIMPORT.out.counts_gene_scaled, - SALMON_TXIMPORT.out.tpm_gene, - SALMON_TX2GENE.out.tsv.collect() - ) - - SALMON_SE_TRANSCRIPT ( - SALMON_TXIMPORT.out.counts_transcript, - SALMON_TXIMPORT.out.tpm_transcript, - SALMON_TX2GENE.out.tsv.collect() - ) - - emit: - results = SALMON_QUANT.out.results // channel: [ val(meta), results_dir ] - - tpm_gene = SALMON_TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] - counts_gene = SALMON_TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] - counts_gene_length_scaled = SALMON_TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] - counts_gene_scaled = SALMON_TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] - tpm_transcript = SALMON_TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] - counts_transcript = SALMON_TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] - - merged_gene_rds = SALMON_SE_GENE.out.rds // path: *.rds - merged_gene_rds_length_scaled = SALMON_SE_GENE_LENGTH_SCALED.out.rds // path: *.rds - merged_gene_rds_scaled = SALMON_SE_GENE_SCALED.out.rds // path: *.rds - - merged_counts_transcript = SALMON_TXIMPORT.out.counts_transcript // path: *.transcript_counts.tsv - merged_tpm_transcript = SALMON_TXIMPORT.out.tpm_transcript // path: *.transcript_tpm.tsv - merged_transcript_rds = SALMON_SE_TRANSCRIPT.out.rds // path: *.rds - - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml b/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml index 3af412fab..f11e7ab6f 100644 --- a/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml +++ b/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml @@ -8,12 +8,13 @@ keywords: - bam - sam - cram -modules: +components: - umitools/dedup - samtools/index - samtools/stats - samtools/idxstats - samtools/flagstat + - bam_stats_samtools input: - ch_bam_bai: description: | diff --git a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml index d5e716092..b924596d8 100644 --- a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml +++ b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml @@ -7,12 +7,13 @@ keywords: - sam - cram -modules: +components: - picard/markduplicates - samtools/index - samtools/stats - samtools/idxstats - samtools/flagstat + - bam_stats_samtools input: - ch_bam: diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml b/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml index 8dfbd58df..69c16be41 100644 --- a/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml +++ b/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml @@ -6,12 +6,13 @@ keywords: - bam - sam - cram -modules: +components: - samtools/sort - samtools/index - samtools/stats - samtools/idxstats - samtools/flagstat + - bam_stats_samtools input: - meta: type: map diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml index b05086bc2..87863b11b 100644 --- a/subworkflows/nf-core/bam_stats_samtools/meta.yml +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -7,7 +7,7 @@ keywords: - bam - sam - cram -modules: +components: - samtools/stats - samtools/idxstats - samtools/flagstat diff --git a/subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/meta.yml b/subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/meta.yml index 8d3257773..5ad4db7db 100644 --- a/subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/meta.yml +++ b/subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/meta.yml @@ -6,7 +6,7 @@ keywords: - bigwig - clip - conversion -modules: +components: - ucsc/bedclip - ucsc/bedgraphtobigwig input: diff --git a/subworkflows/nf-core/fastq_align_hisat2/meta.yml b/subworkflows/nf-core/fastq_align_hisat2/meta.yml index 36c7c78ec..2b05beb68 100644 --- a/subworkflows/nf-core/fastq_align_hisat2/meta.yml +++ b/subworkflows/nf-core/fastq_align_hisat2/meta.yml @@ -10,11 +10,12 @@ keywords: - bam - sam - cram -modules: +components: - hisat2/align - samtools/stats - samtools/idxstats - samtools/flagstat + - bam_sort_stats_samtools input: - meta: type: map diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf index 64ec88f23..3dbb27eae 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -14,7 +14,7 @@ import groovy.json.JsonSlurper def getFastpReadsAfterFiltering(json_file) { def Map json = (Map) new JsonSlurper().parseText(json_file.text).get('summary') - return json['after_filtering']['total_reads'].toInteger() + return json['after_filtering']['total_reads'].toLong() } workflow FASTQ_FASTQC_UMITOOLS_FASTP { @@ -100,7 +100,7 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { .set { ch_num_trimmed_reads } ch_num_trimmed_reads - .filter { meta, reads, num_reads -> num_reads >= min_trimmed_reads.toInteger() } + .filter { meta, reads, num_reads -> num_reads >= min_trimmed_reads.toLong() } .map { meta, reads, num_reads -> [ meta, reads ] } .set { trim_reads } diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml index 8f1620093..f76b40261 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml @@ -9,7 +9,7 @@ keywords: - UMI - trimming - fastp -modules: +components: - fastqc - umitools/extract - fastp diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml index 3b1a675c3..e32e90f43 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml +++ b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml @@ -8,7 +8,7 @@ keywords: - UMI - trimming - trimgalore -modules: +components: - fastqc - umitools/extract - trimgalore diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml b/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml index db96312aa..d144269ba 100644 --- a/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml @@ -5,9 +5,10 @@ keywords: - fastq - subsample - strandedness -modules: +components: - fq/subsample - salmon/quant + - salmon/index input: - meta: type: map diff --git a/tower.yml b/tower.yml index b6983530a..21f821a53 100644 --- a/tower.yml +++ b/tower.yml @@ -21,6 +21,16 @@ reports: display: "All samples Salmon merged transcript raw counts" "**/salmon/salmon.merged.transcript_tpm.tsv": display: "All samples Salmon merged transcript TPM counts" + "**/kallisto/**/deseq2.plots.pdf": + display: "All samples Kallisto DESeq2 QC PDF plots" + "**/kallisto/kallisto.merged.gene_counts.tsv": + display: "All samples Kallisto merged gene raw counts" + "**/kallisto/kallisto.merged.gene_tpm.tsv": + display: "All samples Kallisto merged gene TPM counts" + "**/kallisto/kallisto.merged.transcript_counts.tsv": + display: "All samples Kallisto merged transcript raw counts" + "**/kallisto/kallisto.merged.transcript_tpm.tsv": + display: "All samples Kallisto merged transcript TPM counts" "**/star_rsem/**/deseq2.plots.pdf": display: "All samples STAR RSEM DESeq2 QC PDF plots" "**/star_rsem/rsem.merged.gene_counts.tsv": diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 8c6b7260d..403194ac2 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -1,43 +1,35 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def valid_params = [ - aligners : ['star_salmon', 'star_rsem', 'hisat2'], - trimmers : ['trimgalore', 'fastp'], - pseudoaligners : ['salmon'], - rseqc_modules : ['bam_stat', 'inner_distance', 'infer_experiment', 'junction_annotation', 'junction_saturation', 'read_distribution', 'read_duplication', 'tin'] -] +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// Validate input parameters -WorkflowRnaseq.initialise(params, log, valid_params) +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation -// Check input path parameters to see if they exist -checkPathParamList = [ - params.input, params.multiqc_config, - params.fasta, params.transcript_fasta, params.additional_fasta, - params.gtf, params.gff, params.gene_bed, - params.ribo_database_manifest, params.splicesites, - params.star_index, params.hisat2_index, params.rsem_index, params.salmon_index -] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +WorkflowRnaseq.initialise(params, log) // Check rRNA databases for sortmerna if (params.remove_ribo_rna) { - ch_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) + ch_ribo_db = file(params.ribo_database_manifest) if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"} } // Check if file with list of fastas is provided when running BBSplit if (!params.skip_bbsplit && !params.bbsplit_index && params.bbsplit_fasta_list) { - ch_bbsplit_fasta_list = file(params.bbsplit_fasta_list, checkIfExists: true) + ch_bbsplit_fasta_list = file(params.bbsplit_fasta_list) if (ch_bbsplit_fasta_list.isEmpty()) {exit 1, "File provided with --bbsplit_fasta_list is empty: ${ch_bbsplit_fasta_list.getName()}!"} } @@ -47,6 +39,25 @@ if (!params.skip_bbsplit) { prepareToolIndices << 'bbsplit' } if (!params.skip_alignment) { prepareToolIndices << params.aligner } if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } +// Determine whether to filter the GTF or not +def filterGtf = + (( + // Condition 1: Alignment is required and aligner is set + !params.skip_alignment && params.aligner + ) || + ( + // Condition 2: Pseudoalignment is required and pseudoaligner is set + !params.skip_pseudo_alignment && params.pseudo_aligner + ) || + ( + // Condition 3: Transcript FASTA file is not provided + !params.transcript_fasta + )) && + ( + // Condition 4: --skip_gtf_filter is not provided + !params.skip_gtf_filter + ) + // Get RSeqC modules to run def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : [] if (params.bam_csi_index) { @@ -75,9 +86,9 @@ if (params.fasta && params.gtf) { */ ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) // Header files for MultiQC ch_pca_header_multiqc = file("$projectDir/assets/multiqc/deseq2_pca_header.txt", checkIfExists: true) @@ -96,21 +107,20 @@ ch_biotypes_header_multiqc = file("$projectDir/assets/multiqc/biotypes_header. include { BEDTOOLS_GENOMECOV } from '../modules/local/bedtools_genomecov' include { DESEQ2_QC as DESEQ2_QC_STAR_SALMON } from '../modules/local/deseq2_qc' include { DESEQ2_QC as DESEQ2_QC_RSEM } from '../modules/local/deseq2_qc' -include { DESEQ2_QC as DESEQ2_QC_SALMON } from '../modules/local/deseq2_qc' +include { DESEQ2_QC as DESEQ2_QC_PSEUDO } from '../modules/local/deseq2_qc' include { DUPRADAR } from '../modules/local/dupradar' include { MULTIQC } from '../modules/local/multiqc' include { MULTIQC_CUSTOM_BIOTYPE } from '../modules/local/multiqc_custom_biotype' -include { UMITOOLS_PREPAREFORRSEM as UMITOOLS_PREPAREFORSALMON } from '../modules/local/umitools_prepareforrsem.nf' +include { UMITOOLS_PREPAREFORRSEM as UMITOOLS_PREPAREFORSALMON } from '../modules/local/umitools_prepareforrsem' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' -include { ALIGN_STAR } from '../subworkflows/local/align_star' -include { QUANTIFY_RSEM } from '../subworkflows/local/quantify_rsem' -include { QUANTIFY_SALMON as QUANTIFY_STAR_SALMON } from '../subworkflows/local/quantify_salmon' -include { QUANTIFY_SALMON as QUANTIFY_SALMON } from '../subworkflows/local/quantify_salmon' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { ALIGN_STAR } from '../subworkflows/local/align_star' +include { QUANTIFY_RSEM } from '../subworkflows/local/quantify_rsem' +include { QUANTIFY_PSEUDO_ALIGNMENT as QUANTIFY_STAR_SALMON } from '../subworkflows/local/quantify_pseudo' +include { QUANTIFY_PSEUDO_ALIGNMENT } from '../subworkflows/local/quantify_pseudo' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -121,30 +131,30 @@ include { QUANTIFY_SALMON as QUANTIFY_SALMON } from '../subworkflows/local/ // // MODULE: Installed directly from nf-core/modules // -include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' -include { BBMAP_BBSPLIT } from '../modules/nf-core/bbmap/bbsplit/main' -include { SAMTOOLS_SORT } from '../modules/nf-core/samtools/sort/main' -include { PRESEQ_LCEXTRAP } from '../modules/nf-core/preseq/lcextrap/main' -include { QUALIMAP_RNASEQ } from '../modules/nf-core/qualimap/rnaseq/main' -include { SORTMERNA } from '../modules/nf-core/sortmerna/main' -include { STRINGTIE_STRINGTIE } from '../modules/nf-core/stringtie/stringtie/main' -include { SUBREAD_FEATURECOUNTS } from '../modules/nf-core/subread/featurecounts/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq' +include { BBMAP_BBSPLIT } from '../modules/nf-core/bbmap/bbsplit' +include { SAMTOOLS_SORT } from '../modules/nf-core/samtools/sort' +include { PRESEQ_LCEXTRAP } from '../modules/nf-core/preseq/lcextrap' +include { QUALIMAP_RNASEQ } from '../modules/nf-core/qualimap/rnaseq' +include { SORTMERNA } from '../modules/nf-core/sortmerna' +include { STRINGTIE_STRINGTIE } from '../modules/nf-core/stringtie/stringtie' +include { SUBREAD_FEATURECOUNTS } from '../modules/nf-core/subread/featurecounts' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' // // SUBWORKFLOW: Consisting entirely of nf-core/modules // -include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../subworkflows/nf-core/fastq_subsample_fq_salmon/main' -include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main' -include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../subworkflows/nf-core/fastq_fastqc_umitools_fastp/main' -include { FASTQ_ALIGN_HISAT2 } from '../subworkflows/nf-core/fastq_align_hisat2/main' -include { BAM_SORT_STATS_SAMTOOLS } from '../subworkflows/nf-core/bam_sort_stats_samtools/main' -include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard/main' -include { BAM_RSEQC } from '../subworkflows/nf-core/bam_rseqc/main' -include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME } from '../subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main' -include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME } from '../subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main' -include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/main' -include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig/main' +include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../subworkflows/nf-core/fastq_subsample_fq_salmon' +include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../subworkflows/nf-core/fastq_fastqc_umitools_trimgalore' +include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../subworkflows/nf-core/fastq_fastqc_umitools_fastp' +include { FASTQ_ALIGN_HISAT2 } from '../subworkflows/nf-core/fastq_align_hisat2' +include { BAM_SORT_STATS_SAMTOOLS } from '../subworkflows/nf-core/bam_sort_stats_samtools' +include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard' +include { BAM_RSEQC } from '../subworkflows/nf-core/bam_rseqc' +include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME } from '../subworkflows/nf-core/bam_dedup_stats_samtools_umitools' +include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME } from '../subworkflows/nf-core/bam_dedup_stats_samtools_umitools' +include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig' +include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -178,12 +188,14 @@ workflow RNASEQ { params.star_index, params.rsem_index, params.salmon_index, + params.kallisto_index, params.hisat2_index, params.bbsplit_index, params.gencode, is_aws_igenome, biotype, - prepareToolIndices + prepareToolIndices, + filterGtf ) ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) @@ -196,27 +208,30 @@ workflow RNASEQ { } // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // Create input channel from input file provided through params.input // - INPUT_CHECK ( - ch_input - ) - .reads - .map { - meta, fastq -> - new_id = meta.id - ~/_T\d+/ - [ meta + [id: new_id], fastq ] - } - .groupTuple() - .branch { - meta, fastq -> - single : fastq.size() == 1 - return [ meta, fastq.flatten() ] - multiple: fastq.size() > 1 - return [ meta, fastq.flatten() ] - } - .set { ch_fastq } - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + Channel + .fromSamplesheet("input") + .map { + meta, fastq_1, fastq_2 -> + if (!fastq_2) { + return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] + } else { + return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] + } + } + .groupTuple() + .map { + WorkflowRnaseq.validateInput(it) + } + .branch { + meta, fastqs -> + single : fastqs.size() == 1 + return [ meta, fastqs.flatten() ] + multiple: fastqs.size() > 1 + return [ meta, fastqs.flatten() ] + } + .set { ch_fastq } // // MODULE: Concatenate FastQ files from same sample if required @@ -241,7 +256,7 @@ workflow RNASEQ { .set { ch_strand_fastq } // - // SUBWORKFLOW: Sub-sample FastQ files and pseudo-align with Salmon to auto-infer strandedness + // SUBWORKFLOW: Sub-sample FastQ files and pseudoalign with Salmon to auto-infer strandedness // // Return empty channel if ch_strand_fastq.auto_strand is empty so salmon index isn't created PREPARE_GENOME.out.fasta @@ -388,8 +403,8 @@ workflow RNASEQ { if (!params.skip_alignment && params.aligner == 'star_salmon') { ALIGN_STAR ( ch_filtered_reads, - PREPARE_GENOME.out.star_index, - PREPARE_GENOME.out.gtf, + PREPARE_GENOME.out.star_index.map { [ [:], it ] }, + PREPARE_GENOME.out.gtf.map { [ [:], it ] }, params.star_ignore_sjdbgtf, '', params.seq_center ?: '', @@ -480,8 +495,11 @@ workflow RNASEQ { ch_dummy_file, PREPARE_GENOME.out.transcript_fasta, PREPARE_GENOME.out.gtf, + 'salmon', true, - params.salmon_quant_libtype ?: '' + params.salmon_quant_libtype ?: '', + params.kallisto_quant_fraglen, + params.kallisto_quant_fraglen_sd ) ch_versions = ch_versions.mix(QUANTIFY_STAR_SALMON.out.versions) @@ -735,7 +753,7 @@ workflow RNASEQ { if (!params.skip_qualimap) { QUALIMAP_RNASEQ ( ch_genome_bam, - PREPARE_GENOME.out.gtf + PREPARE_GENOME.out.gtf.map { [ [:], it ] } ) ch_qualimap_multiqc = QUALIMAP_RNASEQ.out.results ch_versions = ch_versions.mix(QUALIMAP_RNASEQ.out.versions.first()) @@ -794,35 +812,47 @@ workflow RNASEQ { } // - // SUBWORKFLOW: Pseudo-alignment and quantification with Salmon + // SUBWORKFLOW: Pseudoalignment and quantification with Salmon // - ch_salmon_multiqc = Channel.empty() + ch_pseudo_multiqc = Channel.empty() ch_pseudoaligner_pca_multiqc = Channel.empty() ch_pseudoaligner_clustering_multiqc = Channel.empty() - if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { - QUANTIFY_SALMON ( + + if (!params.skip_pseudo_alignment) { + + if (params.pseudo_aligner == 'salmon') { + ch_pseudo_index = PREPARE_GENOME.out.salmon_index + } else { + ch_pseudo_index = PREPARE_GENOME.out.kallisto_index + } + + QUANTIFY_PSEUDO_ALIGNMENT ( ch_filtered_reads, - PREPARE_GENOME.out.salmon_index, + ch_pseudo_index, ch_dummy_file, PREPARE_GENOME.out.gtf, + params.pseudo_aligner, false, - params.salmon_quant_libtype ?: '' + params.salmon_quant_libtype ?: '', + params.kallisto_quant_fraglen, + params.kallisto_quant_fraglen_sd ) - ch_salmon_multiqc = QUANTIFY_SALMON.out.results - ch_versions = ch_versions.mix(QUANTIFY_SALMON.out.versions) + ch_pseudo_multiqc = QUANTIFY_PSEUDO_ALIGNMENT.out.multiqc + ch_counts_gene_length_scaled = QUANTIFY_PSEUDO_ALIGNMENT.out.counts_gene_length_scaled + ch_versions = ch_versions.mix(QUANTIFY_PSEUDO_ALIGNMENT.out.versions) if (!params.skip_qc & !params.skip_deseq2_qc) { - DESEQ2_QC_SALMON ( - QUANTIFY_SALMON.out.counts_gene_length_scaled, + DESEQ2_QC_PSEUDO ( + ch_counts_gene_length_scaled, ch_pca_header_multiqc, ch_clustering_header_multiqc ) - ch_pseudoaligner_pca_multiqc = DESEQ2_QC_SALMON.out.pca_multiqc - ch_pseudoaligner_clustering_multiqc = DESEQ2_QC_SALMON.out.dists_multiqc - ch_versions = ch_versions.mix(DESEQ2_QC_SALMON.out.versions) + ch_pseudoaligner_pca_multiqc = DESEQ2_QC_PSEUDO.out.pca_multiqc + ch_pseudoaligner_clustering_multiqc = DESEQ2_QC_PSEUDO.out.dists_multiqc + ch_versions = ch_versions.mix(DESEQ2_QC_PSEUDO.out.versions) } } - + // // MODULE: Pipeline reporting // @@ -837,7 +867,7 @@ workflow RNASEQ { workflow_summary = WorkflowRnaseq.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowRnaseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + methods_description = WorkflowRnaseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) MULTIQC ( @@ -857,7 +887,7 @@ workflow RNASEQ { ch_star_multiqc.collect{it[1]}.ifEmpty([]), ch_hisat2_multiqc.collect{it[1]}.ifEmpty([]), ch_rsem_multiqc.collect{it[1]}.ifEmpty([]), - ch_salmon_multiqc.collect{it[1]}.ifEmpty([]), + ch_pseudo_multiqc.collect{it[1]}.ifEmpty([]), ch_samtools_stats.collect{it[1]}.ifEmpty([]), ch_samtools_flagstat.collect{it[1]}.ifEmpty([]), ch_samtools_idxstats.collect{it[1]}.ifEmpty([]), @@ -893,12 +923,13 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) } + + NfcoreTemplate.dump_parameters(workflow, params) + NfcoreTemplate.summary(workflow, params, log, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } - - NfcoreTemplate.summary(workflow, params, log, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) } /*