Merge pull request #1231 from maxulysse/sortmerna

Update sortmerna usage
nf-core · Mar 5, 2024 · 34e51dd · 34e51dd
2 parents a50212c + 04b3581
commit 34e51dd
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,9 +21,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [PR #1220](https://github.com/nf-core/rnaseq/pull/1220) - Initialise nf-test and add pipeline level test
 - [PR #1226](https://github.com/nf-core/rnaseq/pull/1226) - Reuse bbsplit index and don't keep overwriting ([#1225](https://github.com/nf-core/rnaseq/issues/1225))
 - [PR #1229](https://github.com/nf-core/rnaseq/pull/1229) - Template update for nf-core/tools v2.13.1
+- [PR #1231](https://github.com/nf-core/rnaseq/pull/1231) - Add sortmerna index possibilities
 
 ### Parameters
 
+| Old parameter | New parameter       |
+| ------------- | ------------------- |
+|               | `--sortmerna_index` |
+
 ### Software dependencies
 
 | Dependency  | Old version | New version |

diff --git a/main.nf b/main.nf
@@ -37,6 +37,7 @@ params.gtf              = getGenomeAttribute('gtf')
 params.gff              = getGenomeAttribute('gff')
 params.gene_bed         = getGenomeAttribute('bed12')
 params.bbsplit_index    = getGenomeAttribute('bbsplit')
+params.sortmerna_index  = getGenomeAttribute('sortmerna')
 params.star_index       = getGenomeAttribute('star')
 params.hisat2_index     = getGenomeAttribute('hisat2')
 params.rsem_index       = getGenomeAttribute('rsem')
@@ -70,18 +71,21 @@ workflow NFCORE_RNASEQ {
         params.gene_bed,
         params.splicesites,
         params.bbsplit_fasta_list,
+        params.ribo_database_manifest,
         params.star_index,
         params.rsem_index,
         params.salmon_index,
         params.kallisto_index,
         params.hisat2_index,
         params.bbsplit_index,
+        params.sortmerna_index,
         params.gencode,
         params.featurecounts_group_type,
         params.aligner,
         params.pseudo_aligner,
         params.skip_gtf_filter,
         params.skip_bbsplit,
+        !params.remove_ribo_rna,
         params.skip_alignment,
         params.skip_pseudo_alignment
     )
@@ -114,7 +118,9 @@ workflow NFCORE_RNASEQ {
         PREPARE_GENOME.out.salmon_index,
         PREPARE_GENOME.out.kallisto_index,
         PREPARE_GENOME.out.bbsplit_index,
-        PREPARE_GENOME.out.splicesites
+        PREPARE_GENOME.out.sortmerna_index,
+        PREPARE_GENOME.out.splicesites,
+        !params.remove_ribo_rna && params.remove_ribo_rna
     )
     ch_versions = ch_versions.mix(RNASEQ.out.versions)
 

diff --git a/modules/nf-core/sortmerna/nextflow.config b/modules/nf-core/sortmerna/nextflow.config
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -267,6 +267,14 @@
                     "description": "Path to directory or tar.gz archive for pre-built BBSplit index.",
                     "help_text": "The BBSplit index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--bbsplit_index` for future runs."
                 },
+                "sortmerna_index": {
+                    "type": "string",
+                    "format": "path",
+                    "exists": true,
+                    "fa_icon": "fas fa-bezier-curve",
+                    "description": "Path to directory or tar.gz archive for pre-built sortmerna index.",
+                    "help_text": "The sortmerna index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--sortmerna_index` for future runs."
+                },
                 "remove_ribo_rna": {
                     "type": "boolean",
                     "fa_icon": "fas fa-trash-alt",

diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
@@ -10,6 +10,7 @@ include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gun
 include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip'
 
 include { UNTAR as UNTAR_BBSPLIT_INDEX      } from '../../../modules/nf-core/untar'
+include { UNTAR as UNTAR_SORTMERNA_INDEX    } from '../../../modules/nf-core/untar'
 include { UNTAR as UNTAR_STAR_INDEX         } from '../../../modules/nf-core/untar'
 include { UNTAR as UNTAR_RSEM_INDEX         } from '../../../modules/nf-core/untar'
 include { UNTAR as UNTAR_HISAT2_INDEX       } from '../../../modules/nf-core/untar'
@@ -20,6 +21,7 @@ include { CUSTOM_CATADDITIONALFASTA         } from '../../../modules/nf-core/cus
 include { CUSTOM_GETCHROMSIZES              } from '../../../modules/nf-core/custom/getchromsizes'
 include { GFFREAD                           } from '../../../modules/nf-core/gffread'
 include { BBMAP_BBSPLIT                     } from '../../../modules/nf-core/bbmap/bbsplit'
+include { SORTMERNA as SORTMERNA_INDEX      } from '../../../modules/nf-core/sortmerna'
 include { STAR_GENOMEGENERATE               } from '../../../modules/nf-core/star/genomegenerate'
 include { HISAT2_EXTRACTSPLICESITES         } from '../../../modules/nf-core/hisat2/extractsplicesites'
 include { HISAT2_BUILD                      } from '../../../modules/nf-core/hisat2/build'
@@ -43,18 +45,21 @@ workflow PREPARE_GENOME {
     gene_bed                 //      file: /path/to/gene.bed
     splicesites              //      file: /path/to/splicesites.txt
     bbsplit_fasta_list       //      file: /path/to/bbsplit_fasta_list.txt
+    sortmerna_fasta_list     //      file: /path/to/sortmerna_fasta_list.txt
     star_index               // directory: /path/to/star/index/
     rsem_index               // directory: /path/to/rsem/index/
     salmon_index             // directory: /path/to/salmon/index/
     kallisto_index           // directory: /path/to/kallisto/index/
     hisat2_index             // directory: /path/to/hisat2/index/
     bbsplit_index            // directory: /path/to/rsem/index/
+    sortmerna_index          // directory: /path/to/sortmerna/index/
     gencode                  //   boolean: whether the genome is from GENCODE
     featurecounts_group_type //    string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts
     aligner                  //    string: Specifies the alignment algorithm to use - available options are 'star_salmon', 'star_rsem' and 'hisat2'
     pseudo_aligner           //    string: Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'
     skip_gtf_filter          //   boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs
     skip_bbsplit             //   boolean: Skip BBSplit for removal of non-reference genome reads
+    skip_sortmerna           //   boolean: Skip sortmerna for removal of reads mapping to sequences in sortmerna_fasta_list
     skip_alignment           //   boolean: Skip all of the alignment-based processes within the pipeline
     skip_pseudo_alignment    //   boolean: Skip all of the pseudoalignment-based processes within the pipeline
 
@@ -188,6 +193,7 @@ workflow PREPARE_GENOME {
     //
     def prepare_tool_indices = []
     if (!skip_bbsplit) { prepare_tool_indices << 'bbsplit' }
+    if (!skip_sortmerna) { prepare_tool_indices << 'sortmerna' }
     if (!skip_alignment) { prepare_tool_indices << aligner }
     if (!skip_pseudo_alignment && pseudo_aligner) { prepare_tool_indices << pseudo_aligner }
 
@@ -218,6 +224,34 @@ workflow PREPARE_GENOME {
         }
     }
 
+    //
+    // Uncompress sortmerna index or generate from scratch if required
+    //
+    ch_sortmerna_index = Channel.empty()
+    if ('sortmerna' in prepare_tool_indices) {
+        if (sortmerna_index) {
+            if (sortmerna_index.endsWith('.tar.gz')) {
+                ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ( [ [:], sortmerna_index ] ).untar.map { it[1] }
+                ch_versions      = ch_versions.mix(UNTAR_SORTMERNA_INDEX.out.versions)
+            } else {
+                ch_sortmerna_index = Channel.value(file(sortmerna_index))
+            }
+        } else {
+            ch_sortmerna_fastas = Channel.from(file(sortmerna_fasta_list).readLines())
+                .map { row -> file(row, checkIfExists: true) }
+                .collect()
+                .map{ ['rrna_refs', it] }
+
+            SORTMERNA_INDEX (
+                Channel.of([[],[]]),
+                ch_sortmerna_fastas,
+                Channel.of([[],[]])
+            )
+            ch_sortmerna_index = SORTMERNA_INDEX.out.index.first()
+            ch_versions = ch_versions.mix(SORTMERNA_INDEX.out.versions)
+        }
+    }
+
     //
     // Uncompress STAR index or generate from scratch if required
     //
@@ -336,6 +370,7 @@ workflow PREPARE_GENOME {
     chrom_sizes      = ch_chrom_sizes            // channel: path(genome.sizes)
     splicesites      = ch_splicesites            // channel: path(genome.splicesites.txt)
     bbsplit_index    = ch_bbsplit_index          // channel: path(bbsplit/index/)
+    sortmerna_index  = ch_sortmerna_index        // channel: path(sortmerna/index/)
     star_index       = ch_star_index             // channel: path(star/index/)
     rsem_index       = ch_rsem_index             // channel: path(rsem/index/)
     hisat2_index     = ch_hisat2_index           // channel: path(hisat2/index/)

diff --git a/subworkflows/local/prepare_genome/nextflow.config b/subworkflows/local/prepare_genome/nextflow.config
@@ -112,3 +112,16 @@ if (!params.skip_bbsplit && params.bbsplit_fasta_list) {
         }
     }
 }
+
+if (params.remove_ribo_rna && params.ribo_database_manifest) {
+    process {
+        withName: 'SORTMERNA_INDEX' {
+            ext.args   = '--index 1'
+            publishDir = [
+                path: { params.save_reference ? "${params.outdir}/genome/sortmerna" : params.outdir },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null }
+            ]
+        }
+    }
+}
diff --git a/workflows/rnaseq/main.nf b/workflows/rnaseq/main.nf
@@ -44,6 +44,7 @@ include { SAMTOOLS_SORT                                        } from '../../mod
 include { PRESEQ_LCEXTRAP                                      } from '../../modules/nf-core/preseq/lcextrap'
 include { QUALIMAP_RNASEQ                                      } from '../../modules/nf-core/qualimap/rnaseq'
 include { SORTMERNA                                            } from '../../modules/nf-core/sortmerna'
+include { SORTMERNA as SORTMERNA_INDEX                         } from '../../modules/nf-core/sortmerna'
 include { STRINGTIE_STRINGTIE                                  } from '../../modules/nf-core/stringtie/stringtie'
 include { SUBREAD_FEATURECOUNTS                                } from '../../modules/nf-core/subread/featurecounts'
 include { MULTIQC                                              } from '../../modules/nf-core/multiqc'
@@ -97,7 +98,9 @@ workflow RNASEQ {
     ch_salmon_index     // channel: path(salmon/index/)
     ch_kallisto_index   // channel: [ meta, path(kallisto/index/) ]
     ch_bbsplit_index    // channel: path(bbsplit/index/)
+    ch_sortmerna_index  // channel: path(sortmerna/index/)
     ch_splicesites      // channel: path(genome.splicesites.txt)
+    make_sortmerna_index // boolean: Whether to create a sortmerna index before running sortmerna
 
     main:
 
@@ -225,14 +228,29 @@ workflow RNASEQ {
     //
     // MODULE: Remove ribosomal RNA reads
     //
+    // Check rRNA databases for sortmerna
     if (params.remove_ribo_rna) {
         ch_ribo_db = file(params.ribo_database_manifest)
-        ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines()).map { row -> file(row, checkIfExists: true) }.collect()
+        if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"}
+
+        ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines())
+            .map { row -> file(row, checkIfExists: true) }
+            .collect()
+            .map{ ['rrna_refs', it] }
+
+        if (make_sortmerna_index) {
+            SORTMERNA_INDEX (
+                [[],[]],
+                ch_sortmerna_fastas,
+                [[],[]]
+            )
+            ch_sortmerna_index = SORTMERNA_INDEX.out.index.first()
+        }
 
         SORTMERNA (
             ch_filtered_reads,
-            ch_sortmerna_fastas.map{ it -> [ [ id:'fastas' ], it ] },
-            [[:],[]]
+            ch_sortmerna_fastas,
+            ch_sortmerna_index
         )
         .reads
         .set { ch_filtered_reads }