Merge pull request #1342 from nf-core/factor_out_preprocessing

Factor out preprocessing
nf-core · Jul 17, 2024 · 5bd04b4 · 5bd04b4
2 parents 66f3594 + 5507a6d
commit 5bd04b4
Show file tree

Hide file tree

Showing 16 changed files with 945 additions and 419 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -104,6 +104,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [PR #1336](https://github.com/nf-core/rnaseq/pull/1334) - Use nf-core/setup-nf-test to install nf-test from cache during CI/CD
 - [PR #1340](https://github.com/nf-core/rnaseq/pull/1340) - Remove out-of-date Azure specific guidance
 - [PR #1341](https://github.com/nf-core/rnaseq/pull/1341) - Add rename in the MultiQC report for samples without techreps
+- [PR #1342](https://github.com/nf-core/rnaseq/pull/1342) - Factor out preprocessing
 
 ### Parameters
 

diff --git a/main.nf b/main.nf
@@ -117,6 +117,7 @@ workflow NFCORE_RNASEQ {
         PREPARE_GENOME.out.salmon_index,
         PREPARE_GENOME.out.kallisto_index,
         PREPARE_GENOME.out.bbsplit_index,
+        PREPARE_GENOME.out.rrna_fastas,
         PREPARE_GENOME.out.sortmerna_index,
         PREPARE_GENOME.out.splicesites,
         !params.remove_ribo_rna && params.remove_ribo_rna

diff --git a/modules.json b/modules.json
@@ -8,7 +8,7 @@
                     "bbmap/bbsplit": {
                         "branch": "master",
                         "git_sha": "2c6b1144ed58b6184ad58fc4e6b6a90219b4bf4f",
-                        "installed_by": ["modules"]
+                        "installed_by": ["fastq_qc_trim_filter_setstrandedness", "modules"]
                     },
                     "bedtools/genomecov": {
                         "branch": "master",
@@ -18,7 +18,7 @@
                     "cat/fastq": {
                         "branch": "master",
                         "git_sha": "4fc983ad0b30e6e32696fa7d980c76c7bfe1c03e",
-                        "installed_by": ["modules"]
+                        "installed_by": ["fastq_qc_trim_filter_setstrandedness", "modules"]
                     },
                     "custom/catadditionalfasta": {
                         "branch": "master",
@@ -202,7 +202,7 @@
                     "sortmerna": {
                         "branch": "master",
                         "git_sha": "df05c8db5195867c0bc7b92c1788115b66f0d17d",
-                        "installed_by": ["modules"]
+                        "installed_by": ["fastq_qc_trim_filter_setstrandedness", "modules"]
                     },
                     "star/align": {
                         "branch": "master",
@@ -315,17 +315,22 @@
                     "fastq_fastqc_umitools_fastp": {
                         "branch": "master",
                         "git_sha": "db35d26edeafacf9906a517827df621a29adc13d",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": ["fastq_qc_trim_filter_setstrandedness", "subworkflows"]
                     },
                     "fastq_fastqc_umitools_trimgalore": {
                         "branch": "master",
                         "git_sha": "cb6defa0834eda9d6d3f967e981c819fc3e257bf",
+                        "installed_by": ["fastq_qc_trim_filter_setstrandedness", "subworkflows"]
+                    },
+                    "fastq_qc_trim_filter_setstrandedness": {
+                        "branch": "master",
+                        "git_sha": "b86de50ab60c19ab40e70a4501820f4cb307050b",
                         "installed_by": ["subworkflows"]
                     },
                     "fastq_subsample_fq_salmon": {
                         "branch": "master",
                         "git_sha": "727232afb8294b53dd9d05bfe469b70cce1675bb",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": ["fastq_qc_trim_filter_setstrandedness", "subworkflows"]
                     },
                     "quantify_pseudo_alignment": {
                         "branch": "master",

diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
@@ -228,7 +228,11 @@ workflow PREPARE_GENOME {
     // Uncompress sortmerna index or generate from scratch if required
     //
     ch_sortmerna_index = Channel.empty()
+    ch_rrna_fastas = Channel.empty()
+
     if ('sortmerna' in prepare_tool_indices) {
+        ribo_db = file(sortmerna_fasta_list)
+
         if (sortmerna_index) {
             if (sortmerna_index.endsWith('.tar.gz')) {
                 ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ( [ [:], sortmerna_index ] ).untar.map { it[1] }
@@ -237,14 +241,12 @@ workflow PREPARE_GENOME {
                 ch_sortmerna_index = Channel.value(file(sortmerna_index))
             }
         } else {
-            ch_sortmerna_fastas = Channel.from(file(sortmerna_fasta_list).readLines())
+            ch_rrna_fastas = Channel.from(ribo_db.readLines())
                 .map { row -> file(row, checkIfExists: true) }
-                .collect()
-                .map { [ 'rrna_refs', it ] }
 
             SORTMERNA_INDEX (
                 Channel.of([ [],[] ]),
-                ch_sortmerna_fastas,
+                ch_rrna_fastas.collect().map { [ 'rrna_refs', it ] },
                 Channel.of([ [],[] ])
             )
             ch_sortmerna_index = SORTMERNA_INDEX.out.index.first()
@@ -370,6 +372,7 @@ workflow PREPARE_GENOME {
     chrom_sizes      = ch_chrom_sizes            // channel: path(genome.sizes)
     splicesites      = ch_splicesites            // channel: path(genome.splicesites.txt)
     bbsplit_index    = ch_bbsplit_index          // channel: path(bbsplit/index/)
+    rrna_fastas      = ch_rrna_fastas            // channel: path(sortmerna_fasta_list)
     sortmerna_index  = ch_sortmerna_index        // channel: path(sortmerna/index/)
     star_index       = ch_star_index             // channel: path(star/index/)
     rsem_index       = ch_rsem_index             // channel: path(rsem/index/)

diff --git a/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf
@@ -20,6 +20,7 @@ include { imNotification            } from '../../nf-core/utils_nfcore_pipeline'
 include { UTILS_NFCORE_PIPELINE     } from '../../nf-core/utils_nfcore_pipeline'
 include { workflowCitation          } from '../../nf-core/utils_nfcore_pipeline'
 include { logColours                } from '../../nf-core/utils_nfcore_pipeline'
+include { calculateStrandedness     } from '../../nf-core/fastq_qc_trim_filter_setstrandedness'
 
 /*
 ========================================================================================
@@ -548,63 +549,6 @@ def biotypeInGtf(gtf_file, biotype) {
     }
 }
 
-//
-// Function to determine library type by comparing type counts. Consistent
-// between Salmon and RSeQC
-//
-def calculateStrandedness(forwardFragments, reverseFragments, unstrandedFragments, stranded_threshold=0.8, unstranded_threshold=0.1) {
-    def totalFragments = forwardFragments + reverseFragments + unstrandedFragments
-    def totalStrandedFragments = forwardFragments + reverseFragments
-
-    def library_strandedness = 'undetermined'
-    if (totalStrandedFragments > 0) {
-        def forwardProportion = forwardFragments / (totalStrandedFragments as double)
-        def reverseProportion = reverseFragments / (totalStrandedFragments as double)
-        def proportionDifference = Math.abs(forwardProportion - reverseProportion)
-
-        if (forwardProportion >= stranded_threshold) {
-            strandedness = 'forward'
-        } else if (reverseProportion >= stranded_threshold) {
-            strandedness = 'reverse'
-        } else if (proportionDifference <= unstranded_threshold) {
-            strandedness = 'unstranded'
-        }
-    }
-
-    return [
-        inferred_strandedness: strandedness,
-        forwardFragments: (forwardFragments / (totalFragments as double)) * 100,
-        reverseFragments: (reverseFragments / (totalFragments as double)) * 100,
-        unstrandedFragments: (unstrandedFragments / (totalFragments as double)) * 100
-    ]
-}
-
-//
-// Function that parses Salmon quant 'lib_format_counts.json' output file to get inferred strandedness
-//
-def getSalmonInferredStrandedness(json_file, stranded_threshold = 0.8, unstranded_threshold = 0.1) {
-    // Parse the JSON content of the file
-    def libCounts = new JsonSlurper().parseText(json_file.text)
-
-    // Calculate the counts for forward and reverse strand fragments
-    def forwardKeys = ['SF', 'ISF', 'MSF', 'OSF']
-    def reverseKeys = ['SR', 'ISR', 'MSR', 'OSR']
-
-    // Calculate unstranded fragments (IU and U)
-    // NOTE: this is here for completeness, but actually all fragments have a
-    // strandedness (even if the overall library does not), so all these values
-    // will be '0'. See
-    // https://groups.google.com/g/sailfish-users/c/yxzBDv6NB6I
-    def unstrandedKeys = ['IU', 'U', 'MU']
-
-    def forwardFragments = forwardKeys.collect { libCounts[it] ?: 0 }.sum()
-    def reverseFragments = reverseKeys.collect { libCounts[it] ?: 0 }.sum()
-    def unstrandedFragments = unstrandedKeys.collect { libCounts[it] ?: 0 }.sum()
-
-    // Use shared calculation function to determine strandedness
-    return calculateStrandedness(forwardFragments, reverseFragments, unstrandedFragments, stranded_threshold, unstranded_threshold)
-}
-
 //
 // Function that parses RSeQC infer_experiment output file to get inferred strandedness
 //

diff --git a/subworkflows/local/utils_nfcore_rnaseq_pipeline/tests/main.function.nf.test b/subworkflows/local/utils_nfcore_rnaseq_pipeline/tests/main.function.nf.test
@@ -390,115 +390,6 @@ nextflow_function {
 
     }
 
-    test("Test Function getSalmonInferredStrandedness unstranded") {
-
-        function "getSalmonInferredStrandedness"
-
-        when {
-            function {
-                """
-                import groovy.json.JsonOutput
-
-                // Define the JSON contents for the test
-                def json_contents = JsonOutput.toJson([
-                    "SF": 0,
-                    "SR": 0,
-                    "ISF": 100,
-                    "ISR": 100,
-                    "IU": 0,
-                    "U": 0
-                ])
-                def jsonFile = file("${workDir}/salmonUnstranded.json")
-                jsonFile.write(json_contents)
-
-                input[0] = jsonFile
-                input[1] = 0.8
-                input[2] = 0.1
-                """
-            }
-        }
-
-        then {
-            assertAll(
-                { assert function.success },
-                { assert snapshot(function.result).match() }
-            )
-        }
-
-    }
-
-    test("Test Function getSalmonInferredStrandedness forward") {
-
-        function "getSalmonInferredStrandedness"
-
-        when {
-            function {
-                """
-                import groovy.json.JsonOutput
-
-                def json_contents = JsonOutput.toJson([
-                    "SF": 0,
-                    "SR": 0,
-                    "ISF": 100,
-                    "ISR": 0,
-                    "IU": 0,
-                    "U": 0
-                ])
-                def jsonFile = file("${workDir}/salmonForward.json")
-                jsonFile.write(json_contents)
-
-                input[0] = jsonFile
-                input[1] = 0.8
-                input[2] = 0.1
-                """
-            }
-        }
-
-        then {
-            assertAll(
-                { assert function.success },
-                { assert snapshot(function.result).match() }
-            )
-        }
-
-    }
-
-    test("Test Function getSalmonInferredStrandedness reverse") {
-
-        function "getSalmonInferredStrandedness"
-
-        when {
-            function {
-                """
-                import groovy.json.JsonOutput
-
-                def json_contents = JsonOutput.toJson([
-                    "SF": 0,
-                    "SR": 0,
-                    "ISF": 0,
-                    "ISR": 100,
-                    "IU": 0,
-                    "U": 0
-                ])
-                def jsonFile = file("${workDir}/salmonReverse.json")
-                jsonFile.write(json_contents)
-
-                input[0] = jsonFile
-                input[1] = 0.8
-                input[2] = 0.1
-                """
-            }
-        }
-
-        then {
-            assertAll(
-                { assert function.success },
-                { assert snapshot(function.result).match() }
-            )
-        }
-
-    }
-
     test("Test Function getStarPercentMapped pass") {
 
         function "getStarPercentMapped"

diff --git a/subworkflows/local/utils_nfcore_rnaseq_pipeline/tests/main.function.nf.test.snap b/subworkflows/local/utils_nfcore_rnaseq_pipeline/tests/main.function.nf.test.snap
@@ -45,36 +45,6 @@
         },
         "timestamp": "2024-03-06T14:33:26.903306"
     },
-    "Test Function getSalmonInferredStrandedness unstranded": {
-        "content": [
-            {
-                "inferred_strandedness": "unstranded",
-                "forwardFragments": 50.0,
-                "reverseFragments": 50.0,
-                "unstrandedFragments": 0.0
-            }
-        ],
-        "meta": {
-            "nf-test": "0.8.4",
-            "nextflow": "23.10.1"
-        },
-        "timestamp": "2024-06-18T14:29:54.96715"
-    },
-    "Test Function getSalmonInferredStrandedness reverse": {
-        "content": [
-            {
-                "inferred_strandedness": "reverse",
-                "forwardFragments": 0.0,
-                "reverseFragments": 100.0,
-                "unstrandedFragments": 0.0
-            }
-        ],
-        "meta": {
-            "nf-test": "0.8.4",
-            "nextflow": "23.10.1"
-        },
-        "timestamp": "2024-06-18T14:30:11.417381"
-    },
     "Test Function checkSamplesAfterGrouping invalid strandedness": {
         "content": null,
         "meta": {
@@ -215,21 +185,6 @@
         },
         "timestamp": "2024-03-06T14:32:49.565504"
     },
-    "Test Function getSalmonInferredStrandedness forward": {
-        "content": [
-            {
-                "inferred_strandedness": "forward",
-                "forwardFragments": 100.0,
-                "reverseFragments": 0.0,
-                "unstrandedFragments": 0.0
-            }
-        ],
-        "meta": {
-            "nf-test": "0.8.4",
-            "nextflow": "23.10.1"
-        },
-        "timestamp": "2024-06-18T14:30:03.301262"
-    },
     "Test Function rsemStarIndexWarn": {
         "content": null,
         "meta": {