Skip to content

Commit

Permalink
Merge pull request #1342 from nf-core/factor_out_preprocessing
Browse files Browse the repository at this point in the history
Factor out preprocessing
  • Loading branch information
pinin4fjords committed Jul 17, 2024
2 parents 66f3594 + 5507a6d commit 5bd04b4
Show file tree
Hide file tree
Showing 16 changed files with 945 additions and 419 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
- [PR #1336](https://github.com/nf-core/rnaseq/pull/1334) - Use nf-core/setup-nf-test to install nf-test from cache during CI/CD
- [PR #1340](https://github.com/nf-core/rnaseq/pull/1340) - Remove out-of-date Azure specific guidance
- [PR #1341](https://github.com/nf-core/rnaseq/pull/1341) - Add rename in the MultiQC report for samples without techreps
- [PR #1342](https://github.com/nf-core/rnaseq/pull/1342) - Factor out preprocessing

### Parameters

Expand Down
1 change: 1 addition & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ workflow NFCORE_RNASEQ {
PREPARE_GENOME.out.salmon_index,
PREPARE_GENOME.out.kallisto_index,
PREPARE_GENOME.out.bbsplit_index,
PREPARE_GENOME.out.rrna_fastas,
PREPARE_GENOME.out.sortmerna_index,
PREPARE_GENOME.out.splicesites,
!params.remove_ribo_rna && params.remove_ribo_rna
Expand Down
15 changes: 10 additions & 5 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"bbmap/bbsplit": {
"branch": "master",
"git_sha": "2c6b1144ed58b6184ad58fc4e6b6a90219b4bf4f",
"installed_by": ["modules"]
"installed_by": ["fastq_qc_trim_filter_setstrandedness", "modules"]
},
"bedtools/genomecov": {
"branch": "master",
Expand All @@ -18,7 +18,7 @@
"cat/fastq": {
"branch": "master",
"git_sha": "4fc983ad0b30e6e32696fa7d980c76c7bfe1c03e",
"installed_by": ["modules"]
"installed_by": ["fastq_qc_trim_filter_setstrandedness", "modules"]
},
"custom/catadditionalfasta": {
"branch": "master",
Expand Down Expand Up @@ -202,7 +202,7 @@
"sortmerna": {
"branch": "master",
"git_sha": "df05c8db5195867c0bc7b92c1788115b66f0d17d",
"installed_by": ["modules"]
"installed_by": ["fastq_qc_trim_filter_setstrandedness", "modules"]
},
"star/align": {
"branch": "master",
Expand Down Expand Up @@ -315,17 +315,22 @@
"fastq_fastqc_umitools_fastp": {
"branch": "master",
"git_sha": "db35d26edeafacf9906a517827df621a29adc13d",
"installed_by": ["subworkflows"]
"installed_by": ["fastq_qc_trim_filter_setstrandedness", "subworkflows"]
},
"fastq_fastqc_umitools_trimgalore": {
"branch": "master",
"git_sha": "cb6defa0834eda9d6d3f967e981c819fc3e257bf",
"installed_by": ["fastq_qc_trim_filter_setstrandedness", "subworkflows"]
},
"fastq_qc_trim_filter_setstrandedness": {
"branch": "master",
"git_sha": "b86de50ab60c19ab40e70a4501820f4cb307050b",
"installed_by": ["subworkflows"]
},
"fastq_subsample_fq_salmon": {
"branch": "master",
"git_sha": "727232afb8294b53dd9d05bfe469b70cce1675bb",
"installed_by": ["subworkflows"]
"installed_by": ["fastq_qc_trim_filter_setstrandedness", "subworkflows"]
},
"quantify_pseudo_alignment": {
"branch": "master",
Expand Down
11 changes: 7 additions & 4 deletions subworkflows/local/prepare_genome/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,11 @@ workflow PREPARE_GENOME {
// Uncompress sortmerna index or generate from scratch if required
//
ch_sortmerna_index = Channel.empty()
ch_rrna_fastas = Channel.empty()

if ('sortmerna' in prepare_tool_indices) {
ribo_db = file(sortmerna_fasta_list)

if (sortmerna_index) {
if (sortmerna_index.endsWith('.tar.gz')) {
ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ( [ [:], sortmerna_index ] ).untar.map { it[1] }
Expand All @@ -237,14 +241,12 @@ workflow PREPARE_GENOME {
ch_sortmerna_index = Channel.value(file(sortmerna_index))
}
} else {
ch_sortmerna_fastas = Channel.from(file(sortmerna_fasta_list).readLines())
ch_rrna_fastas = Channel.from(ribo_db.readLines())
.map { row -> file(row, checkIfExists: true) }
.collect()
.map { [ 'rrna_refs', it ] }

SORTMERNA_INDEX (
Channel.of([ [],[] ]),
ch_sortmerna_fastas,
ch_rrna_fastas.collect().map { [ 'rrna_refs', it ] },
Channel.of([ [],[] ])
)
ch_sortmerna_index = SORTMERNA_INDEX.out.index.first()
Expand Down Expand Up @@ -370,6 +372,7 @@ workflow PREPARE_GENOME {
chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes)
splicesites = ch_splicesites // channel: path(genome.splicesites.txt)
bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/)
rrna_fastas = ch_rrna_fastas // channel: path(sortmerna_fasta_list)
sortmerna_index = ch_sortmerna_index // channel: path(sortmerna/index/)
star_index = ch_star_index // channel: path(star/index/)
rsem_index = ch_rsem_index // channel: path(rsem/index/)
Expand Down
58 changes: 1 addition & 57 deletions subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ include { imNotification } from '../../nf-core/utils_nfcore_pipeline'
include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline'
include { workflowCitation } from '../../nf-core/utils_nfcore_pipeline'
include { logColours } from '../../nf-core/utils_nfcore_pipeline'
include { calculateStrandedness } from '../../nf-core/fastq_qc_trim_filter_setstrandedness'

/*
========================================================================================
Expand Down Expand Up @@ -548,63 +549,6 @@ def biotypeInGtf(gtf_file, biotype) {
}
}

//
// Function to determine library type by comparing type counts. Consistent
// between Salmon and RSeQC
//
def calculateStrandedness(forwardFragments, reverseFragments, unstrandedFragments, stranded_threshold=0.8, unstranded_threshold=0.1) {
def totalFragments = forwardFragments + reverseFragments + unstrandedFragments
def totalStrandedFragments = forwardFragments + reverseFragments

def library_strandedness = 'undetermined'
if (totalStrandedFragments > 0) {
def forwardProportion = forwardFragments / (totalStrandedFragments as double)
def reverseProportion = reverseFragments / (totalStrandedFragments as double)
def proportionDifference = Math.abs(forwardProportion - reverseProportion)

if (forwardProportion >= stranded_threshold) {
strandedness = 'forward'
} else if (reverseProportion >= stranded_threshold) {
strandedness = 'reverse'
} else if (proportionDifference <= unstranded_threshold) {
strandedness = 'unstranded'
}
}

return [
inferred_strandedness: strandedness,
forwardFragments: (forwardFragments / (totalFragments as double)) * 100,
reverseFragments: (reverseFragments / (totalFragments as double)) * 100,
unstrandedFragments: (unstrandedFragments / (totalFragments as double)) * 100
]
}

//
// Function that parses Salmon quant 'lib_format_counts.json' output file to get inferred strandedness
//
def getSalmonInferredStrandedness(json_file, stranded_threshold = 0.8, unstranded_threshold = 0.1) {
// Parse the JSON content of the file
def libCounts = new JsonSlurper().parseText(json_file.text)

// Calculate the counts for forward and reverse strand fragments
def forwardKeys = ['SF', 'ISF', 'MSF', 'OSF']
def reverseKeys = ['SR', 'ISR', 'MSR', 'OSR']

// Calculate unstranded fragments (IU and U)
// NOTE: this is here for completeness, but actually all fragments have a
// strandedness (even if the overall library does not), so all these values
// will be '0'. See
// https://groups.google.com/g/sailfish-users/c/yxzBDv6NB6I
def unstrandedKeys = ['IU', 'U', 'MU']

def forwardFragments = forwardKeys.collect { libCounts[it] ?: 0 }.sum()
def reverseFragments = reverseKeys.collect { libCounts[it] ?: 0 }.sum()
def unstrandedFragments = unstrandedKeys.collect { libCounts[it] ?: 0 }.sum()

// Use shared calculation function to determine strandedness
return calculateStrandedness(forwardFragments, reverseFragments, unstrandedFragments, stranded_threshold, unstranded_threshold)
}

//
// Function that parses RSeQC infer_experiment output file to get inferred strandedness
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,115 +390,6 @@ nextflow_function {

}

test("Test Function getSalmonInferredStrandedness unstranded") {

function "getSalmonInferredStrandedness"

when {
function {
"""
import groovy.json.JsonOutput
// Define the JSON contents for the test
def json_contents = JsonOutput.toJson([
"SF": 0,
"SR": 0,
"ISF": 100,
"ISR": 100,
"IU": 0,
"U": 0
])
def jsonFile = file("${workDir}/salmonUnstranded.json")
jsonFile.write(json_contents)
input[0] = jsonFile
input[1] = 0.8
input[2] = 0.1
"""
}
}

then {
assertAll(
{ assert function.success },
{ assert snapshot(function.result).match() }
)
}

}

test("Test Function getSalmonInferredStrandedness forward") {

function "getSalmonInferredStrandedness"

when {
function {
"""
import groovy.json.JsonOutput
def json_contents = JsonOutput.toJson([
"SF": 0,
"SR": 0,
"ISF": 100,
"ISR": 0,
"IU": 0,
"U": 0
])
def jsonFile = file("${workDir}/salmonForward.json")
jsonFile.write(json_contents)
input[0] = jsonFile
input[1] = 0.8
input[2] = 0.1
"""
}
}

then {
assertAll(
{ assert function.success },
{ assert snapshot(function.result).match() }
)
}

}

test("Test Function getSalmonInferredStrandedness reverse") {

function "getSalmonInferredStrandedness"

when {
function {
"""
import groovy.json.JsonOutput
def json_contents = JsonOutput.toJson([
"SF": 0,
"SR": 0,
"ISF": 0,
"ISR": 100,
"IU": 0,
"U": 0
])
def jsonFile = file("${workDir}/salmonReverse.json")
jsonFile.write(json_contents)
input[0] = jsonFile
input[1] = 0.8
input[2] = 0.1
"""
}
}

then {
assertAll(
{ assert function.success },
{ assert snapshot(function.result).match() }
)
}

}

test("Test Function getStarPercentMapped pass") {

function "getStarPercentMapped"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,36 +45,6 @@
},
"timestamp": "2024-03-06T14:33:26.903306"
},
"Test Function getSalmonInferredStrandedness unstranded": {
"content": [
{
"inferred_strandedness": "unstranded",
"forwardFragments": 50.0,
"reverseFragments": 50.0,
"unstrandedFragments": 0.0
}
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "23.10.1"
},
"timestamp": "2024-06-18T14:29:54.96715"
},
"Test Function getSalmonInferredStrandedness reverse": {
"content": [
{
"inferred_strandedness": "reverse",
"forwardFragments": 0.0,
"reverseFragments": 100.0,
"unstrandedFragments": 0.0
}
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "23.10.1"
},
"timestamp": "2024-06-18T14:30:11.417381"
},
"Test Function checkSamplesAfterGrouping invalid strandedness": {
"content": null,
"meta": {
Expand Down Expand Up @@ -215,21 +185,6 @@
},
"timestamp": "2024-03-06T14:32:49.565504"
},
"Test Function getSalmonInferredStrandedness forward": {
"content": [
{
"inferred_strandedness": "forward",
"forwardFragments": 100.0,
"reverseFragments": 0.0,
"unstrandedFragments": 0.0
}
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "23.10.1"
},
"timestamp": "2024-06-18T14:30:03.301262"
},
"Test Function rsemStarIndexWarn": {
"content": null,
"meta": {
Expand Down
Loading

0 comments on commit 5bd04b4

Please sign in to comment.