diff --git a/CHANGELOG.md b/CHANGELOG.md index 4680c417a..6ef0b84d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-21 + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Jonathan Manning](https://github.com/pinin4fjords) +- [Regina Hertfelder Reynolds](https://github.com/RHReynolds) +- [Matthias Zepper](https://github.com/MatthiasZepper) + +### Enhancements & fixes + +- [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables +- [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped +- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [PR #1127](https://github.com/nf-core/rnaseq/pull/1127) - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. + ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes -- [[PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix +- [PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix ## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 265250627..b2215fde6 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -23,14 +23,14 @@ def extract_fasta_seq_names(fasta_name: str) -> Set[str]: def tab_delimited(file: str) -> float: """Check if file is tab-delimited and return median number of tabs.""" with open(file, "r") as f: - data = f.read(1024) + data = f.read(102400) return statistics.median(line.count("\t") for line in data.split("\n")) def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: - raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") + raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.") seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") diff --git a/bin/tximport.r b/bin/tximport.r index ee3d8b212..59a8fcd9e 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,94 +1,143 @@ #!/usr/bin/env Rscript -# Written by Lorena Pantano and released under the MIT license. +# Script for importing and processing transcript-level quantifications. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released +# under the MIT license. +# Loading required libraries library(SummarizedExperiment) library(tximport) -args = commandArgs(trailingOnly=TRUE) +# Parsing command line arguments +args <- commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", call.=FALSE) + stop("Usage: tximport.r ", + call.=FALSE) } -coldata = args[1] -path = args[2] -sample_name = args[3] -quant_type = args[4] -tx2gene_path = args[5] - -prefix = sample_name - -info = file.info(tx2gene_path) -if (info$size == 0) { - tx2gene = NULL -} else { - rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) - colnames(rowdata) = c("tx", "gene_id", "gene_name") - tx2gene = rowdata[,1:2] +# Assigning command line arguments to variables +coldata_path <- args[1] +path <- args[2] +prefix <- args[3] +quant_type <- args[4] +tx2gene_path <- args[5] + +## Functions + +# Build a table from a SummarizedExperiment object +build_table <- function(se.obj, slot) { + cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) } -pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") -fns = list.files(path, pattern = pattern, recursive = T, full.names = T) -names = basename(dirname(fns)) -names(fns) = names - -if (file.exists(coldata)) { - coldata = read.csv(coldata, sep="\t") - coldata = coldata[match(names, coldata[,1]),] - coldata = cbind(files = fns, coldata) -} else { - message("ColData not available: ", coldata) - coldata = data.frame(files = fns, names = names) +# Write a table to a file with given parameters +write_se_table <- function(params) { + file_name <- paste0(prefix, ".", params$suffix) + write.table(build_table(params$obj, params$slot), file_name, + sep="\t", quote=FALSE, row.names = FALSE) } -dropInfReps = quant_type == "kallisto" +# Read transcript metadata from a given path +read_transcript_info <- function(tinfo_path){ + info <- file.info(tinfo_path) + if (info$size == 0) { + stop("tx2gene file is empty") + } + + transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, + col.names = c("tx", "gene_id", "gene_name")) -txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -rownames(coldata) = coldata[["names"]] -extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) -if (length(extra) > 0) { - rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] + rownames(transcript_info) <- transcript_info[["tx"]] + + list(transcript = transcript_info, + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } -rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),] -rownames(rowdata) = rowdata[["tx"]] -se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]), - colData = DataFrame(coldata), - rowData = rowdata) -if (!is.null(tx2gene)) { - gi = summarizeToGene(txi, tx2gene = tx2gene) - gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") - gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") - growdata = unique(rowdata[,2:3]) - growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] - rownames(growdata) = growdata[["tx"]] - gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) - gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) - gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) + +# Read and process sample/column data from a given path +read_coldata <- function(coldata_path){ + if (file.exists(coldata_path)) { + coldata <- read.csv(coldata_path, sep="\t") + coldata <- coldata[match(names, coldata[,1]),] + coldata <- cbind(files = fns, coldata) + } else { + message("ColData not available: ", coldata_path) + coldata <- data.frame(files = fns, names = names) + } + rownames(coldata) <- coldata[["names"]] } -build_table = function(se.obj, slot) { - cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) +# Create a SummarizedExperiment object with given data +create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { + SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), + colData = col_data, + rowData = row_data) } -if(exists("gse")){ - write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) +# Main script starts here + +# Define pattern for file names based on quantification type +pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") +fns <- list.files(path, pattern = pattern, recursive = T, full.names = T) +names <- basename(dirname(fns)) +names(fns) <- names +dropInfReps <- quant_type == "kallisto" + +# Import transcript-level quantifications +txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) + +# Read transcript and sample data +transcript_info <- read_transcript_info(tx2gene_path) +coldata <- read_coldata(coldata_path) + +# Create initial SummarizedExperiment object +se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], + DataFrame(coldata), transcript_info$transcript) + +# Setting parameters for writing tables +params <- list( + list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), + list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), + list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") +) + +# Process gene-level data if tx2gene mapping is available +if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { + tx2gene <- transcript_info$tx2gene + gi <- summarizeToGene(txi, tx2gene = tx2gene) + gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") + gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") + + gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] + rownames(gene_info) <- gene_info[["tx"]] + + col_data_frame <- DataFrame(coldata) + + # Create gene-level SummarizedExperiment objects + gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], + col_data_frame, gene_info) + gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], + col_data_frame, gene_info) + gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], + col_data_frame, gene_info) + + params <- c(params, list( + list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), + list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), + list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"), + list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"), + list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"), + list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"), + list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv") + )) } -write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) -write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) +# Writing tables for each set of parameters +done <- lapply(params, write_se_table) -# Print sessioninfo to standard out +# Output session information and citations citation("tximeta") sessionInfo() diff --git a/conf/modules.config b/conf/modules.config index 6e28f549b..efef36974 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1087,7 +1087,6 @@ if (!params.skip_multiqc) { if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { process { - withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' { ext.args = { params.extra_salmon_quant_args ?: '' } publishDir = [ @@ -1112,7 +1111,7 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') { } } -if (!params.skip_pseudo_alignment) { +if (!params.skip_pseudo_alignment && params.pseudo_aligner) { process { withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' { publishDir = [ diff --git a/modules/local/tximport/main.nf b/modules/local/tximport/main.nf index 56d826a7d..cd97d139f 100644 --- a/modules/local/tximport/main.nf +++ b/modules/local/tximport/main.nf @@ -16,8 +16,10 @@ process TXIMPORT { path "*gene_counts.tsv" , emit: counts_gene path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled + path "*gene_lengths.tsv" , emit: lengths_gene path "*transcript_tpm.tsv" , emit: tpm_transcript path "*transcript_counts.tsv" , emit: counts_transcript + path "*transcript_lengths.tsv" , emit: lengths_transcript path "versions.yml" , emit: versions when: diff --git a/nextflow.config b/nextflow.config index d42914210..89f72ec5b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -317,7 +317,7 @@ manifest { description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '3.13.1' + version = '3.13.2' doi = 'https://doi.org/10.5281/zenodo.1400710' } diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 0be947954..0840c7734 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -47,7 +47,7 @@ workflow PREPARE_GENOME { rsem_index // directory: /path/to/rsem/index/ salmon_index // directory: /path/to/salmon/index/ kallisto_index // directory: /path/to/kallisto/index/ - hisat2_index // directory: /path/to/hisat2/index/ + hisat2_index // directory: /path/to/hisat2/index/ bbsplit_index // directory: /path/to/rsem/index/ gencode // boolean: whether the genome is from GENCODE is_aws_igenome // boolean: whether the genome files are from AWS iGenomes @@ -139,14 +139,13 @@ workflow PREPARE_GENOME { } else { ch_transcript_fasta = Channel.value(file(transcript_fasta)) } - if (gencode) { + if (gencode) { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } } else { ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta - ch_versions = ch_versions.mix(GTF_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } @@ -268,7 +267,7 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) } } - + // // Uncompress Kallisto index or generate from scratch if required // diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf index 886b45e7b..7887f086f 100644 --- a/subworkflows/local/quantify_pseudo/main.nf +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT { results = ch_pseudo_results // channel: [ val(meta), results_dir ] multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ] - tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] - counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] - counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] - counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] - tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] - counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] + tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv + counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv + lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv + counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv + counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv + tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv + counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv + lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv merged_gene_rds = SE_GENE.out.rds // path: *.rds merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds diff --git a/tower.yml b/tower.yml index 21f821a53..7ce7e2dff 100644 --- a/tower.yml +++ b/tower.yml @@ -5,6 +5,10 @@ reports: display: "All samples STAR Salmon DESeq2 QC PDF plots" "**/star_salmon/salmon.merged.gene_counts.tsv": display: "All samples STAR Salmon merged gene raw counts" + "**/star_salmon/salmon.merged.gene_lengths.tsv": + display: "All samples STAR Salmon mean transcript length per gene" + "**/star_salmon/salmon.merged.transcript_lengths.tsv": + display: "All samples STAR Salmon transcript lengths" "**/star_salmon/salmon.merged.gene_tpm.tsv": display: "All samples STAR Salmon merged gene TPM counts" "**/star_salmon/salmon.merged.transcript_counts.tsv": @@ -15,6 +19,10 @@ reports: display: "All samples Salmon DESeq2 QC PDF plots" "**/salmon/salmon.merged.gene_counts.tsv": display: "All samples Salmon merged gene raw counts" + "**/salmon/salmon.merged.gene_lengths.tsv": + display: "All samples Salmon mean transcript length per gene" + "**/salmon/salmon.merged.transcript_lengths.tsv": + display: "All samples Salmon transcript lengths" "**/salmon/salmon.merged.gene_tpm.tsv": display: "All samples Salmon merged gene TPM counts" "**/salmon/salmon.merged.transcript_counts.tsv": @@ -25,6 +33,10 @@ reports: display: "All samples Kallisto DESeq2 QC PDF plots" "**/kallisto/kallisto.merged.gene_counts.tsv": display: "All samples Kallisto merged gene raw counts" + "**/kallisto/kallisto.merged.gene_lengths.tsv": + display: "All samples Kallisto mean transcript length per gene" + "**/kallisto/kallisto.merged.transcript_lengths.tsv": + display: "All samples Kallisto transcript lengths" "**/kallisto/kallisto.merged.gene_tpm.tsv": display: "All samples Kallisto merged gene TPM counts" "**/kallisto/kallisto.merged.transcript_counts.tsv": diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 403194ac2..f8def2d10 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -816,9 +816,8 @@ workflow RNASEQ { // ch_pseudo_multiqc = Channel.empty() ch_pseudoaligner_pca_multiqc = Channel.empty() - ch_pseudoaligner_clustering_multiqc = Channel.empty() - - if (!params.skip_pseudo_alignment) { + ch_pseudoaligner_clustering_multiqc = Channel.empty() + if (!params.skip_pseudo_alignment && params.pseudo_aligner) { if (params.pseudo_aligner == 'salmon') { ch_pseudo_index = PREPARE_GENOME.out.salmon_index