From 5b99b9bbf01072c88e5ea370d5ce86cd5bd9a124 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 12:13:32 +0000 Subject: [PATCH 01/29] [skip ci] overhaul tximport.r --- bin/tximport.r | 177 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 117 insertions(+), 60 deletions(-) diff --git a/bin/tximport.r b/bin/tximport.r index ee3d8b212..75ff41ef4 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -7,86 +7,143 @@ library(tximport) args = commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", call.=FALSE) + stop("Usage: tximport.r ", call.=FALSE) } -coldata = args[1] +coldata_path = args[1] path = args[2] -sample_name = args[3] +prefix = args[3] quant_type = args[4] tx2gene_path = args[5] -prefix = sample_name +## Functions -info = file.info(tx2gene_path) -if (info$size == 0) { - tx2gene = NULL -} else { - rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) - colnames(rowdata) = c("tx", "gene_id", "gene_name") - tx2gene = rowdata[,1:2] +# Function to build a table from a SummarizedExperiment object +build_table <- function(se.obj, slot) { + cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) +} + +# Function to write a table to a file +write_se_table <- function(params) { + file_name <- paste0(prefix, ".", params$suffix) + write.table(build_table(params$obj, params$slot), file_name, sep="\t", quote=FALSE, row.names = FALSE) } +# Read and process transcript meta +read_transcript_info <- function(tinfo_path){ + + info = file.info(tinfo_path) + if (info$size == 0) { + stop("tx2gene file is empty") + } + + # Read transcript information and set column names + transcript_info = read.csv( + tinfo_path, + sep="\t", + header = FALSE, + col.names = c("tx", "gene_id", "gene_name") + ) + + # Identify and add extra transcripts + extra = setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + transcript_info = rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + + # Reorder rows to match the order in txi and set row names + transcript_info = transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] + rownames(transcript_info) = transcript_info[["tx"]] + + # Return various formulations of the info for downstream use + list( + transcript = transcript_info, + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2] + ) +} + +# Read sample/column data +read_coldata <- function(coldata_path){ + + if (file.exists(coldata_path)) { + coldata = read.csv(coldata_path, sep="\t") + coldata = coldata[match(names, coldata[,1]),] + coldata = cbind(files = fns, coldata) + } else { + message("ColData not available: ", coldata_path) + coldata = data.frame(files = fns, names = names) + } + + rownames(coldata) = coldata[["names"]] +} + +# Function to create a SummarizedExperiment object +create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { + return(SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), + colData = col_data, + rowData = row_data)) +} + +# Read the abundance values pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") fns = list.files(path, pattern = pattern, recursive = T, full.names = T) names = basename(dirname(fns)) names(fns) = names +dropInfReps = quant_type == "kallisto" +txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -if (file.exists(coldata)) { - coldata = read.csv(coldata, sep="\t") - coldata = coldata[match(names, coldata[,1]),] - coldata = cbind(files = fns, coldata) -} else { - message("ColData not available: ", coldata) - coldata = data.frame(files = fns, names = names) -} +# Read transcript metadata +transcript_info = read_transcript_info(tx2gene_path) -dropInfReps = quant_type == "kallisto" +# Read sample/ column data +coldata = read_coldata(coldata_path) -txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -rownames(coldata) = coldata[["names"]] -extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) -if (length(extra) > 0) { - rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra)) -} -rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),] -rownames(rowdata) = rowdata[["tx"]] -se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]), - colData = DataFrame(coldata), - rowData = rowdata) -if (!is.null(tx2gene)) { - gi = summarizeToGene(txi, tx2gene = tx2gene) - gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") - gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") - growdata = unique(rowdata[,2:3]) - growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] - rownames(growdata) = growdata[["tx"]] - gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) - gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) - gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) -} +# Process input data to summarized experiment objects -build_table = function(se.obj, slot) { - cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) -} +# Create initial SummarizedExperiment object +se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], + DataFrame(coldata), transcript_info$transcript) + +# Parameters for writing tables +params <- list( + list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), + list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), + list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") +) + +# Check if tx2gene is present in transcript_info +if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { + tx2gene <- transcript_info$tx2gene + gi <- summarizeToGene(txi, tx2gene = tx2gene) + gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") + gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") + + # Get gene information matching the rows in gi + gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] + rownames(gene_info) <- gene_info[["tx"]] + + # Create dataframe only once for reuse + col_data_frame <- DataFrame(coldata) -if(exists("gse")){ - write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) + # Create gene-level SummarizedExperiment objects + gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], + col_data_frame, gene_info) + gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], + col_data_frame, gene_info) + gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], + col_data_frame, gene_info) + params <- c(params, list( + list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), + list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), + list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"), + list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"), + list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"), + list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"), + list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv") + )) } -write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) -write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) +# Apply the write_se_table function to each set of parameters +lapply(params, write_se_table) # Print sessioninfo to standard out citation("tximeta") From 5f1826d4b837e7abfab8467baf9cc55ff06d4f4f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:18:47 +0000 Subject: [PATCH 02/29] Fix formatting etc in tximport script --- bin/tximport.r | 122 +++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 66 deletions(-) mode change 100755 => 100644 bin/tximport.r diff --git a/bin/tximport.r b/bin/tximport.r old mode 100755 new mode 100644 index 75ff41ef4..e81e5a7e6 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,127 +1,117 @@ #!/usr/bin/env Rscript +# Script for importing and processing transcript-level quantifications. # Written by Lorena Pantano and released under the MIT license. +# Loading required libraries library(SummarizedExperiment) library(tximport) -args = commandArgs(trailingOnly=TRUE) +# Parsing command line arguments +args <- commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", call.=FALSE) + stop("Usage: tximport.r ", + call.=FALSE) } -coldata_path = args[1] -path = args[2] -prefix = args[3] -quant_type = args[4] -tx2gene_path = args[5] +# Assigning command line arguments to variables +coldata_path <- args[1] +path <- args[2] +prefix <- args[3] +quant_type <- args[4] +tx2gene_path <- args[5] ## Functions -# Function to build a table from a SummarizedExperiment object +# Build a table from a SummarizedExperiment object build_table <- function(se.obj, slot) { cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) } -# Function to write a table to a file +# Write a table to a file with given parameters write_se_table <- function(params) { file_name <- paste0(prefix, ".", params$suffix) - write.table(build_table(params$obj, params$slot), file_name, sep="\t", quote=FALSE, row.names = FALSE) + write.table(build_table(params$obj, params$slot), file_name, + sep="\t", quote=FALSE, row.names = FALSE) } -# Read and process transcript meta +# Read transcript metadata from a given path read_transcript_info <- function(tinfo_path){ - - info = file.info(tinfo_path) + info <- file.info(tinfo_path) if (info$size == 0) { stop("tx2gene file is empty") } - # Read transcript information and set column names - transcript_info = read.csv( - tinfo_path, - sep="\t", - header = FALSE, - col.names = c("tx", "gene_id", "gene_name") - ) - - # Identify and add extra transcripts - extra = setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) - transcript_info = rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) - - # Reorder rows to match the order in txi and set row names - transcript_info = transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] - rownames(transcript_info) = transcript_info[["tx"]] - - # Return various formulations of the info for downstream use - list( - transcript = transcript_info, - gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2] - ) + transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, + col.names = c("tx", "gene_id", "gene_name")) + + extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] + rownames(transcript_info) <- transcript_info[["tx"]] + + list(transcript = transcript_info, + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } -# Read sample/column data +# Read and process sample/column data from a given path read_coldata <- function(coldata_path){ - if (file.exists(coldata_path)) { - coldata = read.csv(coldata_path, sep="\t") - coldata = coldata[match(names, coldata[,1]),] - coldata = cbind(files = fns, coldata) + coldata <- read.csv(coldata_path, sep="\t") + coldata <- coldata[match(names, coldata[,1]),] + coldata <- cbind(files = fns, coldata) } else { message("ColData not available: ", coldata_path) - coldata = data.frame(files = fns, names = names) + coldata <- data.frame(files = fns, names = names) } - - rownames(coldata) = coldata[["names"]] + rownames(coldata) <- coldata[["names"]] } -# Function to create a SummarizedExperiment object +# Create a SummarizedExperiment object with given data create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { - return(SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), - colData = col_data, - rowData = row_data)) + SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), + colData = col_data, + rowData = row_data) } -# Read the abundance values -pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") -fns = list.files(path, pattern = pattern, recursive = T, full.names = T) -names = basename(dirname(fns)) -names(fns) = names -dropInfReps = quant_type == "kallisto" -txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) +# Main script starts here -# Read transcript metadata -transcript_info = read_transcript_info(tx2gene_path) +# Define pattern for file names based on quantification type +pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") +fns <- list.files(path, pattern = pattern, recursive = T, full.names = T) +names <- basename(dirname(fns)) +names(fns) <- names +dropInfReps <- quant_type == "kallisto" -# Read sample/ column data -coldata = read_coldata(coldata_path) +# Import transcript-level quantifications +txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -# Process input data to summarized experiment objects +# Read transcript and sample data +transcript_info <- read_transcript_info(tx2gene_path) +coldata <- read_coldata(coldata_path) # Create initial SummarizedExperiment object se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], DataFrame(coldata), transcript_info$transcript) -# Parameters for writing tables +# Setting parameters for writing tables params <- list( list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") ) -# Check if tx2gene is present in transcript_info +# Process gene-level data if tx2gene mapping is available if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { tx2gene <- transcript_info$tx2gene gi <- summarizeToGene(txi, tx2gene = tx2gene) gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") - # Get gene information matching the rows in gi gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] rownames(gene_info) <- gene_info[["tx"]] - # Create dataframe only once for reuse col_data_frame <- DataFrame(coldata) # Create gene-level SummarizedExperiment objects @@ -142,10 +132,10 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) )) } -# Apply the write_se_table function to each set of parameters -lapply(params, write_se_table) +# Writing tables for each set of parameters +done <- lapply(params, write_se_table) -# Print sessioninfo to standard out +# Output session information and citations citation("tximeta") sessionInfo() From 489bcb4efdc7bd58839b22b0360d26b4d80b87a8 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:43:44 +0000 Subject: [PATCH 03/29] update workflow files for new tximport script --- modules/local/tximport/main.nf | 2 ++ subworkflows/local/quantify_pseudo/main.nf | 14 ++++++++------ tower.yml | 12 ++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/modules/local/tximport/main.nf b/modules/local/tximport/main.nf index 56d826a7d..cd97d139f 100644 --- a/modules/local/tximport/main.nf +++ b/modules/local/tximport/main.nf @@ -16,8 +16,10 @@ process TXIMPORT { path "*gene_counts.tsv" , emit: counts_gene path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled + path "*gene_lengths.tsv" , emit: lengths_gene path "*transcript_tpm.tsv" , emit: tpm_transcript path "*transcript_counts.tsv" , emit: counts_transcript + path "*transcript_lengths.tsv" , emit: lengths_transcript path "versions.yml" , emit: versions when: diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf index 886b45e7b..bbb4a7185 100644 --- a/subworkflows/local/quantify_pseudo/main.nf +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT { results = ch_pseudo_results // channel: [ val(meta), results_dir ] multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ] - tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] - counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] - counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] - counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] - tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] - counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] + tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv + counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv + lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv + counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv + counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv + tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv + counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv + lengths_transcript = TXIMPORT.out.lengths_transript // path *transcript_lengths.tsv merged_gene_rds = SE_GENE.out.rds // path: *.rds merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds diff --git a/tower.yml b/tower.yml index 21f821a53..4079a8405 100644 --- a/tower.yml +++ b/tower.yml @@ -4,6 +4,10 @@ reports: "**/star_salmon/**/deseq2.plots.pdf": display: "All samples STAR Salmon DESeq2 QC PDF plots" "**/star_salmon/salmon.merged.gene_counts.tsv": + display: "All samples STAR Salmon mean transcript length per gene" + "**/star_salmon/salmon.merged.gene_lengths.tsv": + display: "All samples STAR Salmon transcript lengths" + "**/star_salmon/salmon.merged.transcript_lengths.tsv": display: "All samples STAR Salmon merged gene raw counts" "**/star_salmon/salmon.merged.gene_tpm.tsv": display: "All samples STAR Salmon merged gene TPM counts" @@ -14,6 +18,10 @@ reports: "**/salmon/**/deseq2.plots.pdf": display: "All samples Salmon DESeq2 QC PDF plots" "**/salmon/salmon.merged.gene_counts.tsv": + display: "All samples Salmon mean transcript length per gene" + "**/salmon/salmon.merged.gene_lengths.tsv": + display: "All samples Salmon transcript lengths" + "**/salmon/salmon.merged.transcript_lengths.tsv": display: "All samples Salmon merged gene raw counts" "**/salmon/salmon.merged.gene_tpm.tsv": display: "All samples Salmon merged gene TPM counts" @@ -24,6 +32,10 @@ reports: "**/kallisto/**/deseq2.plots.pdf": display: "All samples Kallisto DESeq2 QC PDF plots" "**/kallisto/kallisto.merged.gene_counts.tsv": + display: "All samples Kallisto mean transcript length per gene" + "**/kallisto/kallisto.merged.gene_lengths.tsv": + display: "All samples Kallisto transcript lengths" + "**/kallisto/kallisto.merged.transcript_lengths.tsv": display: "All samples Kallisto merged gene raw counts" "**/kallisto/kallisto.merged.gene_tpm.tsv": display: "All samples Kallisto merged gene TPM counts" From a9a831c67b04257a0fa139cfa1c9b66f55de1251 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:47:54 +0000 Subject: [PATCH 04/29] appease eclint --- bin/tximport.r | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/bin/tximport.r b/bin/tximport.r index e81e5a7e6..bf7377f9f 100644 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -10,8 +10,8 @@ library(tximport) # Parsing command line arguments args <- commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", - call.=FALSE) + stop("Usage: tximport.r ", + call.=FALSE) } # Assigning command line arguments to variables @@ -21,7 +21,7 @@ prefix <- args[3] quant_type <- args[4] tx2gene_path <- args[5] -## Functions +## Functions # Build a table from a SummarizedExperiment object build_table <- function(se.obj, slot) { @@ -31,7 +31,7 @@ build_table <- function(se.obj, slot) { # Write a table to a file with given parameters write_se_table <- function(params) { file_name <- paste0(prefix, ".", params$suffix) - write.table(build_table(params$obj, params$slot), file_name, + write.table(build_table(params$obj, params$slot), file_name, sep="\t", quote=FALSE, row.names = FALSE) } @@ -40,9 +40,9 @@ read_transcript_info <- function(tinfo_path){ info <- file.info(tinfo_path) if (info$size == 0) { stop("tx2gene file is empty") - } + } - transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, + transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, col.names = c("tx", "gene_id", "gene_name")) extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) @@ -51,8 +51,8 @@ read_transcript_info <- function(tinfo_path){ rownames(transcript_info) <- transcript_info[["tx"]] list(transcript = transcript_info, - gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2]) + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } # Read and process sample/column data from a given path @@ -71,8 +71,8 @@ read_coldata <- function(coldata_path){ # Create a SummarizedExperiment object with given data create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), - colData = col_data, - rowData = row_data) + colData = col_data, + rowData = row_data) } # Main script starts here @@ -93,7 +93,7 @@ coldata <- read_coldata(coldata_path) # Create initial SummarizedExperiment object se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], - DataFrame(coldata), transcript_info$transcript) + DataFrame(coldata), transcript_info$transcript) # Setting parameters for writing tables params <- list( @@ -116,11 +116,12 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) # Create gene-level SummarizedExperiment objects gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], - col_data_frame, gene_info) + col_data_frame, gene_info) gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], - col_data_frame, gene_info) + col_data_frame, gene_info) gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], - col_data_frame, gene_info) + col_data_frame, gene_info) + params <- c(params, list( list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), From b7066620049a6df0be2e2108f678171c85de1006 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:50:15 +0000 Subject: [PATCH 05/29] appease eclint --- bin/tximport.r | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/tximport.r b/bin/tximport.r index bf7377f9f..528d59e2f 100644 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -51,8 +51,8 @@ read_transcript_info <- function(tinfo_path){ rownames(transcript_info) <- transcript_info[["tx"]] list(transcript = transcript_info, - gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2]) + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } # Read and process sample/column data from a given path From 7533e9f48da38f303bb7f65b4d74cd182bf918b0 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 14:43:01 +0000 Subject: [PATCH 06/29] Ensure pseudoaligner is set if pseudoalignment is not skipped --- nextflow.config | 2 +- workflows/rnaseq.nf | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index d42914210..6a5788e2f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ params { // Alignment aligner = 'star_salmon' - pseudo_aligner = null + pseudo_aligner = 'salmon' seq_center = null bam_csi_index = false star_ignore_sjdbgtf = false diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 403194ac2..4547162a9 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -37,7 +37,13 @@ if (!params.skip_bbsplit && !params.bbsplit_index && params.bbsplit_fasta_list) def prepareToolIndices = [] if (!params.skip_bbsplit) { prepareToolIndices << 'bbsplit' } if (!params.skip_alignment) { prepareToolIndices << params.aligner } -if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } +if (!params.skip_pseudo_alignment) { + if(params.pseudo_aligner) { + prepareToolIndices << params.pseudo_aligner + } else { + exit 1, "--skip_pseudo_alignment not set, but --pseudo_aligner not set" + } +} // Determine whether to filter the GTF or not def filterGtf = From 10d808666bc4b08a2912828cdc2e107e1726b342 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 14:51:40 +0000 Subject: [PATCH 07/29] Update changelog --- CHANGELOG.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4680c417a..c6687990b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-20 + +### Enhancements & fixes + +- [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped + ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes -- [[PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix +- [PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix ## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 From 6bc74b649398c9d2c6576b07879303d087fa9389 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 15:07:34 +0000 Subject: [PATCH 08/29] restore script executability --- bin/tximport.r | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/tximport.r diff --git a/bin/tximport.r b/bin/tximport.r old mode 100644 new mode 100755 From 3e30c263ef306235f78a7f5951af9b8e7bde9639 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 15:20:18 +0000 Subject: [PATCH 09/29] correct typo --- subworkflows/local/quantify_pseudo/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf index bbb4a7185..7887f086f 100644 --- a/subworkflows/local/quantify_pseudo/main.nf +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -86,7 +86,7 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT { counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv - lengths_transcript = TXIMPORT.out.lengths_transript // path *transcript_lengths.tsv + lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv merged_gene_rds = SE_GENE.out.rds // path: *.rds merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds From 736bffac8e015ccba163870aa4c6d741d8a456e1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 16:18:23 +0000 Subject: [PATCH 10/29] Fix tower yml --- tower.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tower.yml b/tower.yml index 4079a8405..7ce7e2dff 100644 --- a/tower.yml +++ b/tower.yml @@ -4,11 +4,11 @@ reports: "**/star_salmon/**/deseq2.plots.pdf": display: "All samples STAR Salmon DESeq2 QC PDF plots" "**/star_salmon/salmon.merged.gene_counts.tsv": - display: "All samples STAR Salmon mean transcript length per gene" + display: "All samples STAR Salmon merged gene raw counts" "**/star_salmon/salmon.merged.gene_lengths.tsv": - display: "All samples STAR Salmon transcript lengths" + display: "All samples STAR Salmon mean transcript length per gene" "**/star_salmon/salmon.merged.transcript_lengths.tsv": - display: "All samples STAR Salmon merged gene raw counts" + display: "All samples STAR Salmon transcript lengths" "**/star_salmon/salmon.merged.gene_tpm.tsv": display: "All samples STAR Salmon merged gene TPM counts" "**/star_salmon/salmon.merged.transcript_counts.tsv": @@ -18,11 +18,11 @@ reports: "**/salmon/**/deseq2.plots.pdf": display: "All samples Salmon DESeq2 QC PDF plots" "**/salmon/salmon.merged.gene_counts.tsv": - display: "All samples Salmon mean transcript length per gene" + display: "All samples Salmon merged gene raw counts" "**/salmon/salmon.merged.gene_lengths.tsv": - display: "All samples Salmon transcript lengths" + display: "All samples Salmon mean transcript length per gene" "**/salmon/salmon.merged.transcript_lengths.tsv": - display: "All samples Salmon merged gene raw counts" + display: "All samples Salmon transcript lengths" "**/salmon/salmon.merged.gene_tpm.tsv": display: "All samples Salmon merged gene TPM counts" "**/salmon/salmon.merged.transcript_counts.tsv": @@ -32,11 +32,11 @@ reports: "**/kallisto/**/deseq2.plots.pdf": display: "All samples Kallisto DESeq2 QC PDF plots" "**/kallisto/kallisto.merged.gene_counts.tsv": - display: "All samples Kallisto mean transcript length per gene" + display: "All samples Kallisto merged gene raw counts" "**/kallisto/kallisto.merged.gene_lengths.tsv": - display: "All samples Kallisto transcript lengths" + display: "All samples Kallisto mean transcript length per gene" "**/kallisto/kallisto.merged.transcript_lengths.tsv": - display: "All samples Kallisto merged gene raw counts" + display: "All samples Kallisto transcript lengths" "**/kallisto/kallisto.merged.gene_tpm.tsv": display: "All samples Kallisto merged gene TPM counts" "**/kallisto/kallisto.merged.transcript_counts.tsv": From 279fd121ae0264282aa0bde0826b37e64de131de Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 16:21:48 +0000 Subject: [PATCH 11/29] [skip ci] bump version in config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6a5788e2f..41bcb45d3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -317,7 +317,7 @@ manifest { description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '3.13.1' + version = '3.13.2' doi = 'https://doi.org/10.5281/zenodo.1400710' } From 8ed06f36c9702d82dd6979086caa64d44c61f0d2 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 16:51:43 +0000 Subject: [PATCH 12/29] Move release to minor version, include tximport PR --- CHANGELOG.md | 3 ++- nextflow.config | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6687990b..7f713bf2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-20 +## [[3.14.0](https://github.com/nf-core/rnaseq/releases/tag/3.14.0)] - 2023-11-21 ### Enhancements & fixes +- [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables - [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 diff --git a/nextflow.config b/nextflow.config index 41bcb45d3..933f48cf4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -317,7 +317,7 @@ manifest { description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '3.13.2' + version = '3.14.0' doi = 'https://doi.org/10.5281/zenodo.1400710' } From 8615a86bcd49c097c2b2cc680742a38d12019fe5 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 17:38:26 +0000 Subject: [PATCH 13/29] [skip ci] Back to patch release --- CHANGELOG.md | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f713bf2d..7b85720a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[3.14.0](https://github.com/nf-core/rnaseq/releases/tag/3.14.0)] - 2023-11-21 +## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-21 ### Enhancements & fixes diff --git a/nextflow.config b/nextflow.config index 933f48cf4..41bcb45d3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -317,7 +317,7 @@ manifest { description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '3.14.0' + version = '3.13.2' doi = 'https://doi.org/10.5281/zenodo.1400710' } From 249c2a9b3c601b3f2b6fd8cdcd328b4b3b9c1347 Mon Sep 17 00:00:00 2001 From: Regina Reynolds Date: Mon, 20 Nov 2023 18:28:20 +0000 Subject: [PATCH 14/29] fix: remove GTF_FILTER.out.versions --- subworkflows/local/prepare_genome/main.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 0be947954..0840c7734 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -47,7 +47,7 @@ workflow PREPARE_GENOME { rsem_index // directory: /path/to/rsem/index/ salmon_index // directory: /path/to/salmon/index/ kallisto_index // directory: /path/to/kallisto/index/ - hisat2_index // directory: /path/to/hisat2/index/ + hisat2_index // directory: /path/to/hisat2/index/ bbsplit_index // directory: /path/to/rsem/index/ gencode // boolean: whether the genome is from GENCODE is_aws_igenome // boolean: whether the genome files are from AWS iGenomes @@ -139,14 +139,13 @@ workflow PREPARE_GENOME { } else { ch_transcript_fasta = Channel.value(file(transcript_fasta)) } - if (gencode) { + if (gencode) { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } } else { ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta - ch_versions = ch_versions.mix(GTF_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } @@ -268,7 +267,7 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) } } - + // // Uncompress Kallisto index or generate from scratch if required // From 5c50f7d8209a77f1b38c9f80c93add7ba58584ec Mon Sep 17 00:00:00 2001 From: Regina Reynolds Date: Mon, 20 Nov 2023 18:40:00 +0000 Subject: [PATCH 15/29] docs: update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4680c417a..eb5917a71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements and fixes - [[PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix +- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true (issue [#1125](https://github.com/nf-core/rnaseq/issues/1125)) ## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 From 1398f5c63921e1c9bf457937308c1737b1c81bfc Mon Sep 17 00:00:00 2001 From: Regina Reynolds Date: Mon, 20 Nov 2023 18:41:16 +0000 Subject: [PATCH 16/29] docs: move update to dev --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb5917a71..9f90f2c37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## dev + +### Enhancements and fixes +- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true (issue [#1125](https://github.com/nf-core/rnaseq/issues/1125)) + ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes - [[PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix -- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true (issue [#1125](https://github.com/nf-core/rnaseq/issues/1125)) ## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 From 0a675cf585906c152c8c44514188ff24644a9a17 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 18:53:29 +0000 Subject: [PATCH 17/29] Fix error message --- bin/filter_gtf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 265250627..b4a662cad 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -30,7 +30,7 @@ def tab_delimited(file: str) -> float: def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: - raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") + raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.") seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") From 317e2e0206ac2f16f504fddc55553749cbce03a6 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 19:01:10 +0000 Subject: [PATCH 18/29] Fix CHANGELOG.md --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f90f2c37..9cf44327d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,13 +6,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## dev ### Enhancements and fixes -- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true (issue [#1125](https://github.com/nf-core/rnaseq/issues/1125)) +- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true +- [#1125](https://github.com/nf-core/rnaseq/issues/1125) - Pipeline fails if transcript_fasta not provided and skip_gtf_filter = true ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes -- [[PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix +- [PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix ## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 From 5e4293090651589c8705db7be85ad88288fa741e Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Mon, 20 Nov 2023 19:02:43 +0000 Subject: [PATCH 19/29] [automated] Fix linting with Prettier --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cf44327d..38d6bfb30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## dev ### Enhancements and fixes + - [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true - [#1125](https://github.com/nf-core/rnaseq/issues/1125) - Pipeline fails if transcript_fasta not provided and skip_gtf_filter = true From e1a5cca1a80bd170f3ec01e543002bcff7786cbf Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Mon, 20 Nov 2023 19:12:22 +0000 Subject: [PATCH 20/29] [automated] Fix linting with Prettier --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de0d10fec..91063f2fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - ## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-21 ### Enhancements & fixes From 89666a475d02a8dd68ab1cc5d681bbd84d59f9b1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 19:16:22 +0000 Subject: [PATCH 21/29] Add credits --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 91063f2fd..ac5484688 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-21 +### Credits + +Special thanks to the following for their contributions to the release: + +- [Jonathan Manning](https://github.com/pinin4fjords) +- [Regina Hertfelder Reynolds](https://github.com/RHReynolds) +- [Matthias Zepper](https://github.com/MatthiasZepper) + ### Enhancements & fixes - [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables From 4e478649368762b57e99cd626f35229ed2dbe32b Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 20 Nov 2023 20:51:19 +0100 Subject: [PATCH 22/29] Enlarge the sampling range for column determination in FilterGTF script. --- CHANGELOG.md | 5 +++-- bin/filter_gtf.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38d6bfb30..72092ef14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements and fixes -- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true -- [#1125](https://github.com/nf-core/rnaseq/issues/1125) - Pipeline fails if transcript_fasta not provided and skip_gtf_filter = true +- [[#1125](https://github.com/nf-core/rnaseq/issues/1125)][[#1126](https://github.com/nf-core/rnaseq/pull/1126)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [[#1127](https://github.com/nf-core/rnaseq/pull/)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. + ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 265250627..b7b4e972c 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -23,14 +23,14 @@ def extract_fasta_seq_names(fasta_name: str) -> Set[str]: def tab_delimited(file: str) -> float: """Check if file is tab-delimited and return median number of tabs.""" with open(file, "r") as f: - data = f.read(1024) + data = f.read(102400) return statistics.median(line.count("\t") for line in data.split("\n")) def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: - raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") + raise ValueError("Invalid GTF file: Expected nine tab-separated columns.") seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") From 0b2da651d96691f274bea6e55a2360cb7d8e0b46 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 20 Nov 2023 21:00:15 +0100 Subject: [PATCH 23/29] Prettier on Markdown documents. --- CHANGELOG.md | 1 - docs/usage.md | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72092ef14..11e4ee828 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#1125](https://github.com/nf-core/rnaseq/issues/1125)][[#1126](https://github.com/nf-core/rnaseq/pull/1126)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. - [[#1127](https://github.com/nf-core/rnaseq/pull/)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. - ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes diff --git a/docs/usage.md b/docs/usage.md index 524acba9b..894f66f32 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,9 +93,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | From 6b85d1e9913af4d687beb981505abf07276ccb7e Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 20 Nov 2023 21:07:08 +0100 Subject: [PATCH 24/29] New version of Prettier, new changes to Markdown. Love it....not. --- docs/usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 894f66f32..524acba9b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,9 +93,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | From 9250c4c26b330b3ce038e231c94bbfe2b4b2267a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 21:04:00 +0000 Subject: [PATCH 25/29] Update CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11e4ee828..900c3718c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements and fixes -- [[#1125](https://github.com/nf-core/rnaseq/issues/1125)][[#1126](https://github.com/nf-core/rnaseq/pull/1126)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. -- [[#1127](https://github.com/nf-core/rnaseq/pull/)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. +- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126)] [[#1125](https://github.com/nf-core/rnaseq/issues/1125)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [[PR #1127](https://github.com/nf-core/rnaseq/pull/1127)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 From 1bad8d5c7b48f24a2e9a08b28c1dce48477dbc1b Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Nov 2023 09:14:59 +0000 Subject: [PATCH 26/29] [skip ci] Add Jon to script credits --- bin/tximport.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tximport.r b/bin/tximport.r index 528d59e2f..a540498d4 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript # Script for importing and processing transcript-level quantifications. -# Written by Lorena Pantano and released under the MIT license. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released under the MIT license. # Loading required libraries library(SummarizedExperiment) From 56edb1cd70d9a0ebe067797acff75b690148418f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Nov 2023 09:17:24 +0000 Subject: [PATCH 27/29] actually don't skip ci --- bin/tximport.r | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/tximport.r b/bin/tximport.r index a540498d4..59a8fcd9e 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,7 +1,8 @@ #!/usr/bin/env Rscript # Script for importing and processing transcript-level quantifications. -# Written by Lorena Pantano, later modified by Jonathan Manning, and released under the MIT license. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released +# under the MIT license. # Loading required libraries library(SummarizedExperiment) From 36c420413355109517e47ca2a5e1fed619de112e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 21 Nov 2023 10:39:37 +0100 Subject: [PATCH 28/29] [skipci] Update CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ee05f00e..6ef0b84d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,8 +17,8 @@ Special thanks to the following for their contributions to the release: - [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables - [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped -- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126)] [[#1125](https://github.com/nf-core/rnaseq/issues/1125)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. -- [[PR #1127](https://github.com/nf-core/rnaseq/pull/1127)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. +- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [PR #1127](https://github.com/nf-core/rnaseq/pull/1127) - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 From cfd9270b6e42a27e3175b1635ec9233d74652e07 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 21 Nov 2023 10:15:52 +0000 Subject: [PATCH 29/29] Always check --skip_pseudo_alignment and --pseudo_aligner are set --- conf/modules.config | 3 +-- nextflow.config | 2 +- workflows/rnaseq.nf | 13 +++---------- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6e28f549b..efef36974 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1087,7 +1087,6 @@ if (!params.skip_multiqc) { if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { process { - withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' { ext.args = { params.extra_salmon_quant_args ?: '' } publishDir = [ @@ -1112,7 +1111,7 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') { } } -if (!params.skip_pseudo_alignment) { +if (!params.skip_pseudo_alignment && params.pseudo_aligner) { process { withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' { publishDir = [ diff --git a/nextflow.config b/nextflow.config index 41bcb45d3..89f72ec5b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ params { // Alignment aligner = 'star_salmon' - pseudo_aligner = 'salmon' + pseudo_aligner = null seq_center = null bam_csi_index = false star_ignore_sjdbgtf = false diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 4547162a9..f8def2d10 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -37,13 +37,7 @@ if (!params.skip_bbsplit && !params.bbsplit_index && params.bbsplit_fasta_list) def prepareToolIndices = [] if (!params.skip_bbsplit) { prepareToolIndices << 'bbsplit' } if (!params.skip_alignment) { prepareToolIndices << params.aligner } -if (!params.skip_pseudo_alignment) { - if(params.pseudo_aligner) { - prepareToolIndices << params.pseudo_aligner - } else { - exit 1, "--skip_pseudo_alignment not set, but --pseudo_aligner not set" - } -} +if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } // Determine whether to filter the GTF or not def filterGtf = @@ -822,9 +816,8 @@ workflow RNASEQ { // ch_pseudo_multiqc = Channel.empty() ch_pseudoaligner_pca_multiqc = Channel.empty() - ch_pseudoaligner_clustering_multiqc = Channel.empty() - - if (!params.skip_pseudo_alignment) { + ch_pseudoaligner_clustering_multiqc = Channel.empty() + if (!params.skip_pseudo_alignment && params.pseudo_aligner) { if (params.pseudo_aligner == 'salmon') { ch_pseudo_index = PREPARE_GENOME.out.salmon_index