Merge pull request #128 from drpatelh/dev

Add plots for consensus QC
nf-core · Jun 23, 2020 · 1333186 · 1333186
2 parents 466cf2c + eca1bdc
commit 1333186
Show file tree

Hide file tree

Showing 3 changed files with 211 additions and 8 deletions.
diff --git a/bin/plot_base_density.r b/bin/plot_base_density.r
@@ -0,0 +1,164 @@
+#!/usr/bin/env Rscript
+
+################################################
+################################################
+## LOAD LIBRARIES                             ##
+################################################
+################################################
+
+library(optparse)
+library(ggplot2)
+library(scales)
+library(reshape2)
+library(Biostrings)
+
+################################################
+################################################
+## VALIDATE COMMAND-LINE PARAMETERS           ##
+################################################
+################################################
+
+option_list <- list(make_option(c("-i", "--fasta_files"), type="character", default=NULL, help="Comma-separated list of fasta files", metavar="fasta_files"),
+                    make_option(c("-s", "--prefixes"), type="character", default=NULL, help="Comma-separated list of prefixes associated with fasta files to add to plots. Must be unique and in same order as fasta file input.", metavar="prefixes"),
+                    make_option(c("-o", "--output_dir"), type="character", default='./', help="Output directory", metavar="path"))
+
+opt_parser <- OptionParser(option_list=option_list)
+opt <- parse_args(opt_parser)
+
+## Check input files
+INPUT_FILES <- unique(unlist(strsplit(opt$fasta_files,",")))
+if (length(INPUT_FILES) == 0) {
+    print_help(opt_parser)
+    stop("At least one input file must be supplied", call.=FALSE)
+}
+if (!all(file.exists(INPUT_FILES))) {
+    stop(paste("The following input files don't exist:",paste(INPUT_FILES[!file.exists(INPUT_FILES)], sep='', collapse=' '), sep=' '), call.=FALSE)
+}
+
+## Check prefixes for input files
+PREFIXES <- basename(INPUT_FILES)
+if (!is.null(opt$prefixes)){
+    PREFIXES <- unique(unlist(strsplit(opt$prefixes,",")))
+    if (length(INPUT_FILES) != length(PREFIXES)) {
+        print_help(opt_parser)
+        stop("Please provide a unique prefix for each fasta file.", call.=FALSE)
+    }
+}
+
+## Check the output directory has a trailing slash, if not add one
+OUTDIR <- opt$output_dir
+if (tail(strsplit(OUTDIR,"")[[1]],1)!="/") {
+    OUTDIR <- paste(OUTDIR,"/",sep='')
+}
+## Create the directory if it doesn't already exist.
+if (!file.exists(OUTDIR)) {
+  dir.create(OUTDIR, recursive=TRUE)
+}
+
+################################################
+################################################
+## READ IN DATA                               ##
+################################################
+################################################
+
+dat <- NULL
+for (input_file in INPUT_FILES) {
+    dat <- c(dat,readDNAStringSet(input_file)[1])
+}
+
+################################################
+################################################
+## PLOTS                                      ##
+################################################
+################################################
+
+bases_std <- c("A","C","T","G")
+base_cols <- c("A" = "#009E73",
+               "C" = "#0072B2",
+               "T" = "#D55E00",
+               "G" = "#000000",
+               "N" = "#E69F00",
+               "X" = "#999999")
+
+for (idx in 1:length(dat)) {
+
+    ## Table of base counts
+    base_seq <- strsplit(toString(dat[[idx]]), "")[[1]]
+    base_tab <- data.frame(table(base_seq), stringsAsFactors=FALSE)
+    colnames(base_tab) <- c("base","freq")
+    rownames(base_tab) <- base_tab$base
+    for (base in 1:length(bases_std)) {
+        if (!any(base_tab$base %in% bases_std[base])) {
+            base_tab <- rbind(base_tab,c(bases_std[base],0))
+        }
+    }
+    base_tab$perc <- 100 *base_tab$freq / sum(base_tab$freq)
+    base_tab <- base_tab[order(base_tab$base, decreasing=FALSE),]
+    base_tab <- rbind(base_tab[c(bases_std, "N"),], base_tab[!rownames(base_tab) %in% c(bases_std, "N"),])
+    base_tab$base <- factor(base_tab$base, levels=rownames(base_tab))
+    outfile <- paste(OUTDIR, PREFIXES[idx], ".base_counts.tsv", sep='')
+    write.table(base_tab, file=outfile, col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
+
+    ## Barplot of base frequencies
+    barplot <- ggplot(base_tab, aes(x=base,y=perc)) +
+               geom_bar(stat="identity") +
+               theme_classic() +
+               scale_y_continuous(limits=c(0,100),breaks=c(0,25,50,75,100)) +
+               ylab("% Observed") +
+               xlab("Base") +
+               ggtitle(PREFIXES[idx])
+    outfile <- paste(OUTDIR, PREFIXES[idx], ".base_counts.pdf", sep='')
+    ggsave(file=outfile, barplot, width=12, height=10, units="cm")
+
+    ## Create a data frame of base coverage
+    bases <- unique(c(bases_std,"N",unique(base_seq)))
+    base_dat <- data.frame(sample=names(dat[[idx]])[1], position=1:length(base_seq), stringsAsFactors=FALSE)
+    for (base in 1:length(bases)) {
+        base_dat[,bases[base]] <- as.numeric(base_seq==bases[base])
+    }
+
+    ## Stretches of N's
+    N_rle <- Rle(base_dat[,"N"])
+    N_dat <- data.frame(start=cumsum(runLength(N_rle))[runValue(N_rle)==1], width=runLength(N_rle)[runValue(N_rle)==1])
+    outfile <- paste(OUTDIR, PREFIXES[idx], ".N_run.tsv", sep='')
+    write.table(N_dat, file=outfile, col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
+
+    ## Running mean of bp density for standard bases
+    run_k <- 1001
+    run_dat <- base_dat[,c("sample", "position", bases_std)]
+    for (base in bases_std) {
+        run_dat[,base] <- as.numeric(runmean(Rle(base_dat[,base]), k=run_k, endrule="constant"))
+    }
+    run_dat <- melt(run_dat, c(1,2))
+    colnames(run_dat)[3] <- "base"
+    run_dat$position <- run_dat$position/1000
+    lineplot <- ggplot(run_dat,aes(x=position, y=value, colour=base)) +
+                geom_line() +
+                theme_classic() +
+                theme(panel.border=element_rect(colour="black", fill=NA, size=1)) +
+                scale_y_continuous(breaks=c(0,0.25,0.50,0.75,1)) +
+                xlab("Position (Kb)") +
+                ylab(paste("Base density (running mean k=",run_k,")", sep='')) +
+                ggtitle(PREFIXES[idx]) +
+                scale_colour_manual(values=base_cols)
+    outfile <- paste(OUTDIR, PREFIXES[idx], ".ACTG_density.pdf", sep='')
+    ggsave(file=outfile, lineplot, width=18, height=10, units="cm")
+
+    ## Single base density plots, nucleotide resolution.
+    bases_other <- bases[!bases %in% bases_std]
+    for (obase in bases_other) {
+        plot_dat  <- base_dat[,c("sample", "position", obase)]
+        colnames(plot_dat)[3] <- "base"
+        plot_col <- ifelse(obase=="N", base_cols[["N"]], base_cols[["X"]])
+        lineplot <- ggplot(plot_dat, aes(x=position/1000, y=base)) +
+                    geom_line(colour=plot_col) +
+                    theme_classic() +
+                    theme(legend.position="none", panel.border=element_rect(colour="black", fill=NA, size=1)) +
+                    scale_y_continuous(breaks=c(0,1), labels=c(0,1)) +
+                    xlab("Position (Kb)") +
+                    ylab(paste(obase,"density", sep=' ')) +
+                    ggtitle(PREFIXES[idx])
+      outfile <- paste(OUTDIR, PREFIXES[idx], ".", obase, "_density.pdf", sep='')
+      ggsave(file=outfile, lineplot, width=18, height=10, units="cm")
+    }
+}
diff --git a/docs/output.md b/docs/output.md
@@ -271,11 +271,17 @@ Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-
 * `variants/varscan2/consensus/`
     * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.fa`: Consensus Fasta file generated by integrating the high frequency variants called by VarScan into the reference genome.
     * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.masked.fa`: Masked consensus Fasta file.
-* `variants/varscan2/log/`
-    * `<SAMPLE>.varscan2.log`: Log file generated from stderr by VarScan 2.
+* `variants/varscan2/consensus/base_qc/`
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_density.pdf`: Plot showing density of N bases within the consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_run.tsv`: File containing start positions and width of N bases in consensus sequence.
 * `variants/varscan2/bcftools_stats/`
     * `<SAMPLE>.bcftools_stats.txt`: Statistics and counts obtained from low frequency variants VCF file.
     * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.bcftools_stats.txt`: Statistics and counts obtained from high frequency variants VCF file.
+* `variants/varscan2/log/`
+    * `<SAMPLE>.varscan2.log`: Log file generated from stderr by VarScan 2.
 * `variants/bam/mpileup/`
     * `<SAMPLE>.<SUFFIX>.mpileup`: mpileup files summarize all the data from aligned reads at a given genomic position. Each row of the mpileup file gives similar information to a single vertical column of reads as visualised in IGV.
 
@@ -302,6 +308,12 @@ Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-
 * `variants/ivar/consensus/`
     * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.fa`: Consensus Fasta file generated by iVar at the frequency threshold set by the `--max_allele_freq` parameter.
     * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.qual.txt`: File with the average quality of each base in the consensus sequence.
+* `variants/ivar/consensus/base_qc/`
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_density.pdf`: Plot showing density of N bases within the consensus sequence.
+    * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_run.tsv`: File containing start positions and width of N bases in consensus sequence.
 * `variants/ivar/log/`
     * `<SAMPLE>.variant.counts.log`: Variant counts for low frequency variants.
     * `<SAMPLE>.AF<MAX_ALLELE_FREQ>.variant.counts.log`: Variant counts for high frequency variants.
@@ -326,6 +338,12 @@ Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-
 * `variants/bcftools/consensus/`
     * `<SAMPLE>.consensus.fa`: Consensus Fasta file generated by integrating the variants called by BCFTools into the reference genome.
     * `<SAMPLE>.consensus.masked.fa`: Masked consensus Fasta file.
+* `variants/bcftools/consensus/base_qc/`
+    * `<SAMPLE>.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence.
+    * `<SAMPLE>.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence.
+    * `<SAMPLE>.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence.
+    * `<SAMPLE>.N_density.pdf`: Plot showing density of N bases within the consensus sequence.
+    * `<SAMPLE>.N_run.tsv`: File containing start positions and width of N bases in consensus sequence.
 * `variants/bcftools/bcftools_stats/`
     * `<SAMPLE>.bcftools_stats.txt`: Statistics and counts obtained from VCF file.
 

diff --git a/main.nf b/main.nf
@@ -1363,7 +1363,12 @@ process VARSCAN2 {
 process VARSCAN2_CONSENSUS {
     tag "$sample"
     label 'process_medium'
-    publishDir "${params.outdir}/variants/varscan2/consensus", mode: params.publish_dir_mode
+    publishDir "${params.outdir}/variants/varscan2/consensus", mode: params.publish_dir_mode,
+        saveAs: { filename ->
+                      if (filename.endsWith(".tsv")) "base_qc/$filename"
+                      else if (filename.endsWith(".pdf")) "base_qc/$filename"
+                      else filename
+                }
 
     when:
     !params.skip_variants && 'varscan2' in callers
@@ -1374,7 +1379,7 @@ process VARSCAN2_CONSENSUS {
 
     output:
     tuple val(sample), val(single_end), path("*consensus.masked.fa") into ch_varscan2_consensus
-    path "*consensus.fa"
+    path "*.{consensus.fa,tsv,pdf}"
 
     script:
     prefix = "${sample}.AF${params.max_allele_freq}"
@@ -1393,6 +1398,8 @@ process VARSCAN2_CONSENSUS {
         -fo ${prefix}.consensus.masked.fa
     header=\$(head -n 1 ${prefix}.consensus.masked.fa | sed 's/>//g')
     sed -i "s/\${header}/${sample}/g" ${prefix}.consensus.masked.fa
+
+    plot_base_density.r --fasta_files ${prefix}.consensus.masked.fa --prefixes $prefix --output_dir ./
     """
 }
 
@@ -1565,7 +1572,12 @@ process IVAR_VARIANTS {
 process IVAR_CONSENSUS {
     tag "$sample"
     label 'process_medium'
-    publishDir "${params.outdir}/variants/ivar/consensus", mode: params.publish_dir_mode
+    publishDir "${params.outdir}/variants/ivar/consensus", mode: params.publish_dir_mode,
+        saveAs: { filename ->
+                      if (filename.endsWith(".tsv")) "base_qc/$filename"
+                      else if (filename.endsWith(".pdf")) "base_qc/$filename"
+                      else filename
+                }
 
     when:
     !params.skip_variants && 'ivar' in callers
@@ -1576,14 +1588,16 @@ process IVAR_CONSENSUS {
 
     output:
     tuple val(sample), val(single_end), path("*.fa") into ch_ivar_consensus
-    path "*.txt"
+    path "*.{txt,tsv,pdf}"
 
     script:
     prefix = "${sample}.AF${params.max_allele_freq}"
     """
     cat $mpileup | ivar consensus -q $params.min_base_qual -t $params.max_allele_freq -m $params.min_coverage -n N -p ${prefix}.consensus
     header=\$(head -n1 ${prefix}.consensus.fa | sed 's/>//g')
     sed -i "s/\${header}/${sample}/g" ${prefix}.consensus.fa
+
+    plot_base_density.r --fasta_files ${prefix}.consensus.fa --prefixes $prefix --output_dir ./
     """
 }
 
@@ -1748,7 +1762,12 @@ process BCFTOOLS_VARIANTS {
 process BCFTOOLS_CONSENSUS {
     tag "$sample"
     label 'process_medium'
-    publishDir "${params.outdir}/variants/bcftools/consensus", mode: params.publish_dir_mode
+    publishDir "${params.outdir}/variants/bcftools/consensus", mode: params.publish_dir_mode,
+        saveAs: { filename ->
+                      if (filename.endsWith(".tsv")) "base_qc/$filename"
+                      else if (filename.endsWith(".pdf")) "base_qc/$filename"
+                      else filename
+                }
 
     when:
     !params.skip_variants && 'bcftools' in callers
@@ -1759,7 +1778,7 @@ process BCFTOOLS_CONSENSUS {
 
     output:
     tuple val(sample), val(single_end), path("*consensus.masked.fa") into ch_bcftools_consensus_masked
-    path "*consensus.fa"
+    path "*.{consensus.fa,tsv,pdf}"
 
     script:
     """
@@ -1778,6 +1797,8 @@ process BCFTOOLS_CONSENSUS {
     sed -i 's/${index_base}/${sample}/g' ${sample}.consensus.masked.fa
     header=\$(head -n1 ${sample}.consensus.masked.fa | sed 's/>//g')
     sed -i "s/\${header}/${sample}/g" ${sample}.consensus.masked.fa
+
+    plot_base_density.r --fasta_files ${sample}.consensus.masked.fa --prefixes $sample --output_dir ./
     """
 }