Skip to content

Commit

Permalink
Merge pull request #128 from drpatelh/dev
Browse files Browse the repository at this point in the history
Add plots for consensus QC
  • Loading branch information
drpatelh committed Jun 23, 2020
2 parents 466cf2c + eca1bdc commit 1333186
Show file tree
Hide file tree
Showing 3 changed files with 211 additions and 8 deletions.
164 changes: 164 additions & 0 deletions bin/plot_base_density.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env Rscript

################################################
################################################
## LOAD LIBRARIES ##
################################################
################################################

library(optparse)
library(ggplot2)
library(scales)
library(reshape2)
library(Biostrings)

################################################
################################################
## VALIDATE COMMAND-LINE PARAMETERS ##
################################################
################################################

option_list <- list(make_option(c("-i", "--fasta_files"), type="character", default=NULL, help="Comma-separated list of fasta files", metavar="fasta_files"),
make_option(c("-s", "--prefixes"), type="character", default=NULL, help="Comma-separated list of prefixes associated with fasta files to add to plots. Must be unique and in same order as fasta file input.", metavar="prefixes"),
make_option(c("-o", "--output_dir"), type="character", default='./', help="Output directory", metavar="path"))

opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)

## Check input files
INPUT_FILES <- unique(unlist(strsplit(opt$fasta_files,",")))
if (length(INPUT_FILES) == 0) {
print_help(opt_parser)
stop("At least one input file must be supplied", call.=FALSE)
}
if (!all(file.exists(INPUT_FILES))) {
stop(paste("The following input files don't exist:",paste(INPUT_FILES[!file.exists(INPUT_FILES)], sep='', collapse=' '), sep=' '), call.=FALSE)
}

## Check prefixes for input files
PREFIXES <- basename(INPUT_FILES)
if (!is.null(opt$prefixes)){
PREFIXES <- unique(unlist(strsplit(opt$prefixes,",")))
if (length(INPUT_FILES) != length(PREFIXES)) {
print_help(opt_parser)
stop("Please provide a unique prefix for each fasta file.", call.=FALSE)
}
}

## Check the output directory has a trailing slash, if not add one
OUTDIR <- opt$output_dir
if (tail(strsplit(OUTDIR,"")[[1]],1)!="/") {
OUTDIR <- paste(OUTDIR,"/",sep='')
}
## Create the directory if it doesn't already exist.
if (!file.exists(OUTDIR)) {
dir.create(OUTDIR, recursive=TRUE)
}

################################################
################################################
## READ IN DATA ##
################################################
################################################

dat <- NULL
for (input_file in INPUT_FILES) {
dat <- c(dat,readDNAStringSet(input_file)[1])
}

################################################
################################################
## PLOTS ##
################################################
################################################

bases_std <- c("A","C","T","G")
base_cols <- c("A" = "#009E73",
"C" = "#0072B2",
"T" = "#D55E00",
"G" = "#000000",
"N" = "#E69F00",
"X" = "#999999")

for (idx in 1:length(dat)) {

## Table of base counts
base_seq <- strsplit(toString(dat[[idx]]), "")[[1]]
base_tab <- data.frame(table(base_seq), stringsAsFactors=FALSE)
colnames(base_tab) <- c("base","freq")
rownames(base_tab) <- base_tab$base
for (base in 1:length(bases_std)) {
if (!any(base_tab$base %in% bases_std[base])) {
base_tab <- rbind(base_tab,c(bases_std[base],0))
}
}
base_tab$perc <- 100 *base_tab$freq / sum(base_tab$freq)
base_tab <- base_tab[order(base_tab$base, decreasing=FALSE),]
base_tab <- rbind(base_tab[c(bases_std, "N"),], base_tab[!rownames(base_tab) %in% c(bases_std, "N"),])
base_tab$base <- factor(base_tab$base, levels=rownames(base_tab))
outfile <- paste(OUTDIR, PREFIXES[idx], ".base_counts.tsv", sep='')
write.table(base_tab, file=outfile, col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)

## Barplot of base frequencies
barplot <- ggplot(base_tab, aes(x=base,y=perc)) +
geom_bar(stat="identity") +
theme_classic() +
scale_y_continuous(limits=c(0,100),breaks=c(0,25,50,75,100)) +
ylab("% Observed") +
xlab("Base") +
ggtitle(PREFIXES[idx])
outfile <- paste(OUTDIR, PREFIXES[idx], ".base_counts.pdf", sep='')
ggsave(file=outfile, barplot, width=12, height=10, units="cm")

## Create a data frame of base coverage
bases <- unique(c(bases_std,"N",unique(base_seq)))
base_dat <- data.frame(sample=names(dat[[idx]])[1], position=1:length(base_seq), stringsAsFactors=FALSE)
for (base in 1:length(bases)) {
base_dat[,bases[base]] <- as.numeric(base_seq==bases[base])
}

## Stretches of N's
N_rle <- Rle(base_dat[,"N"])
N_dat <- data.frame(start=cumsum(runLength(N_rle))[runValue(N_rle)==1], width=runLength(N_rle)[runValue(N_rle)==1])
outfile <- paste(OUTDIR, PREFIXES[idx], ".N_run.tsv", sep='')
write.table(N_dat, file=outfile, col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)

## Running mean of bp density for standard bases
run_k <- 1001
run_dat <- base_dat[,c("sample", "position", bases_std)]
for (base in bases_std) {
run_dat[,base] <- as.numeric(runmean(Rle(base_dat[,base]), k=run_k, endrule="constant"))
}
run_dat <- melt(run_dat, c(1,2))
colnames(run_dat)[3] <- "base"
run_dat$position <- run_dat$position/1000
lineplot <- ggplot(run_dat,aes(x=position, y=value, colour=base)) +
geom_line() +
theme_classic() +
theme(panel.border=element_rect(colour="black", fill=NA, size=1)) +
scale_y_continuous(breaks=c(0,0.25,0.50,0.75,1)) +
xlab("Position (Kb)") +
ylab(paste("Base density (running mean k=",run_k,")", sep='')) +
ggtitle(PREFIXES[idx]) +
scale_colour_manual(values=base_cols)
outfile <- paste(OUTDIR, PREFIXES[idx], ".ACTG_density.pdf", sep='')
ggsave(file=outfile, lineplot, width=18, height=10, units="cm")

## Single base density plots, nucleotide resolution.
bases_other <- bases[!bases %in% bases_std]
for (obase in bases_other) {
plot_dat <- base_dat[,c("sample", "position", obase)]
colnames(plot_dat)[3] <- "base"
plot_col <- ifelse(obase=="N", base_cols[["N"]], base_cols[["X"]])
lineplot <- ggplot(plot_dat, aes(x=position/1000, y=base)) +
geom_line(colour=plot_col) +
theme_classic() +
theme(legend.position="none", panel.border=element_rect(colour="black", fill=NA, size=1)) +
scale_y_continuous(breaks=c(0,1), labels=c(0,1)) +
xlab("Position (Kb)") +
ylab(paste(obase,"density", sep=' ')) +
ggtitle(PREFIXES[idx])
outfile <- paste(OUTDIR, PREFIXES[idx], ".", obase, "_density.pdf", sep='')
ggsave(file=outfile, lineplot, width=18, height=10, units="cm")
}
}
22 changes: 20 additions & 2 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,11 +271,17 @@ Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-
* `variants/varscan2/consensus/`
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.fa`: Consensus Fasta file generated by integrating the high frequency variants called by VarScan into the reference genome.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.masked.fa`: Masked consensus Fasta file.
* `variants/varscan2/log/`
* `<SAMPLE>.varscan2.log`: Log file generated from stderr by VarScan 2.
* `variants/varscan2/consensus/base_qc/`
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_density.pdf`: Plot showing density of N bases within the consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_run.tsv`: File containing start positions and width of N bases in consensus sequence.
* `variants/varscan2/bcftools_stats/`
* `<SAMPLE>.bcftools_stats.txt`: Statistics and counts obtained from low frequency variants VCF file.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.bcftools_stats.txt`: Statistics and counts obtained from high frequency variants VCF file.
* `variants/varscan2/log/`
* `<SAMPLE>.varscan2.log`: Log file generated from stderr by VarScan 2.
* `variants/bam/mpileup/`
* `<SAMPLE>.<SUFFIX>.mpileup`: mpileup files summarize all the data from aligned reads at a given genomic position. Each row of the mpileup file gives similar information to a single vertical column of reads as visualised in IGV.

Expand All @@ -302,6 +308,12 @@ Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-
* `variants/ivar/consensus/`
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.fa`: Consensus Fasta file generated by iVar at the frequency threshold set by the `--max_allele_freq` parameter.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.consensus.qual.txt`: File with the average quality of each base in the consensus sequence.
* `variants/ivar/consensus/base_qc/`
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_density.pdf`: Plot showing density of N bases within the consensus sequence.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.N_run.tsv`: File containing start positions and width of N bases in consensus sequence.
* `variants/ivar/log/`
* `<SAMPLE>.variant.counts.log`: Variant counts for low frequency variants.
* `<SAMPLE>.AF<MAX_ALLELE_FREQ>.variant.counts.log`: Variant counts for high frequency variants.
Expand All @@ -326,6 +338,12 @@ Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-
* `variants/bcftools/consensus/`
* `<SAMPLE>.consensus.fa`: Consensus Fasta file generated by integrating the variants called by BCFTools into the reference genome.
* `<SAMPLE>.consensus.masked.fa`: Masked consensus Fasta file.
* `variants/bcftools/consensus/base_qc/`
* `<SAMPLE>.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence.
* `<SAMPLE>.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence.
* `<SAMPLE>.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence.
* `<SAMPLE>.N_density.pdf`: Plot showing density of N bases within the consensus sequence.
* `<SAMPLE>.N_run.tsv`: File containing start positions and width of N bases in consensus sequence.
* `variants/bcftools/bcftools_stats/`
* `<SAMPLE>.bcftools_stats.txt`: Statistics and counts obtained from VCF file.

Expand Down
33 changes: 27 additions & 6 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -1363,7 +1363,12 @@ process VARSCAN2 {
process VARSCAN2_CONSENSUS {
tag "$sample"
label 'process_medium'
publishDir "${params.outdir}/variants/varscan2/consensus", mode: params.publish_dir_mode
publishDir "${params.outdir}/variants/varscan2/consensus", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith(".tsv")) "base_qc/$filename"
else if (filename.endsWith(".pdf")) "base_qc/$filename"
else filename
}

when:
!params.skip_variants && 'varscan2' in callers
Expand All @@ -1374,7 +1379,7 @@ process VARSCAN2_CONSENSUS {

output:
tuple val(sample), val(single_end), path("*consensus.masked.fa") into ch_varscan2_consensus
path "*consensus.fa"
path "*.{consensus.fa,tsv,pdf}"

script:
prefix = "${sample}.AF${params.max_allele_freq}"
Expand All @@ -1393,6 +1398,8 @@ process VARSCAN2_CONSENSUS {
-fo ${prefix}.consensus.masked.fa
header=\$(head -n 1 ${prefix}.consensus.masked.fa | sed 's/>//g')
sed -i "s/\${header}/${sample}/g" ${prefix}.consensus.masked.fa
plot_base_density.r --fasta_files ${prefix}.consensus.masked.fa --prefixes $prefix --output_dir ./
"""
}

Expand Down Expand Up @@ -1565,7 +1572,12 @@ process IVAR_VARIANTS {
process IVAR_CONSENSUS {
tag "$sample"
label 'process_medium'
publishDir "${params.outdir}/variants/ivar/consensus", mode: params.publish_dir_mode
publishDir "${params.outdir}/variants/ivar/consensus", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith(".tsv")) "base_qc/$filename"
else if (filename.endsWith(".pdf")) "base_qc/$filename"
else filename
}

when:
!params.skip_variants && 'ivar' in callers
Expand All @@ -1576,14 +1588,16 @@ process IVAR_CONSENSUS {

output:
tuple val(sample), val(single_end), path("*.fa") into ch_ivar_consensus
path "*.txt"
path "*.{txt,tsv,pdf}"

script:
prefix = "${sample}.AF${params.max_allele_freq}"
"""
cat $mpileup | ivar consensus -q $params.min_base_qual -t $params.max_allele_freq -m $params.min_coverage -n N -p ${prefix}.consensus
header=\$(head -n1 ${prefix}.consensus.fa | sed 's/>//g')
sed -i "s/\${header}/${sample}/g" ${prefix}.consensus.fa
plot_base_density.r --fasta_files ${prefix}.consensus.fa --prefixes $prefix --output_dir ./
"""
}

Expand Down Expand Up @@ -1748,7 +1762,12 @@ process BCFTOOLS_VARIANTS {
process BCFTOOLS_CONSENSUS {
tag "$sample"
label 'process_medium'
publishDir "${params.outdir}/variants/bcftools/consensus", mode: params.publish_dir_mode
publishDir "${params.outdir}/variants/bcftools/consensus", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith(".tsv")) "base_qc/$filename"
else if (filename.endsWith(".pdf")) "base_qc/$filename"
else filename
}

when:
!params.skip_variants && 'bcftools' in callers
Expand All @@ -1759,7 +1778,7 @@ process BCFTOOLS_CONSENSUS {

output:
tuple val(sample), val(single_end), path("*consensus.masked.fa") into ch_bcftools_consensus_masked
path "*consensus.fa"
path "*.{consensus.fa,tsv,pdf}"

script:
"""
Expand All @@ -1778,6 +1797,8 @@ process BCFTOOLS_CONSENSUS {
sed -i 's/${index_base}/${sample}/g' ${sample}.consensus.masked.fa
header=\$(head -n1 ${sample}.consensus.masked.fa | sed 's/>//g')
sed -i "s/\${header}/${sample}/g" ${sample}.consensus.masked.fa
plot_base_density.r --fasta_files ${sample}.consensus.masked.fa --prefixes $sample --output_dir ./
"""
}

Expand Down

0 comments on commit 1333186

Please sign in to comment.