From 6fe23ee794dce1fe190b7f579e49149b1694c8f0 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Thu, 22 Jun 2023 12:11:32 -0700 Subject: [PATCH 01/16] @maotian06 original code --- compare-vcf/compare_vcf.R | 293 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 compare-vcf/compare_vcf.R diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R new file mode 100644 index 00000000..d9010acb --- /dev/null +++ b/compare-vcf/compare_vcf.R @@ -0,0 +1,293 @@ +# This is the script to compare the vcf results ################ +# Initial commit: Mao Tian 09-28-2022 +## 1. Setup the environment ######################## +library('argparse'); +library('BoutrosLab.plotting.general'); +library('BoutrosLab.utilities'); +library('VennDiagram'); + +### 2. Dataset Argument Parser ######################################## +parse.args <- function() { + data.parser <- ArgumentParser(); + data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', default = 'NULL'); + data.parser$add_argument('-f', '--file', help = 'an input file that lists all the vcf.gz files', default = 'NULL'); + data.parser$add_argument('-v', '--valid_algorithm', help = 'valid algorithms, default are the four call-sSNV algorithms', default = 'NULL'); + data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test'); + data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL'); + data.parser$add_argument('--pairwise_comparsion', help = 'the option to do pairwise comparsion, deflaut is FALSE.', default = 'FALSE'); + data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'Y'); + ### Set the arg_parase + args <- data.parser$parse_args(); + if ( args$output.dir == 'NULL') { + args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/'); + } + return(args); + }; +### 3. Read Input VCF Lists ################################## +read.inputs <- function(args) { + if (args$path != 'NULL') { + vcf.list <- list.files(path = args$path, + pattern = '*.vcf.gz$', + all.files = TRUE, + full.names = TRUE, + recursive = TRUE); + } else if (args$file != 'NULL') { + vcf.list <- as.character(read.table(args$file, header = FALSE)); + } else { + print( 'No input file or input directory!'); + }; + # set the default valid algorithm, the default are four algorithms used in call-sSNV + if (args$valid_algorithm == 'NULL') { + algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2'); + args$valid_algorithm <- algorithm.list; + } else { + algorithm.list <- args$valid_algorithm + }; + # this function is used to extract the algorithm of the vcf.gz file + # need to pre-define the algorithm.list + extract.algorithm <- function(x) { + for (i in 1:length(algorithm.list)) { + if (grepl(algorithm.list[i], x)) { + return(algorithm.list[i]); + }; + }; + }; + # Strelka2 has an output with indel variants, removed it here + vcf.list <- vcf.list[ - grep('indel', vcf.list)]; + vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm)); + names(vcf.list) <- vcf.list.name; + vcf.count <- c(); + # count each vcf file variant + for (file in vcf.list) { + count <- system(paste('zcat', file, '| grep -v "^#" | wc -l', sep = ' '), intern = TRUE); + vcf.count <- c(vcf.count, count); + } + vcf.summary <- data.frame(vcf.list, vcf.list.name, vcf.count); + colnames(vcf.summary) <- c('filename', 'algorithm', 'variant.count'); + # rank vcf by variant.count + vcf.summary <- vcf.summary[order(vcf.summary$variant.count, decreasing = TRUE),]; + vcf.list <- as.character(vcf.summary$filename); + names(vcf.list) <- as.character(vcf.summary$algorithm); + system(paste('mkdir', args$output.dir)); + system(paste('mkdir', paste(args$output.dir, 'intermediate', sep = '/'))); + setwd(args$output.dir); + write.table(vcf.summary, 'vcf_summary.tsv', sep = '\t', row.names = FALSE, quote = FALSE); + return(vcf.list); + }; + +### 4. Run VCF check with system() ################################## +run.vcf.check <- function(vcf.list, args) { + docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare'; + if (args$global_comparsion == 'TRUE') { + vcf.list.text <- paste(vcf.list, collapse = ' '); + vcf.check.command <- paste(docker.command, vcf.list.text, '> global_comparison.txt', sep = ' '); + futile.logger::flog.info('Running the following command to check VCF overlap:'); + print(vcf.check.command); + system(vcf.check.command, intern = TRUE, ignore.stderr = TRUE); + + }; + if (args$pairwise_comparsion == 'TRUE') { + for (i in 1:(length(vcf.list) - 1)) { + for (j in (i + 1) : length(vcf.list)) { + file1 <- vcf.list[i]; + file2 <- vcf.list[j]; + name <- paste(names(vcf.list)[i], names(vcf.list)[j], 'pairwise', sep = '-' ); + system(paste('bash vcf_check.sh', file1, file2, name, args$output.dir, sep = ' '), intern = TRUE, ignore.stderr = TRUE); + } + }; + }; + system('mv *.txt intermediate/'); +}; + +### 5. Gather VCF check results ################################## +gather.vcf.compare.result <- function( args ){ + setwd(args$output.dir); + + if (args$global_comparsion == 'TRUE') { + futile.logger::flog.info('Reading the following file:'); + file <- list.files(path = getwd(), + pattern = 'global_comparison.txt$', + full.names = TRUE, + recursive = TRUE); + system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' ')); + # start read temp_overlap_count.txt by each line + con <- file('temp_overlap_count.txt', "r"); + overlap.count <- c(); + category <- c(); + vcf.list.name <- names(vcf.list); + while ( TRUE ) { + line = readLines(con, n = 1); + if ( length(line) == 0 ) { + break + } + line.content <- unlist(strsplit(line, '\t')); + overlap.count <- c(overlap.count, line.content[1]); + line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content); + if (length(line.content) == 2) { + filename <- line.content[2]; + algorithm <- vcf.list.name[match(filename, vcf.list)]; + } else { + algorithm <- c() + for (i in (2 : length(line.content))){ + filename <- line.content[i]; + algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]); + } + } + category <- c(category, paste(algorithm, collapse = '-')); + } + close(con) + overlap.count <- data.frame(overlap.count, category); + colnames(overlap.count) <- c('count', 'category'); + overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category)))); + overlap.count <- overlap.count[order(overlap.count$algorithm.count),]; + vcf.list.name.level <- as.numeric(factor(vcf.list.name)); + # generate compare group, such as 1, 2, 3, 12, 13, 23 + for (i in 1:nrow(overlap.count)) { + if ( overlap.count$algorithm.count[i] == 1) { + overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name); + } else { + category.list <- unlist(strsplit(overlap.count$category[i], '-')); + category.list.number <- as.character(sapply(category.list, function(x) { + return(match(x, vcf.list.name)) + })); + category.list.number <- category.list.number[order(category.list.number)]; + overlap.count$compare.group[i] <- paste(category.list.number, collapse = ''); + rm(category.list.number); + } + }; + overlap.count <- overlap.count[order(overlap.count$compare.group),]; + futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv'); + + print(overlap.count); + write.table(overlap.count, + 'overlap_count.tsv', + sep = '\t', + row.names = FALSE, + quote = FALSE); + return(overlap.count); + } + if (args$pairwise_comparsion == 'Y') { + file.list <- list.files(path = getwd(), + pattern = 'intermediate/*_vcf_comparsion.txt$', + full.names = TRUE); + for (i in 1:length(file.list)) { + futile.logger::flog.info('Reading the following file:'); + system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' ')); + overlap.df <- read.table('temp_overlap_count.txt', header = FALSE, allowEscapes = FALSE); + system(paste('cat', file.list[i], '| grep "^VN" | cut -f 3 | head -2 >filename.txt', sep = ' ')); + filename.df <- read.table('filename.txt', header = FALSE, sep = ' '); + algo1 <- unlist(strsplit(basename(filename.df$V1[1]), '-'))[1]; + algo2 <- unlist(strsplit(basename(filename.df$V1[2]), '-'))[1]; + system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2 | head -3 >temp_number.txt', sep = ' ')); + number.df <- read.table('temp_number.txt', header = FALSE); + number.df$V1 <- as.numeric(number.df$V1); + cross.area <- number.df$V1[3]; + area1 <- number.df$V1[1] + cross.area; + area2 <- number.df$V1[2] + cross.area; + venn.plot <- draw.pairwise.venn( + area1, area2, cross.area, + category = c(algo1, algo2), + cat.pos = c(0, 180), + euler.d = TRUE, + sep.dist = 0.03, + rotation.degree = 45 + ); + tiff(filename = paste(algo1, algo2, 'Pairwise-Venn-diagram.tiff', sep = '_'), + compression = "lzw"); + grid.draw(venn.plot); + dev.off(); + }; + }; + } +### 5. Plot Global Venn Plot ###################################### +plot.quad.venn.plot <- function (overlap.count, vcf.list) { + device <- pdf(file = NULL); + futile.logger::flog.info('Reading vcf_summary.tsv'); + vcf.summary <- read.table('vcf_summary.tsv', sep = '\t', header = TRUE); + if (all(vcf.summary$filename == vcf.list)) { + futile.logger::flog.info('algorithm sequence matched, run variant count check'); + } else { + futile.logger::flog.info('algorithm sequence not matched, reordering'); + vcf.summary <- vcf.summary[match(vcf.list, vcf.summary$filename),]; + if (all(vcf.summary$algorithm == vcf.list)) { + futile.logger::flog.info('reordered, run variant count check'); + } + } + # check if the sum of the overlap.count equals to the summary of vcf + futile.logger::flog.info('check if the sum of the overlap.count equals to the summary of vcf'); + overlap.count$count <- as.numeric(overlap.count$count); + for (algo in vcf.summary$algorithm) { + count.from.summary <- vcf.summary$variant.count[vcf.summary$algorithm == algo]; + count.from.overlap.sum <- sum( + overlap.count$count[grep(algo, overlap.count$category)] + ); + if (count.from.summary == count.from.overlap.sum) { + futile.logger::flog.info(paste(algo, 'Match!', sep = ' ')); + } else { + futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' ')); + } + } + u1 = vcf.summary$variant.count[1]; + u2 = vcf.summary$variant.count[2]; + u3 = vcf.summary$variant.count[3]; + u4 = vcf.summary$variant.count[4]; + u12 = overlap.count$count[match('12', overlap.count$compare.group)]; + u13 = overlap.count$count[match('13', overlap.count$compare.group)]; + u14 = overlap.count$count[match('14', overlap.count$compare.group)]; + u23 = overlap.count$count[match('23', overlap.count$compare.group)]; + u24 = overlap.count$count[match('24', overlap.count$compare.group)]; + u34 = overlap.count$count[match('34', overlap.count$compare.group)]; + u123 = overlap.count$count[match('123', overlap.count$compare.group)]; + u124 = overlap.count$count[match('124', overlap.count$compare.group)]; + u134 = overlap.count$count[match('134', overlap.count$compare.group)]; + u234 = overlap.count$count[match('234', overlap.count$compare.group)]; + u1234 = overlap.count$count[match('1234', overlap.count$compare.group)]; + area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14, u124, u123, u23, u2, u12) + print(area.vector); + # plot qual.venn.plot + venn.plot <- draw.quad.venn( + area1 = vcf.summary$variant.count[1], + area2 = vcf.summary$variant.count[2], + area3 = vcf.summary$variant.count[3], + area4 = vcf.summary$variant.count[4], + n12 = (u12 + u123 + u124 + u1234), + n13 = (u13 + u123 + u134 + u1234), + n14 = (u14 + u124 + u134 + u1234), + n23 = (u23 + u123 + u234 + u1234), + n24 = (u24 + u124 + u234 + u1234), + n34 = (u34 + u134 + u234 + u1234), + n123 = (u123 + u1234), + n124 = (u124 + u1234), + n134 = (u134 + u1234), + n234 = (u234 + u1234), + n1234 = u1234, + category = vcf.summary$algorithm, + fill = c("orange", "red", "green", "blue"), + lty = "dashed", + cex = 1, + cat.cex = 0.8, + cat.col = 'black'); + tiff( + filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', '.tiff' ), + compression = "lzw" ); + grid.rect(x = unit(1, "npc"), y = unit(1, "npc"), + width = unit(5, "npc"), height = unit(5, "npc"), + just = "centre", default.units = "npc", name = NULL, + gp=gpar(), draw = TRUE, vp = NULL); + grid.draw(venn.plot); + dev.off(); +} + + + +### Main ########################################################## + +args <- parse.args(); +vcf.list <- read.inputs(args); +run.vcf.check(vcf.list, args); +if (args$global_comparsion == 'Y') { + overlap.count <- gather.vcf.compare.result(args); +} else { + gather.vcf.compare.result(args); +}; +plot.quad.venn.plot(overlap.count, vcf.list); From b4a7b203d9350db6b4dda6dc043dcc390c08b388 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Thu, 22 Jun 2023 14:25:44 -0700 Subject: [PATCH 02/16] working locally, before paring down --- compare-vcf/compare_vcf.R | 127 ++++++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R index d9010acb..c56f04b8 100644 --- a/compare-vcf/compare_vcf.R +++ b/compare-vcf/compare_vcf.R @@ -6,6 +6,18 @@ library('BoutrosLab.plotting.general'); library('BoutrosLab.utilities'); library('VennDiagram'); +# tmp: for testing +#args <- list(); +##args$path <- 'NULL'; +##args$file <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/maotian06-compare-vcf/input-vcfs.txt'; +#args$path <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/call-sSNV-6.0.0/CPCG0000000196-T001-P01' +#args$file <- 'NULL'; +#args$valid_algorithm <- 'NULL'; +#args$dataset.id <- 'CPCG'; +#args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/'); +#args$pairwise_comparsion <- 'FALSE'; +#args$global_comparsion <- 'TRUE'; + ### 2. Dataset Argument Parser ######################################## parse.args <- function() { data.parser <- ArgumentParser(); @@ -15,7 +27,7 @@ parse.args <- function() { data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test'); data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL'); data.parser$add_argument('--pairwise_comparsion', help = 'the option to do pairwise comparsion, deflaut is FALSE.', default = 'FALSE'); - data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'Y'); + data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'TRUE'); ### Set the arg_parase args <- data.parser$parse_args(); if ( args$output.dir == 'NULL') { @@ -27,19 +39,19 @@ parse.args <- function() { read.inputs <- function(args) { if (args$path != 'NULL') { vcf.list <- list.files(path = args$path, - pattern = '*.vcf.gz$', + pattern = '*snvs.vcf.gz$', all.files = TRUE, full.names = TRUE, recursive = TRUE); } else if (args$file != 'NULL') { - vcf.list <- as.character(read.table(args$file, header = FALSE)); + vcf.list <- as.list(t(read.table(args$file, header = FALSE))); } else { print( 'No input file or input directory!'); }; # set the default valid algorithm, the default are four algorithms used in call-sSNV if (args$valid_algorithm == 'NULL') { algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2'); - args$valid_algorithm <- algorithm.list; +# args$valid_algorithm <- algorithm.list; } else { algorithm.list <- args$valid_algorithm }; @@ -53,14 +65,15 @@ read.inputs <- function(args) { }; }; # Strelka2 has an output with indel variants, removed it here - vcf.list <- vcf.list[ - grep('indel', vcf.list)]; +# vcf.list <- vcf.list[ - grep('indel', vcf.list)]; + vcf.list <- vcf.list[ - grep('intermediate', vcf.list)]; vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm)); names(vcf.list) <- vcf.list.name; vcf.count <- c(); # count each vcf file variant for (file in vcf.list) { count <- system(paste('zcat', file, '| grep -v "^#" | wc -l', sep = ' '), intern = TRUE); - vcf.count <- c(vcf.count, count); + vcf.count <- c(vcf.count, as.numeric(count)); } vcf.summary <- data.frame(vcf.list, vcf.list.name, vcf.count); colnames(vcf.summary) <- c('filename', 'algorithm', 'variant.count'); @@ -75,15 +88,15 @@ read.inputs <- function(args) { return(vcf.list); }; -### 4. Run VCF check with system() ################################## -run.vcf.check <- function(vcf.list, args) { +### 4. Run VCF compare with system() ################################## +run.vcf.compare <- function(vcf.list, args) { docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare'; if (args$global_comparsion == 'TRUE') { vcf.list.text <- paste(vcf.list, collapse = ' '); - vcf.check.command <- paste(docker.command, vcf.list.text, '> global_comparison.txt', sep = ' '); + vcf.compare.command <- paste(docker.command, vcf.list.text, '> global_comparison.txt', sep = ' '); futile.logger::flog.info('Running the following command to check VCF overlap:'); - print(vcf.check.command); - system(vcf.check.command, intern = TRUE, ignore.stderr = TRUE); + print(vcf.compare.command); + system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE); }; if (args$pairwise_comparsion == 'TRUE') { @@ -99,7 +112,7 @@ run.vcf.check <- function(vcf.list, args) { system('mv *.txt intermediate/'); }; -### 5. Gather VCF check results ################################## +### 5. Gather VCF compare results ################################## gather.vcf.compare.result <- function( args ){ setwd(args$output.dir); @@ -166,7 +179,7 @@ gather.vcf.compare.result <- function( args ){ quote = FALSE); return(overlap.count); } - if (args$pairwise_comparsion == 'Y') { + if (args$pairwise_comparsion == 'TRUE') { file.list <- list.files(path = getwd(), pattern = 'intermediate/*_vcf_comparsion.txt$', full.names = TRUE); @@ -221,46 +234,64 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { count.from.overlap.sum <- sum( overlap.count$count[grep(algo, overlap.count$category)] ); - if (count.from.summary == count.from.overlap.sum) { - futile.logger::flog.info(paste(algo, 'Match!', sep = ' ')); - } else { - futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' ')); - } + if (count.from.summary == count.from.overlap.sum) { + futile.logger::flog.info(paste(algo, 'Match!', sep = ' ')); + } else { + futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' ')); + } } - u1 = vcf.summary$variant.count[1]; - u2 = vcf.summary$variant.count[2]; - u3 = vcf.summary$variant.count[3]; - u4 = vcf.summary$variant.count[4]; - u12 = overlap.count$count[match('12', overlap.count$compare.group)]; - u13 = overlap.count$count[match('13', overlap.count$compare.group)]; - u14 = overlap.count$count[match('14', overlap.count$compare.group)]; - u23 = overlap.count$count[match('23', overlap.count$compare.group)]; - u24 = overlap.count$count[match('24', overlap.count$compare.group)]; - u34 = overlap.count$count[match('34', overlap.count$compare.group)]; - u123 = overlap.count$count[match('123', overlap.count$compare.group)]; - u124 = overlap.count$count[match('124', overlap.count$compare.group)]; - u134 = overlap.count$count[match('134', overlap.count$compare.group)]; - u234 = overlap.count$count[match('234', overlap.count$compare.group)]; - u1234 = overlap.count$count[match('1234', overlap.count$compare.group)]; - area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14, u124, u123, u23, u2, u12) - print(area.vector); +# u1 = vcf.summary$variant.count[1]; +# u2 = vcf.summary$variant.count[2]; +# u3 = vcf.summary$variant.count[3]; +# u4 = vcf.summary$variant.count[4]; +# +# combination2 <- combn(1:4, 2); +# combination3 <- combn(1:4, 3); +# combination4 <- combn(1:4, 4); +# comb.list <- c( +# apply(combination2, 2, paste0, collapse=""), +# apply(combination3, 2, paste0, collapse=""), +# apply(combination4, 2, paste0, collapse="") +# ); + + count <- function(x) { + cnt <- overlap.count$count[match(x, overlap.count$compare.group)]; + if (is.na(cnt)) { + return(0); + } else { + return(cnt); + } + }; +# u12 = overlap.count$count[match('12', overlap.count$compare.group)]; +# u13 = overlap.count$count[match('13', overlap.count$compare.group)]; +# u14 = overlap.count$count[match('14', overlap.count$compare.group)]; +# u23 = overlap.count$count[match('23', overlap.count$compare.group)]; +# u24 = overlap.count$count[match('24', overlap.count$compare.group)]; +# u34 = overlap.count$count[match('34', overlap.count$compare.group)]; +# u123 = overlap.count$count[match('123', overlap.count$compare.group)]; +# u124 = overlap.count$count[match('124', overlap.count$compare.group)]; +# u134 = overlap.count$count[match('134', overlap.count$compare.group)]; +# u234 = overlap.count$count[match('234', overlap.count$compare.group)]; +# u1234 = overlap.count$count[match('1234', overlap.count$compare.group)]; +# area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14, u124, u123, u23, u2, u12); +# print(area.vector); # plot qual.venn.plot venn.plot <- draw.quad.venn( area1 = vcf.summary$variant.count[1], area2 = vcf.summary$variant.count[2], area3 = vcf.summary$variant.count[3], area4 = vcf.summary$variant.count[4], - n12 = (u12 + u123 + u124 + u1234), - n13 = (u13 + u123 + u134 + u1234), - n14 = (u14 + u124 + u134 + u1234), - n23 = (u23 + u123 + u234 + u1234), - n24 = (u24 + u124 + u234 + u1234), - n34 = (u34 + u134 + u234 + u1234), - n123 = (u123 + u1234), - n124 = (u124 + u1234), - n134 = (u134 + u1234), - n234 = (u234 + u1234), - n1234 = u1234, + n12 = (count('12') + count('123') + count('124') + count('1234')), + n13 = (count('13') + count('123') + count('134') + count('1234')), + n14 = (count('14') + count('124') + count('134') + count('1234')), + n23 = (count('23') + count('123') + count('234') + count('1234')), + n24 = (count('24') + count('124') + count('234') + count('1234')), + n34 = (count('34') + count('134') + count('234') + count('1234')), + n123 = (count('123') + count('1234')), + n124 = (count('124') + count('1234')), + n134 = (count('134') + count('1234')), + n234 = (count('234') + count('1234')), + n1234 = count('1234'), category = vcf.summary$algorithm, fill = c("orange", "red", "green", "blue"), lty = "dashed", @@ -284,8 +315,8 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { args <- parse.args(); vcf.list <- read.inputs(args); -run.vcf.check(vcf.list, args); -if (args$global_comparsion == 'Y') { +run.vcf.compare(vcf.list, args); +if (args$global_comparsion == 'TRUE') { overlap.count <- gather.vcf.compare.result(args); } else { gather.vcf.compare.result(args); From 4baa1fd4926998f04540d31a43bf5674e98ccd0b Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Thu, 22 Jun 2023 15:00:07 -0700 Subject: [PATCH 03/16] only supports directory input and global comparison --- compare-vcf/compare_vcf.R | 247 ++++++++++++-------------------------- 1 file changed, 80 insertions(+), 167 deletions(-) diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R index c56f04b8..7ab5f46a 100644 --- a/compare-vcf/compare_vcf.R +++ b/compare-vcf/compare_vcf.R @@ -8,26 +8,19 @@ library('VennDiagram'); # tmp: for testing #args <- list(); -##args$path <- 'NULL'; -##args$file <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/maotian06-compare-vcf/input-vcfs.txt'; #args$path <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/call-sSNV-6.0.0/CPCG0000000196-T001-P01' -#args$file <- 'NULL'; #args$valid_algorithm <- 'NULL'; #args$dataset.id <- 'CPCG'; #args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/'); -#args$pairwise_comparsion <- 'FALSE'; -#args$global_comparsion <- 'TRUE'; ### 2. Dataset Argument Parser ######################################## parse.args <- function() { data.parser <- ArgumentParser(); - data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', default = 'NULL'); + data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', required = TRUE); data.parser$add_argument('-f', '--file', help = 'an input file that lists all the vcf.gz files', default = 'NULL'); data.parser$add_argument('-v', '--valid_algorithm', help = 'valid algorithms, default are the four call-sSNV algorithms', default = 'NULL'); data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test'); data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL'); - data.parser$add_argument('--pairwise_comparsion', help = 'the option to do pairwise comparsion, deflaut is FALSE.', default = 'FALSE'); - data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'TRUE'); ### Set the arg_parase args <- data.parser$parse_args(); if ( args$output.dir == 'NULL') { @@ -37,17 +30,13 @@ parse.args <- function() { }; ### 3. Read Input VCF Lists ################################## read.inputs <- function(args) { - if (args$path != 'NULL') { - vcf.list <- list.files(path = args$path, - pattern = '*snvs.vcf.gz$', - all.files = TRUE, - full.names = TRUE, - recursive = TRUE); - } else if (args$file != 'NULL') { - vcf.list <- as.list(t(read.table(args$file, header = FALSE))); - } else { - print( 'No input file or input directory!'); - }; + vcf.list <- list.files( + path = args$path, + pattern = '*snvs.vcf.gz$', + all.files = TRUE, + full.names = TRUE, + recursive = TRUE + ); # set the default valid algorithm, the default are four algorithms used in call-sSNV if (args$valid_algorithm == 'NULL') { algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2'); @@ -64,8 +53,6 @@ read.inputs <- function(args) { }; }; }; - # Strelka2 has an output with indel variants, removed it here -# vcf.list <- vcf.list[ - grep('indel', vcf.list)]; vcf.list <- vcf.list[ - grep('intermediate', vcf.list)]; vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm)); names(vcf.list) <- vcf.list.name; @@ -91,24 +78,11 @@ read.inputs <- function(args) { ### 4. Run VCF compare with system() ################################## run.vcf.compare <- function(vcf.list, args) { docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare'; - if (args$global_comparsion == 'TRUE') { - vcf.list.text <- paste(vcf.list, collapse = ' '); - vcf.compare.command <- paste(docker.command, vcf.list.text, '> global_comparison.txt', sep = ' '); - futile.logger::flog.info('Running the following command to check VCF overlap:'); - print(vcf.compare.command); - system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE); - - }; - if (args$pairwise_comparsion == 'TRUE') { - for (i in 1:(length(vcf.list) - 1)) { - for (j in (i + 1) : length(vcf.list)) { - file1 <- vcf.list[i]; - file2 <- vcf.list[j]; - name <- paste(names(vcf.list)[i], names(vcf.list)[j], 'pairwise', sep = '-' ); - system(paste('bash vcf_check.sh', file1, file2, name, args$output.dir, sep = ' '), intern = TRUE, ignore.stderr = TRUE); - } - }; - }; + vcf.list.text <- paste(vcf.list, collapse = ' '); + vcf.compare.command <- paste(docker.command, vcf.list.text, '> global_comparison.txt', sep = ' '); + futile.logger::flog.info('Running the following command to check VCF overlap:'); + print(vcf.compare.command); + system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE); system('mv *.txt intermediate/'); }; @@ -116,101 +90,67 @@ run.vcf.compare <- function(vcf.list, args) { gather.vcf.compare.result <- function( args ){ setwd(args$output.dir); - if (args$global_comparsion == 'TRUE') { - futile.logger::flog.info('Reading the following file:'); - file <- list.files(path = getwd(), - pattern = 'global_comparison.txt$', - full.names = TRUE, - recursive = TRUE); - system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' ')); - # start read temp_overlap_count.txt by each line - con <- file('temp_overlap_count.txt', "r"); - overlap.count <- c(); - category <- c(); - vcf.list.name <- names(vcf.list); - while ( TRUE ) { - line = readLines(con, n = 1); - if ( length(line) == 0 ) { - break - } - line.content <- unlist(strsplit(line, '\t')); - overlap.count <- c(overlap.count, line.content[1]); - line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content); - if (length(line.content) == 2) { - filename <- line.content[2]; - algorithm <- vcf.list.name[match(filename, vcf.list)]; - } else { - algorithm <- c() - for (i in (2 : length(line.content))){ - filename <- line.content[i]; - algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]); - } - } - category <- c(category, paste(algorithm, collapse = '-')); + futile.logger::flog.info('Reading the following file:'); + file <- list.files(path = getwd(), + pattern = 'global_comparison.txt$', + full.names = TRUE, + recursive = TRUE); + system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' ')); + # start read temp_overlap_count.txt by each line + con <- file('temp_overlap_count.txt', "r"); + overlap.count <- c(); + category <- c(); + vcf.list.name <- names(vcf.list); + while ( TRUE ) { + line = readLines(con, n = 1); + if ( length(line) == 0 ) { + break } - close(con) - overlap.count <- data.frame(overlap.count, category); - colnames(overlap.count) <- c('count', 'category'); - overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category)))); - overlap.count <- overlap.count[order(overlap.count$algorithm.count),]; - vcf.list.name.level <- as.numeric(factor(vcf.list.name)); - # generate compare group, such as 1, 2, 3, 12, 13, 23 - for (i in 1:nrow(overlap.count)) { - if ( overlap.count$algorithm.count[i] == 1) { - overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name); - } else { - category.list <- unlist(strsplit(overlap.count$category[i], '-')); - category.list.number <- as.character(sapply(category.list, function(x) { - return(match(x, vcf.list.name)) - })); - category.list.number <- category.list.number[order(category.list.number)]; - overlap.count$compare.group[i] <- paste(category.list.number, collapse = ''); - rm(category.list.number); + line.content <- unlist(strsplit(line, '\t')); + overlap.count <- c(overlap.count, line.content[1]); + line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content); + if (length(line.content) == 2) { + filename <- line.content[2]; + algorithm <- vcf.list.name[match(filename, vcf.list)]; + } else { + algorithm <- c() + for (i in (2 : length(line.content))){ + filename <- line.content[i]; + algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]); } - }; - overlap.count <- overlap.count[order(overlap.count$compare.group),]; - futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv'); - - print(overlap.count); - write.table(overlap.count, - 'overlap_count.tsv', - sep = '\t', - row.names = FALSE, - quote = FALSE); - return(overlap.count); } - if (args$pairwise_comparsion == 'TRUE') { - file.list <- list.files(path = getwd(), - pattern = 'intermediate/*_vcf_comparsion.txt$', - full.names = TRUE); - for (i in 1:length(file.list)) { - futile.logger::flog.info('Reading the following file:'); - system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' ')); - overlap.df <- read.table('temp_overlap_count.txt', header = FALSE, allowEscapes = FALSE); - system(paste('cat', file.list[i], '| grep "^VN" | cut -f 3 | head -2 >filename.txt', sep = ' ')); - filename.df <- read.table('filename.txt', header = FALSE, sep = ' '); - algo1 <- unlist(strsplit(basename(filename.df$V1[1]), '-'))[1]; - algo2 <- unlist(strsplit(basename(filename.df$V1[2]), '-'))[1]; - system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2 | head -3 >temp_number.txt', sep = ' ')); - number.df <- read.table('temp_number.txt', header = FALSE); - number.df$V1 <- as.numeric(number.df$V1); - cross.area <- number.df$V1[3]; - area1 <- number.df$V1[1] + cross.area; - area2 <- number.df$V1[2] + cross.area; - venn.plot <- draw.pairwise.venn( - area1, area2, cross.area, - category = c(algo1, algo2), - cat.pos = c(0, 180), - euler.d = TRUE, - sep.dist = 0.03, - rotation.degree = 45 - ); - tiff(filename = paste(algo1, algo2, 'Pairwise-Venn-diagram.tiff', sep = '_'), - compression = "lzw"); - grid.draw(venn.plot); - dev.off(); - }; - }; + category <- c(category, paste(algorithm, collapse = '-')); + } + close(con) + overlap.count <- data.frame(overlap.count, category); + colnames(overlap.count) <- c('count', 'category'); + overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category)))); + overlap.count <- overlap.count[order(overlap.count$algorithm.count),]; + vcf.list.name.level <- as.numeric(factor(vcf.list.name)); + # generate compare group, such as 1, 2, 3, 12, 13, 23 + for (i in 1:nrow(overlap.count)) { + if ( overlap.count$algorithm.count[i] == 1) { + overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name); + } else { + category.list <- unlist(strsplit(overlap.count$category[i], '-')); + category.list.number <- as.character(sapply(category.list, function(x) { + return(match(x, vcf.list.name)) + })); + category.list.number <- category.list.number[order(category.list.number)]; + overlap.count$compare.group[i] <- paste(category.list.number, collapse = ''); + rm(category.list.number); + } + }; + overlap.count <- overlap.count[order(overlap.count$compare.group),]; + futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv'); + + print(overlap.count); + write.table(overlap.count, + 'overlap_count.tsv', + sep = '\t', + row.names = FALSE, + quote = FALSE); + return(overlap.count); } ### 5. Plot Global Venn Plot ###################################### plot.quad.venn.plot <- function (overlap.count, vcf.list) { @@ -240,19 +180,6 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' ')); } } -# u1 = vcf.summary$variant.count[1]; -# u2 = vcf.summary$variant.count[2]; -# u3 = vcf.summary$variant.count[3]; -# u4 = vcf.summary$variant.count[4]; -# -# combination2 <- combn(1:4, 2); -# combination3 <- combn(1:4, 3); -# combination4 <- combn(1:4, 4); -# comb.list <- c( -# apply(combination2, 2, paste0, collapse=""), -# apply(combination3, 2, paste0, collapse=""), -# apply(combination4, 2, paste0, collapse="") -# ); count <- function(x) { cnt <- overlap.count$count[match(x, overlap.count$compare.group)]; @@ -262,19 +189,6 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { return(cnt); } }; -# u12 = overlap.count$count[match('12', overlap.count$compare.group)]; -# u13 = overlap.count$count[match('13', overlap.count$compare.group)]; -# u14 = overlap.count$count[match('14', overlap.count$compare.group)]; -# u23 = overlap.count$count[match('23', overlap.count$compare.group)]; -# u24 = overlap.count$count[match('24', overlap.count$compare.group)]; -# u34 = overlap.count$count[match('34', overlap.count$compare.group)]; -# u123 = overlap.count$count[match('123', overlap.count$compare.group)]; -# u124 = overlap.count$count[match('124', overlap.count$compare.group)]; -# u134 = overlap.count$count[match('134', overlap.count$compare.group)]; -# u234 = overlap.count$count[match('234', overlap.count$compare.group)]; -# u1234 = overlap.count$count[match('1234', overlap.count$compare.group)]; -# area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14, u124, u123, u23, u2, u12); -# print(area.vector); # plot qual.venn.plot venn.plot <- draw.quad.venn( area1 = vcf.summary$variant.count[1], @@ -297,14 +211,17 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { lty = "dashed", cex = 1, cat.cex = 0.8, - cat.col = 'black'); + cat.col = 'black' + ); tiff( filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', '.tiff' ), - compression = "lzw" ); + compression = "lzw" + ); grid.rect(x = unit(1, "npc"), y = unit(1, "npc"), - width = unit(5, "npc"), height = unit(5, "npc"), - just = "centre", default.units = "npc", name = NULL, - gp=gpar(), draw = TRUE, vp = NULL); + width = unit(5, "npc"), height = unit(5, "npc"), + just = "centre", default.units = "npc", name = NULL, + gp=gpar(), draw = TRUE, vp = NULL + ); grid.draw(venn.plot); dev.off(); } @@ -316,9 +233,5 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { args <- parse.args(); vcf.list <- read.inputs(args); run.vcf.compare(vcf.list, args); -if (args$global_comparsion == 'TRUE') { - overlap.count <- gather.vcf.compare.result(args); -} else { - gather.vcf.compare.result(args); -}; +overlap.count <- gather.vcf.compare.result(args); plot.quad.venn.plot(overlap.count, vcf.list); From 4cde0646a6d7dbc476c52a33e5cff3c7c7faed0f Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Thu, 22 Jun 2023 15:02:03 -0700 Subject: [PATCH 04/16] minor edit --- compare-vcf/compare_vcf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R index 7ab5f46a..9a7b0ac2 100644 --- a/compare-vcf/compare_vcf.R +++ b/compare-vcf/compare_vcf.R @@ -214,7 +214,7 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) { cat.col = 'black' ); tiff( - filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', '.tiff' ), + filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', 'tiff' ), compression = "lzw" ); grid.rect(x = unit(1, "npc"), y = unit(1, "npc"), From b485057d4d219b692350396fac48919c3dd9ec29 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Tue, 27 Jun 2023 16:36:04 -0700 Subject: [PATCH 05/16] with BEDtools, not needed --- config/default.config | 2 ++ config/methods.config | 2 +- main.nf | 4 +-- metadata.yaml | 2 +- module/intersect-processes.nf | 54 +++++++++++++++++++++++++++++++++-- module/intersect.nf | 36 +++++++++++++++++------ 6 files changed, 86 insertions(+), 14 deletions(-) diff --git a/config/default.config b/config/default.config index 3ba1aa6b..ad9e3b7c 100644 --- a/config/default.config +++ b/config/default.config @@ -23,6 +23,7 @@ params { manta_version = "1.6.0" MuSE_version = "2.0" BCFtools_version = "1.15.1" + BEDtools_version = "2.29.2" docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}" docker_image_validate_params = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" docker_image_GATK = "broadinstitute/gatk:${params.GATK_version}" @@ -32,6 +33,7 @@ params { docker_image_manta = "${-> params.docker_container_registry}/manta:${params.manta_version}" docker_image_MuSE = "${-> params.docker_container_registry}/muse:${params.MuSE_version}" docker_image_BCFtools = "${-> params.docker_container_registry}/bcftools:${params.BCFtools_version}" + docker_image_BEDtools = "${-> params.docker_container_registry}/bedtools:${params.BEDtools_version}" } diff --git a/config/methods.config b/config/methods.config index dbf24712..d28e089c 100644 --- a/config/methods.config +++ b/config/methods.config @@ -129,7 +129,7 @@ methods { if (params.containsKey("call_region") && params.call_region) { params.use_call_region = true } else { - params.call_region = "${params.work_dir}/NO_FILE.bed.gz" + // params.call_region = "${params.work_dir}/NO_FILE.bed.gz" params.use_call_region = false } params.call_region_index = "${params.call_region}.tbi" diff --git a/main.nf b/main.nf index 1cda90bc..b0c18721 100755 --- a/main.nf +++ b/main.nf @@ -214,13 +214,13 @@ workflow { .mix(strelka2_vcf_ch) .mix(mutect2_vcf_ch) .mix(muse_vcf_ch)) - .collect() + // .collect() tool_indices = (somaticsniper_idx_ch .mix(strelka2_idx_ch) .mix(mutect2_idx_ch) .mix(muse_idx_ch)) - .collect() + // .collect() intersect( tool_vcfs, diff --git a/metadata.yaml b/metadata.yaml index ab371712..da9b8577 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -5,4 +5,4 @@ Maintainers: ['maotian@mednet.ucla.edu'] Contributors: ['Mao Tian', 'Bugh Caden', 'Helena Winata', 'Yash Patel', 'Sorel Fitz-Gibbon'] Languages: ['Docker', 'Nextflow'] Dependencies: ['Docker', 'Nextflow'] -Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.15.1] +Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.15.1, BEDtools v2.29.2] diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf index 28ff1ee8..841b01e9 100644 --- a/module/intersect-processes.nf +++ b/module/intersect-processes.nf @@ -4,7 +4,33 @@ log.info """\ ==================================== Docker Images: - docker_image_BCFtools: ${params.docker_image_BCFtools} +- docker_image_BEDtools: ${params.docker_image_BEDtools} + """ +process trim_VCF_BCFtools { + container params.docker_image_BCFtools + publishDir path: "${params.workflow_output_dir}/intermediate", + mode: "copy", + pattern: "*_trim-SNV.vcf.gz*" + publishDir path: "${params.workflow_log_output_dir}", + mode: "copy", + pattern: ".command.*", + saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" } + + input: + path vcf + path index + path call_region + + output: + path "*_trim-SNV.vcf.gz", emit: trimmed_vcf + + script: + """ + set -euo pipefail + bcftools view --regions-file ${params.call_region} --output "${vcf.split('_')[-1]}}_SNV-trim.vcf.gz" ${vcf} + """ +} process intersect_VCFs_BCFtools { container params.docker_image_BCFtools @@ -30,12 +56,36 @@ process intersect_VCFs_BCFtools { path "isec-2-or-more" script: - vcf_list = vcfs.join(' ') - """ set -euo pipefail bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list} awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done """ } + +process intersect_VCFs_BEDtools { + container params.docker_image_BEDtools + publishDir path: "${params.workflow_output_dir}/output", + mode: "copy", + pattern: "multiinter.out" + publishDir path: "${params.workflow_log_output_dir}", + mode: "copy", + pattern: ".command.*", + saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" } + + input: + path vcfs + path indices + + output: + path "multiinter.out" + + script: + vcf_list = vcfs.join(' ') + name_list = vcfs.collect { it.getName().split('_')[0] }.join(' ') + """ + set -euo pipefail + bedtools multiinter -i ${vcf_list} -header -names ${name_list} > multiinter.out + """ +} diff --git a/module/intersect.nf b/module/intersect.nf index 534fd41f..6059ce36 100644 --- a/module/intersect.nf +++ b/module/intersect.nf @@ -1,31 +1,51 @@ include { generate_sha512sum } from './common' +include { trim_VCF_BCFtools } from './intersect-processes.nf' include { intersect_VCFs_BCFtools } from './intersect-processes.nf' +include { intersect_VCFs_BEDtools } from './intersect-processes.nf' include { compress_index_VCF } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams( options: [ output_dir: params.workflow_output_dir, log_output_dir: params.workflow_log_output_dir, bgzip_extra_args: params.bgzip_extra_args, - tabix_extra_args: params.tabix_extra_args + tabix_extra_args: params.tabix_extra_args, + is_output_file: false ]) - workflow intersect { take: tool_vcfs tool_indices main: - intersect_VCFs_BCFtools( + trim_VCF_BCFtools( tool_vcfs, - tool_indices - ) + tool_indices, + params.call_region + ) + trim_VCF_BCFtools.out.trimmed_vcf + .map { it -> ["${file(it).getName().split('_')[0]}", it] } + .set { inputs_to_compress } + compress_index_VCF(inputs_to_compress) + trimmed_vcfs = compress_index_VCF.out.index_out + .map{ it -> ["${it[1]}"] } + .collect() + trimmed_indices = compress_index_VCF.out.index_out + .map{ it -> ["${it[2]}"] } + .collect() + intersect_VCFs_BCFtools( + trimmed_vcfs, + trimmed_indices + ) + intersect_VCFs_BEDtools( + trimmed_vcfs, + trimmed_indices + ) file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf .flatten() .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-vcf", it]} .mix(intersect_VCFs_BCFtools.out.consensus_idx .flatten() - .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-idx", it]}) + .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-idx", it]} + ) generate_sha512sum(file_for_sha512) - emit: - intersect_VCFs_BCFtools.out.consensus_vcf } From c52d327ca36dc3260eb02e5ce5159bdf3ef38261 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Wed, 28 Jun 2023 12:32:49 -0700 Subject: [PATCH 06/16] switch to one extra bcftools line, no bedtools --- config/default.config | 4 +-- main.nf | 4 +-- metadata.yaml | 2 +- module/intersect-processes.nf | 62 +++++------------------------------ module/intersect.nf | 33 ++----------------- 5 files changed, 16 insertions(+), 89 deletions(-) diff --git a/config/default.config b/config/default.config index ad9e3b7c..0f732270 100644 --- a/config/default.config +++ b/config/default.config @@ -22,8 +22,7 @@ params { strelka2_version = "2.9.10" manta_version = "1.6.0" MuSE_version = "2.0" - BCFtools_version = "1.15.1" - BEDtools_version = "2.29.2" + BCFtools_version = "1.17" docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}" docker_image_validate_params = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" docker_image_GATK = "broadinstitute/gatk:${params.GATK_version}" @@ -33,7 +32,6 @@ params { docker_image_manta = "${-> params.docker_container_registry}/manta:${params.manta_version}" docker_image_MuSE = "${-> params.docker_container_registry}/muse:${params.MuSE_version}" docker_image_BCFtools = "${-> params.docker_container_registry}/bcftools:${params.BCFtools_version}" - docker_image_BEDtools = "${-> params.docker_container_registry}/bedtools:${params.BEDtools_version}" } diff --git a/main.nf b/main.nf index b0c18721..1cda90bc 100755 --- a/main.nf +++ b/main.nf @@ -214,13 +214,13 @@ workflow { .mix(strelka2_vcf_ch) .mix(mutect2_vcf_ch) .mix(muse_vcf_ch)) - // .collect() + .collect() tool_indices = (somaticsniper_idx_ch .mix(strelka2_idx_ch) .mix(mutect2_idx_ch) .mix(muse_idx_ch)) - // .collect() + .collect() intersect( tool_vcfs, diff --git a/metadata.yaml b/metadata.yaml index da9b8577..aea10576 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -5,4 +5,4 @@ Maintainers: ['maotian@mednet.ucla.edu'] Contributors: ['Mao Tian', 'Bugh Caden', 'Helena Winata', 'Yash Patel', 'Sorel Fitz-Gibbon'] Languages: ['Docker', 'Nextflow'] Dependencies: ['Docker', 'Nextflow'] -Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.15.1, BEDtools v2.29.2] +Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.17] diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf index 841b01e9..7bf8696d 100644 --- a/module/intersect-processes.nf +++ b/module/intersect-processes.nf @@ -4,34 +4,8 @@ log.info """\ ==================================== Docker Images: - docker_image_BCFtools: ${params.docker_image_BCFtools} -- docker_image_BEDtools: ${params.docker_image_BEDtools} """ -process trim_VCF_BCFtools { - container params.docker_image_BCFtools - publishDir path: "${params.workflow_output_dir}/intermediate", - mode: "copy", - pattern: "*_trim-SNV.vcf.gz*" - publishDir path: "${params.workflow_log_output_dir}", - mode: "copy", - pattern: ".command.*", - saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" } - - input: - path vcf - path index - path call_region - - output: - path "*_trim-SNV.vcf.gz", emit: trimmed_vcf - - script: - """ - set -euo pipefail - bcftools view --regions-file ${params.call_region} --output "${vcf.split('_')[-1]}}_SNV-trim.vcf.gz" ${vcf} - """ -} - process intersect_VCFs_BCFtools { container params.docker_image_BCFtools publishDir path: "${params.workflow_output_dir}/output", @@ -40,6 +14,9 @@ process intersect_VCFs_BCFtools { publishDir path: "${params.workflow_output_dir}/output", mode: "copy", pattern: "isec-2-or-more" + publishDir path: "${params.workflow_output_dir}/output", + mode: "copy", + pattern: "isec-1-or-more/*.txt" publishDir path: "${params.workflow_log_output_dir}", mode: "copy", pattern: ".command.*", @@ -48,44 +25,23 @@ process intersect_VCFs_BCFtools { input: path vcfs path indices + path call_region + path call_region_index output: path "*.vcf.gz", emit: consensus_vcf path "*.vcf.gz.tbi", emit: consensus_idx path ".command.*" path "isec-2-or-more" + path "isec-1-or-more/sites.txt" + path "isec-1-or-more/README.txt" script: vcf_list = vcfs.join(' ') """ set -euo pipefail - bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list} + bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${call_region} ${vcf_list} awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done + bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${call_region} ${vcf_list} """ } - -process intersect_VCFs_BEDtools { - container params.docker_image_BEDtools - publishDir path: "${params.workflow_output_dir}/output", - mode: "copy", - pattern: "multiinter.out" - publishDir path: "${params.workflow_log_output_dir}", - mode: "copy", - pattern: ".command.*", - saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" } - - input: - path vcfs - path indices - - output: - path "multiinter.out" - - script: - vcf_list = vcfs.join(' ') - name_list = vcfs.collect { it.getName().split('_')[0] }.join(' ') - """ - set -euo pipefail - bedtools multiinter -i ${vcf_list} -header -names ${name_list} > multiinter.out - """ -} diff --git a/module/intersect.nf b/module/intersect.nf index 6059ce36..2e99f7a5 100644 --- a/module/intersect.nf +++ b/module/intersect.nf @@ -1,15 +1,5 @@ include { generate_sha512sum } from './common' -include { trim_VCF_BCFtools } from './intersect-processes.nf' include { intersect_VCFs_BCFtools } from './intersect-processes.nf' -include { intersect_VCFs_BEDtools } from './intersect-processes.nf' -include { compress_index_VCF } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams( - options: [ - output_dir: params.workflow_output_dir, - log_output_dir: params.workflow_log_output_dir, - bgzip_extra_args: params.bgzip_extra_args, - tabix_extra_args: params.tabix_extra_args, - is_output_file: false - ]) workflow intersect { take: @@ -17,28 +7,11 @@ workflow intersect { tool_indices main: - trim_VCF_BCFtools( + intersect_VCFs_BCFtools( tool_vcfs, tool_indices, - params.call_region - ) - trim_VCF_BCFtools.out.trimmed_vcf - .map { it -> ["${file(it).getName().split('_')[0]}", it] } - .set { inputs_to_compress } - compress_index_VCF(inputs_to_compress) - trimmed_vcfs = compress_index_VCF.out.index_out - .map{ it -> ["${it[1]}"] } - .collect() - trimmed_indices = compress_index_VCF.out.index_out - .map{ it -> ["${it[2]}"] } - .collect() - intersect_VCFs_BCFtools( - trimmed_vcfs, - trimmed_indices - ) - intersect_VCFs_BEDtools( - trimmed_vcfs, - trimmed_indices + params.call_region, + params.call_region_index ) file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf .flatten() From 97336dfea0792699a896f69a2b87e6eed537db0f Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Wed, 28 Jun 2023 12:42:10 -0700 Subject: [PATCH 07/16] remove compare_vcf.R --- compare-vcf/compare_vcf.R | 237 -------------------------------------- 1 file changed, 237 deletions(-) delete mode 100644 compare-vcf/compare_vcf.R diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R deleted file mode 100644 index 9a7b0ac2..00000000 --- a/compare-vcf/compare_vcf.R +++ /dev/null @@ -1,237 +0,0 @@ -# This is the script to compare the vcf results ################ -# Initial commit: Mao Tian 09-28-2022 -## 1. Setup the environment ######################## -library('argparse'); -library('BoutrosLab.plotting.general'); -library('BoutrosLab.utilities'); -library('VennDiagram'); - -# tmp: for testing -#args <- list(); -#args$path <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/call-sSNV-6.0.0/CPCG0000000196-T001-P01' -#args$valid_algorithm <- 'NULL'; -#args$dataset.id <- 'CPCG'; -#args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/'); - -### 2. Dataset Argument Parser ######################################## -parse.args <- function() { - data.parser <- ArgumentParser(); - data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', required = TRUE); - data.parser$add_argument('-f', '--file', help = 'an input file that lists all the vcf.gz files', default = 'NULL'); - data.parser$add_argument('-v', '--valid_algorithm', help = 'valid algorithms, default are the four call-sSNV algorithms', default = 'NULL'); - data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test'); - data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL'); - ### Set the arg_parase - args <- data.parser$parse_args(); - if ( args$output.dir == 'NULL') { - args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/'); - } - return(args); - }; -### 3. Read Input VCF Lists ################################## -read.inputs <- function(args) { - vcf.list <- list.files( - path = args$path, - pattern = '*snvs.vcf.gz$', - all.files = TRUE, - full.names = TRUE, - recursive = TRUE - ); - # set the default valid algorithm, the default are four algorithms used in call-sSNV - if (args$valid_algorithm == 'NULL') { - algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2'); -# args$valid_algorithm <- algorithm.list; - } else { - algorithm.list <- args$valid_algorithm - }; - # this function is used to extract the algorithm of the vcf.gz file - # need to pre-define the algorithm.list - extract.algorithm <- function(x) { - for (i in 1:length(algorithm.list)) { - if (grepl(algorithm.list[i], x)) { - return(algorithm.list[i]); - }; - }; - }; - vcf.list <- vcf.list[ - grep('intermediate', vcf.list)]; - vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm)); - names(vcf.list) <- vcf.list.name; - vcf.count <- c(); - # count each vcf file variant - for (file in vcf.list) { - count <- system(paste('zcat', file, '| grep -v "^#" | wc -l', sep = ' '), intern = TRUE); - vcf.count <- c(vcf.count, as.numeric(count)); - } - vcf.summary <- data.frame(vcf.list, vcf.list.name, vcf.count); - colnames(vcf.summary) <- c('filename', 'algorithm', 'variant.count'); - # rank vcf by variant.count - vcf.summary <- vcf.summary[order(vcf.summary$variant.count, decreasing = TRUE),]; - vcf.list <- as.character(vcf.summary$filename); - names(vcf.list) <- as.character(vcf.summary$algorithm); - system(paste('mkdir', args$output.dir)); - system(paste('mkdir', paste(args$output.dir, 'intermediate', sep = '/'))); - setwd(args$output.dir); - write.table(vcf.summary, 'vcf_summary.tsv', sep = '\t', row.names = FALSE, quote = FALSE); - return(vcf.list); - }; - -### 4. Run VCF compare with system() ################################## -run.vcf.compare <- function(vcf.list, args) { - docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare'; - vcf.list.text <- paste(vcf.list, collapse = ' '); - vcf.compare.command <- paste(docker.command, vcf.list.text, '> global_comparison.txt', sep = ' '); - futile.logger::flog.info('Running the following command to check VCF overlap:'); - print(vcf.compare.command); - system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE); - system('mv *.txt intermediate/'); -}; - -### 5. Gather VCF compare results ################################## -gather.vcf.compare.result <- function( args ){ - setwd(args$output.dir); - - futile.logger::flog.info('Reading the following file:'); - file <- list.files(path = getwd(), - pattern = 'global_comparison.txt$', - full.names = TRUE, - recursive = TRUE); - system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' ')); - # start read temp_overlap_count.txt by each line - con <- file('temp_overlap_count.txt', "r"); - overlap.count <- c(); - category <- c(); - vcf.list.name <- names(vcf.list); - while ( TRUE ) { - line = readLines(con, n = 1); - if ( length(line) == 0 ) { - break - } - line.content <- unlist(strsplit(line, '\t')); - overlap.count <- c(overlap.count, line.content[1]); - line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content); - if (length(line.content) == 2) { - filename <- line.content[2]; - algorithm <- vcf.list.name[match(filename, vcf.list)]; - } else { - algorithm <- c() - for (i in (2 : length(line.content))){ - filename <- line.content[i]; - algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]); - } - } - category <- c(category, paste(algorithm, collapse = '-')); - } - close(con) - overlap.count <- data.frame(overlap.count, category); - colnames(overlap.count) <- c('count', 'category'); - overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category)))); - overlap.count <- overlap.count[order(overlap.count$algorithm.count),]; - vcf.list.name.level <- as.numeric(factor(vcf.list.name)); - # generate compare group, such as 1, 2, 3, 12, 13, 23 - for (i in 1:nrow(overlap.count)) { - if ( overlap.count$algorithm.count[i] == 1) { - overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name); - } else { - category.list <- unlist(strsplit(overlap.count$category[i], '-')); - category.list.number <- as.character(sapply(category.list, function(x) { - return(match(x, vcf.list.name)) - })); - category.list.number <- category.list.number[order(category.list.number)]; - overlap.count$compare.group[i] <- paste(category.list.number, collapse = ''); - rm(category.list.number); - } - }; - overlap.count <- overlap.count[order(overlap.count$compare.group),]; - futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv'); - - print(overlap.count); - write.table(overlap.count, - 'overlap_count.tsv', - sep = '\t', - row.names = FALSE, - quote = FALSE); - return(overlap.count); - } -### 5. Plot Global Venn Plot ###################################### -plot.quad.venn.plot <- function (overlap.count, vcf.list) { - device <- pdf(file = NULL); - futile.logger::flog.info('Reading vcf_summary.tsv'); - vcf.summary <- read.table('vcf_summary.tsv', sep = '\t', header = TRUE); - if (all(vcf.summary$filename == vcf.list)) { - futile.logger::flog.info('algorithm sequence matched, run variant count check'); - } else { - futile.logger::flog.info('algorithm sequence not matched, reordering'); - vcf.summary <- vcf.summary[match(vcf.list, vcf.summary$filename),]; - if (all(vcf.summary$algorithm == vcf.list)) { - futile.logger::flog.info('reordered, run variant count check'); - } - } - # check if the sum of the overlap.count equals to the summary of vcf - futile.logger::flog.info('check if the sum of the overlap.count equals to the summary of vcf'); - overlap.count$count <- as.numeric(overlap.count$count); - for (algo in vcf.summary$algorithm) { - count.from.summary <- vcf.summary$variant.count[vcf.summary$algorithm == algo]; - count.from.overlap.sum <- sum( - overlap.count$count[grep(algo, overlap.count$category)] - ); - if (count.from.summary == count.from.overlap.sum) { - futile.logger::flog.info(paste(algo, 'Match!', sep = ' ')); - } else { - futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' ')); - } - } - - count <- function(x) { - cnt <- overlap.count$count[match(x, overlap.count$compare.group)]; - if (is.na(cnt)) { - return(0); - } else { - return(cnt); - } - }; - # plot qual.venn.plot - venn.plot <- draw.quad.venn( - area1 = vcf.summary$variant.count[1], - area2 = vcf.summary$variant.count[2], - area3 = vcf.summary$variant.count[3], - area4 = vcf.summary$variant.count[4], - n12 = (count('12') + count('123') + count('124') + count('1234')), - n13 = (count('13') + count('123') + count('134') + count('1234')), - n14 = (count('14') + count('124') + count('134') + count('1234')), - n23 = (count('23') + count('123') + count('234') + count('1234')), - n24 = (count('24') + count('124') + count('234') + count('1234')), - n34 = (count('34') + count('134') + count('234') + count('1234')), - n123 = (count('123') + count('1234')), - n124 = (count('124') + count('1234')), - n134 = (count('134') + count('1234')), - n234 = (count('234') + count('1234')), - n1234 = count('1234'), - category = vcf.summary$algorithm, - fill = c("orange", "red", "green", "blue"), - lty = "dashed", - cex = 1, - cat.cex = 0.8, - cat.col = 'black' - ); - tiff( - filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', 'tiff' ), - compression = "lzw" - ); - grid.rect(x = unit(1, "npc"), y = unit(1, "npc"), - width = unit(5, "npc"), height = unit(5, "npc"), - just = "centre", default.units = "npc", name = NULL, - gp=gpar(), draw = TRUE, vp = NULL - ); - grid.draw(venn.plot); - dev.off(); -} - - - -### Main ########################################################## - -args <- parse.args(); -vcf.list <- read.inputs(args); -run.vcf.compare(vcf.list, args); -overlap.count <- gather.vcf.compare.result(args); -plot.quad.venn.plot(overlap.count, vcf.list); From 5887e59f5cc2655a9ade44d7ad97e636c97d289d Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Wed, 28 Jun 2023 13:50:15 -0700 Subject: [PATCH 08/16] update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4457175b..b493dfbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Add regions filter to variant intersections, limiting output to main chromosome variants +- Add second BCFtools step to create full presence/absence variant table (including private) - Add workflow to create a `consensus.vcf` that includes SNVs found by two or more variant callers - Add `fix_sample_names_VCF`, tumor and normal sample IDs from input BAMs used in output VCFs - Add `split_VCF_bcftools` to `Mutect2` workflow, separating SNVs, MNVs and Indels ### Changed +- Update to BCFtools v1.17 - Update to use sample ID from input BAM files (single tumor/normal BAM input only) - Use BCFtools to compress PASS variants instead of bgzip - Use BCFtools to extract PASS variants instead of awk From a9f6f9e857b9fb5875bb8a5b71981476749c87c0 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Mon, 10 Jul 2023 15:19:59 -0700 Subject: [PATCH 09/16] restructuring call_regions code --- config/methods.config | 13 ++++++++----- config/schema.yaml | 7 +++---- config/template.config | 3 ++- module/intersect.nf | 4 ++-- module/mutect2.nf | 3 +-- module/strelka2-processes.nf | 4 ++-- module/strelka2.nf | 8 ++++---- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/config/methods.config b/config/methods.config index d28e089c..a32c7183 100644 --- a/config/methods.config +++ b/config/methods.config @@ -125,14 +125,16 @@ methods { } } + set_canonical_regions_params { + params.canonical_regions_index = "${params.canonical_regions}.tbi" + } + set_strelka2_params = { - if (params.containsKey("call_region") && params.call_region) { - params.use_call_region = true + if (params.containsKey("use_canonical_regions") && params.use_canonical_regions) { + params.use_canonical_regions = true } else { - // params.call_region = "${params.work_dir}/NO_FILE.bed.gz" - params.use_call_region = false + params.use_canonical_regions = false } - params.call_region_index = "${params.call_region}.tbi" } set_mutect2_params = { @@ -198,6 +200,7 @@ methods { retry.setup_retry() methods.set_env() methods.set_sample_params() + methods.set_canonical_regions_params() methods.set_strelka2_params() methods.set_mutect2_params() methods.set_output_directory() diff --git a/config/schema.yaml b/config/schema.yaml index ca2ded4c..7f01d6b9 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -46,11 +46,10 @@ exome: required: false default: false help: 'The exome option when running manta and strelka2' -call_region: +canonical_regions: type: 'Path' - required: false - default: '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' - help: 'A call region bed file for strelka2 to save runtime or for a targeted region' + required: true + help: 'canonical regions used by mutect2, strelka2 and intersect' split_intervals_extra_args: type: 'String' required: false diff --git a/config/template.config b/config/template.config index 1b2cd833..b6dcd1e6 100644 --- a/config/template.config +++ b/config/template.config @@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config" params { algorithm = [] // 'somaticsniper', 'strelka2', 'mutect2', 'muse' reference = '' + canonical_regions = '${projectDir}/config/hg38_chromosomes_canonical.bed' output_dir = '' dataset_id = '' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 @@ -25,7 +26,7 @@ params { tabix_extra_args = '' // strelka2 options - call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' + use_canonical_regions = true // mutect2 options split_intervals_extra_args = '' diff --git a/module/intersect.nf b/module/intersect.nf index 2e99f7a5..e1694081 100644 --- a/module/intersect.nf +++ b/module/intersect.nf @@ -10,8 +10,8 @@ workflow intersect { intersect_VCFs_BCFtools( tool_vcfs, tool_indices, - params.call_region, - params.call_region_index + params.canonical_regions, + params.canonical_regions_index ) file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf .flatten() diff --git a/module/mutect2.nf b/module/mutect2.nf index bf6d6246..3b03be62 100644 --- a/module/mutect2.nf +++ b/module/mutect2.nf @@ -36,8 +36,7 @@ workflow mutect2 { if (params.intervals) { intervals = params.intervals } else { - intervals = "${projectDir}/config/hg38_chromosomes_canonical.list" - + intervals = params.canonical_regions // process non-canonical chromosome regions seperately // as this region requires more memory than the canonical regions call_sSNVInNonAssembledChromosomes_Mutect2( diff --git a/module/strelka2-processes.nf b/module/strelka2-processes.nf index 304b90e2..74bd4d84 100644 --- a/module/strelka2-processes.nf +++ b/module/strelka2-processes.nf @@ -38,7 +38,7 @@ process call_sIndel_Manta { script: exome = params.exome ? "--exome" : "" - call_region_command = params.use_call_region ? "--callRegions ${call_region}" : "" + call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : "" """ configManta.py \ --normalBam $normal \ @@ -82,7 +82,7 @@ process call_sSNV_Strelka2 { script: exome = params.exome ? "--exome" : "" - call_region_command = params.use_call_region ? "--callRegions ${call_region}" : "" + call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : "" """ set -euo pipefail configureStrelkaSomaticWorkflow.py \ diff --git a/module/strelka2.nf b/module/strelka2.nf index 92cf2d20..c1b3ab9f 100644 --- a/module/strelka2.nf +++ b/module/strelka2.nf @@ -26,8 +26,8 @@ workflow strelka2 { normal_index, params.reference, "${params.reference}.fai", - params.call_region, - params.call_region_index + params.canonical_regions, + params.canonical_regions_index ) call_sSNV_Strelka2( tumor_bam, @@ -37,8 +37,8 @@ workflow strelka2 { params.reference, "${params.reference}.fai", call_sIndel_Manta.out[0], - params.call_region, - params.call_region_index + params.canonical_regions, + params.canonical_regions_index ) filter_VCF_BCFtools(call_sSNV_Strelka2.out.snvs_vcf .mix(call_sSNV_Strelka2.out.indels_vcf)) From efe7030adaf4b22849aaf81c550b12358f6cae5b Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Fri, 14 Jul 2023 15:37:59 -0700 Subject: [PATCH 10/16] reconfigure strelka2 call_regions to general intersect_regions --- CHANGELOG.md | 1 + config/methods.config | 15 +++------------ config/schema.yaml | 8 ++++++-- config/template.config | 4 ++-- main.nf | 8 ++++---- module/intersect-processes.nf | 8 ++++---- module/intersect.nf | 4 ++-- module/mutect2.nf | 2 +- module/strelka2-processes.nf | 4 ++-- module/strelka2.nf | 9 ++++----- test/config/a_mini-all-tools.config | 5 +++-- test/config/a_mini-muse.config | 3 ++- test/config/a_mini-mutect2.config | 3 ++- test/config/a_mini-somaticsniper.config | 3 ++- test/config/a_mini-strelka2.config | 3 ++- 15 files changed, 40 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a80fd738..14420e12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `split_VCF_bcftools` to `Mutect2` workflow, separating SNVs, MNVs and Indels ### Changed +- reconfigure call_regions to intersect_regions - Update to BCFtools v1.17 - Update `MuSE` to `v2.0.2` - Update to use sample ID from input BAM files (single tumor/normal BAM input only) diff --git a/config/methods.config b/config/methods.config index a32c7183..47646663 100644 --- a/config/methods.config +++ b/config/methods.config @@ -125,16 +125,8 @@ methods { } } - set_canonical_regions_params { - params.canonical_regions_index = "${params.canonical_regions}.tbi" - } - - set_strelka2_params = { - if (params.containsKey("use_canonical_regions") && params.use_canonical_regions) { - params.use_canonical_regions = true - } else { - params.use_canonical_regions = false - } + set_intersect_regions_params = { + params.intersect_regions_index = "${params.intersect_regions}.tbi" } set_mutect2_params = { @@ -200,8 +192,7 @@ methods { retry.setup_retry() methods.set_env() methods.set_sample_params() - methods.set_canonical_regions_params() - methods.set_strelka2_params() + methods.set_intersect_regions_params() methods.set_mutect2_params() methods.set_output_directory() methods.set_pipeline_log() diff --git a/config/schema.yaml b/config/schema.yaml index 7f01d6b9..e63ebdea 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -46,10 +46,14 @@ exome: required: false default: false help: 'The exome option when running manta and strelka2' -canonical_regions: +intersect_regions: type: 'Path' required: true - help: 'canonical regions used by mutect2, strelka2 and intersect' + help: 'call regions used by mutect2, strelka2 and intersect' +use_intersect_regions: + type: 'Bool' + required: true + default: true split_intervals_extra_args: type: 'String' required: false diff --git a/config/template.config b/config/template.config index b6dcd1e6..3b7606cf 100644 --- a/config/template.config +++ b/config/template.config @@ -13,7 +13,7 @@ includeConfig "${projectDir}/config/methods.config" params { algorithm = [] // 'somaticsniper', 'strelka2', 'mutect2', 'muse' reference = '' - canonical_regions = '${projectDir}/config/hg38_chromosomes_canonical.bed' + intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz' output_dir = '' dataset_id = '' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 @@ -26,7 +26,7 @@ params { tabix_extra_args = '' // strelka2 options - use_canonical_regions = true + use_intersect_regions = true // mutect2 options split_intervals_extra_args = '' diff --git a/main.nf b/main.nf index 1cda90bc..a05c0f0c 100755 --- a/main.nf +++ b/main.nf @@ -30,7 +30,7 @@ log.info """\ reference: ${params.reference} reference_index: ${params.reference_index} reference_dict: ${params.reference_dict} - call_region: ${params.call_region} + intersect_region: ${params.intersect_region} - output: output_dir: ${params.output_dir_base} @@ -132,11 +132,11 @@ workflow { file_to_validate = reference_ch .mix (tumor_input.tumor_bam, tumor_input.tumor_index, normal_input.normal_bam, normal_input.normal_index) } - if (params.use_call_region) { + if (params.use_intersect_regions) { file_to_validate = file_to_validate.mix( Channel.from( - params.call_region, - params.call_region_index + params.intersect_regions, + params.intersect_regions_index ) ) } diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf index 7bf8696d..2b8f39a4 100644 --- a/module/intersect-processes.nf +++ b/module/intersect-processes.nf @@ -25,8 +25,8 @@ process intersect_VCFs_BCFtools { input: path vcfs path indices - path call_region - path call_region_index + path intersect_region + path intersect_region_index output: path "*.vcf.gz", emit: consensus_vcf @@ -40,8 +40,8 @@ process intersect_VCFs_BCFtools { vcf_list = vcfs.join(' ') """ set -euo pipefail - bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${call_region} ${vcf_list} + bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${intersect_region} ${vcf_list} awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done - bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${call_region} ${vcf_list} + bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_region} ${vcf_list} """ } diff --git a/module/intersect.nf b/module/intersect.nf index e1694081..cb4c0b9a 100644 --- a/module/intersect.nf +++ b/module/intersect.nf @@ -10,8 +10,8 @@ workflow intersect { intersect_VCFs_BCFtools( tool_vcfs, tool_indices, - params.canonical_regions, - params.canonical_regions_index + params.intersect_regions, + params.intersect_regions_index ) file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf .flatten() diff --git a/module/mutect2.nf b/module/mutect2.nf index 3b03be62..fc97bea0 100644 --- a/module/mutect2.nf +++ b/module/mutect2.nf @@ -36,7 +36,7 @@ workflow mutect2 { if (params.intervals) { intervals = params.intervals } else { - intervals = params.canonical_regions + intervals = "${projectDir}/config/hg38_chromosomes_canonical.bed" // process non-canonical chromosome regions seperately // as this region requires more memory than the canonical regions call_sSNVInNonAssembledChromosomes_Mutect2( diff --git a/module/strelka2-processes.nf b/module/strelka2-processes.nf index 74bd4d84..1ce1291d 100644 --- a/module/strelka2-processes.nf +++ b/module/strelka2-processes.nf @@ -38,7 +38,7 @@ process call_sIndel_Manta { script: exome = params.exome ? "--exome" : "" - call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : "" + call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : "" """ configManta.py \ --normalBam $normal \ @@ -82,7 +82,7 @@ process call_sSNV_Strelka2 { script: exome = params.exome ? "--exome" : "" - call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : "" + call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : "" """ set -euo pipefail configureStrelkaSomaticWorkflow.py \ diff --git a/module/strelka2.nf b/module/strelka2.nf index c1b3ab9f..429dc997 100644 --- a/module/strelka2.nf +++ b/module/strelka2.nf @@ -26,8 +26,8 @@ workflow strelka2 { normal_index, params.reference, "${params.reference}.fai", - params.canonical_regions, - params.canonical_regions_index + params.intersect_regions, + params.intersect_regions_index ) call_sSNV_Strelka2( tumor_bam, @@ -37,8 +37,8 @@ workflow strelka2 { params.reference, "${params.reference}.fai", call_sIndel_Manta.out[0], - params.canonical_regions, - params.canonical_regions_index + params.intersect_regions, + params.intersect_regions_index ) filter_VCF_BCFtools(call_sSNV_Strelka2.out.snvs_vcf .mix(call_sSNV_Strelka2.out.indels_vcf)) @@ -58,4 +58,3 @@ workflow strelka2 { .map{ it -> ["${it[2]}"] } } - diff --git a/test/config/a_mini-all-tools.config b/test/config/a_mini-all-tools.config index 60d71945..46cad5cb 100644 --- a/test/config/a_mini-all-tools.config +++ b/test/config/a_mini-all-tools.config @@ -13,18 +13,19 @@ includeConfig "${projectDir}/config/methods.config" params { algorithm = ['somaticsniper', 'strelka2', 'mutect2', 'muse'] reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta' + intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz' dataset_id = 'TWGSAMIN' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 // set params.exome to TRUE will add the '-E' option when running MuSE exome = true - save_intermediate_files = true + save_intermediate_files = false // module options bgzip_extra_args = '' tabix_extra_args = '' // strelka2 options - call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' + use_intersect_regions = true // mutect2 options split_intervals_extra_args = '' diff --git a/test/config/a_mini-muse.config b/test/config/a_mini-muse.config index 08045db7..a8c12850 100644 --- a/test/config/a_mini-muse.config +++ b/test/config/a_mini-muse.config @@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config" params { algorithm = ['muse'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse' reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta' + intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz' dataset_id = 'TWGSAMIN' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 // set params.exome to TRUE will add the '-E' option when running MuSE @@ -24,7 +25,7 @@ params { tabix_extra_args = '' // strelka2 options - call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' + use_intersect_regions = true // mutect2 options split_intervals_extra_args = '' diff --git a/test/config/a_mini-mutect2.config b/test/config/a_mini-mutect2.config index 35713744..cd43d7e6 100644 --- a/test/config/a_mini-mutect2.config +++ b/test/config/a_mini-mutect2.config @@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config" params { algorithm = ['mutect2'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse' reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta' + intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz' dataset_id = 'TWGSAMIN' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 // set params.exome to TRUE will add the '-E' option when running MuSE @@ -24,7 +25,7 @@ params { tabix_extra_args = '' // strelka2 options - call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' + use_intersect_regions = true // mutect2 options split_intervals_extra_args = '' diff --git a/test/config/a_mini-somaticsniper.config b/test/config/a_mini-somaticsniper.config index 6a8ccd6f..2bc4ae92 100644 --- a/test/config/a_mini-somaticsniper.config +++ b/test/config/a_mini-somaticsniper.config @@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config" params { algorithm = ['somaticsniper'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse' reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta' + intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz' dataset_id = 'TWGSAMIN' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 // set params.exome to TRUE will add the '-E' option when running MuSE @@ -24,7 +25,7 @@ params { tabix_extra_args = '' // strelka2 options - call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' + use_intersect_regions = true // mutect2 options split_intervals_extra_args = '' diff --git a/test/config/a_mini-strelka2.config b/test/config/a_mini-strelka2.config index f106759b..5665c987 100644 --- a/test/config/a_mini-strelka2.config +++ b/test/config/a_mini-strelka2.config @@ -14,6 +14,7 @@ params { algorithm = ['strelka2'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse' reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta' dataset_id = 'TWGSAMIN' + intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz' // set params.exome to TRUE will add the '--exome' option when running manta and strelka2 // set params.exome to TRUE will add the '-E' option when running MuSE exome = true @@ -24,7 +25,7 @@ params { tabix_extra_args = '' // strelka2 options - call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz' + use_intersect_regions = true // mutect2 options split_intervals_extra_args = '' From 44e4f74bed958f4be271f90d7e2d019317d87299 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Fri, 14 Jul 2023 17:15:33 -0700 Subject: [PATCH 11/16] remove call regions from isec-2-or-more --- module/intersect-processes.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf index 2b8f39a4..8a4a3f59 100644 --- a/module/intersect-processes.nf +++ b/module/intersect-processes.nf @@ -40,7 +40,7 @@ process intersect_VCFs_BCFtools { vcf_list = vcfs.join(' ') """ set -euo pipefail - bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${intersect_region} ${vcf_list} + bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list} awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_region} ${vcf_list} """ From 4a0e4e6f6bb5d48b50cf02b6661992d1b41e6805 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Mon, 17 Jul 2023 11:26:45 -0700 Subject: [PATCH 12/16] call_regions to intersect_regions within strelka2-processes --- module/strelka2-processes.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/module/strelka2-processes.nf b/module/strelka2-processes.nf index 1ce1291d..00ec17d3 100644 --- a/module/strelka2-processes.nf +++ b/module/strelka2-processes.nf @@ -27,8 +27,8 @@ process call_sIndel_Manta { path normal_index path reference path reference_index - path call_region - path call_region_index + path intersect_regions + path intersect_regions_index output: tuple path("MantaWorkflow/results/variants/candidateSmallIndels.vcf.gz"), @@ -38,7 +38,7 @@ process call_sIndel_Manta { script: exome = params.exome ? "--exome" : "" - call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : "" + call_region_command = params.use_intersect_regions ? "--callRegions ${intersect_regions}" : "" """ configManta.py \ --normalBam $normal \ @@ -71,8 +71,8 @@ process call_sSNV_Strelka2 { path reference path reference_index tuple path(indel_candidates), path(indel_candidates_index) - path call_region - path call_region_index + path intersect_regions + path intersect_regions_index output: tuple val("SNV"), path("StrelkaSomaticWorkflow/results/variants/somatic.snvs.vcf.gz"), emit: snvs_vcf @@ -82,7 +82,7 @@ process call_sSNV_Strelka2 { script: exome = params.exome ? "--exome" : "" - call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : "" + call_region_command = params.use_intersect_regions ? "--callRegions ${intersect_regions}" : "" """ set -euo pipefail configureStrelkaSomaticWorkflow.py \ From e1775fb471c46e46f0c5173614c41e094abb77d8 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Tue, 18 Jul 2023 09:51:46 -0700 Subject: [PATCH 13/16] consistent plural intersect_regions --- main.nf | 2 +- module/intersect-processes.nf | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index a05c0f0c..5b536791 100755 --- a/main.nf +++ b/main.nf @@ -30,7 +30,7 @@ log.info """\ reference: ${params.reference} reference_index: ${params.reference_index} reference_dict: ${params.reference_dict} - intersect_region: ${params.intersect_region} + intersect_regions: ${params.intersect_regions} - output: output_dir: ${params.output_dir_base} diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf index 8a4a3f59..25a8b10d 100644 --- a/module/intersect-processes.nf +++ b/module/intersect-processes.nf @@ -25,8 +25,8 @@ process intersect_VCFs_BCFtools { input: path vcfs path indices - path intersect_region - path intersect_region_index + path intersect_regions + path intersect_regions_index output: path "*.vcf.gz", emit: consensus_vcf @@ -42,6 +42,6 @@ process intersect_VCFs_BCFtools { set -euo pipefail bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list} awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done - bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_region} ${vcf_list} + bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_regions} ${vcf_list} """ } From 365956cc3310a27d1e286ad5d0119778c42615c4 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Tue, 18 Jul 2023 10:07:08 -0700 Subject: [PATCH 14/16] add comments to bcftools isec script --- module/intersect-processes.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf index 25a8b10d..257d1a2a 100644 --- a/module/intersect-processes.nf +++ b/module/intersect-processes.nf @@ -40,8 +40,11 @@ process intersect_VCFs_BCFtools { vcf_list = vcfs.join(' ') """ set -euo pipefail + # intersect keeping only variants that are present in at least 2 VCFs + # Use README.txt to rename output files to include sample names bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list} awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done + # intersect, keeping all variants, to create presence/absence list of variants in each VCF bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_regions} ${vcf_list} """ } From e2fe7bb049a6e035316c216750352034fb9a4d05 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Tue, 18 Jul 2023 10:17:44 -0700 Subject: [PATCH 15/16] edit interect_regions help --- config/schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/schema.yaml b/config/schema.yaml index e63ebdea..6b61e0fc 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -49,7 +49,7 @@ exome: intersect_regions: type: 'Path' required: true - help: 'call regions used by mutect2, strelka2 and intersect' + help: 'call regions bed file used by mutect2, strelka2 and intersect' use_intersect_regions: type: 'Bool' required: true From a0c532a087ca01e025ec36c76307868477249428 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Thu, 20 Jul 2023 12:50:32 -0700 Subject: [PATCH 16/16] always validate intersect_regions files --- main.nf | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 5b536791..04b22cd5 100755 --- a/main.nf +++ b/main.nf @@ -132,14 +132,12 @@ workflow { file_to_validate = reference_ch .mix (tumor_input.tumor_bam, tumor_input.tumor_index, normal_input.normal_bam, normal_input.normal_index) } - if (params.use_intersect_regions) { - file_to_validate = file_to_validate.mix( - Channel.from( - params.intersect_regions, - params.intersect_regions_index - ) + file_to_validate = file_to_validate.mix( + Channel.from( + params.intersect_regions, + params.intersect_regions_index ) - } + ) run_validate_PipeVal(file_to_validate) run_validate_PipeVal.out.validation_result.collectFile( name: 'input_validation.txt', newLine: true,