From 6fe23ee794dce1fe190b7f579e49149b1694c8f0 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Thu, 22 Jun 2023 12:11:32 -0700
Subject: [PATCH 01/16] @maotian06 original code

---
 compare-vcf/compare_vcf.R | 293 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 293 insertions(+)
 create mode 100644 compare-vcf/compare_vcf.R

diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R
new file mode 100644
index 00000000..d9010acb
--- /dev/null
+++ b/compare-vcf/compare_vcf.R
@@ -0,0 +1,293 @@
+# This is the script to compare the vcf results ################
+# Initial commit: Mao Tian 09-28-2022
+## 1. Setup the environment ########################
+library('argparse');
+library('BoutrosLab.plotting.general');
+library('BoutrosLab.utilities');
+library('VennDiagram');
+
+### 2. Dataset Argument Parser ########################################
+parse.args <- function() {
+    data.parser <- ArgumentParser();
+    data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', default = 'NULL');
+    data.parser$add_argument('-f', '--file', help = 'an input file that lists all the vcf.gz files', default = 'NULL');
+    data.parser$add_argument('-v', '--valid_algorithm', help = 'valid algorithms, default are the four call-sSNV algorithms', default = 'NULL');
+    data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test');
+    data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL');
+    data.parser$add_argument('--pairwise_comparsion', help = 'the option to do pairwise comparsion, deflaut is FALSE.', default = 'FALSE');
+    data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'Y');
+    ### Set the arg_parase
+    args <- data.parser$parse_args();
+    if ( args$output.dir == 'NULL') {
+        args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/');
+    }
+    return(args);
+    };
+### 3. Read Input VCF Lists ##################################
+read.inputs <- function(args) {
+    if (args$path != 'NULL') {
+        vcf.list <- list.files(path = args$path,
+                                    pattern = '*.vcf.gz$',
+                                    all.files = TRUE,
+                                    full.names = TRUE,
+                                    recursive = TRUE);
+        } else if (args$file != 'NULL') {
+            vcf.list <- as.character(read.table(args$file, header = FALSE));
+        } else {
+            print( 'No input file or input directory!');
+        };
+    # set the default valid algorithm, the default are four algorithms used in call-sSNV
+    if (args$valid_algorithm == 'NULL') {
+        algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2');
+        args$valid_algorithm <- algorithm.list;
+        } else {
+            algorithm.list <- args$valid_algorithm
+        };
+    # this function is used to extract the algorithm of the vcf.gz file
+    # need to pre-define the algorithm.list
+    extract.algorithm <- function(x) {
+        for (i in 1:length(algorithm.list)) {
+            if (grepl(algorithm.list[i], x)) {
+                return(algorithm.list[i]);
+            };
+        };
+    };
+    # Strelka2 has an output with indel variants, removed it here
+    vcf.list <- vcf.list[ - grep('indel', vcf.list)];
+    vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm));
+    names(vcf.list) <- vcf.list.name;
+    vcf.count <- c();
+    # count each vcf file variant
+    for (file in vcf.list) {
+        count <- system(paste('zcat', file, '| grep -v "^#" | wc -l', sep = ' '), intern = TRUE);
+        vcf.count <- c(vcf.count, count);
+    }
+    vcf.summary <- data.frame(vcf.list, vcf.list.name, vcf.count);
+    colnames(vcf.summary) <- c('filename', 'algorithm', 'variant.count');
+    # rank vcf by variant.count
+    vcf.summary <- vcf.summary[order(vcf.summary$variant.count, decreasing = TRUE),];
+    vcf.list <- as.character(vcf.summary$filename);
+    names(vcf.list) <- as.character(vcf.summary$algorithm);
+    system(paste('mkdir', args$output.dir));
+    system(paste('mkdir', paste(args$output.dir, 'intermediate', sep = '/')));
+    setwd(args$output.dir);
+    write.table(vcf.summary, 'vcf_summary.tsv', sep = '\t', row.names = FALSE, quote = FALSE);
+    return(vcf.list);
+    };
+
+### 4. Run VCF check with system() ##################################
+run.vcf.check <- function(vcf.list, args) {
+    docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare';
+    if (args$global_comparsion == 'TRUE') {
+        vcf.list.text <- paste(vcf.list, collapse = ' ');
+        vcf.check.command <- paste(docker.command,  vcf.list.text, '> global_comparison.txt', sep = ' ');
+        futile.logger::flog.info('Running the following command to check VCF overlap:');
+        print(vcf.check.command);
+        system(vcf.check.command, intern = TRUE, ignore.stderr = TRUE);
+
+    };
+    if (args$pairwise_comparsion == 'TRUE') {
+        for (i in 1:(length(vcf.list) - 1)) {
+            for (j in (i + 1) : length(vcf.list)) {
+                file1 <- vcf.list[i];
+                file2 <- vcf.list[j];
+                name <- paste(names(vcf.list)[i], names(vcf.list)[j], 'pairwise', sep = '-' );
+                system(paste('bash vcf_check.sh', file1, file2, name, args$output.dir, sep = ' '), intern = TRUE, ignore.stderr = TRUE);
+            }
+        };
+    };
+    system('mv *.txt intermediate/');
+};
+
+### 5. Gather VCF check results ##################################
+gather.vcf.compare.result <- function( args ){
+    setwd(args$output.dir);
+
+    if (args$global_comparsion == 'TRUE') {
+        futile.logger::flog.info('Reading the following file:');
+        file <- list.files(path = getwd(),
+                                    pattern = 'global_comparison.txt$',
+                                    full.names = TRUE,
+                                    recursive = TRUE);
+        system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' '));
+        # start read temp_overlap_count.txt by each line
+        con <- file('temp_overlap_count.txt', "r");
+        overlap.count <- c();
+        category <- c();
+        vcf.list.name <- names(vcf.list);
+        while ( TRUE ) {
+            line = readLines(con, n = 1);
+            if ( length(line) == 0 ) {
+            break
+            }
+            line.content <- unlist(strsplit(line, '\t'));
+            overlap.count <- c(overlap.count, line.content[1]);
+            line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content);
+            if (length(line.content) == 2) {
+                filename <- line.content[2];
+                algorithm <- vcf.list.name[match(filename, vcf.list)];
+            } else {
+                algorithm <- c()
+                for (i in (2 : length(line.content))){
+                    filename <- line.content[i];
+                    algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]);
+                }
+            }
+            category <- c(category, paste(algorithm, collapse = '-'));
+        }
+        close(con)
+        overlap.count <- data.frame(overlap.count, category);
+        colnames(overlap.count) <- c('count', 'category');
+        overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category))));
+        overlap.count <- overlap.count[order(overlap.count$algorithm.count),];
+        vcf.list.name.level <- as.numeric(factor(vcf.list.name));
+        # generate compare group, such as 1, 2, 3, 12, 13, 23
+        for (i in 1:nrow(overlap.count)) {
+            if ( overlap.count$algorithm.count[i] == 1) {
+                overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name);
+            } else {
+                category.list <- unlist(strsplit(overlap.count$category[i], '-'));
+                category.list.number <- as.character(sapply(category.list, function(x) {
+                    return(match(x, vcf.list.name))
+                    }));
+                category.list.number <- category.list.number[order(category.list.number)];
+                overlap.count$compare.group[i] <- paste(category.list.number, collapse = '');
+                rm(category.list.number);
+            }
+        };
+        overlap.count <- overlap.count[order(overlap.count$compare.group),];
+        futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv');
+
+        print(overlap.count);
+        write.table(overlap.count,
+                    'overlap_count.tsv',
+                    sep = '\t',
+                    row.names = FALSE,
+                    quote = FALSE);
+        return(overlap.count);
+        }
+    if (args$pairwise_comparsion == 'Y') {
+        file.list <- list.files(path = getwd(),
+                                pattern = 'intermediate/*_vcf_comparsion.txt$',
+                                full.names = TRUE);
+        for (i in 1:length(file.list)) {
+            futile.logger::flog.info('Reading the following file:');
+            system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' '));
+            overlap.df <- read.table('temp_overlap_count.txt', header = FALSE, allowEscapes = FALSE);
+            system(paste('cat', file.list[i], '| grep "^VN" | cut -f 3 | head -2 >filename.txt', sep = ' '));
+            filename.df <- read.table('filename.txt', header = FALSE, sep = ' ');
+            algo1 <- unlist(strsplit(basename(filename.df$V1[1]), '-'))[1];
+            algo2 <- unlist(strsplit(basename(filename.df$V1[2]), '-'))[1];
+            system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2 | head -3 >temp_number.txt', sep = ' '));
+            number.df <- read.table('temp_number.txt', header = FALSE);
+            number.df$V1 <- as.numeric(number.df$V1);
+            cross.area <- number.df$V1[3];
+            area1 <- number.df$V1[1] + cross.area;
+            area2 <- number.df$V1[2] + cross.area;
+            venn.plot <- draw.pairwise.venn(
+                            area1, area2, cross.area,
+                            category = c(algo1, algo2),
+                            cat.pos = c(0, 180),
+                            euler.d = TRUE,
+                            sep.dist = 0.03,
+                            rotation.degree = 45
+                        );
+            tiff(filename = paste(algo1, algo2, 'Pairwise-Venn-diagram.tiff', sep = '_'),
+                    compression = "lzw");
+            grid.draw(venn.plot);
+            dev.off();
+            };
+        };
+    }
+### 5. Plot Global Venn Plot ######################################
+plot.quad.venn.plot <- function (overlap.count, vcf.list) {
+    device <- pdf(file = NULL);
+    futile.logger::flog.info('Reading vcf_summary.tsv');
+    vcf.summary <- read.table('vcf_summary.tsv', sep = '\t', header = TRUE);
+    if (all(vcf.summary$filename == vcf.list)) {
+        futile.logger::flog.info('algorithm sequence matched, run variant count check');
+    } else {
+        futile.logger::flog.info('algorithm sequence not matched, reordering');
+        vcf.summary <- vcf.summary[match(vcf.list, vcf.summary$filename),];
+        if (all(vcf.summary$algorithm == vcf.list)) {
+        futile.logger::flog.info('reordered, run variant count check');
+        }
+    }
+    # check if the sum of the overlap.count equals to the summary of vcf
+    futile.logger::flog.info('check if the sum of the overlap.count equals to the summary of vcf');
+    overlap.count$count <- as.numeric(overlap.count$count);
+    for (algo in vcf.summary$algorithm) {
+        count.from.summary <- vcf.summary$variant.count[vcf.summary$algorithm == algo];
+        count.from.overlap.sum <- sum(
+            overlap.count$count[grep(algo, overlap.count$category)]
+            );
+    if (count.from.summary == count.from.overlap.sum) {
+        futile.logger::flog.info(paste(algo, 'Match!', sep = ' '));
+    } else {
+        futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' '));
+    }
+    }
+    u1 = vcf.summary$variant.count[1];
+    u2 = vcf.summary$variant.count[2];
+    u3 = vcf.summary$variant.count[3];
+    u4 = vcf.summary$variant.count[4];
+    u12 = overlap.count$count[match('12', overlap.count$compare.group)];
+    u13 = overlap.count$count[match('13', overlap.count$compare.group)];
+    u14 = overlap.count$count[match('14', overlap.count$compare.group)];
+    u23 = overlap.count$count[match('23', overlap.count$compare.group)];
+    u24 = overlap.count$count[match('24', overlap.count$compare.group)];
+    u34 = overlap.count$count[match('34', overlap.count$compare.group)];
+    u123 = overlap.count$count[match('123', overlap.count$compare.group)];
+    u124 = overlap.count$count[match('124', overlap.count$compare.group)];
+    u134 = overlap.count$count[match('134', overlap.count$compare.group)];
+    u234 = overlap.count$count[match('234', overlap.count$compare.group)];
+    u1234 = overlap.count$count[match('1234', overlap.count$compare.group)];
+    area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14,  u124, u123, u23, u2, u12)
+    print(area.vector);
+    # plot qual.venn.plot
+    venn.plot <- draw.quad.venn(
+        area1 = vcf.summary$variant.count[1],
+        area2 = vcf.summary$variant.count[2],
+        area3 = vcf.summary$variant.count[3],
+        area4 = vcf.summary$variant.count[4],
+        n12 = (u12 + u123 + u124 + u1234),
+        n13 = (u13 + u123 + u134 + u1234),
+        n14 = (u14 + u124 + u134 + u1234),
+        n23 = (u23 + u123 + u234 + u1234),
+        n24 = (u24 + u124 + u234 + u1234),
+        n34 = (u34 + u134 + u234 + u1234),
+        n123 = (u123 + u1234),
+        n124 = (u124 + u1234),
+        n134 = (u134 + u1234),
+        n234 = (u234 + u1234),
+        n1234 = u1234,
+        category = vcf.summary$algorithm,
+        fill = c("orange", "red", "green", "blue"),
+        lty = "dashed",
+        cex = 1,
+        cat.cex = 0.8,
+        cat.col = 'black');
+    tiff(
+        filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', '.tiff' ),
+        compression = "lzw" );
+    grid.rect(x = unit(1, "npc"), y = unit(1, "npc"),
+          width = unit(5, "npc"), height = unit(5, "npc"),
+          just = "centre", default.units = "npc", name = NULL,
+          gp=gpar(), draw = TRUE, vp = NULL);
+    grid.draw(venn.plot);
+    dev.off();
+}
+
+
+
+### Main ##########################################################
+
+args <- parse.args();
+vcf.list <- read.inputs(args);
+run.vcf.check(vcf.list, args);
+if (args$global_comparsion == 'Y') {
+    overlap.count <- gather.vcf.compare.result(args);
+} else {
+    gather.vcf.compare.result(args);
+};
+plot.quad.venn.plot(overlap.count, vcf.list);

From b4a7b203d9350db6b4dda6dc043dcc390c08b388 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Thu, 22 Jun 2023 14:25:44 -0700
Subject: [PATCH 02/16] working locally, before paring down

---
 compare-vcf/compare_vcf.R | 127 ++++++++++++++++++++++++--------------
 1 file changed, 79 insertions(+), 48 deletions(-)

diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R
index d9010acb..c56f04b8 100644
--- a/compare-vcf/compare_vcf.R
+++ b/compare-vcf/compare_vcf.R
@@ -6,6 +6,18 @@ library('BoutrosLab.plotting.general');
 library('BoutrosLab.utilities');
 library('VennDiagram');
 
+# tmp: for testing
+#args <- list();
+##args$path <- 'NULL';
+##args$file <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/maotian06-compare-vcf/input-vcfs.txt';
+#args$path <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/call-sSNV-6.0.0/CPCG0000000196-T001-P01'
+#args$file <- 'NULL';
+#args$valid_algorithm <- 'NULL';
+#args$dataset.id <- 'CPCG';
+#args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/');
+#args$pairwise_comparsion <- 'FALSE';
+#args$global_comparsion <- 'TRUE';
+
 ### 2. Dataset Argument Parser ########################################
 parse.args <- function() {
     data.parser <- ArgumentParser();
@@ -15,7 +27,7 @@ parse.args <- function() {
     data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test');
     data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL');
     data.parser$add_argument('--pairwise_comparsion', help = 'the option to do pairwise comparsion, deflaut is FALSE.', default = 'FALSE');
-    data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'Y');
+    data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'TRUE');
     ### Set the arg_parase
     args <- data.parser$parse_args();
     if ( args$output.dir == 'NULL') {
@@ -27,19 +39,19 @@ parse.args <- function() {
 read.inputs <- function(args) {
     if (args$path != 'NULL') {
         vcf.list <- list.files(path = args$path,
-                                    pattern = '*.vcf.gz$',
+                                    pattern = '*snvs.vcf.gz$',
                                     all.files = TRUE,
                                     full.names = TRUE,
                                     recursive = TRUE);
         } else if (args$file != 'NULL') {
-            vcf.list <- as.character(read.table(args$file, header = FALSE));
+            vcf.list <- as.list(t(read.table(args$file, header = FALSE)));
         } else {
             print( 'No input file or input directory!');
         };
     # set the default valid algorithm, the default are four algorithms used in call-sSNV
     if (args$valid_algorithm == 'NULL') {
         algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2');
-        args$valid_algorithm <- algorithm.list;
+#        args$valid_algorithm <- algorithm.list;
         } else {
             algorithm.list <- args$valid_algorithm
         };
@@ -53,14 +65,15 @@ read.inputs <- function(args) {
         };
     };
     # Strelka2 has an output with indel variants, removed it here
-    vcf.list <- vcf.list[ - grep('indel', vcf.list)];
+#    vcf.list <- vcf.list[ - grep('indel', vcf.list)];
+    vcf.list <- vcf.list[ - grep('intermediate', vcf.list)];
     vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm));
     names(vcf.list) <- vcf.list.name;
     vcf.count <- c();
     # count each vcf file variant
     for (file in vcf.list) {
         count <- system(paste('zcat', file, '| grep -v "^#" | wc -l', sep = ' '), intern = TRUE);
-        vcf.count <- c(vcf.count, count);
+        vcf.count <- c(vcf.count, as.numeric(count));
     }
     vcf.summary <- data.frame(vcf.list, vcf.list.name, vcf.count);
     colnames(vcf.summary) <- c('filename', 'algorithm', 'variant.count');
@@ -75,15 +88,15 @@ read.inputs <- function(args) {
     return(vcf.list);
     };
 
-### 4. Run VCF check with system() ##################################
-run.vcf.check <- function(vcf.list, args) {
+### 4. Run VCF compare with system() ##################################
+run.vcf.compare <- function(vcf.list, args) {
     docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare';
     if (args$global_comparsion == 'TRUE') {
         vcf.list.text <- paste(vcf.list, collapse = ' ');
-        vcf.check.command <- paste(docker.command,  vcf.list.text, '> global_comparison.txt', sep = ' ');
+        vcf.compare.command <- paste(docker.command,  vcf.list.text, '> global_comparison.txt', sep = ' ');
         futile.logger::flog.info('Running the following command to check VCF overlap:');
-        print(vcf.check.command);
-        system(vcf.check.command, intern = TRUE, ignore.stderr = TRUE);
+        print(vcf.compare.command);
+        system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE);
 
     };
     if (args$pairwise_comparsion == 'TRUE') {
@@ -99,7 +112,7 @@ run.vcf.check <- function(vcf.list, args) {
     system('mv *.txt intermediate/');
 };
 
-### 5. Gather VCF check results ##################################
+### 5. Gather VCF compare results ##################################
 gather.vcf.compare.result <- function( args ){
     setwd(args$output.dir);
 
@@ -166,7 +179,7 @@ gather.vcf.compare.result <- function( args ){
                     quote = FALSE);
         return(overlap.count);
         }
-    if (args$pairwise_comparsion == 'Y') {
+    if (args$pairwise_comparsion == 'TRUE') {
         file.list <- list.files(path = getwd(),
                                 pattern = 'intermediate/*_vcf_comparsion.txt$',
                                 full.names = TRUE);
@@ -221,46 +234,64 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
         count.from.overlap.sum <- sum(
             overlap.count$count[grep(algo, overlap.count$category)]
             );
-    if (count.from.summary == count.from.overlap.sum) {
-        futile.logger::flog.info(paste(algo, 'Match!', sep = ' '));
-    } else {
-        futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' '));
-    }
+        if (count.from.summary == count.from.overlap.sum) {
+            futile.logger::flog.info(paste(algo, 'Match!', sep = ' '));
+        } else {
+            futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' '));
+        }
     }
-    u1 = vcf.summary$variant.count[1];
-    u2 = vcf.summary$variant.count[2];
-    u3 = vcf.summary$variant.count[3];
-    u4 = vcf.summary$variant.count[4];
-    u12 = overlap.count$count[match('12', overlap.count$compare.group)];
-    u13 = overlap.count$count[match('13', overlap.count$compare.group)];
-    u14 = overlap.count$count[match('14', overlap.count$compare.group)];
-    u23 = overlap.count$count[match('23', overlap.count$compare.group)];
-    u24 = overlap.count$count[match('24', overlap.count$compare.group)];
-    u34 = overlap.count$count[match('34', overlap.count$compare.group)];
-    u123 = overlap.count$count[match('123', overlap.count$compare.group)];
-    u124 = overlap.count$count[match('124', overlap.count$compare.group)];
-    u134 = overlap.count$count[match('134', overlap.count$compare.group)];
-    u234 = overlap.count$count[match('234', overlap.count$compare.group)];
-    u1234 = overlap.count$count[match('1234', overlap.count$compare.group)];
-    area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14,  u124, u123, u23, u2, u12)
-    print(area.vector);
+#    u1 = vcf.summary$variant.count[1];
+#    u2 = vcf.summary$variant.count[2];
+#    u3 = vcf.summary$variant.count[3];
+#    u4 = vcf.summary$variant.count[4];
+#
+#    combination2 <- combn(1:4, 2);
+#    combination3 <- combn(1:4, 3);
+#    combination4 <- combn(1:4, 4);
+#    comb.list <- c(
+#        apply(combination2, 2, paste0, collapse=""),
+#        apply(combination3, 2, paste0, collapse=""),
+#        apply(combination4, 2, paste0, collapse="")
+#        );
+
+    count <- function(x) {
+        cnt <- overlap.count$count[match(x, overlap.count$compare.group)];
+        if (is.na(cnt)) {
+            return(0);
+        } else {
+            return(cnt);
+        }
+    };
+#    u12 = overlap.count$count[match('12', overlap.count$compare.group)];
+#    u13 = overlap.count$count[match('13', overlap.count$compare.group)];
+#    u14 = overlap.count$count[match('14', overlap.count$compare.group)];
+#    u23 = overlap.count$count[match('23', overlap.count$compare.group)];
+#    u24 = overlap.count$count[match('24', overlap.count$compare.group)];
+#    u34 = overlap.count$count[match('34', overlap.count$compare.group)];
+#    u123 = overlap.count$count[match('123', overlap.count$compare.group)];
+#    u124 = overlap.count$count[match('124', overlap.count$compare.group)];
+#    u134 = overlap.count$count[match('134', overlap.count$compare.group)];
+#    u234 = overlap.count$count[match('234', overlap.count$compare.group)];
+#    u1234 = overlap.count$count[match('1234', overlap.count$compare.group)];
+#    area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14,  u124, u123, u23, u2, u12);
+#    print(area.vector);
     # plot qual.venn.plot
     venn.plot <- draw.quad.venn(
         area1 = vcf.summary$variant.count[1],
         area2 = vcf.summary$variant.count[2],
         area3 = vcf.summary$variant.count[3],
         area4 = vcf.summary$variant.count[4],
-        n12 = (u12 + u123 + u124 + u1234),
-        n13 = (u13 + u123 + u134 + u1234),
-        n14 = (u14 + u124 + u134 + u1234),
-        n23 = (u23 + u123 + u234 + u1234),
-        n24 = (u24 + u124 + u234 + u1234),
-        n34 = (u34 + u134 + u234 + u1234),
-        n123 = (u123 + u1234),
-        n124 = (u124 + u1234),
-        n134 = (u134 + u1234),
-        n234 = (u234 + u1234),
-        n1234 = u1234,
+        n12 = (count('12') + count('123') + count('124') + count('1234')),
+        n13 = (count('13') + count('123') + count('134') + count('1234')),
+        n14 = (count('14') + count('124') + count('134') + count('1234')),
+        n23 = (count('23') + count('123') + count('234') + count('1234')),
+        n24 = (count('24') + count('124') + count('234') + count('1234')),
+        n34 = (count('34') + count('134') + count('234') + count('1234')),
+        n123 = (count('123') + count('1234')),
+        n124 = (count('124') + count('1234')),
+        n134 = (count('134') + count('1234')),
+        n234 = (count('234') + count('1234')),
+        n1234 = count('1234'),
         category = vcf.summary$algorithm,
         fill = c("orange", "red", "green", "blue"),
         lty = "dashed",
@@ -284,8 +315,8 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
 
 args <- parse.args();
 vcf.list <- read.inputs(args);
-run.vcf.check(vcf.list, args);
-if (args$global_comparsion == 'Y') {
+run.vcf.compare(vcf.list, args);
+if (args$global_comparsion == 'TRUE') {
     overlap.count <- gather.vcf.compare.result(args);
 } else {
     gather.vcf.compare.result(args);

From 4baa1fd4926998f04540d31a43bf5674e98ccd0b Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Thu, 22 Jun 2023 15:00:07 -0700
Subject: [PATCH 03/16] only supports directory input and global comparison

---
 compare-vcf/compare_vcf.R | 247 ++++++++++++--------------------------
 1 file changed, 80 insertions(+), 167 deletions(-)

diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R
index c56f04b8..7ab5f46a 100644
--- a/compare-vcf/compare_vcf.R
+++ b/compare-vcf/compare_vcf.R
@@ -8,26 +8,19 @@ library('VennDiagram');
 
 # tmp: for testing
 #args <- list();
-##args$path <- 'NULL';
-##args$file <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/maotian06-compare-vcf/input-vcfs.txt';
 #args$path <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/call-sSNV-6.0.0/CPCG0000000196-T001-P01'
-#args$file <- 'NULL';
 #args$valid_algorithm <- 'NULL';
 #args$dataset.id <- 'CPCG';
 #args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/');
-#args$pairwise_comparsion <- 'FALSE';
-#args$global_comparsion <- 'TRUE';
 
 ### 2. Dataset Argument Parser ########################################
 parse.args <- function() {
     data.parser <- ArgumentParser();
-    data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', default = 'NULL');
+    data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', required = TRUE);
     data.parser$add_argument('-f', '--file', help = 'an input file that lists all the vcf.gz files', default = 'NULL');
     data.parser$add_argument('-v', '--valid_algorithm', help = 'valid algorithms, default are the four call-sSNV algorithms', default = 'NULL');
     data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test');
     data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL');
-    data.parser$add_argument('--pairwise_comparsion', help = 'the option to do pairwise comparsion, deflaut is FALSE.', default = 'FALSE');
-    data.parser$add_argument('--global_comparsion', help = 'the output directory, default is current working directory, deflaut is TRUE', default = 'TRUE');
     ### Set the arg_parase
     args <- data.parser$parse_args();
     if ( args$output.dir == 'NULL') {
@@ -37,17 +30,13 @@ parse.args <- function() {
     };
 ### 3. Read Input VCF Lists ##################################
 read.inputs <- function(args) {
-    if (args$path != 'NULL') {
-        vcf.list <- list.files(path = args$path,
-                                    pattern = '*snvs.vcf.gz$',
-                                    all.files = TRUE,
-                                    full.names = TRUE,
-                                    recursive = TRUE);
-        } else if (args$file != 'NULL') {
-            vcf.list <- as.list(t(read.table(args$file, header = FALSE)));
-        } else {
-            print( 'No input file or input directory!');
-        };
+    vcf.list <- list.files(
+        path = args$path,
+        pattern = '*snvs.vcf.gz$',
+        all.files = TRUE,
+        full.names = TRUE,
+        recursive = TRUE
+        );
     # set the default valid algorithm, the default are four algorithms used in call-sSNV
     if (args$valid_algorithm == 'NULL') {
         algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2');
@@ -64,8 +53,6 @@ read.inputs <- function(args) {
             };
         };
     };
-    # Strelka2 has an output with indel variants, removed it here
-#    vcf.list <- vcf.list[ - grep('indel', vcf.list)];
     vcf.list <- vcf.list[ - grep('intermediate', vcf.list)];
     vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm));
     names(vcf.list) <- vcf.list.name;
@@ -91,24 +78,11 @@ read.inputs <- function(args) {
 ### 4. Run VCF compare with system() ##################################
 run.vcf.compare <- function(vcf.list, args) {
     docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare';
-    if (args$global_comparsion == 'TRUE') {
-        vcf.list.text <- paste(vcf.list, collapse = ' ');
-        vcf.compare.command <- paste(docker.command,  vcf.list.text, '> global_comparison.txt', sep = ' ');
-        futile.logger::flog.info('Running the following command to check VCF overlap:');
-        print(vcf.compare.command);
-        system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE);
-
-    };
-    if (args$pairwise_comparsion == 'TRUE') {
-        for (i in 1:(length(vcf.list) - 1)) {
-            for (j in (i + 1) : length(vcf.list)) {
-                file1 <- vcf.list[i];
-                file2 <- vcf.list[j];
-                name <- paste(names(vcf.list)[i], names(vcf.list)[j], 'pairwise', sep = '-' );
-                system(paste('bash vcf_check.sh', file1, file2, name, args$output.dir, sep = ' '), intern = TRUE, ignore.stderr = TRUE);
-            }
-        };
-    };
+    vcf.list.text <- paste(vcf.list, collapse = ' ');
+    vcf.compare.command <- paste(docker.command,  vcf.list.text, '> global_comparison.txt', sep = ' ');
+    futile.logger::flog.info('Running the following command to check VCF overlap:');
+    print(vcf.compare.command);
+    system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE);
     system('mv *.txt intermediate/');
 };
 
@@ -116,101 +90,67 @@ run.vcf.compare <- function(vcf.list, args) {
 gather.vcf.compare.result <- function( args ){
     setwd(args$output.dir);
 
-    if (args$global_comparsion == 'TRUE') {
-        futile.logger::flog.info('Reading the following file:');
-        file <- list.files(path = getwd(),
-                                    pattern = 'global_comparison.txt$',
-                                    full.names = TRUE,
-                                    recursive = TRUE);
-        system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' '));
-        # start read temp_overlap_count.txt by each line
-        con <- file('temp_overlap_count.txt', "r");
-        overlap.count <- c();
-        category <- c();
-        vcf.list.name <- names(vcf.list);
-        while ( TRUE ) {
-            line = readLines(con, n = 1);
-            if ( length(line) == 0 ) {
-            break
-            }
-            line.content <- unlist(strsplit(line, '\t'));
-            overlap.count <- c(overlap.count, line.content[1]);
-            line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content);
-            if (length(line.content) == 2) {
-                filename <- line.content[2];
-                algorithm <- vcf.list.name[match(filename, vcf.list)];
-            } else {
-                algorithm <- c()
-                for (i in (2 : length(line.content))){
-                    filename <- line.content[i];
-                    algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]);
-                }
-            }
-            category <- c(category, paste(algorithm, collapse = '-'));
+    futile.logger::flog.info('Reading the following file:');
+    file <- list.files(path = getwd(),
+                                pattern = 'global_comparison.txt$',
+                                full.names = TRUE,
+                                recursive = TRUE);
+    system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' '));
+    # start read temp_overlap_count.txt by each line
+    con <- file('temp_overlap_count.txt', "r");
+    overlap.count <- c();
+    category <- c();
+    vcf.list.name <- names(vcf.list);
+    while ( TRUE ) {
+        line = readLines(con, n = 1);
+        if ( length(line) == 0 ) {
+        break
         }
-        close(con)
-        overlap.count <- data.frame(overlap.count, category);
-        colnames(overlap.count) <- c('count', 'category');
-        overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category))));
-        overlap.count <- overlap.count[order(overlap.count$algorithm.count),];
-        vcf.list.name.level <- as.numeric(factor(vcf.list.name));
-        # generate compare group, such as 1, 2, 3, 12, 13, 23
-        for (i in 1:nrow(overlap.count)) {
-            if ( overlap.count$algorithm.count[i] == 1) {
-                overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name);
-            } else {
-                category.list <- unlist(strsplit(overlap.count$category[i], '-'));
-                category.list.number <- as.character(sapply(category.list, function(x) {
-                    return(match(x, vcf.list.name))
-                    }));
-                category.list.number <- category.list.number[order(category.list.number)];
-                overlap.count$compare.group[i] <- paste(category.list.number, collapse = '');
-                rm(category.list.number);
+        line.content <- unlist(strsplit(line, '\t'));
+        overlap.count <- c(overlap.count, line.content[1]);
+        line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content);
+        if (length(line.content) == 2) {
+            filename <- line.content[2];
+            algorithm <- vcf.list.name[match(filename, vcf.list)];
+        } else {
+            algorithm <- c()
+            for (i in (2 : length(line.content))){
+                filename <- line.content[i];
+                algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]);
             }
-        };
-        overlap.count <- overlap.count[order(overlap.count$compare.group),];
-        futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv');
-
-        print(overlap.count);
-        write.table(overlap.count,
-                    'overlap_count.tsv',
-                    sep = '\t',
-                    row.names = FALSE,
-                    quote = FALSE);
-        return(overlap.count);
         }
-    if (args$pairwise_comparsion == 'TRUE') {
-        file.list <- list.files(path = getwd(),
-                                pattern = 'intermediate/*_vcf_comparsion.txt$',
-                                full.names = TRUE);
-        for (i in 1:length(file.list)) {
-            futile.logger::flog.info('Reading the following file:');
-            system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' '));
-            overlap.df <- read.table('temp_overlap_count.txt', header = FALSE, allowEscapes = FALSE);
-            system(paste('cat', file.list[i], '| grep "^VN" | cut -f 3 | head -2 >filename.txt', sep = ' '));
-            filename.df <- read.table('filename.txt', header = FALSE, sep = ' ');
-            algo1 <- unlist(strsplit(basename(filename.df$V1[1]), '-'))[1];
-            algo2 <- unlist(strsplit(basename(filename.df$V1[2]), '-'))[1];
-            system(paste('cat', file.list[i], '| grep "^VN" | cut -f 2 | head -3 >temp_number.txt', sep = ' '));
-            number.df <- read.table('temp_number.txt', header = FALSE);
-            number.df$V1 <- as.numeric(number.df$V1);
-            cross.area <- number.df$V1[3];
-            area1 <- number.df$V1[1] + cross.area;
-            area2 <- number.df$V1[2] + cross.area;
-            venn.plot <- draw.pairwise.venn(
-                            area1, area2, cross.area,
-                            category = c(algo1, algo2),
-                            cat.pos = c(0, 180),
-                            euler.d = TRUE,
-                            sep.dist = 0.03,
-                            rotation.degree = 45
-                        );
-            tiff(filename = paste(algo1, algo2, 'Pairwise-Venn-diagram.tiff', sep = '_'),
-                    compression = "lzw");
-            grid.draw(venn.plot);
-            dev.off();
-            };
-        };
+        category <- c(category, paste(algorithm, collapse = '-'));
+    }
+    close(con)
+    overlap.count <- data.frame(overlap.count, category);
+    colnames(overlap.count) <- c('count', 'category');
+    overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category))));
+    overlap.count <- overlap.count[order(overlap.count$algorithm.count),];
+    vcf.list.name.level <- as.numeric(factor(vcf.list.name));
+    # generate compare group, such as 1, 2, 3, 12, 13, 23
+    for (i in 1:nrow(overlap.count)) {
+        if ( overlap.count$algorithm.count[i] == 1) {
+            overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name);
+        } else {
+            category.list <- unlist(strsplit(overlap.count$category[i], '-'));
+            category.list.number <- as.character(sapply(category.list, function(x) {
+                return(match(x, vcf.list.name))
+                }));
+            category.list.number <- category.list.number[order(category.list.number)];
+            overlap.count$compare.group[i] <- paste(category.list.number, collapse = '');
+            rm(category.list.number);
+        }
+    };
+    overlap.count <- overlap.count[order(overlap.count$compare.group),];
+    futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv');
+
+    print(overlap.count);
+    write.table(overlap.count,
+                'overlap_count.tsv',
+                sep = '\t',
+                row.names = FALSE,
+                quote = FALSE);
+    return(overlap.count);
     }
 ### 5. Plot Global Venn Plot ######################################
 plot.quad.venn.plot <- function (overlap.count, vcf.list) {
@@ -240,19 +180,6 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
             futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' '));
         }
     }
-#    u1 = vcf.summary$variant.count[1];
-#    u2 = vcf.summary$variant.count[2];
-#    u3 = vcf.summary$variant.count[3];
-#    u4 = vcf.summary$variant.count[4];
-#
-#    combination2 <- combn(1:4, 2);
-#    combination3 <- combn(1:4, 3);
-#    combination4 <- combn(1:4, 4);
-#    comb.list <- c(
-#        apply(combination2, 2, paste0, collapse=""),
-#        apply(combination3, 2, paste0, collapse=""),
-#        apply(combination4, 2, paste0, collapse="")
-#        );
 
     count <- function(x) {
         cnt <- overlap.count$count[match(x, overlap.count$compare.group)];
@@ -262,19 +189,6 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
             return(cnt);
         }
     };
-#    u12 = overlap.count$count[match('12', overlap.count$compare.group)];
-#    u13 = overlap.count$count[match('13', overlap.count$compare.group)];
-#    u14 = overlap.count$count[match('14', overlap.count$compare.group)];
-#    u23 = overlap.count$count[match('23', overlap.count$compare.group)];
-#    u24 = overlap.count$count[match('24', overlap.count$compare.group)];
-#    u34 = overlap.count$count[match('34', overlap.count$compare.group)];
-#    u123 = overlap.count$count[match('123', overlap.count$compare.group)];
-#    u124 = overlap.count$count[match('124', overlap.count$compare.group)];
-#    u134 = overlap.count$count[match('134', overlap.count$compare.group)];
-#    u234 = overlap.count$count[match('234', overlap.count$compare.group)];
-#    u1234 = overlap.count$count[match('1234', overlap.count$compare.group)];
-#    area.vector <- c(u3, u34, u4, u13, u134, u1234, u234, u24, u1, u14,  u124, u123, u23, u2, u12);
-#    print(area.vector);
     # plot qual.venn.plot
     venn.plot <- draw.quad.venn(
         area1 = vcf.summary$variant.count[1],
@@ -297,14 +211,17 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
         lty = "dashed",
         cex = 1,
         cat.cex = 0.8,
-        cat.col = 'black');
+        cat.col = 'black'
+        );
     tiff(
         filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', '.tiff' ),
-        compression = "lzw" );
+        compression = "lzw"
+        );
     grid.rect(x = unit(1, "npc"), y = unit(1, "npc"),
-          width = unit(5, "npc"), height = unit(5, "npc"),
-          just = "centre", default.units = "npc", name = NULL,
-          gp=gpar(), draw = TRUE, vp = NULL);
+        width = unit(5, "npc"), height = unit(5, "npc"),
+        just = "centre", default.units = "npc", name = NULL,
+        gp=gpar(), draw = TRUE, vp = NULL
+        );
     grid.draw(venn.plot);
     dev.off();
 }
@@ -316,9 +233,5 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
 args <- parse.args();
 vcf.list <- read.inputs(args);
 run.vcf.compare(vcf.list, args);
-if (args$global_comparsion == 'TRUE') {
-    overlap.count <- gather.vcf.compare.result(args);
-} else {
-    gather.vcf.compare.result(args);
-};
+overlap.count <- gather.vcf.compare.result(args);
 plot.quad.venn.plot(overlap.count, vcf.list);

From 4cde0646a6d7dbc476c52a33e5cff3c7c7faed0f Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Thu, 22 Jun 2023 15:02:03 -0700
Subject: [PATCH 04/16] minor edit

---
 compare-vcf/compare_vcf.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R
index 7ab5f46a..9a7b0ac2 100644
--- a/compare-vcf/compare_vcf.R
+++ b/compare-vcf/compare_vcf.R
@@ -214,7 +214,7 @@ plot.quad.venn.plot <- function (overlap.count, vcf.list) {
         cat.col = 'black'
         );
     tiff(
-        filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', '.tiff' ),
+        filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', 'tiff' ),
         compression = "lzw"
         );
     grid.rect(x = unit(1, "npc"), y = unit(1, "npc"),

From b485057d4d219b692350396fac48919c3dd9ec29 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Tue, 27 Jun 2023 16:36:04 -0700
Subject: [PATCH 05/16] with BEDtools, not needed

---
 config/default.config         |  2 ++
 config/methods.config         |  2 +-
 main.nf                       |  4 +--
 metadata.yaml                 |  2 +-
 module/intersect-processes.nf | 54 +++++++++++++++++++++++++++++++++--
 module/intersect.nf           | 36 +++++++++++++++++------
 6 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/config/default.config b/config/default.config
index 3ba1aa6b..ad9e3b7c 100644
--- a/config/default.config
+++ b/config/default.config
@@ -23,6 +23,7 @@ params {
     manta_version = "1.6.0"
     MuSE_version = "2.0"
     BCFtools_version = "1.15.1"
+    BEDtools_version = "2.29.2"
     docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}"
     docker_image_validate_params = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}"
     docker_image_GATK = "broadinstitute/gatk:${params.GATK_version}"
@@ -32,6 +33,7 @@ params {
     docker_image_manta = "${-> params.docker_container_registry}/manta:${params.manta_version}"
     docker_image_MuSE = "${-> params.docker_container_registry}/muse:${params.MuSE_version}"
     docker_image_BCFtools = "${-> params.docker_container_registry}/bcftools:${params.BCFtools_version}"
+    docker_image_BEDtools = "${-> params.docker_container_registry}/bedtools:${params.BEDtools_version}"
 
 }
 
diff --git a/config/methods.config b/config/methods.config
index dbf24712..d28e089c 100644
--- a/config/methods.config
+++ b/config/methods.config
@@ -129,7 +129,7 @@ methods {
         if (params.containsKey("call_region") && params.call_region) {
             params.use_call_region = true
         } else {
-            params.call_region = "${params.work_dir}/NO_FILE.bed.gz"
+   //         params.call_region = "${params.work_dir}/NO_FILE.bed.gz"
             params.use_call_region = false
         }
         params.call_region_index = "${params.call_region}.tbi"
diff --git a/main.nf b/main.nf
index 1cda90bc..b0c18721 100755
--- a/main.nf
+++ b/main.nf
@@ -214,13 +214,13 @@ workflow {
             .mix(strelka2_vcf_ch)
             .mix(mutect2_vcf_ch)
             .mix(muse_vcf_ch))
-            .collect()
+        //    .collect()
 
         tool_indices = (somaticsniper_idx_ch
             .mix(strelka2_idx_ch)
             .mix(mutect2_idx_ch)
             .mix(muse_idx_ch))
-            .collect()
+         //   .collect()
 
         intersect(
             tool_vcfs,
diff --git a/metadata.yaml b/metadata.yaml
index ab371712..da9b8577 100644
--- a/metadata.yaml
+++ b/metadata.yaml
@@ -5,4 +5,4 @@ Maintainers: ['maotian@mednet.ucla.edu']
 Contributors: ['Mao Tian', 'Bugh Caden', 'Helena Winata', 'Yash Patel', 'Sorel Fitz-Gibbon']
 Languages: ['Docker', 'Nextflow']
 Dependencies: ['Docker', 'Nextflow']
-Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.15.1]
+Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.15.1, BEDtools v2.29.2]
diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf
index 28ff1ee8..841b01e9 100644
--- a/module/intersect-processes.nf
+++ b/module/intersect-processes.nf
@@ -4,7 +4,33 @@ log.info """\
 ====================================
 Docker Images:
 - docker_image_BCFtools: ${params.docker_image_BCFtools}
+- docker_image_BEDtools: ${params.docker_image_BEDtools}
+
 """
+process trim_VCF_BCFtools {
+   container params.docker_image_BCFtools
+    publishDir path: "${params.workflow_output_dir}/intermediate",
+        mode: "copy",
+        pattern: "*_trim-SNV.vcf.gz*"
+    publishDir path: "${params.workflow_log_output_dir}",
+        mode: "copy",
+        pattern: ".command.*",
+        saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" }
+
+    input:
+    path vcf
+    path index
+    path call_region
+
+    output:
+    path "*_trim-SNV.vcf.gz", emit: trimmed_vcf
+
+    script:
+    """
+    set -euo pipefail
+    bcftools view  --regions-file ${params.call_region} --output "${vcf.split('_')[-1]}}_SNV-trim.vcf.gz" ${vcf}
+    """
+}
 
 process intersect_VCFs_BCFtools {
     container params.docker_image_BCFtools
@@ -30,12 +56,36 @@ process intersect_VCFs_BCFtools {
     path "isec-2-or-more"
 
     script:
-
     vcf_list = vcfs.join(' ')
-
     """
     set -euo pipefail
     bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done
     """
     }
+
+process intersect_VCFs_BEDtools {
+    container params.docker_image_BEDtools
+    publishDir path: "${params.workflow_output_dir}/output",
+        mode: "copy",
+        pattern: "multiinter.out"
+    publishDir path: "${params.workflow_log_output_dir}",
+        mode: "copy",
+        pattern: ".command.*",
+        saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" }
+
+    input:
+    path vcfs
+    path indices
+
+    output:
+    path "multiinter.out"
+
+    script:
+    vcf_list = vcfs.join(' ')
+    name_list = vcfs.collect { it.getName().split('_')[0] }.join(' ')
+    """
+    set -euo pipefail
+    bedtools multiinter -i ${vcf_list} -header -names ${name_list} > multiinter.out
+    """
+}
diff --git a/module/intersect.nf b/module/intersect.nf
index 534fd41f..6059ce36 100644
--- a/module/intersect.nf
+++ b/module/intersect.nf
@@ -1,31 +1,51 @@
 include { generate_sha512sum } from './common'
+include { trim_VCF_BCFtools } from './intersect-processes.nf'
 include { intersect_VCFs_BCFtools } from './intersect-processes.nf'
+include { intersect_VCFs_BEDtools } from './intersect-processes.nf'
 include { compress_index_VCF } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
     options: [
         output_dir: params.workflow_output_dir,
         log_output_dir: params.workflow_log_output_dir,
         bgzip_extra_args: params.bgzip_extra_args,
-        tabix_extra_args: params.tabix_extra_args
+        tabix_extra_args: params.tabix_extra_args,
+        is_output_file: false
         ])
 
-
 workflow intersect {
     take:
     tool_vcfs
     tool_indices
 
     main:
-        intersect_VCFs_BCFtools(
+        trim_VCF_BCFtools(
             tool_vcfs,
-            tool_indices
-        )
+            tool_indices,
+            params.call_region
+            )
+        trim_VCF_BCFtools.out.trimmed_vcf
+            .map { it -> ["${file(it).getName().split('_')[0]}", it] }
+            .set { inputs_to_compress }
+        compress_index_VCF(inputs_to_compress)
+        trimmed_vcfs = compress_index_VCF.out.index_out
+            .map{ it -> ["${it[1]}"] }
+            .collect()
+        trimmed_indices = compress_index_VCF.out.index_out
+            .map{ it -> ["${it[2]}"] }
+            .collect()
+        intersect_VCFs_BCFtools(
+            trimmed_vcfs,
+            trimmed_indices
+            )
+        intersect_VCFs_BEDtools(
+            trimmed_vcfs,
+            trimmed_indices
+            )
         file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf
             .flatten()
             .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-vcf", it]}
             .mix(intersect_VCFs_BCFtools.out.consensus_idx
                 .flatten()
-                .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-idx", it]})
+                .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-idx", it]}
+                )
         generate_sha512sum(file_for_sha512)
-    emit:
-        intersect_VCFs_BCFtools.out.consensus_vcf
     }

From c52d327ca36dc3260eb02e5ce5159bdf3ef38261 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Wed, 28 Jun 2023 12:32:49 -0700
Subject: [PATCH 06/16] switch to one extra bcftools line, no bedtools

---
 config/default.config         |  4 +--
 main.nf                       |  4 +--
 metadata.yaml                 |  2 +-
 module/intersect-processes.nf | 62 +++++------------------------------
 module/intersect.nf           | 33 ++-----------------
 5 files changed, 16 insertions(+), 89 deletions(-)

diff --git a/config/default.config b/config/default.config
index ad9e3b7c..0f732270 100644
--- a/config/default.config
+++ b/config/default.config
@@ -22,8 +22,7 @@ params {
     strelka2_version = "2.9.10"
     manta_version = "1.6.0"
     MuSE_version = "2.0"
-    BCFtools_version = "1.15.1"
-    BEDtools_version = "2.29.2"
+    BCFtools_version = "1.17"
     docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}"
     docker_image_validate_params = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}"
     docker_image_GATK = "broadinstitute/gatk:${params.GATK_version}"
@@ -33,7 +32,6 @@ params {
     docker_image_manta = "${-> params.docker_container_registry}/manta:${params.manta_version}"
     docker_image_MuSE = "${-> params.docker_container_registry}/muse:${params.MuSE_version}"
     docker_image_BCFtools = "${-> params.docker_container_registry}/bcftools:${params.BCFtools_version}"
-    docker_image_BEDtools = "${-> params.docker_container_registry}/bedtools:${params.BEDtools_version}"
 
 }
 
diff --git a/main.nf b/main.nf
index b0c18721..1cda90bc 100755
--- a/main.nf
+++ b/main.nf
@@ -214,13 +214,13 @@ workflow {
             .mix(strelka2_vcf_ch)
             .mix(mutect2_vcf_ch)
             .mix(muse_vcf_ch))
-        //    .collect()
+            .collect()
 
         tool_indices = (somaticsniper_idx_ch
             .mix(strelka2_idx_ch)
             .mix(mutect2_idx_ch)
             .mix(muse_idx_ch))
-         //   .collect()
+            .collect()
 
         intersect(
             tool_vcfs,
diff --git a/metadata.yaml b/metadata.yaml
index da9b8577..aea10576 100644
--- a/metadata.yaml
+++ b/metadata.yaml
@@ -5,4 +5,4 @@ Maintainers: ['maotian@mednet.ucla.edu']
 Contributors: ['Mao Tian', 'Bugh Caden', 'Helena Winata', 'Yash Patel', 'Sorel Fitz-Gibbon']
 Languages: ['Docker', 'Nextflow']
 Dependencies: ['Docker', 'Nextflow']
-Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.15.1, BEDtools v2.29.2]
+Tools: ['GATK 4.4.0.0', 'SomaticSniper v1.0.5.0', 'SAMtools v1.16.1', 'Strelka2 v2.9.10', 'Manta v1.6.0', 'MuSE v2.0', BCFtools v1.17]
diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf
index 841b01e9..7bf8696d 100644
--- a/module/intersect-processes.nf
+++ b/module/intersect-processes.nf
@@ -4,34 +4,8 @@ log.info """\
 ====================================
 Docker Images:
 - docker_image_BCFtools: ${params.docker_image_BCFtools}
-- docker_image_BEDtools: ${params.docker_image_BEDtools}
 
 """
-process trim_VCF_BCFtools {
-   container params.docker_image_BCFtools
-    publishDir path: "${params.workflow_output_dir}/intermediate",
-        mode: "copy",
-        pattern: "*_trim-SNV.vcf.gz*"
-    publishDir path: "${params.workflow_log_output_dir}",
-        mode: "copy",
-        pattern: ".command.*",
-        saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" }
-
-    input:
-    path vcf
-    path index
-    path call_region
-
-    output:
-    path "*_trim-SNV.vcf.gz", emit: trimmed_vcf
-
-    script:
-    """
-    set -euo pipefail
-    bcftools view  --regions-file ${params.call_region} --output "${vcf.split('_')[-1]}}_SNV-trim.vcf.gz" ${vcf}
-    """
-}
-
 process intersect_VCFs_BCFtools {
     container params.docker_image_BCFtools
     publishDir path: "${params.workflow_output_dir}/output",
@@ -40,6 +14,9 @@ process intersect_VCFs_BCFtools {
     publishDir path: "${params.workflow_output_dir}/output",
         mode: "copy",
         pattern: "isec-2-or-more"
+    publishDir path: "${params.workflow_output_dir}/output",
+        mode: "copy",
+        pattern: "isec-1-or-more/*.txt"
     publishDir path: "${params.workflow_log_output_dir}",
         mode: "copy",
         pattern: ".command.*",
@@ -48,44 +25,23 @@ process intersect_VCFs_BCFtools {
     input:
     path vcfs
     path indices
+    path call_region
+    path call_region_index
 
     output:
     path "*.vcf.gz", emit: consensus_vcf
     path "*.vcf.gz.tbi", emit: consensus_idx
     path ".command.*"
     path "isec-2-or-more"
+    path "isec-1-or-more/sites.txt"
+    path "isec-1-or-more/README.txt"
 
     script:
     vcf_list = vcfs.join(' ')
     """
     set -euo pipefail
-    bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list}
+    bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${call_region} ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done
+    bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${call_region} ${vcf_list}
     """
     }
-
-process intersect_VCFs_BEDtools {
-    container params.docker_image_BEDtools
-    publishDir path: "${params.workflow_output_dir}/output",
-        mode: "copy",
-        pattern: "multiinter.out"
-    publishDir path: "${params.workflow_log_output_dir}",
-        mode: "copy",
-        pattern: ".command.*",
-        saveAs: { "${task.process.replace(':', '/')}-${task.index}/log${file(it).getName()}" }
-
-    input:
-    path vcfs
-    path indices
-
-    output:
-    path "multiinter.out"
-
-    script:
-    vcf_list = vcfs.join(' ')
-    name_list = vcfs.collect { it.getName().split('_')[0] }.join(' ')
-    """
-    set -euo pipefail
-    bedtools multiinter -i ${vcf_list} -header -names ${name_list} > multiinter.out
-    """
-}
diff --git a/module/intersect.nf b/module/intersect.nf
index 6059ce36..2e99f7a5 100644
--- a/module/intersect.nf
+++ b/module/intersect.nf
@@ -1,15 +1,5 @@
 include { generate_sha512sum } from './common'
-include { trim_VCF_BCFtools } from './intersect-processes.nf'
 include { intersect_VCFs_BCFtools } from './intersect-processes.nf'
-include { intersect_VCFs_BEDtools } from './intersect-processes.nf'
-include { compress_index_VCF } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
-    options: [
-        output_dir: params.workflow_output_dir,
-        log_output_dir: params.workflow_log_output_dir,
-        bgzip_extra_args: params.bgzip_extra_args,
-        tabix_extra_args: params.tabix_extra_args,
-        is_output_file: false
-        ])
 
 workflow intersect {
     take:
@@ -17,28 +7,11 @@ workflow intersect {
     tool_indices
 
     main:
-        trim_VCF_BCFtools(
+        intersect_VCFs_BCFtools(
             tool_vcfs,
             tool_indices,
-            params.call_region
-            )
-        trim_VCF_BCFtools.out.trimmed_vcf
-            .map { it -> ["${file(it).getName().split('_')[0]}", it] }
-            .set { inputs_to_compress }
-        compress_index_VCF(inputs_to_compress)
-        trimmed_vcfs = compress_index_VCF.out.index_out
-            .map{ it -> ["${it[1]}"] }
-            .collect()
-        trimmed_indices = compress_index_VCF.out.index_out
-            .map{ it -> ["${it[2]}"] }
-            .collect()
-        intersect_VCFs_BCFtools(
-            trimmed_vcfs,
-            trimmed_indices
-            )
-        intersect_VCFs_BEDtools(
-            trimmed_vcfs,
-            trimmed_indices
+            params.call_region,
+            params.call_region_index
             )
         file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf
             .flatten()

From 97336dfea0792699a896f69a2b87e6eed537db0f Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Wed, 28 Jun 2023 12:42:10 -0700
Subject: [PATCH 07/16] remove compare_vcf.R

---
 compare-vcf/compare_vcf.R | 237 --------------------------------------
 1 file changed, 237 deletions(-)
 delete mode 100644 compare-vcf/compare_vcf.R

diff --git a/compare-vcf/compare_vcf.R b/compare-vcf/compare_vcf.R
deleted file mode 100644
index 9a7b0ac2..00000000
--- a/compare-vcf/compare_vcf.R
+++ /dev/null
@@ -1,237 +0,0 @@
-# This is the script to compare the vcf results ################
-# Initial commit: Mao Tian 09-28-2022
-## 1. Setup the environment ########################
-library('argparse');
-library('BoutrosLab.plotting.general');
-library('BoutrosLab.utilities');
-library('VennDiagram');
-
-# tmp: for testing
-#args <- list();
-#args$path <- '/hot/software/pipeline/pipeline-call-sSNV/Nextflow/development/unreleased/sfitz-intersect-vcfs/call-sSNV-6.0.0/CPCG0000000196-T001-P01'
-#args$valid_algorithm <- 'NULL';
-#args$dataset.id <- 'CPCG';
-#args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/');
-
-### 2. Dataset Argument Parser ########################################
-parse.args <- function() {
-    data.parser <- ArgumentParser();
-    data.parser$add_argument('-p', '--path', help = 'directory that stores the vcf.gz files', required = TRUE);
-    data.parser$add_argument('-f', '--file', help = 'an input file that lists all the vcf.gz files', default = 'NULL');
-    data.parser$add_argument('-v', '--valid_algorithm', help = 'valid algorithms, default are the four call-sSNV algorithms', default = 'NULL');
-    data.parser$add_argument('-d', '--dataset.id', help = 'dataset id will be used to generate the output directory that will store both intermediate and output files', default = 'test');
-    data.parser$add_argument('-o', '--output.dir', help = 'the output directory, default is current working directory', default = 'NULL');
-    ### Set the arg_parase
-    args <- data.parser$parse_args();
-    if ( args$output.dir == 'NULL') {
-        args$output.dir <- paste(getwd(), paste(gsub(' |:', '-', Sys.time()), 'VCF-Comparison-Result', sep = '_'), sep = '/');
-    }
-    return(args);
-    };
-### 3. Read Input VCF Lists ##################################
-read.inputs <- function(args) {
-    vcf.list <- list.files(
-        path = args$path,
-        pattern = '*snvs.vcf.gz$',
-        all.files = TRUE,
-        full.names = TRUE,
-        recursive = TRUE
-        );
-    # set the default valid algorithm, the default are four algorithms used in call-sSNV
-    if (args$valid_algorithm == 'NULL') {
-        algorithm.list <- c('MuSE', 'Mutect2', 'SomaticSniper', 'Strelka2');
-#        args$valid_algorithm <- algorithm.list;
-        } else {
-            algorithm.list <- args$valid_algorithm
-        };
-    # this function is used to extract the algorithm of the vcf.gz file
-    # need to pre-define the algorithm.list
-    extract.algorithm <- function(x) {
-        for (i in 1:length(algorithm.list)) {
-            if (grepl(algorithm.list[i], x)) {
-                return(algorithm.list[i]);
-            };
-        };
-    };
-    vcf.list <- vcf.list[ - grep('intermediate', vcf.list)];
-    vcf.list.name <- unlist(lapply(vcf.list, extract.algorithm));
-    names(vcf.list) <- vcf.list.name;
-    vcf.count <- c();
-    # count each vcf file variant
-    for (file in vcf.list) {
-        count <- system(paste('zcat', file, '| grep -v "^#" | wc -l', sep = ' '), intern = TRUE);
-        vcf.count <- c(vcf.count, as.numeric(count));
-    }
-    vcf.summary <- data.frame(vcf.list, vcf.list.name, vcf.count);
-    colnames(vcf.summary) <- c('filename', 'algorithm', 'variant.count');
-    # rank vcf by variant.count
-    vcf.summary <- vcf.summary[order(vcf.summary$variant.count, decreasing = TRUE),];
-    vcf.list <- as.character(vcf.summary$filename);
-    names(vcf.list) <- as.character(vcf.summary$algorithm);
-    system(paste('mkdir', args$output.dir));
-    system(paste('mkdir', paste(args$output.dir, 'intermediate', sep = '/')));
-    setwd(args$output.dir);
-    write.table(vcf.summary, 'vcf_summary.tsv', sep = '\t', row.names = FALSE, quote = FALSE);
-    return(vcf.list);
-    };
-
-### 4. Run VCF compare with system() ##################################
-run.vcf.compare <- function(vcf.list, args) {
-    docker.command <- 'docker run --rm -v /hot/:/hot/ -u `id -u` biocontainers/vcftools:v0.1.16-1-deb_cv1 vcf-compare';
-    vcf.list.text <- paste(vcf.list, collapse = ' ');
-    vcf.compare.command <- paste(docker.command,  vcf.list.text, '> global_comparison.txt', sep = ' ');
-    futile.logger::flog.info('Running the following command to check VCF overlap:');
-    print(vcf.compare.command);
-    system(vcf.compare.command, intern = TRUE, ignore.stderr = TRUE);
-    system('mv *.txt intermediate/');
-};
-
-### 5. Gather VCF compare results ##################################
-gather.vcf.compare.result <- function( args ){
-    setwd(args$output.dir);
-
-    futile.logger::flog.info('Reading the following file:');
-    file <- list.files(path = getwd(),
-                                pattern = 'global_comparison.txt$',
-                                full.names = TRUE,
-                                recursive = TRUE);
-    system(paste('cat', file, '| grep "^VN" | cut -f 2- >temp_overlap_count.txt', sep = ' '));
-    # start read temp_overlap_count.txt by each line
-    con <- file('temp_overlap_count.txt', "r");
-    overlap.count <- c();
-    category <- c();
-    vcf.list.name <- names(vcf.list);
-    while ( TRUE ) {
-        line = readLines(con, n = 1);
-        if ( length(line) == 0 ) {
-        break
-        }
-        line.content <- unlist(strsplit(line, '\t'));
-        overlap.count <- c(overlap.count, line.content[1]);
-        line.content <- gsub('\\s+[(]+[0-9]+.[0-9]%[)]', '', line.content);
-        if (length(line.content) == 2) {
-            filename <- line.content[2];
-            algorithm <- vcf.list.name[match(filename, vcf.list)];
-        } else {
-            algorithm <- c()
-            for (i in (2 : length(line.content))){
-                filename <- line.content[i];
-                algorithm <- c(algorithm, vcf.list.name[match(filename, vcf.list)]);
-            }
-        }
-        category <- c(category, paste(algorithm, collapse = '-'));
-    }
-    close(con)
-    overlap.count <- data.frame(overlap.count, category);
-    colnames(overlap.count) <- c('count', 'category');
-    overlap.count$algorithm.count <- (1 + lengths(regmatches(overlap.count$category, gregexpr("-", overlap.count$category))));
-    overlap.count <- overlap.count[order(overlap.count$algorithm.count),];
-    vcf.list.name.level <- as.numeric(factor(vcf.list.name));
-    # generate compare group, such as 1, 2, 3, 12, 13, 23
-    for (i in 1:nrow(overlap.count)) {
-        if ( overlap.count$algorithm.count[i] == 1) {
-            overlap.count$compare.group[i] <- match(overlap.count$category[i], vcf.list.name);
-        } else {
-            category.list <- unlist(strsplit(overlap.count$category[i], '-'));
-            category.list.number <- as.character(sapply(category.list, function(x) {
-                return(match(x, vcf.list.name))
-                }));
-            category.list.number <- category.list.number[order(category.list.number)];
-            overlap.count$compare.group[i] <- paste(category.list.number, collapse = '');
-            rm(category.list.number);
-        }
-    };
-    overlap.count <- overlap.count[order(overlap.count$compare.group),];
-    futile.logger::flog.info('Here is the result will be stored in overlap_count.tsv');
-
-    print(overlap.count);
-    write.table(overlap.count,
-                'overlap_count.tsv',
-                sep = '\t',
-                row.names = FALSE,
-                quote = FALSE);
-    return(overlap.count);
-    }
-### 5. Plot Global Venn Plot ######################################
-plot.quad.venn.plot <- function (overlap.count, vcf.list) {
-    device <- pdf(file = NULL);
-    futile.logger::flog.info('Reading vcf_summary.tsv');
-    vcf.summary <- read.table('vcf_summary.tsv', sep = '\t', header = TRUE);
-    if (all(vcf.summary$filename == vcf.list)) {
-        futile.logger::flog.info('algorithm sequence matched, run variant count check');
-    } else {
-        futile.logger::flog.info('algorithm sequence not matched, reordering');
-        vcf.summary <- vcf.summary[match(vcf.list, vcf.summary$filename),];
-        if (all(vcf.summary$algorithm == vcf.list)) {
-        futile.logger::flog.info('reordered, run variant count check');
-        }
-    }
-    # check if the sum of the overlap.count equals to the summary of vcf
-    futile.logger::flog.info('check if the sum of the overlap.count equals to the summary of vcf');
-    overlap.count$count <- as.numeric(overlap.count$count);
-    for (algo in vcf.summary$algorithm) {
-        count.from.summary <- vcf.summary$variant.count[vcf.summary$algorithm == algo];
-        count.from.overlap.sum <- sum(
-            overlap.count$count[grep(algo, overlap.count$category)]
-            );
-        if (count.from.summary == count.from.overlap.sum) {
-            futile.logger::flog.info(paste(algo, 'Match!', sep = ' '));
-        } else {
-            futile.logger::flog.info(paste(algo, 'Not Match! The sum of the vcf_compare tool does not equal to the summary of vcf', sep = ' '));
-        }
-    }
-
-    count <- function(x) {
-        cnt <- overlap.count$count[match(x, overlap.count$compare.group)];
-        if (is.na(cnt)) {
-            return(0);
-        } else {
-            return(cnt);
-        }
-    };
-    # plot qual.venn.plot
-    venn.plot <- draw.quad.venn(
-        area1 = vcf.summary$variant.count[1],
-        area2 = vcf.summary$variant.count[2],
-        area3 = vcf.summary$variant.count[3],
-        area4 = vcf.summary$variant.count[4],
-        n12 = (count('12') + count('123') + count('124') + count('1234')),
-        n13 = (count('13') + count('123') + count('134') + count('1234')),
-        n14 = (count('14') + count('124') + count('134') + count('1234')),
-        n23 = (count('23') + count('123') + count('234') + count('1234')),
-        n24 = (count('24') + count('124') + count('234') + count('1234')),
-        n34 = (count('34') + count('134') + count('234') + count('1234')),
-        n123 = (count('123') + count('1234')),
-        n124 = (count('124') + count('1234')),
-        n134 = (count('134') + count('1234')),
-        n234 = (count('234') + count('1234')),
-        n1234 = count('1234'),
-        category = vcf.summary$algorithm,
-        fill = c("orange", "red", "green", "blue"),
-        lty = "dashed",
-        cex = 1,
-        cat.cex = 0.8,
-        cat.col = 'black'
-        );
-    tiff(
-        filename = generate.filename(args$dataset.id, 'Quad-Venn-diagram', 'tiff' ),
-        compression = "lzw"
-        );
-    grid.rect(x = unit(1, "npc"), y = unit(1, "npc"),
-        width = unit(5, "npc"), height = unit(5, "npc"),
-        just = "centre", default.units = "npc", name = NULL,
-        gp=gpar(), draw = TRUE, vp = NULL
-        );
-    grid.draw(venn.plot);
-    dev.off();
-}
-
-
-
-### Main ##########################################################
-
-args <- parse.args();
-vcf.list <- read.inputs(args);
-run.vcf.compare(vcf.list, args);
-overlap.count <- gather.vcf.compare.result(args);
-plot.quad.venn.plot(overlap.count, vcf.list);

From 5887e59f5cc2655a9ade44d7ad97e636c97d289d Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Wed, 28 Jun 2023 13:50:15 -0700
Subject: [PATCH 08/16] update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4457175b..b493dfbf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,11 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- Add regions filter to variant intersections, limiting output to main chromosome variants
+- Add second BCFtools step to create full presence/absence variant table (including private)
 - Add workflow to create a `consensus.vcf` that includes SNVs found by two or more variant callers
 - Add `fix_sample_names_VCF`, tumor and normal sample IDs from input BAMs used in output VCFs
 - Add `split_VCF_bcftools` to `Mutect2` workflow, separating SNVs, MNVs and Indels
 
 ### Changed
+- Update to BCFtools v1.17
 - Update to use sample ID from input BAM files (single tumor/normal BAM input only)
 - Use BCFtools to compress PASS variants instead of bgzip
 - Use BCFtools to extract PASS variants instead of awk

From a9f6f9e857b9fb5875bb8a5b71981476749c87c0 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Mon, 10 Jul 2023 15:19:59 -0700
Subject: [PATCH 09/16] restructuring call_regions code

---
 config/methods.config        | 13 ++++++++-----
 config/schema.yaml           |  7 +++----
 config/template.config       |  3 ++-
 module/intersect.nf          |  4 ++--
 module/mutect2.nf            |  3 +--
 module/strelka2-processes.nf |  4 ++--
 module/strelka2.nf           |  8 ++++----
 7 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/config/methods.config b/config/methods.config
index d28e089c..a32c7183 100644
--- a/config/methods.config
+++ b/config/methods.config
@@ -125,14 +125,16 @@ methods {
         }
     }
 
+    set_canonical_regions_params {
+        params.canonical_regions_index = "${params.canonical_regions}.tbi"
+    }
+
     set_strelka2_params = {
-        if (params.containsKey("call_region") && params.call_region) {
-            params.use_call_region = true
+        if (params.containsKey("use_canonical_regions") && params.use_canonical_regions) {
+            params.use_canonical_regions = true
         } else {
-   //         params.call_region = "${params.work_dir}/NO_FILE.bed.gz"
-            params.use_call_region = false
+            params.use_canonical_regions = false
         }
-        params.call_region_index = "${params.call_region}.tbi"
     }
 
     set_mutect2_params = {
@@ -198,6 +200,7 @@ methods {
         retry.setup_retry()
         methods.set_env()
         methods.set_sample_params()
+        methods.set_canonical_regions_params()
         methods.set_strelka2_params()
         methods.set_mutect2_params()
         methods.set_output_directory()
diff --git a/config/schema.yaml b/config/schema.yaml
index ca2ded4c..7f01d6b9 100644
--- a/config/schema.yaml
+++ b/config/schema.yaml
@@ -46,11 +46,10 @@ exome:
   required: false
   default: false
   help: 'The exome option when running manta and strelka2'
-call_region:
+canonical_regions:
   type: 'Path'
-  required: false
-  default: '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
-  help: 'A call region bed file for strelka2 to save runtime or for a targeted region'
+  required: true
+  help: 'canonical regions used by mutect2, strelka2 and intersect'
 split_intervals_extra_args:
   type: 'String'
   required: false
diff --git a/config/template.config b/config/template.config
index 1b2cd833..b6dcd1e6 100644
--- a/config/template.config
+++ b/config/template.config
@@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config"
 params {
     algorithm = [] // 'somaticsniper', 'strelka2', 'mutect2', 'muse'
     reference = ''
+    canonical_regions = '${projectDir}/config/hg38_chromosomes_canonical.bed'
     output_dir = ''
     dataset_id = ''
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
@@ -25,7 +26,7 @@ params {
     tabix_extra_args = ''
 
     // strelka2 options
-    call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
+    use_canonical_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''
diff --git a/module/intersect.nf b/module/intersect.nf
index 2e99f7a5..e1694081 100644
--- a/module/intersect.nf
+++ b/module/intersect.nf
@@ -10,8 +10,8 @@ workflow intersect {
         intersect_VCFs_BCFtools(
             tool_vcfs,
             tool_indices,
-            params.call_region,
-            params.call_region_index
+            params.canonical_regions,
+            params.canonical_regions_index
             )
         file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf
             .flatten()
diff --git a/module/mutect2.nf b/module/mutect2.nf
index bf6d6246..3b03be62 100644
--- a/module/mutect2.nf
+++ b/module/mutect2.nf
@@ -36,8 +36,7 @@ workflow mutect2 {
         if (params.intervals) {
             intervals = params.intervals
         } else {
-            intervals = "${projectDir}/config/hg38_chromosomes_canonical.list"
-
+            intervals = params.canonical_regions
             // process non-canonical chromosome regions seperately
             // as this region requires more memory than the canonical regions
             call_sSNVInNonAssembledChromosomes_Mutect2(
diff --git a/module/strelka2-processes.nf b/module/strelka2-processes.nf
index 304b90e2..74bd4d84 100644
--- a/module/strelka2-processes.nf
+++ b/module/strelka2-processes.nf
@@ -38,7 +38,7 @@ process call_sIndel_Manta {
 
     script:
     exome = params.exome ? "--exome" : ""
-    call_region_command = params.use_call_region ? "--callRegions ${call_region}" : ""
+    call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : ""
     """
     configManta.py \
         --normalBam $normal \
@@ -82,7 +82,7 @@ process call_sSNV_Strelka2 {
 
     script:
     exome = params.exome ? "--exome" : ""
-    call_region_command = params.use_call_region ? "--callRegions ${call_region}" : ""
+    call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : ""
     """
     set -euo pipefail
     configureStrelkaSomaticWorkflow.py \
diff --git a/module/strelka2.nf b/module/strelka2.nf
index 92cf2d20..c1b3ab9f 100644
--- a/module/strelka2.nf
+++ b/module/strelka2.nf
@@ -26,8 +26,8 @@ workflow strelka2 {
             normal_index,
             params.reference,
             "${params.reference}.fai",
-            params.call_region,
-            params.call_region_index
+            params.canonical_regions,
+            params.canonical_regions_index
         )
         call_sSNV_Strelka2(
             tumor_bam,
@@ -37,8 +37,8 @@ workflow strelka2 {
             params.reference,
             "${params.reference}.fai",
             call_sIndel_Manta.out[0],
-            params.call_region,
-            params.call_region_index
+            params.canonical_regions,
+            params.canonical_regions_index
         )
         filter_VCF_BCFtools(call_sSNV_Strelka2.out.snvs_vcf
             .mix(call_sSNV_Strelka2.out.indels_vcf))

From efe7030adaf4b22849aaf81c550b12358f6cae5b Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Fri, 14 Jul 2023 15:37:59 -0700
Subject: [PATCH 10/16] reconfigure strelka2 call_regions to general
 intersect_regions

---
 CHANGELOG.md                            |  1 +
 config/methods.config                   | 15 +++------------
 config/schema.yaml                      |  8 ++++++--
 config/template.config                  |  4 ++--
 main.nf                                 |  8 ++++----
 module/intersect-processes.nf           |  8 ++++----
 module/intersect.nf                     |  4 ++--
 module/mutect2.nf                       |  2 +-
 module/strelka2-processes.nf            |  4 ++--
 module/strelka2.nf                      |  9 ++++-----
 test/config/a_mini-all-tools.config     |  5 +++--
 test/config/a_mini-muse.config          |  3 ++-
 test/config/a_mini-mutect2.config       |  3 ++-
 test/config/a_mini-somaticsniper.config |  3 ++-
 test/config/a_mini-strelka2.config      |  3 ++-
 15 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a80fd738..14420e12 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `split_VCF_bcftools` to `Mutect2` workflow, separating SNVs, MNVs and Indels
 
 ### Changed
+- reconfigure call_regions to intersect_regions 
 - Update to BCFtools v1.17
 - Update `MuSE` to `v2.0.2`
 - Update to use sample ID from input BAM files (single tumor/normal BAM input only)
diff --git a/config/methods.config b/config/methods.config
index a32c7183..47646663 100644
--- a/config/methods.config
+++ b/config/methods.config
@@ -125,16 +125,8 @@ methods {
         }
     }
 
-    set_canonical_regions_params {
-        params.canonical_regions_index = "${params.canonical_regions}.tbi"
-    }
-
-    set_strelka2_params = {
-        if (params.containsKey("use_canonical_regions") && params.use_canonical_regions) {
-            params.use_canonical_regions = true
-        } else {
-            params.use_canonical_regions = false
-        }
+    set_intersect_regions_params = {
+        params.intersect_regions_index = "${params.intersect_regions}.tbi"
     }
 
     set_mutect2_params = {
@@ -200,8 +192,7 @@ methods {
         retry.setup_retry()
         methods.set_env()
         methods.set_sample_params()
-        methods.set_canonical_regions_params()
-        methods.set_strelka2_params()
+        methods.set_intersect_regions_params()
         methods.set_mutect2_params()
         methods.set_output_directory()
         methods.set_pipeline_log()
diff --git a/config/schema.yaml b/config/schema.yaml
index 7f01d6b9..e63ebdea 100644
--- a/config/schema.yaml
+++ b/config/schema.yaml
@@ -46,10 +46,14 @@ exome:
   required: false
   default: false
   help: 'The exome option when running manta and strelka2'
-canonical_regions:
+intersect_regions:
   type: 'Path'
   required: true
-  help: 'canonical regions used by mutect2, strelka2 and intersect'
+  help: 'call regions used by mutect2, strelka2 and intersect'
+use_intersect_regions:
+  type: 'Bool'
+  required: true
+  default: true
 split_intervals_extra_args:
   type: 'String'
   required: false
diff --git a/config/template.config b/config/template.config
index b6dcd1e6..3b7606cf 100644
--- a/config/template.config
+++ b/config/template.config
@@ -13,7 +13,7 @@ includeConfig "${projectDir}/config/methods.config"
 params {
     algorithm = [] // 'somaticsniper', 'strelka2', 'mutect2', 'muse'
     reference = ''
-    canonical_regions = '${projectDir}/config/hg38_chromosomes_canonical.bed'
+    intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz'
     output_dir = ''
     dataset_id = ''
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
@@ -26,7 +26,7 @@ params {
     tabix_extra_args = ''
 
     // strelka2 options
-    use_canonical_regions = true
+    use_intersect_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''
diff --git a/main.nf b/main.nf
index 1cda90bc..a05c0f0c 100755
--- a/main.nf
+++ b/main.nf
@@ -30,7 +30,7 @@ log.info """\
         reference: ${params.reference}
         reference_index: ${params.reference_index}
         reference_dict: ${params.reference_dict}
-        call_region: ${params.call_region}
+        intersect_region: ${params.intersect_region}
 
     - output:
         output_dir: ${params.output_dir_base}
@@ -132,11 +132,11 @@ workflow {
         file_to_validate = reference_ch
         .mix (tumor_input.tumor_bam, tumor_input.tumor_index, normal_input.normal_bam, normal_input.normal_index)
         }
-    if (params.use_call_region) {
+    if (params.use_intersect_regions) {
         file_to_validate = file_to_validate.mix(
             Channel.from(
-                params.call_region,
-                params.call_region_index
+                params.intersect_regions,
+                params.intersect_regions_index
                 )
             )
         }
diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf
index 7bf8696d..2b8f39a4 100644
--- a/module/intersect-processes.nf
+++ b/module/intersect-processes.nf
@@ -25,8 +25,8 @@ process intersect_VCFs_BCFtools {
     input:
     path vcfs
     path indices
-    path call_region
-    path call_region_index
+    path intersect_region
+    path intersect_region_index
 
     output:
     path "*.vcf.gz", emit: consensus_vcf
@@ -40,8 +40,8 @@ process intersect_VCFs_BCFtools {
     vcf_list = vcfs.join(' ')
     """
     set -euo pipefail
-    bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${call_region} ${vcf_list}
+    bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${intersect_region} ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done
-    bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${call_region} ${vcf_list}
+    bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_region} ${vcf_list}
     """
     }
diff --git a/module/intersect.nf b/module/intersect.nf
index e1694081..cb4c0b9a 100644
--- a/module/intersect.nf
+++ b/module/intersect.nf
@@ -10,8 +10,8 @@ workflow intersect {
         intersect_VCFs_BCFtools(
             tool_vcfs,
             tool_indices,
-            params.canonical_regions,
-            params.canonical_regions_index
+            params.intersect_regions,
+            params.intersect_regions_index
             )
         file_for_sha512 = intersect_VCFs_BCFtools.out.consensus_vcf
             .flatten()
diff --git a/module/mutect2.nf b/module/mutect2.nf
index 3b03be62..fc97bea0 100644
--- a/module/mutect2.nf
+++ b/module/mutect2.nf
@@ -36,7 +36,7 @@ workflow mutect2 {
         if (params.intervals) {
             intervals = params.intervals
         } else {
-            intervals = params.canonical_regions
+            intervals = "${projectDir}/config/hg38_chromosomes_canonical.bed"
             // process non-canonical chromosome regions seperately
             // as this region requires more memory than the canonical regions
             call_sSNVInNonAssembledChromosomes_Mutect2(
diff --git a/module/strelka2-processes.nf b/module/strelka2-processes.nf
index 74bd4d84..1ce1291d 100644
--- a/module/strelka2-processes.nf
+++ b/module/strelka2-processes.nf
@@ -38,7 +38,7 @@ process call_sIndel_Manta {
 
     script:
     exome = params.exome ? "--exome" : ""
-    call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : ""
+    call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : ""
     """
     configManta.py \
         --normalBam $normal \
@@ -82,7 +82,7 @@ process call_sSNV_Strelka2 {
 
     script:
     exome = params.exome ? "--exome" : ""
-    call_region_command = params.use_canonical_regions ? "--callRegions ${call_region}" : ""
+    call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : ""
     """
     set -euo pipefail
     configureStrelkaSomaticWorkflow.py \
diff --git a/module/strelka2.nf b/module/strelka2.nf
index c1b3ab9f..429dc997 100644
--- a/module/strelka2.nf
+++ b/module/strelka2.nf
@@ -26,8 +26,8 @@ workflow strelka2 {
             normal_index,
             params.reference,
             "${params.reference}.fai",
-            params.canonical_regions,
-            params.canonical_regions_index
+            params.intersect_regions,
+            params.intersect_regions_index
         )
         call_sSNV_Strelka2(
             tumor_bam,
@@ -37,8 +37,8 @@ workflow strelka2 {
             params.reference,
             "${params.reference}.fai",
             call_sIndel_Manta.out[0],
-            params.canonical_regions,
-            params.canonical_regions_index
+            params.intersect_regions,
+            params.intersect_regions_index
         )
         filter_VCF_BCFtools(call_sSNV_Strelka2.out.snvs_vcf
             .mix(call_sSNV_Strelka2.out.indels_vcf))
@@ -58,4 +58,3 @@ workflow strelka2 {
             .map{ it -> ["${it[2]}"] }
 
     }
-
diff --git a/test/config/a_mini-all-tools.config b/test/config/a_mini-all-tools.config
index 60d71945..46cad5cb 100644
--- a/test/config/a_mini-all-tools.config
+++ b/test/config/a_mini-all-tools.config
@@ -13,18 +13,19 @@ includeConfig "${projectDir}/config/methods.config"
 params {
     algorithm = ['somaticsniper', 'strelka2', 'mutect2', 'muse']
     reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta'
+    intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz'
     dataset_id = 'TWGSAMIN'
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
     // set params.exome to TRUE will add the '-E' option when running MuSE
     exome = true
-    save_intermediate_files = true
+    save_intermediate_files = false
 
     // module options
     bgzip_extra_args = ''
     tabix_extra_args = ''
 
     // strelka2 options
-    call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
+    use_intersect_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''
diff --git a/test/config/a_mini-muse.config b/test/config/a_mini-muse.config
index 08045db7..a8c12850 100644
--- a/test/config/a_mini-muse.config
+++ b/test/config/a_mini-muse.config
@@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config"
 params {
     algorithm = ['muse'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse'
     reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta'
+    intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz'
     dataset_id = 'TWGSAMIN'
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
     // set params.exome to TRUE will add the '-E' option when running MuSE
@@ -24,7 +25,7 @@ params {
     tabix_extra_args = ''
 
     // strelka2 options
-    call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
+    use_intersect_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''
diff --git a/test/config/a_mini-mutect2.config b/test/config/a_mini-mutect2.config
index 35713744..cd43d7e6 100644
--- a/test/config/a_mini-mutect2.config
+++ b/test/config/a_mini-mutect2.config
@@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config"
 params {
     algorithm = ['mutect2'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse'
     reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta'
+    intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz'
     dataset_id = 'TWGSAMIN'
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
     // set params.exome to TRUE will add the '-E' option when running MuSE
@@ -24,7 +25,7 @@ params {
     tabix_extra_args = ''
 
     // strelka2 options
-    call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
+    use_intersect_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''
diff --git a/test/config/a_mini-somaticsniper.config b/test/config/a_mini-somaticsniper.config
index 6a8ccd6f..2bc4ae92 100644
--- a/test/config/a_mini-somaticsniper.config
+++ b/test/config/a_mini-somaticsniper.config
@@ -13,6 +13,7 @@ includeConfig "${projectDir}/config/methods.config"
 params {
     algorithm = ['somaticsniper'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse'
     reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta'
+    intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz'
     dataset_id = 'TWGSAMIN'
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
     // set params.exome to TRUE will add the '-E' option when running MuSE
@@ -24,7 +25,7 @@ params {
     tabix_extra_args = ''
 
     // strelka2 options
-    call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
+    use_intersect_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''
diff --git a/test/config/a_mini-strelka2.config b/test/config/a_mini-strelka2.config
index f106759b..5665c987 100644
--- a/test/config/a_mini-strelka2.config
+++ b/test/config/a_mini-strelka2.config
@@ -14,6 +14,7 @@ params {
     algorithm = ['strelka2'] // 'somaticsniper', 'strelka2', 'mutect2', 'muse'
     reference = '/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta'
     dataset_id = 'TWGSAMIN'
+    intersect_regions = '/hot/ref/tool-specific-input/pipeline-call-sSNV-6.0.0/GRCh38-BI-20160721/hg38_chromosomes_canonical.bed.gz'
     // set params.exome to TRUE will add the '--exome' option when running manta and strelka2
     // set params.exome to TRUE will add the '-E' option when running MuSE
     exome = true
@@ -24,7 +25,7 @@ params {
     tabix_extra_args = ''
 
     // strelka2 options
-    call_region = '/hot/ref/tool-specific-input/Strelka2/GRCh38/strelka2_call_region.bed.gz'
+    use_intersect_regions = true
 
     // mutect2 options
     split_intervals_extra_args = ''

From 44e4f74bed958f4be271f90d7e2d019317d87299 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Fri, 14 Jul 2023 17:15:33 -0700
Subject: [PATCH 11/16] remove call regions from isec-2-or-more

---
 module/intersect-processes.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf
index 2b8f39a4..8a4a3f59 100644
--- a/module/intersect-processes.nf
+++ b/module/intersect-processes.nf
@@ -40,7 +40,7 @@ process intersect_VCFs_BCFtools {
     vcf_list = vcfs.join(' ')
     """
     set -euo pipefail
-    bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more --regions-file ${intersect_region} ${vcf_list}
+    bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done
     bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_region} ${vcf_list}
     """

From 4a0e4e6f6bb5d48b50cf02b6661992d1b41e6805 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Mon, 17 Jul 2023 11:26:45 -0700
Subject: [PATCH 12/16] call_regions to intersect_regions within
 strelka2-processes

---
 module/strelka2-processes.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/module/strelka2-processes.nf b/module/strelka2-processes.nf
index 1ce1291d..00ec17d3 100644
--- a/module/strelka2-processes.nf
+++ b/module/strelka2-processes.nf
@@ -27,8 +27,8 @@ process call_sIndel_Manta {
     path normal_index
     path reference
     path reference_index
-    path call_region
-    path call_region_index
+    path intersect_regions
+    path intersect_regions_index
 
     output:
     tuple path("MantaWorkflow/results/variants/candidateSmallIndels.vcf.gz"),
@@ -38,7 +38,7 @@ process call_sIndel_Manta {
 
     script:
     exome = params.exome ? "--exome" : ""
-    call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : ""
+    call_region_command = params.use_intersect_regions ? "--callRegions ${intersect_regions}" : ""
     """
     configManta.py \
         --normalBam $normal \
@@ -71,8 +71,8 @@ process call_sSNV_Strelka2 {
     path reference
     path reference_index
     tuple path(indel_candidates), path(indel_candidates_index)
-    path call_region
-    path call_region_index
+    path intersect_regions
+    path intersect_regions_index
 
     output:
     tuple val("SNV"), path("StrelkaSomaticWorkflow/results/variants/somatic.snvs.vcf.gz"), emit: snvs_vcf
@@ -82,7 +82,7 @@ process call_sSNV_Strelka2 {
 
     script:
     exome = params.exome ? "--exome" : ""
-    call_region_command = params.use_intersect_regions ? "--callRegions ${call_region}" : ""
+    call_region_command = params.use_intersect_regions ? "--callRegions ${intersect_regions}" : ""
     """
     set -euo pipefail
     configureStrelkaSomaticWorkflow.py \

From e1775fb471c46e46f0c5173614c41e094abb77d8 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Tue, 18 Jul 2023 09:51:46 -0700
Subject: [PATCH 13/16] consistent plural intersect_regions

---
 main.nf                       | 2 +-
 module/intersect-processes.nf | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index a05c0f0c..5b536791 100755
--- a/main.nf
+++ b/main.nf
@@ -30,7 +30,7 @@ log.info """\
         reference: ${params.reference}
         reference_index: ${params.reference_index}
         reference_dict: ${params.reference_dict}
-        intersect_region: ${params.intersect_region}
+        intersect_regions: ${params.intersect_regions}
 
     - output:
         output_dir: ${params.output_dir_base}
diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf
index 8a4a3f59..25a8b10d 100644
--- a/module/intersect-processes.nf
+++ b/module/intersect-processes.nf
@@ -25,8 +25,8 @@ process intersect_VCFs_BCFtools {
     input:
     path vcfs
     path indices
-    path intersect_region
-    path intersect_region_index
+    path intersect_regions
+    path intersect_regions_index
 
     output:
     path "*.vcf.gz", emit: consensus_vcf
@@ -42,6 +42,6 @@ process intersect_VCFs_BCFtools {
     set -euo pipefail
     bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done
-    bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_region} ${vcf_list}
+    bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_regions} ${vcf_list}
     """
     }

From 365956cc3310a27d1e286ad5d0119778c42615c4 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Tue, 18 Jul 2023 10:07:08 -0700
Subject: [PATCH 14/16] add comments to bcftools isec script

---
 module/intersect-processes.nf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/module/intersect-processes.nf b/module/intersect-processes.nf
index 25a8b10d..257d1a2a 100644
--- a/module/intersect-processes.nf
+++ b/module/intersect-processes.nf
@@ -40,8 +40,11 @@ process intersect_VCFs_BCFtools {
     vcf_list = vcfs.join(' ')
     """
     set -euo pipefail
+    # intersect keeping only variants that are present in at least 2 VCFs
+    # Use README.txt to rename output files to include sample names
     bcftools isec --nfiles +2 --output-type z --prefix isec-2-or-more ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  | sed 's/.vcf.gz\$/-consensus-variants.vcf.gz/' | while read a b c d; do mv \$a \$d ; mv \$a.tbi \$d.tbi ; done
+    # intersect, keeping all variants, to create presence/absence list of variants in each VCF
     bcftools isec --output-type z --prefix isec-1-or-more --regions-file ${intersect_regions} ${vcf_list}
     """
     }

From e2fe7bb049a6e035316c216750352034fb9a4d05 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Tue, 18 Jul 2023 10:17:44 -0700
Subject: [PATCH 15/16] edit interect_regions help

---
 config/schema.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/schema.yaml b/config/schema.yaml
index e63ebdea..6b61e0fc 100644
--- a/config/schema.yaml
+++ b/config/schema.yaml
@@ -49,7 +49,7 @@ exome:
 intersect_regions:
   type: 'Path'
   required: true
-  help: 'call regions used by mutect2, strelka2 and intersect'
+  help: 'call regions bed file used by mutect2, strelka2 and intersect'
 use_intersect_regions:
   type: 'Bool'
   required: true

From a0c532a087ca01e025ec36c76307868477249428 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Thu, 20 Jul 2023 12:50:32 -0700
Subject: [PATCH 16/16] always validate intersect_regions files

---
 main.nf | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/main.nf b/main.nf
index 5b536791..04b22cd5 100755
--- a/main.nf
+++ b/main.nf
@@ -132,14 +132,12 @@ workflow {
         file_to_validate = reference_ch
         .mix (tumor_input.tumor_bam, tumor_input.tumor_index, normal_input.normal_bam, normal_input.normal_index)
         }
-    if (params.use_intersect_regions) {
-        file_to_validate = file_to_validate.mix(
-            Channel.from(
-                params.intersect_regions,
-                params.intersect_regions_index
-                )
+    file_to_validate = file_to_validate.mix(
+        Channel.from(
+            params.intersect_regions,
+            params.intersect_regions_index
             )
-        }
+        )
     run_validate_PipeVal(file_to_validate)
     run_validate_PipeVal.out.validation_result.collectFile(
         name: 'input_validation.txt', newLine: true,