uclahs-cds · sorelfitzgibbon · Oct 4, 2023 · Sep 26, 2023 · Sep 28, 2023 · Sep 29, 2023
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add check for MuSE or Mutect2 on F2 node
 
 ### Changed
+- Reorder all VCFs before intersection
+- Move `filter_VCF_BCFtools` to `common.nf`
 - Fix blarchive compression log output directory
 - Delay readcount compression until original file is no longer needed
 

@@ -174,10 +174,10 @@ workflow {
         }
 
     // Set empty channels so any unused tools don't cause failure at intersect step
-    Channel.empty().set { somaticsniper_vcf_ch }
-    Channel.empty().set { strelka2_vcf_ch }
-    Channel.empty().set { mutect2_vcf_ch }
-    Channel.empty().set { muse_vcf_ch }
+    Channel.empty().set { somaticsniper_gzvcf_ch }
+    Channel.empty().set { strelka2_gzvcf_ch }
+    Channel.empty().set { mutect2_gzvcf_ch }
+    Channel.empty().set { muse_gzvcf_ch }
 
     Channel.empty().set { somaticsniper_idx_ch }
     Channel.empty().set { strelka2_idx_ch }
@@ -193,7 +193,7 @@ workflow {
             run_GetSampleName_Mutect2_normal.out.name_ch,
             run_GetSampleName_Mutect2_tumor.out.name_ch
             )
-            somaticsniper.out.vcf.set { somaticsniper_vcf_ch }
+            somaticsniper.out.gzvcf.set { somaticsniper_gzvcf_ch }
             somaticsniper.out.idx.set { somaticsniper_idx_ch }
         }
     if ('strelka2' in params.algorithm) {
@@ -205,7 +205,7 @@ workflow {
             run_GetSampleName_Mutect2_normal.out.name_ch,
             run_GetSampleName_Mutect2_tumor.out.name_ch
             )
-            strelka2.out.vcf.set { strelka2_vcf_ch }
+            strelka2.out.gzvcf.set { strelka2_gzvcf_ch }
             strelka2.out.idx.set { strelka2_idx_ch }
         }
     if ('mutect2' in params.algorithm) {
@@ -216,7 +216,7 @@ workflow {
             normal_input.normal_index.collect(),
             tumor_input.contamination_est.collect()
             )
-            mutect2.out.vcf.set { mutect2_vcf_ch }
+            mutect2.out.gzvcf.set { mutect2_gzvcf_ch }
             mutect2.out.idx.set { mutect2_idx_ch }
         }
     if ('muse' in params.algorithm) {
@@ -228,30 +228,28 @@ workflow {
             run_GetSampleName_Mutect2_normal.out.name_ch,
             run_GetSampleName_Mutect2_tumor.out.name_ch
             )
-            muse.out.vcf.set { muse_vcf_ch }
+            muse.out.gzvcf.set { muse_gzvcf_ch }
             muse.out.idx.set { muse_idx_ch }
         }
 
     // Intersect all vcf files
     if (params.algorithm.size() > 1) {
-        tool_vcfs = (somaticsniper_vcf_ch
-            .mix(strelka2_vcf_ch)
-            .mix(mutect2_vcf_ch)
-            .mix(muse_vcf_ch))
+        tool_gzvcfs = (somaticsniper_gzvcf_ch
+            .mix(strelka2_gzvcf_ch)
+            .mix(mutect2_gzvcf_ch)
+            .mix(muse_gzvcf_ch))
             .collect()
-
         tool_indices = (somaticsniper_idx_ch
             .mix(strelka2_idx_ch)
             .mix(mutect2_idx_ch)
             .mix(muse_idx_ch))
             .collect()
-
         intersect(
-            tool_vcfs,
+            tool_gzvcfs,
             tool_indices,
             script_dir_ch,
-            run_GetSampleName_Mutect2_normal.out.name_ch,
-            run_GetSampleName_Mutect2_tumor.out.name_ch
+            run_GetSampleName_Mutect2_normal.out.name_ch.first(),
+            run_GetSampleName_Mutect2_tumor.out.name_ch.first()
             )
         }
     }
@@ -8,6 +8,31 @@ Docker Images:
 - docker_image_blarchive: ${params.docker_image_blarchive}
 """
 
+process filter_VCF_BCFtools {
+    container params.docker_image_BCFtools
+    publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
+        mode: "copy",
+        pattern: "*.vcf.gz",
+        enabled: params.save_intermediate_files
+    publishDir path: "${params.workflow_log_output_dir}",
+        mode: "copy",
+        pattern: ".command.*",
+        saveAs: { "${task.process.split(':')[-1]}-${var_type}/log${file(it).getName()}" }
+
+    input:
+    tuple val(var_type), path(vcf)
+
+    output:
+    tuple val(var_type), path("*.vcf.gz"), emit: gzvcf
+    path ".command.*"
+
+    script:
+    """
+    set -euo pipefail
+    bcftools view -f PASS  --output-type z --output ${params.output_filename}_${var_type}-pass.vcf.gz ${vcf}
+    """
+    }
+
 process generate_sha512sum {
     container params.docker_image_validate_params
     publishDir path: "${params.workflow_output_dir}/output",
@@ -52,7 +77,7 @@ process rename_samples_BCFtools {
     tuple val(var_type), path(vcf)
 
     output:
-    tuple val(var_type), path("*.vcf.gz"), emit: fix_vcf
+    tuple val(var_type), path("*.vcf.gz"), emit: gzvcf
     path ".command.*"
     path "*_samples.txt"
 

@@ -10,6 +10,36 @@ Intersect Options:
 - vcf2maf_extra_args:  ${params.vcf2maf_extra_args}
 ====================================
 """
+process reorder_samples_BCFtools {
+    container params.docker_image_BCFtools
+    publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
+        mode: "copy",
+        pattern: "*.vcf.gz",
+        enabled: params.save_intermediate_files
+    publishDir path: "${params.workflow_log_output_dir}",
+        mode: "copy",
+        pattern: ".command.*",
+        saveAs: { "${task.process.split(':')[-1]}-${algorithm}/log${file(it).getName()}" }
+
+    input:
+    tuple val(algorithm), path(gzvcf)
+    path indices
+    val tumor_id
+    val normal_id
+
+    output:
+    path "*-reorder.vcf.gz", emit: gzvcf
+    path ".command.*"
+
+    script:
+    """
+    set -euo pipefail
+    infile=\$(basename ${gzvcf} .vcf.gz)
+    outfile="\${infile}-reorder.vcf.gz"
+    bcftools view -s ${tumor_id},${normal_id} --output \${outfile} ${gzvcf}
+    """
+    }
+
 process intersect_VCFs_BCFtools {
     container params.docker_image_BCFtools
     publishDir path: "${params.workflow_output_dir}/output",
@@ -29,20 +59,20 @@ process intersect_VCFs_BCFtools {
         saveAs: { "${task.process.split(':')[-1]}/log${file(it).getName()}" }
 
     input:
-    path vcfs
+    path gzvcf
     path indices
     path intersect_regions
     path intersect_regions_index
 
     output:
-    path "*.vcf.gz", emit: intersect_vcf
-    path "*.vcf.gz.tbi", emit: intersect_idx
+    path "*.vcf.gz", emit: gzvcf
+    path "*.vcf.gz.tbi", emit: idx
     path ".command.*"
     path "isec-2-or-more/*.txt"
     path "isec-1-or-more/*.txt", emit: isec
 
     script:
-    vcf_list = vcfs.join(' ')
+    vcf_list = gzvcf.join(' ')
     regions_command = params.use_intersect_regions ? "--regions-file ${intersect_regions}" : ""
     """
     set -euo pipefail
@@ -54,13 +84,13 @@ process intersect_VCFs_BCFtools {
         ${regions_command} \
         ${vcf_list}
     awk '/Using the following file names:/{x=1;next} x' isec-2-or-more/README.txt  \
-        | sed 's/.vcf.gz\$/-intersect.vcf.gz/' \
+        | sed 's/-reorder.vcf.gz\$/-intersect.vcf.gz/' \
         | while read a b c d; do
             mv \$a \$d
             mv \$a.tbi \$d.tbi
             done
     # intersect, keeping all variants, to create presence/absence list of variants in each VCF
-    bcftools isec \
+    bcftools isec --nfiles +1\
         --output-type z \
         --prefix isec-1-or-more \
         ${regions_command} \
@@ -109,7 +139,7 @@ process concat_VCFs_BCFtools {
     path indices
 
     output:
-    path "*concat.vcf", emit: concat_vcf
+    path "*concat.vcf", emit: vcf
     path ".command.*"
 
     script:
@@ -146,7 +176,7 @@ process convert_VCF_vcf2maf {
     val tumor_id
 
     output:
-    path "*.maf", emit: concat_maf
+    path "*.maf", emit: maf
     path ".command.*"
 
     script:

@@ -3,8 +3,16 @@ include { compress_file_blarchive} from './common'  addParams(
     blarchive_publishDir : "${params.workflow_output_dir}/output",
     blarchive_enabled : true
     )
-include { intersect_VCFs_BCFtools; plot_VennDiagram_R; concat_VCFs_BCFtools ; convert_VCF_vcf2maf } from './intersect-processes.nf'
-include { compress_index_VCF } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
+include { reorder_samples_BCFtools; intersect_VCFs_BCFtools; plot_VennDiagram_R; concat_VCFs_BCFtools ; convert_VCF_vcf2maf } from './intersect-processes.nf'
+include { compress_index_VCF as compress_index_VCF_reordered } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
+    options: [
+        output_dir: params.workflow_output_dir,
+        log_output_dir: params.workflow_log_output_dir,
+        bgzip_extra_args: params.bgzip_extra_args,
+        tabix_extra_args: params.tabix_extra_args,
+        is_output_file: false
+        ])
+include { compress_index_VCF as compress_index_VCF_concat } from '../external/pipeline-Nextflow-module/modules/common/index_VCF_tabix/main.nf' addParams(
     options: [
         output_dir: params.workflow_output_dir,
         log_output_dir: params.workflow_log_output_dir,
@@ -21,54 +29,73 @@ def sortVcfs(List paths) {
 
 workflow intersect {
     take:
-    tool_vcfs
+    tool_gzvcfs
     tool_indices
     script_dir_ch
     normal_id
     tumor_id
 
     main:
-        vcfs_ch = tool_vcfs
+        tool_gzvcfs_ch = tool_gzvcfs
+            .flatten()
+            .map{ it -> ["${file(it).getName().split('-')[0]}", it]}
+        tool_indices_ch = tool_indices
+            .flatten()
+        reorder_samples_BCFtools(
+            tool_gzvcfs_ch,
+            tool_indices_ch,
+            normal_id,
+            tumor_id
+            )
+        compress_index_VCF_reordered(reorder_samples_BCFtools.out.gzvcf
+            .map{ it -> ["${file(it).getName().split('-')[0]}-SNV", it]}
+            )
+        gzvcfs = compress_index_VCF_reordered.out.index_out
+            .map{ it -> it[1] }
+            .collect()
             .map { sortVcfs(it)  }
+        indices = compress_index_VCF_reordered.out.index_out
+            .map{ it -> it[2] }
+            .collect()
         intersect_VCFs_BCFtools(
-            vcfs_ch,
-            tool_indices,
+            gzvcfs,
+            indices,
             params.intersect_regions,
             params.intersect_regions_index
             )
         plot_VennDiagram_R(
             script_dir_ch,
             intersect_VCFs_BCFtools.out.isec,
             )
-        intersect_vcfs_ch = intersect_VCFs_BCFtools.out.intersect_vcf
+        intersect_vcfs = intersect_VCFs_BCFtools.out.gzvcf
             .map { sortVcfs(it) }
         concat_VCFs_BCFtools(
-            intersect_vcfs_ch,
-            intersect_VCFs_BCFtools.out.intersect_idx
+            intersect_vcfs,
+            intersect_VCFs_BCFtools.out.idx
             )
         convert_VCF_vcf2maf(
-            concat_VCFs_BCFtools.out.concat_vcf,
+            concat_VCFs_BCFtools.out.vcf,
             params.reference,
             normal_id,
             tumor_id
             )
-        compress_index_VCF(concat_VCFs_BCFtools.out.concat_vcf
+        compress_index_VCF_concat(concat_VCFs_BCFtools.out.vcf
             .map{ it -> ['SNV', it]}
             )
-        compress_file_blarchive(convert_VCF_vcf2maf.out.concat_maf
+        compress_file_blarchive(convert_VCF_vcf2maf.out.maf
             .map{ it -> ['MAF', it]}
             )
-        file_for_sha512 = intersect_VCFs_BCFtools.out.intersect_vcf
+        file_for_sha512 = intersect_VCFs_BCFtools.out.gzvcf
             .flatten()
-            .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-vcf", it]}
-            .mix(intersect_VCFs_BCFtools.out.intersect_idx
+            .map{ it -> ["${file(it).getName().split('-')[0]}-vcf", it]}
+            .mix(intersect_VCFs_BCFtools.out.idx
                 .flatten()
-                .map{ it -> ["${file(it).getName().split('_')[0]}-SNV-idx", it]}
+                .map{ it -> ["${file(it).getName().split('-')[0]}-idx", it]}
                 )
-            .mix(compress_index_VCF.out.index_out
+            .mix(compress_index_VCF_concat.out.index_out
                 .map{ it -> ["concat-${it[0]}-vcf", it[1]] }
                 )
-            .mix(compress_index_VCF.out.index_out
+            .mix(compress_index_VCF_concat.out.index_out
                 .map{ it -> ["concat-${it[0]}-index", it[2]] }
                 )
             .mix(compress_file_blarchive.out.compressed_file

@@ -77,53 +77,3 @@ process run_sump_MuSE {
         -D $dbSNP
     """
     }
-
-process filter_VCF_BCFtools {
-    container params.docker_image_BCFtools
-    publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
-        mode: "copy",
-        pattern: "*.vcf.gz",
-        enabled: params.save_intermediate_files
-    publishDir path: "${params.workflow_log_output_dir}",
-        mode: "copy",
-        pattern: ".command.*",
-        saveAs: { "${task.process.split(':')[-1]}-${var_type}/log${file(it).getName()}" }
-
-    input:
-    tuple val(var_type), path(vcf)
-
-    output:
-    tuple val(var_type), path("*.vcf.gz"), emit: pass_vcf
-    path ".command.*"
-
-    script:
-    """
-    set -euo pipefail
-    bcftools view -f PASS  --output-type z --output ${params.output_filename}_${var_type}-pass.vcf.gz ${vcf}
-    """
-    }
-
-process reorder_samples_BCFtools {
-    container params.docker_image_BCFtools
-    publishDir path: "${params.workflow_output_dir}/intermediate/${task.process.split(':')[-1]}",
-        mode: "copy",
-        pattern: "*.vcf.gz",
-        enabled: params.save_intermediate_files
-    publishDir path: "${params.workflow_log_output_dir}",
-        mode: "copy",
-        pattern: ".command.*",
-        saveAs: { "${task.process.split(':')[-1]}-${var_type}/log${file(it).getName()}" }
-
-    input:
-    tuple val(var_type), path(vcf)
-
-    output:
-    tuple val(var_type), path("*.vcf.gz"), emit: reorder_vcf
-    path ".command.*"
-
-    script:
-    """
-    set -euo pipefail
-    bcftools view -s NORMAL,TUMOR --output ${params.output_filename}_pass-reorder.vcf.gz ${vcf}
-    """
-    }