From 1f88afd65f133bbd427d8b9fbacec007bb21accc Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 3 Feb 2023 15:56:54 -0500 Subject: [PATCH 01/28] Separate the select sample task as a individual wdl --- .dockstore.yml | 7 +- BenchmarkCNV/BenchmarkCNV.inputs.json | 12 ++-- BenchmarkCNV/BenchmarkCNV.wdl | 62 +++--------------- .../SelectSampleFromCallSet.inputs.json | 9 +++ BenchmarkCNV/SelectSampleFromCallSet.wdl | 65 +++++++++++++++++++ 5 files changed, 96 insertions(+), 59 deletions(-) create mode 100644 BenchmarkCNV/SelectSampleFromCallSet.inputs.json create mode 100644 BenchmarkCNV/SelectSampleFromCallSet.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 059ca98..6ff83ce 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -35,8 +35,13 @@ workflows: primaryDescriptorPath: /wdl/CollectSamErrorMetrics.wdl testParameterFiles: - /input_json/CollectSamErrorMetrics_inputs.json + - name: SelectSampleFromCallSet + subclass: WDL + primaryDescriptorPath: /BenchmarkCNV/SelectSampleFromCallSet.wdl + testParameterFiles: + - /BenchmarkCNV/SelectSampleFromCallSet.inputs.json - name: BenchmarkCNV subclass: WDL primaryDescriptorPath: /BenchmarkCNV/BenchmarkCNV.wdl testParameterFiles: - - /BenchmarkCNV/BenchmarkCNV_test.json \ No newline at end of file + - /BenchmarkCNV/BenchmarkCNV.inputs.json diff --git a/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV.inputs.json index 2810dbe..3c2b002 100644 --- a/BenchmarkCNV/BenchmarkCNV.inputs.json +++ b/BenchmarkCNV/BenchmarkCNV.inputs.json @@ -1,12 +1,16 @@ { + "Benchmark_CNV_Caller.BenchmarkCNV.mem_size": "Int? (optional)", "Benchmark_CNV_Caller.wittyer_docker": "us.gcr.io/tag-team-160914/wittyer:v2", - "Benchmark_CNV_Caller.bcftools_docker": "us.gcr.io/broad-dsde-methods/liquidbiopsy:0.0.4.3", + "Benchmark_CNV_Caller.BenchmarkCNV.mem": "Int? (optional)", + "Benchmark_CNV_Caller.BenchmarkCNV.disk_size": "Int? (optional)", + "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v9-funnydual-broom", "Benchmark_CNV_Caller.wittyer_sv_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/sv-config.json", - "Benchmark_CNV_Caller.wittyer_cnv_evaluation_mode": "CrossTypeAndSimpleCounting", - "Benchmark_CNV_Caller.wittyer_sv_evaluation_mode": "SimpleCounting", + "Benchmark_CNV_Caller.wittyer_cnv_evaluation_mode": "SimpleCounting", + "Benchmark_CNV_Caller.BenchmarkCNV.disk_space": "Int? (optional)", + "Benchmark_CNV_Caller.wittyer_sv_evaluation_mode": "CrossTypeAndSimpleCounting", "Benchmark_CNV_Caller.wittyer_cnv_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/cnv-config.json", - "Benchmark_CNV_Caller.variant_callset": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/1KGP_3202.gatksv_svtools_novelins.freeze_V3_fixed.wAF.vcf", "Benchmark_CNV_Caller.truth_sample_name": "HG00513", + "Benchmark_CNV_Caller.truth_vcf": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/example_truth_vcf/HG00513_filtered.vcf", "Benchmark_CNV_Caller.query_sample_name": "SM-GZQKA", "Benchmark_CNV_Caller.eval_sv_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/HG00513_HG00513_1_SM-GZQKA_v1/HG00513_HG00513_1_SM-GZQKA_v1.sv.vcf.gz", "Benchmark_CNV_Caller.eval_cnv_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/HG00513_HG00513_1_SM-GZQKA_v1/HG00513_HG00513_1_SM-GZQKA_v1.cnv.vcf.gz" diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV.wdl index f612805..c0fa423 100644 --- a/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV.wdl @@ -2,33 +2,24 @@ version 1.0 workflow Benchmark_CNV_Caller { input { - String bcftools_docker - File variant_callset String truth_sample_name String query_sample_name - String wittyer_docker + File truth_vcf File eval_cnv_vcf - File wittyer_cnv_config - String wittyer_cnv_evaluation_mode File eval_sv_vcf + File wittyer_cnv_config File wittyer_sv_config + String wittyer_cnv_evaluation_mode String wittyer_sv_evaluation_mode + String wittyer_docker String wittyer4mat_docker } - # Select vcf for specific sample - call SelectSample { - input: - bcftools_docker = bcftools_docker, - vcf = variant_callset, - truth_sample_name = truth_sample_name - } - # benchmark cnv.vcf and sv.vcf using witty.er tool call BenchmarkCNV { input: wittyer_docker = wittyer_docker, - truth_vcf = SelectSample.output_vcf, + truth_vcf = truth_vcf, truth_sample_name = truth_sample_name, query_sample_name = query_sample_name, eval_cnv_vcf = eval_cnv_vcf, @@ -50,7 +41,6 @@ workflow Benchmark_CNV_Caller { # Outputs that will be retained when execution is complete output { - File truth_vcf = SelectSample.output_vcf File cnv_wittyer_stats = BenchmarkCNV.cnv_wittyer_stats File cnv_wittyer_annotated_vcf = BenchmarkCNV.cnv_wittyer_annotated_vcf File cnv_wittyer_annotated_vcf_index = BenchmarkCNV.cnv_wittyer_annotated_vcf_index @@ -70,46 +60,10 @@ workflow Benchmark_CNV_Caller { } } - # Task 1: Select sample vcf from a large callset (e.g. 1KGP) - task SelectSample { - - input { - String bcftools_docker - File vcf - String truth_sample_name - Int? mem - Int? disk_space - # If mem and disk size were not specified, use 4GB and 100 GB as default - Int mem_size = select_first([mem, 4]) - Int disk_size = select_first([disk_space,100]) - - } - command <<< - set -e - # Select sample using bcftools - bcftools view -s ~{truth_sample_name} -O v -o ~{truth_sample_name}.vcf ~{vcf} - - # Remove Complex SV from the sample vcf because wittyer can't process CPX variants - # Remove INV from the sample vcf because wittyer's exception - # Remove reference allele - - bcftools view -e 'SVTYPE="INV" | SVTYPE="CPX" | GT="0/0"' ~{truth_sample_name}.vcf -o ~{truth_sample_name}_filtered.vcf - - >>> - runtime { - docker: bcftools_docker - bootDiskSizeGb: 12 - memory: mem_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: 2 - } - output { - File output_vcf = "~{truth_sample_name}_filtered.vcf" - } - -} - # Task 2: Benchmark the large variant vcf against truth set generated in task 1 + # Task 1: Benchmark the large variant vcf against truth set + # If you are extracting vcf from a large callset vcf + # Checkout /BenchmarkCNV/SelectSampleFromCallSet.wdl task BenchmarkCNV { input { diff --git a/BenchmarkCNV/SelectSampleFromCallSet.inputs.json b/BenchmarkCNV/SelectSampleFromCallSet.inputs.json new file mode 100644 index 0000000..a3fd317 --- /dev/null +++ b/BenchmarkCNV/SelectSampleFromCallSet.inputs.json @@ -0,0 +1,9 @@ +{ + "SelectSampleFromCallSet.SelectSample.mem": "Int? (optional)", + "SelectSampleFromCallSet.sample_name": "HG00513", + "SelectSampleFromCallSet.SelectSample.disk_size": "Int? (optional)", + "SelectSampleFromCallSet.variant_callset": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/1KGP_3202.gatksv_svtools_novelins.freeze_V3_fixed.wAF.vcf", + "SelectSampleFromCallSet.SelectSample.disk_space": "Int? (optional)", + "SelectSampleFromCallSet.SelectSample.mem_size": "Int? (optional)", + "SelectSampleFromCallSet.bcftools_docker": "us.gcr.io/broad-dsde-methods/liquidbiopsy:0.0.4.3" +} diff --git a/BenchmarkCNV/SelectSampleFromCallSet.wdl b/BenchmarkCNV/SelectSampleFromCallSet.wdl new file mode 100644 index 0000000..3e5fd5a --- /dev/null +++ b/BenchmarkCNV/SelectSampleFromCallSet.wdl @@ -0,0 +1,65 @@ +version 1.0 + +workflow SelectSampleFromCallSet { + input { + String bcftools_docker + File variant_callset + String sample_name + } + + # Select vcf for specific sample + call SelectSample { + input: + bcftools_docker = bcftools_docker, + vcf = variant_callset, + sample_name = sample_name + } + + + # Outputs that will be retained when execution is complete + output { + File selected_vcf = SelectSample.output_vcf + } + meta { + author: "Yueyao Gao" + email: "tag@broadinstitute.org" + description: "SelectSampleFromCallSet.wdl is design to extract specific sample from a large callset vcf" + } +} +task SelectSample { + + input { + String bcftools_docker + File vcf + String sample_name + Int? mem + Int? disk_space + # If mem and disk size were not specified, use 4GB and 100 GB as default + Int mem_size = select_first([mem, 4]) + Int disk_size = select_first([disk_space,100]) + + } + command <<< + set -e + # Select sample using bcftools + bcftools view -s ~{sample_name} -O v -o ~{sample_name}.vcf ~{vcf} + + # Remove Complex SV from the sample vcf because wittyer can't process CPX variants + # Remove INV from the sample vcf because wittyer's exception + # Remove reference allele + + bcftools view -e 'SVTYPE="INV" | SVTYPE="CPX" | GT="0/0"' ~{sample_name}.vcf -o ~{sample_name}_filtered.vcf + + >>> + runtime { + docker: bcftools_docker + bootDiskSizeGb: 12 + memory: mem_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } + output { + File output_vcf = "~{sample_name}_filtered.vcf" + } + +} \ No newline at end of file From 0e757c7e1b6646a1c520507981b89accbd024cf6 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Feb 2023 10:31:49 -0500 Subject: [PATCH 02/28] Added the optional bed file option --- BenchmarkCNV/BenchmarkCNV.wdl | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV.wdl index c0fa423..526b5a4 100644 --- a/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV.wdl @@ -9,6 +9,7 @@ workflow Benchmark_CNV_Caller { File eval_sv_vcf File wittyer_cnv_config File wittyer_sv_config + File? bedfile String wittyer_cnv_evaluation_mode String wittyer_sv_evaluation_mode String wittyer_docker @@ -27,7 +28,8 @@ workflow Benchmark_CNV_Caller { cnv_evaluation_mode = wittyer_cnv_evaluation_mode, eval_sv_vcf = eval_sv_vcf, sv_config_file = wittyer_sv_config, - sv_evaluation_mode = wittyer_sv_evaluation_mode + sv_evaluation_mode = wittyer_sv_evaluation_mode, + bedfile = bedfile } # wittyer4mat to parse the wittyer json output @@ -74,6 +76,7 @@ workflow Benchmark_CNV_Caller { String cnv_evaluation_mode File eval_sv_vcf File sv_config_file + File? bedfile String sv_evaluation_mode String truth_sample_name String query_sample_name @@ -85,6 +88,24 @@ workflow Benchmark_CNV_Caller { } command <<< set -e + + if [[ -f "~{bedfile}" ]]; then + # Run Benchmarking tool wittyer on dragen generated cnv.vcf with bed file + /opt/Wittyer/Wittyer -i ~{eval_cnv_vcf} \ + -t ~{truth_vcf} \ + -em ~{cnv_evaluation_mode} \ + --configFile ~{cnv_config_file} \ + --includeBed ~{bedfile} \ + -o ~{truth_sample_name}_cnv_wittyer_output + + # Run Benchmarking tool wittyer on dragen generated sv.vcf with bed file + /opt/Wittyer/Wittyer -i ~{eval_sv_vcf} \ + -t ~{truth_vcf} \ + -em ~{sv_evaluation_mode} \ + --configFile ~{sv_config_file} \ + --includeBed ~{bedfile} \ + -o ~{truth_sample_name}_sv_wittyer_output + else # Run Benchmarking tool wittyer on dragen generated cnv.vcf /opt/Wittyer/Wittyer -i ~{eval_cnv_vcf} \ -t ~{truth_vcf} \ From 8746e226b5022892b953fdc4cd3d18978a258ca5 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Feb 2023 10:53:58 -0500 Subject: [PATCH 03/28] fix broken if-then-else statement --- BenchmarkCNV/BenchmarkCNV.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV.wdl index 526b5a4..965e749 100644 --- a/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV.wdl @@ -119,7 +119,7 @@ workflow Benchmark_CNV_Caller { -em ~{sv_evaluation_mode} \ --configFile ~{sv_config_file} \ -o ~{truth_sample_name}_sv_wittyer_output - + fi >>> runtime { docker: wittyer_docker From 9e8d101a5c750ad4e45f176061ed9212e65c4c00 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 10:00:55 -0500 Subject: [PATCH 04/28] Generalize BenchmarkCNV wdl --- BenchmarkCNV/BenchmarkCNV.inputs.json | 28 ++++--- BenchmarkCNV/BenchmarkCNV.wdl | 104 ++++++++------------------ 2 files changed, 46 insertions(+), 86 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV.inputs.json index 3c2b002..7f0581a 100644 --- a/BenchmarkCNV/BenchmarkCNV.inputs.json +++ b/BenchmarkCNV/BenchmarkCNV.inputs.json @@ -1,17 +1,15 @@ { - "Benchmark_CNV_Caller.BenchmarkCNV.mem_size": "Int? (optional)", + "Benchmark_CNV_Caller.bedfile": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/HighConfidenceRegion_BedFile/HG002_hg38_SVs_Tier1_v0.6.bed", + "Benchmark_CNV_Caller.BenchmarkCNV.mem_size": 4, "Benchmark_CNV_Caller.wittyer_docker": "us.gcr.io/tag-team-160914/wittyer:v2", - "Benchmark_CNV_Caller.BenchmarkCNV.mem": "Int? (optional)", - "Benchmark_CNV_Caller.BenchmarkCNV.disk_size": "Int? (optional)", - "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v9-funnydual-broom", - "Benchmark_CNV_Caller.wittyer_sv_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/sv-config.json", - "Benchmark_CNV_Caller.wittyer_cnv_evaluation_mode": "SimpleCounting", - "Benchmark_CNV_Caller.BenchmarkCNV.disk_space": "Int? (optional)", - "Benchmark_CNV_Caller.wittyer_sv_evaluation_mode": "CrossTypeAndSimpleCounting", - "Benchmark_CNV_Caller.wittyer_cnv_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/cnv-config.json", - "Benchmark_CNV_Caller.truth_sample_name": "HG00513", - "Benchmark_CNV_Caller.truth_vcf": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/example_truth_vcf/HG00513_filtered.vcf", - "Benchmark_CNV_Caller.query_sample_name": "SM-GZQKA", - "Benchmark_CNV_Caller.eval_sv_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/HG00513_HG00513_1_SM-GZQKA_v1/HG00513_HG00513_1_SM-GZQKA_v1.sv.vcf.gz", - "Benchmark_CNV_Caller.eval_cnv_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/HG00513_HG00513_1_SM-GZQKA_v1/HG00513_HG00513_1_SM-GZQKA_v1.cnv.vcf.gz" -} + "Benchmark_CNV_Caller.BenchmarkCNV.mem": 4, + "Benchmark_CNV_Caller.wittyer_evaluation_mode": "SimpleCounting", + "Benchmark_CNV_Caller.BenchmarkCNV.disk_size": 100, + "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v10-stat-broom", + "Benchmark_CNV_Caller.eval_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/NA24385_hg38/NA24385.cnv.vcf.gz", + "Benchmark_CNV_Caller.BenchmarkCNV.disk_space": 100, + "Benchmark_CNV_Caller.truth_sample_name": "NA24385", + "Benchmark_CNV_Caller.truth_vcf": "gs://fc-cd2e8270-8c64-4f10-bcfb-2f1b5f44aee4/submissions/47e84385-8207-4ad0-a412-b2c113efb470/SelectSampleFromCallSet/17210bc4-e6c8-4ce5-bb23-486fcb18ab89/call-SelectSample/NA24385_filtered.vcf", + "Benchmark_CNV_Caller.query_sample_name": "SM-MVWA8", + "Benchmark_CNV_Caller.wittyer_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/cnv-config.json" +} \ No newline at end of file diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV.wdl index 965e749..f7be199 100644 --- a/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV.wdl @@ -5,13 +5,10 @@ workflow Benchmark_CNV_Caller { String truth_sample_name String query_sample_name File truth_vcf - File eval_cnv_vcf - File eval_sv_vcf - File wittyer_cnv_config - File wittyer_sv_config + File eval_vcf + File wittyer_config File? bedfile - String wittyer_cnv_evaluation_mode - String wittyer_sv_evaluation_mode + String wittyer_evaluation_mode String wittyer_docker String wittyer4mat_docker } @@ -23,12 +20,9 @@ workflow Benchmark_CNV_Caller { truth_vcf = truth_vcf, truth_sample_name = truth_sample_name, query_sample_name = query_sample_name, - eval_cnv_vcf = eval_cnv_vcf, - cnv_config_file = wittyer_cnv_config, - cnv_evaluation_mode = wittyer_cnv_evaluation_mode, - eval_sv_vcf = eval_sv_vcf, - sv_config_file = wittyer_sv_config, - sv_evaluation_mode = wittyer_sv_evaluation_mode, + eval_vcf = eval_vcf, + wittyer_config = wittyer_config, + wittyer_evaluation_mode = wittyer_evaluation_mode, bedfile = bedfile } @@ -36,24 +30,16 @@ workflow Benchmark_CNV_Caller { call Wittyer4Mat { input: wittyer4mat_docker = wittyer4mat_docker, - cnv_wittyer_stats = BenchmarkCNV.cnv_wittyer_stats, - sv_wittyer_stats = BenchmarkCNV.sv_wittyer_stats, + wittyer_stats = BenchmarkCNV.wittyer_stats, truth_sample_name = truth_sample_name } # Outputs that will be retained when execution is complete output { - File cnv_wittyer_stats = BenchmarkCNV.cnv_wittyer_stats - File cnv_wittyer_annotated_vcf = BenchmarkCNV.cnv_wittyer_annotated_vcf - File cnv_wittyer_annotated_vcf_index = BenchmarkCNV.cnv_wittyer_annotated_vcf_index - File sv_wittyer_stats = BenchmarkCNV.sv_wittyer_stats - File sv_wittyer_annotated_vcf = BenchmarkCNV.sv_wittyer_annotated_vcf - File sv_wittyer_annotated_vcf_index = BenchmarkCNV.sv_wittyer_annotated_vcf_index - File cnv_deletion_stat = Wittyer4Mat.cnv_deletion_stat - File cnv_duplication_stat = Wittyer4Mat.cnv_duplication_stat - File sv_deletion_stat = Wittyer4Mat.sv_deletion_stat - File sv_duplication_stat = Wittyer4Mat.sv_duplication_stat - File sv_insertion_stat = Wittyer4Mat.sv_insertion_stat + File wittyer_stats = BenchmarkCNV.wittyer_stats + File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf + File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index + Array[File] Wittyer4Mat_stat = Wittyer4Mat.formatted_stat } meta { author: "Yueyao Gao" @@ -71,13 +57,10 @@ workflow Benchmark_CNV_Caller { input { String wittyer_docker File truth_vcf - File eval_cnv_vcf - File cnv_config_file - String cnv_evaluation_mode - File eval_sv_vcf - File sv_config_file + File eval_vcf + File wittyer_config + String wittyer_evaluation_mode File? bedfile - String sv_evaluation_mode String truth_sample_name String query_sample_name Int? mem @@ -91,34 +74,21 @@ workflow Benchmark_CNV_Caller { if [[ -f "~{bedfile}" ]]; then # Run Benchmarking tool wittyer on dragen generated cnv.vcf with bed file - /opt/Wittyer/Wittyer -i ~{eval_cnv_vcf} \ + /opt/Wittyer/Wittyer -i ~{eval_vcf} \ -t ~{truth_vcf} \ - -em ~{cnv_evaluation_mode} \ - --configFile ~{cnv_config_file} \ + -em ~{wittyer_evaluation_mode} \ + --configFile ~{wittyer_config} \ --includeBed ~{bedfile} \ - -o ~{truth_sample_name}_cnv_wittyer_output + -o ~{truth_sample_name}_wittyer_output - # Run Benchmarking tool wittyer on dragen generated sv.vcf with bed file - /opt/Wittyer/Wittyer -i ~{eval_sv_vcf} \ - -t ~{truth_vcf} \ - -em ~{sv_evaluation_mode} \ - --configFile ~{sv_config_file} \ - --includeBed ~{bedfile} \ - -o ~{truth_sample_name}_sv_wittyer_output else # Run Benchmarking tool wittyer on dragen generated cnv.vcf - /opt/Wittyer/Wittyer -i ~{eval_cnv_vcf} \ + /opt/Wittyer/Wittyer -i ~{eval_vcf} \ -t ~{truth_vcf} \ - -em ~{cnv_evaluation_mode} \ - --configFile ~{cnv_config_file} \ - -o ~{truth_sample_name}_cnv_wittyer_output + -em ~{wittyer_evaluation_mode} \ + --configFile ~{wittyer_config} \ + -o ~{truth_sample_name}_wittyer_output - # Run Benchmarking tool wittyer on dragen generated sv.vcf - /opt/Wittyer/Wittyer -i ~{eval_sv_vcf} \ - -t ~{truth_vcf} \ - -em ~{sv_evaluation_mode} \ - --configFile ~{sv_config_file} \ - -o ~{truth_sample_name}_sv_wittyer_output fi >>> runtime { @@ -129,12 +99,9 @@ workflow Benchmark_CNV_Caller { preemptible: 2 } output { - File cnv_wittyer_stats = "~{truth_sample_name}_cnv_wittyer_output/Wittyer.Stats.json" - File cnv_wittyer_annotated_vcf = "~{truth_sample_name}_cnv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz" - File cnv_wittyer_annotated_vcf_index = "~{truth_sample_name}_cnv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" - File sv_wittyer_stats = "~{truth_sample_name}_sv_wittyer_output/Wittyer.Stats.json" - File sv_wittyer_annotated_vcf = "~{truth_sample_name}_sv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz" - File sv_wittyer_annotated_vcf_index = "~{truth_sample_name}_sv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" + File wittyer_stats = "~{truth_sample_name}_wittyer_output/Wittyer.Stats.json" + File wittyer_annotated_vcf = "~{truth_sample_name}_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz" + File wittyer_annotated_vcf_index = "~{truth_sample_name}_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" } } # Task3: Format the wittyer json output @@ -142,8 +109,7 @@ workflow Benchmark_CNV_Caller { input{ String wittyer4mat_docker - File cnv_wittyer_stats - File sv_wittyer_stats + File wittyer_stats String truth_sample_name } command <<< @@ -153,27 +119,23 @@ workflow Benchmark_CNV_Caller { mkdir ~{truth_sample_name}_cnv_wittyer4mat conda run --no-capture-output \ -n wittyer-parser \ - python3 /wittyer4mat/wittyer_4mat.py -i ~{cnv_wittyer_stats} \ - -t cnv \ - -o ~{truth_sample_name}_cnv_wittyer4mat + python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ + -t base \ + -o ~{truth_sample_name}_base_level_wittyer4mat # Run wittyer4mat script on sv wittyer output mkdir ~{truth_sample_name}_sv_wittyer4mat conda run --no-capture-output \ -n wittyer-parser \ - python3 /wittyer4mat/wittyer_4mat.py -i ~{sv_wittyer_stats} \ - -t sv \ - -o ~{truth_sample_name}_sv_wittyer4mat + python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ + -t event \ + -o ~{truth_sample_name}_event_level_wittyer4mat >>> runtime { docker: wittyer4mat_docker preemptible: 2 } output { - File cnv_deletion_stat = "~{truth_sample_name}_cnv_wittyer4mat/wittyer_cnv_Deletion_output.csv" - File cnv_duplication_stat = "~{truth_sample_name}_cnv_wittyer4mat/wittyer_cnv_Duplication_output.csv" - File sv_deletion_stat = "~{truth_sample_name}_sv_wittyer4mat/wittyer_sv_Deletion_output.csv" - File sv_duplication_stat = "~{truth_sample_name}_sv_wittyer4mat/wittyer_sv_Duplication_output.csv" - File sv_insertion_stat = "~{truth_sample_name}_sv_wittyer4mat/wittyer_sv_Insertion_output.csv" + Array[File] formatted_stat = glob("~{truth_sample_name}_event_level_wittyer4mat/*.csv") } } \ No newline at end of file From c0d96ac7d0e5269909fd2b20049b5b85bfd76bbf Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 10:09:51 -0500 Subject: [PATCH 05/28] Fixed an input error in inputs.json --- BenchmarkCNV/BenchmarkCNV.inputs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV.inputs.json index 7f0581a..9dec994 100644 --- a/BenchmarkCNV/BenchmarkCNV.inputs.json +++ b/BenchmarkCNV/BenchmarkCNV.inputs.json @@ -3,7 +3,7 @@ "Benchmark_CNV_Caller.BenchmarkCNV.mem_size": 4, "Benchmark_CNV_Caller.wittyer_docker": "us.gcr.io/tag-team-160914/wittyer:v2", "Benchmark_CNV_Caller.BenchmarkCNV.mem": 4, - "Benchmark_CNV_Caller.wittyer_evaluation_mode": "SimpleCounting", + "Benchmark_CNV_Caller.wittyer_evaluation_mode": "CrossTypeAndSimpleCounting", "Benchmark_CNV_Caller.BenchmarkCNV.disk_size": 100, "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v10-stat-broom", "Benchmark_CNV_Caller.eval_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/NA24385_hg38/NA24385.cnv.vcf.gz", From ed38ba4505e7f68a8a3fcc700b16fee96910390e Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 10:23:00 -0500 Subject: [PATCH 06/28] Add both base-level and event-level to workflow output --- BenchmarkCNV/BenchmarkCNV.wdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV.wdl index f7be199..d416673 100644 --- a/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV.wdl @@ -39,7 +39,8 @@ workflow Benchmark_CNV_Caller { File wittyer_stats = BenchmarkCNV.wittyer_stats File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index - Array[File] Wittyer4Mat_stat = Wittyer4Mat.formatted_stat + Array[File] Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats + Array[File] Wittyer4Mat_base_stats = Wittyer4Mat.base_level_wittyer_stats } meta { author: "Yueyao Gao" @@ -136,6 +137,7 @@ workflow Benchmark_CNV_Caller { preemptible: 2 } output { - Array[File] formatted_stat = glob("~{truth_sample_name}_event_level_wittyer4mat/*.csv") + Array[File] event_level_wittyer_stats = glob("~{truth_sample_name}_event_level_wittyer4mat/*.csv") + Array[File] base_level_wittyer_stats = glob("~{truth_sample_name}_base_level_wittyer4mat/*.csv") } } \ No newline at end of file From c499c6c8e5104f83ca052bf2791bd59ea647c27a Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 10:35:20 -0500 Subject: [PATCH 07/28] Updated the directory to allow more workflows can be utilized --- .dockstore.yml | 8 ++++---- BenchmarkCNV/{ => BenchmarkCNV}/BenchmarkCNV.inputs.json | 0 BenchmarkCNV/{ => BenchmarkCNV}/BenchmarkCNV.wdl | 0 .../SelectSampleFromCallSet.inputs.json | 0 .../SelectSampleFromCallSet.wdl | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename BenchmarkCNV/{ => BenchmarkCNV}/BenchmarkCNV.inputs.json (100%) rename BenchmarkCNV/{ => BenchmarkCNV}/BenchmarkCNV.wdl (100%) rename BenchmarkCNV/{ => SelectSampleFromCallSet}/SelectSampleFromCallSet.inputs.json (100%) rename BenchmarkCNV/{ => SelectSampleFromCallSet}/SelectSampleFromCallSet.wdl (100%) diff --git a/.dockstore.yml b/.dockstore.yml index 4478115..760a64f 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -22,14 +22,14 @@ workflows: - /input_json/CollectSamErrorMetrics_inputs.json - name: SelectSampleFromCallSet subclass: WDL - primaryDescriptorPath: /BenchmarkCNV/SelectSampleFromCallSet.wdl + primaryDescriptorPath: /BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl testParameterFiles: - - /BenchmarkCNV/SelectSampleFromCallSet.inputs.json + - /BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json - name: BenchmarkCNV subclass: WDL - primaryDescriptorPath: /BenchmarkCNV/BenchmarkCNV.wdl + primaryDescriptorPath: /BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl testParameterFiles: - - /BenchmarkCNV/BenchmarkCNV.inputs.json + - /BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json - name: GATK4_CNV subclass: WDL primaryDescriptorPath: /GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.wdl diff --git a/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json similarity index 100% rename from BenchmarkCNV/BenchmarkCNV.inputs.json rename to BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl similarity index 100% rename from BenchmarkCNV/BenchmarkCNV.wdl rename to BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl diff --git a/BenchmarkCNV/SelectSampleFromCallSet.inputs.json b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json similarity index 100% rename from BenchmarkCNV/SelectSampleFromCallSet.inputs.json rename to BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json diff --git a/BenchmarkCNV/SelectSampleFromCallSet.wdl b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl similarity index 100% rename from BenchmarkCNV/SelectSampleFromCallSet.wdl rename to BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl From 0569dc5dc0f5c95ddb101d6a9dc0aac4100a7bf5 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 11:39:50 -0500 Subject: [PATCH 08/28] Updated the Wittyer4mat image to the lastest one --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json index 9dec994..e5bf04e 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json @@ -5,7 +5,7 @@ "Benchmark_CNV_Caller.BenchmarkCNV.mem": 4, "Benchmark_CNV_Caller.wittyer_evaluation_mode": "CrossTypeAndSimpleCounting", "Benchmark_CNV_Caller.BenchmarkCNV.disk_size": 100, - "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v10-stat-broom", + "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v11-outstat-broom", "Benchmark_CNV_Caller.eval_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/NA24385_hg38/NA24385.cnv.vcf.gz", "Benchmark_CNV_Caller.BenchmarkCNV.disk_space": 100, "Benchmark_CNV_Caller.truth_sample_name": "NA24385", From b73f1cc27432b3bce36804061b5c093ac78209e2 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 13:32:01 -0500 Subject: [PATCH 09/28] Added MergeVCF.wdl --- BenchmarkCNV/MergeVCF/MergeVCF.inputs.json | 10 ++++ BenchmarkCNV/MergeVCF/MergeVCF.wdl | 68 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 BenchmarkCNV/MergeVCF/MergeVCF.inputs.json create mode 100644 BenchmarkCNV/MergeVCF/MergeVCF.wdl diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.inputs.json b/BenchmarkCNV/MergeVCF/MergeVCF.inputs.json new file mode 100644 index 0000000..b0b19e4 --- /dev/null +++ b/BenchmarkCNV/MergeVCF/MergeVCF.inputs.json @@ -0,0 +1,10 @@ +{ + "MergeVCF.mergeVCF.mem_size": "Int? (optional)", + "MergeVCF.mergeVCF.mem": "Int? (optional)", + "MergeVCF.mergeVCF.disk_space": "Int? (optional)", + "MergeVCF.vcf1": "File", + "MergeVCF.sample_name": "String", + "MergeVCF.vcf2": "File", + "MergeVCF.bcftools_docker": "us.gcr.io/broad-dsde-methods/liquidbiopsy:0.0.4.3", + "MergeVCF.mergeVCF.disk_size": "Int? (optional)" +} diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl new file mode 100644 index 0000000..0507b08 --- /dev/null +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -0,0 +1,68 @@ +version 1.0 + +workflow MergeVCF { + input { + String bcftools_docker + File vcf1 + File vcf2 + String sample_name + } + + # Merge Two Input VCF Files + call mergeVCF { + input: + bcftools_docker = bcftools_docker, + vcf1 = vcf1, + vcf2 = vcf2, + sample_name = sample_name + } + + + # Outputs that will be retained when execution is complete + output { + File selected_vcf = mergeVCF.output_vcf + } + meta { + author: "Yueyao Gao" + email: "tag@broadinstitute.org" + description: "MergeVCF.wdl is design to merge two input vcfs" + } +} +task mergeVCF { + + input { + String bcftools_docker + File vcf1 + File vcf2 + String sample_name + Int? mem + Int? disk_space + # If mem and disk size were not specified, use 4GB and 100 GB as default + Int mem_size = select_first([mem, 4]) + Int disk_size = select_first([disk_space,100]) + + } + command <<< + set -e + + echo "Input VCF files:" + echo "vcf1: ~{vcf1}" + echo "vcf2: ~{vcf2}" + + bcftools concat ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf + + # Print the output file name + echo "Merged VCF file: ~{sample_name}_merged.vcf" + >>> + runtime { + docker: bcftools_docker + bootDiskSizeGb: 12 + memory: mem_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } + output { + File output_vcf = "~{sample_name}_merged.vcf" + } + +} \ No newline at end of file From a5155dc062a8c58a5e79b80266147b2b833e83b1 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 13:34:08 -0500 Subject: [PATCH 10/28] Added MergeVCF to dockerstore.yml --- .dockstore.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.dockstore.yml b/.dockstore.yml index 760a64f..278e019 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -25,6 +25,11 @@ workflows: primaryDescriptorPath: /BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl testParameterFiles: - /BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json + - name: MergeVCF + subclass: WDL + primaryDescriptorPath: /BenchmarkCNV/MergeVCF/MergeVCF.wdl + testParameterFiles: + - /BenchmarkCNV/MergeVCF/MergeVCF.inputs.json - name: BenchmarkCNV subclass: WDL primaryDescriptorPath: /BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl From 96485f3e84cfbee911ee22004acc8e98c3922ac7 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 13:54:03 -0500 Subject: [PATCH 11/28] Changed the output VCF name --- BenchmarkCNV/MergeVCF/MergeVCF.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl index 0507b08..51522e4 100644 --- a/BenchmarkCNV/MergeVCF/MergeVCF.wdl +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -20,7 +20,7 @@ workflow MergeVCF { # Outputs that will be retained when execution is complete output { - File selected_vcf = mergeVCF.output_vcf + File merged_vcf = mergeVCF.output_vcf } meta { author: "Yueyao Gao" From b4fd6918d543e8fdda1adc89a60ee1c1d3d00809 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Mon, 27 Feb 2023 14:40:07 -0500 Subject: [PATCH 12/28] Continue reporting the output regardless of the warning --- BenchmarkCNV/MergeVCF/MergeVCF.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl index 51522e4..b07626e 100644 --- a/BenchmarkCNV/MergeVCF/MergeVCF.wdl +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -49,7 +49,9 @@ task mergeVCF { echo "vcf1: ~{vcf1}" echo "vcf2: ~{vcf2}" - bcftools concat ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf + # Use bcftools to concat two vcfs + # Ignore the warning message + bcftools concat ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf 2>/dev/null || true # Print the output file name echo "Merged VCF file: ~{sample_name}_merged.vcf" From 6dc34f351e5430291a908c68f5557868ec2edbdf Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Tue, 28 Feb 2023 14:26:57 -0500 Subject: [PATCH 13/28] Added allow-overlapping option in merge --- BenchmarkCNV/MergeVCF/MergeVCF.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl index b07626e..714ee1b 100644 --- a/BenchmarkCNV/MergeVCF/MergeVCF.wdl +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -51,7 +51,7 @@ task mergeVCF { # Use bcftools to concat two vcfs # Ignore the warning message - bcftools concat ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf 2>/dev/null || true + bcftools concat -a ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf # Print the output file name echo "Merged VCF file: ~{sample_name}_merged.vcf" From 449b9012513a6e8e79a78d8b6ba6127fad3023a2 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Tue, 28 Feb 2023 15:11:17 -0500 Subject: [PATCH 14/28] Updated wdl with index step before merging --- BenchmarkCNV/MergeVCF/MergeVCF.wdl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl index 714ee1b..746979b 100644 --- a/BenchmarkCNV/MergeVCF/MergeVCF.wdl +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -49,12 +49,15 @@ task mergeVCF { echo "vcf1: ~{vcf1}" echo "vcf2: ~{vcf2}" + echo "Index the input VCFs" + bcftools index -t ~{vcf1} + bcftools index -t ~{vcf2} + # Use bcftools to concat two vcfs - # Ignore the warning message - bcftools concat -a ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf + bcftools concat -Oz -a ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf.gz # Print the output file name - echo "Merged VCF file: ~{sample_name}_merged.vcf" + echo "Merged VCF file: ~{sample_name}_merged.vcf.gz" >>> runtime { docker: bcftools_docker @@ -64,7 +67,7 @@ task mergeVCF { preemptible: 2 } output { - File output_vcf = "~{sample_name}_merged.vcf" + File output_vcf = "~{sample_name}_merged.vcf.gz" } } \ No newline at end of file From ce4f252e8039ae029a379cb33197423994e7306a Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Wed, 1 Mar 2023 14:03:24 -0500 Subject: [PATCH 15/28] Only collect event level stats using 4mat script --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index d416673..dbb1beb 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -40,7 +40,6 @@ workflow Benchmark_CNV_Caller { File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index Array[File] Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats - Array[File] Wittyer4Mat_base_stats = Wittyer4Mat.base_level_wittyer_stats } meta { author: "Yueyao Gao" @@ -116,16 +115,7 @@ workflow Benchmark_CNV_Caller { command <<< set -e - # Run wittyer4mat script on cnv wittyer output - mkdir ~{truth_sample_name}_cnv_wittyer4mat - conda run --no-capture-output \ - -n wittyer-parser \ - python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ - -t base \ - -o ~{truth_sample_name}_base_level_wittyer4mat - - # Run wittyer4mat script on sv wittyer output - mkdir ~{truth_sample_name}_sv_wittyer4mat + # Run wittyer4mat script on wittyer output conda run --no-capture-output \ -n wittyer-parser \ python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ @@ -138,6 +128,5 @@ workflow Benchmark_CNV_Caller { } output { Array[File] event_level_wittyer_stats = glob("~{truth_sample_name}_event_level_wittyer4mat/*.csv") - Array[File] base_level_wittyer_stats = glob("~{truth_sample_name}_base_level_wittyer4mat/*.csv") } } \ No newline at end of file From 70ab5cca1619dce5c2ddfeb4b5ec98f6597ad7ab Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 2 Mar 2023 13:56:51 -0500 Subject: [PATCH 16/28] Reflect the sample name in the output CSV name --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index dbb1beb..6939a0f 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -121,6 +121,7 @@ workflow Benchmark_CNV_Caller { python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ -t event \ -o ~{truth_sample_name}_event_level_wittyer4mat + -p ~{truth_sample_name} >>> runtime { docker: wittyer4mat_docker From 93e30ef8af665dad6d85349c7be76f059cc8ece8 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 2 Mar 2023 14:13:15 -0500 Subject: [PATCH 17/28] Fixed typo in 4mat command --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index 6939a0f..a6322f9 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -120,7 +120,7 @@ workflow Benchmark_CNV_Caller { -n wittyer-parser \ python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ -t event \ - -o ~{truth_sample_name}_event_level_wittyer4mat + -o ~{truth_sample_name}_event_level_wittyer4mat \ -p ~{truth_sample_name} >>> runtime { From 9151da77156550ff3aa96c9b5d0c4aa4cb062928 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Mar 2023 17:40:16 -0400 Subject: [PATCH 18/28] Added BGZIP command --- BenchmarkCNV/MergeVCF/MergeVCF.wdl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl index 746979b..a6c2adb 100644 --- a/BenchmarkCNV/MergeVCF/MergeVCF.wdl +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -49,12 +49,16 @@ task mergeVCF { echo "vcf1: ~{vcf1}" echo "vcf2: ~{vcf2}" + # bgzf-compress the input vcf files + bcftools view ~{vcf1} -Oz -o ~{vcf1}.gz + bcftools view ~{vcf2} -Oz -o ~{vcf2}.gz + echo "Index the input VCFs" - bcftools index -t ~{vcf1} - bcftools index -t ~{vcf2} + bcftools index -t ~{vcf1}.gz + bcftools index -t ~{vcf2}.gz # Use bcftools to concat two vcfs - bcftools concat -Oz -a ~{vcf1} ~{vcf2} -o ~{sample_name}_merged.vcf.gz + bcftools concat -Oz -a ~{vcf1}.gz ~{vcf2}.gz -o ~{sample_name}_merged.vcf.gz # Print the output file name echo "Merged VCF file: ~{sample_name}_merged.vcf.gz" From 7da8e7451f8f6ed1277e9ed936009baf1fbf53b4 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Mar 2023 18:29:37 -0400 Subject: [PATCH 19/28] Make the wittyer output vcf be optional --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index a6322f9..0879ab7 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -37,8 +37,8 @@ workflow Benchmark_CNV_Caller { # Outputs that will be retained when execution is complete output { File wittyer_stats = BenchmarkCNV.wittyer_stats - File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf - File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index + File? wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf + File? wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index Array[File] Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats } meta { From 86d39671c3e47247f1f5c394056a51da900f5e03 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Mar 2023 18:58:03 -0400 Subject: [PATCH 20/28] make the vcf output required --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index 0879ab7..a6322f9 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -37,8 +37,8 @@ workflow Benchmark_CNV_Caller { # Outputs that will be retained when execution is complete output { File wittyer_stats = BenchmarkCNV.wittyer_stats - File? wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf - File? wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index + File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf + File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index Array[File] Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats } meta { From 86d35a0bd040dc9219d299a9662075332ff7c696 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Mar 2023 20:00:33 -0400 Subject: [PATCH 21/28] make 4mat task optional --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index a6322f9..fa6f27f 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -11,6 +11,7 @@ workflow Benchmark_CNV_Caller { String wittyer_evaluation_mode String wittyer_docker String wittyer4mat_docker + Boolean? run_wittyer4mat } # benchmark cnv.vcf and sv.vcf using witty.er tool @@ -27,11 +28,13 @@ workflow Benchmark_CNV_Caller { } # wittyer4mat to parse the wittyer json output - call Wittyer4Mat { - input: - wittyer4mat_docker = wittyer4mat_docker, - wittyer_stats = BenchmarkCNV.wittyer_stats, - truth_sample_name = truth_sample_name + if (run_wittyer4mat) { + call Wittyer4Mat { + input: + wittyer4mat_docker = wittyer4mat_docker, + wittyer_stats = BenchmarkCNV.wittyer_stats, + truth_sample_name = truth_sample_name + } } # Outputs that will be retained when execution is complete @@ -39,7 +42,7 @@ workflow Benchmark_CNV_Caller { File wittyer_stats = BenchmarkCNV.wittyer_stats File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index - Array[File] Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats + Array[File]? Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats } meta { author: "Yueyao Gao" From d9f5411a0fa727a559edc28e623d27d1095cd6dd Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Thu, 23 Mar 2023 20:04:00 -0400 Subject: [PATCH 22/28] fixed the input typo --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index fa6f27f..abdd707 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -11,7 +11,7 @@ workflow Benchmark_CNV_Caller { String wittyer_evaluation_mode String wittyer_docker String wittyer4mat_docker - Boolean? run_wittyer4mat + Boolean run_wittyer4mat } # benchmark cnv.vcf and sv.vcf using witty.er tool From 369efec3caeb12fbeb890016b9a500f632e3fd76 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 14 Apr 2023 14:01:11 -0400 Subject: [PATCH 23/28] Replace bash if statement with WDL trick --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index abdd707..e1f3178 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -75,24 +75,14 @@ workflow Benchmark_CNV_Caller { command <<< set -e - if [[ -f "~{bedfile}" ]]; then - # Run Benchmarking tool wittyer on dragen generated cnv.vcf with bed file + # Run Benchmarking tool wittyer on dragen generated cnv.vcf and truth set /opt/Wittyer/Wittyer -i ~{eval_vcf} \ -t ~{truth_vcf} \ -em ~{wittyer_evaluation_mode} \ --configFile ~{wittyer_config} \ - --includeBed ~{bedfile} \ + ~{'--includeBed '+ bedfile} \ -o ~{truth_sample_name}_wittyer_output - else - # Run Benchmarking tool wittyer on dragen generated cnv.vcf - /opt/Wittyer/Wittyer -i ~{eval_vcf} \ - -t ~{truth_vcf} \ - -em ~{wittyer_evaluation_mode} \ - --configFile ~{wittyer_config} \ - -o ~{truth_sample_name}_wittyer_output - - fi >>> runtime { docker: wittyer_docker From 6d5746f191e7fb0f891d86e4b5660e660d7efec2 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 14 Apr 2023 15:10:34 -0400 Subject: [PATCH 24/28] BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index e1f3178..0d323e3 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -113,8 +113,7 @@ workflow Benchmark_CNV_Caller { -n wittyer-parser \ python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ -t event \ - -o ~{truth_sample_name}_event_level_wittyer4mat \ - -p ~{truth_sample_name} + -o ~{truth_sample_name}_event_level_wittyer4mat >>> runtime { docker: wittyer4mat_docker From 72a12a6aa27d8fbceb9141b4e113a1e53a67620e Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 14 Apr 2023 15:25:23 -0400 Subject: [PATCH 25/28] Changed the taskname based on PR comment --- .../SelectSampleFromCallSet.wdl | 67 ++++++++++--------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl index 3e5fd5a..b8b4206 100644 --- a/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl +++ b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl @@ -8,7 +8,7 @@ workflow SelectSampleFromCallSet { } # Select vcf for specific sample - call SelectSample { + call SelectSampleRemoveComplexSV { input: bcftools_docker = bcftools_docker, vcf = variant_callset, @@ -18,7 +18,7 @@ workflow SelectSampleFromCallSet { # Outputs that will be retained when execution is complete output { - File selected_vcf = SelectSample.output_vcf + File selected_vcf = SelectSampleRemoveComplexSV.output_vcf } meta { author: "Yueyao Gao" @@ -26,40 +26,41 @@ workflow SelectSampleFromCallSet { description: "SelectSampleFromCallSet.wdl is design to extract specific sample from a large callset vcf" } } -task SelectSample { + # Select sample from a vcf and remove complex SVs and INV + task SelectSampleRemoveComplexSV { - input { - String bcftools_docker - File vcf - String sample_name - Int? mem - Int? disk_space - # If mem and disk size were not specified, use 4GB and 100 GB as default - Int mem_size = select_first([mem, 4]) - Int disk_size = select_first([disk_space,100]) + input { + String bcftools_docker + File vcf + String sample_name + Int? mem + Int? disk_space + # If mem and disk size were not specified, use 4GB and 100 GB as default + Int mem_size = select_first([mem, 4]) + Int disk_size = select_first([disk_space,100]) - } - command <<< - set -e - # Select sample using bcftools - bcftools view -s ~{sample_name} -O v -o ~{sample_name}.vcf ~{vcf} + } + command <<< + set -e + # Select sample using bcftools + bcftools view -s ~{sample_name} -O v -o ~{sample_name}.vcf ~{vcf} - # Remove Complex SV from the sample vcf because wittyer can't process CPX variants - # Remove INV from the sample vcf because wittyer's exception - # Remove reference allele + # Remove Complex SV from the sample vcf because wittyer can't process CPX variants + # Remove INV from the sample vcf because wittyer's exception + # Remove reference allele - bcftools view -e 'SVTYPE="INV" | SVTYPE="CPX" | GT="0/0"' ~{sample_name}.vcf -o ~{sample_name}_filtered.vcf + bcftools view -e 'SVTYPE="INV" | SVTYPE="CPX" | GT="0/0"' ~{sample_name}.vcf -o ~{sample_name}_filtered.vcf - >>> - runtime { - docker: bcftools_docker - bootDiskSizeGb: 12 - memory: mem_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: 2 - } - output { - File output_vcf = "~{sample_name}_filtered.vcf" - } + >>> + runtime { + docker: bcftools_docker + bootDiskSizeGb: 12 + memory: mem_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } + output { + File output_vcf = "~{sample_name}_filtered.vcf" + } -} \ No newline at end of file + } \ No newline at end of file From 15093916555fead0fb31cc4332e256556f15d8d0 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 14 Apr 2023 15:30:23 -0400 Subject: [PATCH 26/28] Added vcf sorting task based on PR comment --- BenchmarkCNV/MergeVCF/MergeVCF.wdl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl index a6c2adb..25894a2 100644 --- a/BenchmarkCNV/MergeVCF/MergeVCF.wdl +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -60,8 +60,11 @@ task mergeVCF { # Use bcftools to concat two vcfs bcftools concat -Oz -a ~{vcf1}.gz ~{vcf2}.gz -o ~{sample_name}_merged.vcf.gz + #sort the merged vcf + bcftools sort -Oz -o ~{sample_name}_sorted_merged.vcf.gz ~{sample_name}_merged.vcf.gz + # Print the output file name - echo "Merged VCF file: ~{sample_name}_merged.vcf.gz" + echo "Merged VCF file: ~{sample_name}_sorted_merged.vcf.gz" >>> runtime { docker: bcftools_docker @@ -71,7 +74,7 @@ task mergeVCF { preemptible: 2 } output { - File output_vcf = "~{sample_name}_merged.vcf.gz" + File output_vcf = "~{sample_name}_sorted_merged.vcf.gz" } } \ No newline at end of file From 52be6f0b897f11144dc40410669d778129bf992b Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 14 Apr 2023 15:37:45 -0400 Subject: [PATCH 27/28] Added a default docker image --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index 0d323e3..651d6f7 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -9,7 +9,7 @@ workflow Benchmark_CNV_Caller { File wittyer_config File? bedfile String wittyer_evaluation_mode - String wittyer_docker + String wittyer_docker = "yg96/wittyer:v2" String wittyer4mat_docker Boolean run_wittyer4mat } From 019a8d9f15c52c341fafc9fd1de88a4f3d4d6dc7 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Fri, 14 Apr 2023 15:54:30 -0400 Subject: [PATCH 28/28] Fixed a typo in a comment --- BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl index 651d6f7..fe433ae 100644 --- a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -97,7 +97,7 @@ workflow Benchmark_CNV_Caller { File wittyer_annotated_vcf_index = "~{truth_sample_name}_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" } } - # Task3: Format the wittyer json output + # Task2: Format the wittyer json output task Wittyer4Mat{ input{