diff --git a/.dockstore.yml b/.dockstore.yml index e059940..4d63fd2 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -17,11 +17,6 @@ workflows: primaryDescriptorPath: /CollectSamError/CollectSamErrorMetrics.wdl testParameterFiles: - /CollectSamError/CollectSamErrorMetrics.inputs.json - - name: BenchmarkCNV - subclass: WDL - primaryDescriptorPath: /BenchmarkCNV/BenchmarkCNV.wdl - testParameterFiles: - - /BenchmarkCNV/BenchmarkCNV_test.json - name: GATK4_CNV subclass: WDL primaryDescriptorPath: /GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.wdl @@ -79,8 +74,24 @@ workflows: primaryDescriptorPath: /checkBaitSetName/checkBaitSetName.wdl testParameterFiles: - /checkBaitSetName/checkBaitSetName.inputs.json + - name: SelectSampleFromCallSet + subclass: WDL + primaryDescriptorPath: /BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl + testParameterFiles: + - /BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json + - name: MergeVCF + subclass: WDL + primaryDescriptorPath: /BenchmarkCNV/MergeVCF/MergeVCF.wdl + testParameterFiles: + - /BenchmarkCNV/MergeVCF/MergeVCF.inputs.json + - name: BenchmarkCNV + subclass: WDL + primaryDescriptorPath: /BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl + testParameterFiles: + - /BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json - name: checkBaitSetName_dev subclass: WDL primaryDescriptorPath: /checkBaitSetName/checkBaitSetName.dev.wdl testParameterFiles: - /checkBaitSetName/checkBaitSetName.dev.inputs.json + diff --git a/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV.inputs.json deleted file mode 100644 index 2810dbe..0000000 --- a/BenchmarkCNV/BenchmarkCNV.inputs.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "Benchmark_CNV_Caller.wittyer_docker": "us.gcr.io/tag-team-160914/wittyer:v2", - "Benchmark_CNV_Caller.bcftools_docker": "us.gcr.io/broad-dsde-methods/liquidbiopsy:0.0.4.3", - "Benchmark_CNV_Caller.wittyer_sv_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/sv-config.json", - "Benchmark_CNV_Caller.wittyer_cnv_evaluation_mode": "CrossTypeAndSimpleCounting", - "Benchmark_CNV_Caller.wittyer_sv_evaluation_mode": "SimpleCounting", - "Benchmark_CNV_Caller.wittyer_cnv_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/cnv-config.json", - "Benchmark_CNV_Caller.variant_callset": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/1KGP_3202.gatksv_svtools_novelins.freeze_V3_fixed.wAF.vcf", - "Benchmark_CNV_Caller.truth_sample_name": "HG00513", - "Benchmark_CNV_Caller.query_sample_name": "SM-GZQKA", - "Benchmark_CNV_Caller.eval_sv_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/HG00513_HG00513_1_SM-GZQKA_v1/HG00513_HG00513_1_SM-GZQKA_v1.sv.vcf.gz", - "Benchmark_CNV_Caller.eval_cnv_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/HG00513_HG00513_1_SM-GZQKA_v1/HG00513_HG00513_1_SM-GZQKA_v1.cnv.vcf.gz" -} diff --git a/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV.wdl deleted file mode 100644 index f612805..0000000 --- a/BenchmarkCNV/BenchmarkCNV.wdl +++ /dev/null @@ -1,204 +0,0 @@ -version 1.0 - -workflow Benchmark_CNV_Caller { - input { - String bcftools_docker - File variant_callset - String truth_sample_name - String query_sample_name - String wittyer_docker - File eval_cnv_vcf - File wittyer_cnv_config - String wittyer_cnv_evaluation_mode - File eval_sv_vcf - File wittyer_sv_config - String wittyer_sv_evaluation_mode - String wittyer4mat_docker - } - - # Select vcf for specific sample - call SelectSample { - input: - bcftools_docker = bcftools_docker, - vcf = variant_callset, - truth_sample_name = truth_sample_name - } - - # benchmark cnv.vcf and sv.vcf using witty.er tool - call BenchmarkCNV { - input: - wittyer_docker = wittyer_docker, - truth_vcf = SelectSample.output_vcf, - truth_sample_name = truth_sample_name, - query_sample_name = query_sample_name, - eval_cnv_vcf = eval_cnv_vcf, - cnv_config_file = wittyer_cnv_config, - cnv_evaluation_mode = wittyer_cnv_evaluation_mode, - eval_sv_vcf = eval_sv_vcf, - sv_config_file = wittyer_sv_config, - sv_evaluation_mode = wittyer_sv_evaluation_mode - } - - # wittyer4mat to parse the wittyer json output - call Wittyer4Mat { - input: - wittyer4mat_docker = wittyer4mat_docker, - cnv_wittyer_stats = BenchmarkCNV.cnv_wittyer_stats, - sv_wittyer_stats = BenchmarkCNV.sv_wittyer_stats, - truth_sample_name = truth_sample_name - } - - # Outputs that will be retained when execution is complete - output { - File truth_vcf = SelectSample.output_vcf - File cnv_wittyer_stats = BenchmarkCNV.cnv_wittyer_stats - File cnv_wittyer_annotated_vcf = BenchmarkCNV.cnv_wittyer_annotated_vcf - File cnv_wittyer_annotated_vcf_index = BenchmarkCNV.cnv_wittyer_annotated_vcf_index - File sv_wittyer_stats = BenchmarkCNV.sv_wittyer_stats - File sv_wittyer_annotated_vcf = BenchmarkCNV.sv_wittyer_annotated_vcf - File sv_wittyer_annotated_vcf_index = BenchmarkCNV.sv_wittyer_annotated_vcf_index - File cnv_deletion_stat = Wittyer4Mat.cnv_deletion_stat - File cnv_duplication_stat = Wittyer4Mat.cnv_duplication_stat - File sv_deletion_stat = Wittyer4Mat.sv_deletion_stat - File sv_duplication_stat = Wittyer4Mat.sv_duplication_stat - File sv_insertion_stat = Wittyer4Mat.sv_insertion_stat - } - meta { - author: "Yueyao Gao" - email: "gaoyueya@broadinstitute.org" - description: "BenchmarkCNV.wdl is designed to evaluate the performance of Dragen CNV (Copy Number Variation) caller against GATK SV (Structural Variation) caller." - } -} - - # Task 1: Select sample vcf from a large callset (e.g. 1KGP) - task SelectSample { - - input { - String bcftools_docker - File vcf - String truth_sample_name - Int? mem - Int? disk_space - # If mem and disk size were not specified, use 4GB and 100 GB as default - Int mem_size = select_first([mem, 4]) - Int disk_size = select_first([disk_space,100]) - - } - command <<< - set -e - # Select sample using bcftools - bcftools view -s ~{truth_sample_name} -O v -o ~{truth_sample_name}.vcf ~{vcf} - - # Remove Complex SV from the sample vcf because wittyer can't process CPX variants - # Remove INV from the sample vcf because wittyer's exception - # Remove reference allele - - bcftools view -e 'SVTYPE="INV" | SVTYPE="CPX" | GT="0/0"' ~{truth_sample_name}.vcf -o ~{truth_sample_name}_filtered.vcf - - >>> - runtime { - docker: bcftools_docker - bootDiskSizeGb: 12 - memory: mem_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: 2 - } - output { - File output_vcf = "~{truth_sample_name}_filtered.vcf" - } - -} - - # Task 2: Benchmark the large variant vcf against truth set generated in task 1 - task BenchmarkCNV { - - input { - String wittyer_docker - File truth_vcf - File eval_cnv_vcf - File cnv_config_file - String cnv_evaluation_mode - File eval_sv_vcf - File sv_config_file - String sv_evaluation_mode - String truth_sample_name - String query_sample_name - Int? mem - Int? disk_space - # If mem and disk size were not specified, use 4GB and 100 GB as default - Int mem_size = select_first([mem, 4]) - Int disk_size = select_first([disk_space,100]) - } - command <<< - set -e - # Run Benchmarking tool wittyer on dragen generated cnv.vcf - /opt/Wittyer/Wittyer -i ~{eval_cnv_vcf} \ - -t ~{truth_vcf} \ - -em ~{cnv_evaluation_mode} \ - --configFile ~{cnv_config_file} \ - -o ~{truth_sample_name}_cnv_wittyer_output - - # Run Benchmarking tool wittyer on dragen generated sv.vcf - /opt/Wittyer/Wittyer -i ~{eval_sv_vcf} \ - -t ~{truth_vcf} \ - -em ~{sv_evaluation_mode} \ - --configFile ~{sv_config_file} \ - -o ~{truth_sample_name}_sv_wittyer_output - - >>> - runtime { - docker: wittyer_docker - bootDiskSizeGb: 12 - memory: mem_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: 2 - } - output { - File cnv_wittyer_stats = "~{truth_sample_name}_cnv_wittyer_output/Wittyer.Stats.json" - File cnv_wittyer_annotated_vcf = "~{truth_sample_name}_cnv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz" - File cnv_wittyer_annotated_vcf_index = "~{truth_sample_name}_cnv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" - File sv_wittyer_stats = "~{truth_sample_name}_sv_wittyer_output/Wittyer.Stats.json" - File sv_wittyer_annotated_vcf = "~{truth_sample_name}_sv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz" - File sv_wittyer_annotated_vcf_index = "~{truth_sample_name}_sv_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" - } -} - # Task3: Format the wittyer json output - task Wittyer4Mat{ - - input{ - String wittyer4mat_docker - File cnv_wittyer_stats - File sv_wittyer_stats - String truth_sample_name - } - command <<< - set -e - - # Run wittyer4mat script on cnv wittyer output - mkdir ~{truth_sample_name}_cnv_wittyer4mat - conda run --no-capture-output \ - -n wittyer-parser \ - python3 /wittyer4mat/wittyer_4mat.py -i ~{cnv_wittyer_stats} \ - -t cnv \ - -o ~{truth_sample_name}_cnv_wittyer4mat - - # Run wittyer4mat script on sv wittyer output - mkdir ~{truth_sample_name}_sv_wittyer4mat - conda run --no-capture-output \ - -n wittyer-parser \ - python3 /wittyer4mat/wittyer_4mat.py -i ~{sv_wittyer_stats} \ - -t sv \ - -o ~{truth_sample_name}_sv_wittyer4mat - >>> - runtime { - docker: wittyer4mat_docker - preemptible: 2 - } - output { - File cnv_deletion_stat = "~{truth_sample_name}_cnv_wittyer4mat/wittyer_cnv_Deletion_output.csv" - File cnv_duplication_stat = "~{truth_sample_name}_cnv_wittyer4mat/wittyer_cnv_Duplication_output.csv" - File sv_deletion_stat = "~{truth_sample_name}_sv_wittyer4mat/wittyer_sv_Deletion_output.csv" - File sv_duplication_stat = "~{truth_sample_name}_sv_wittyer4mat/wittyer_sv_Duplication_output.csv" - File sv_insertion_stat = "~{truth_sample_name}_sv_wittyer4mat/wittyer_sv_Insertion_output.csv" - } - } \ No newline at end of file diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json new file mode 100644 index 0000000..e5bf04e --- /dev/null +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.inputs.json @@ -0,0 +1,15 @@ +{ + "Benchmark_CNV_Caller.bedfile": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/HighConfidenceRegion_BedFile/HG002_hg38_SVs_Tier1_v0.6.bed", + "Benchmark_CNV_Caller.BenchmarkCNV.mem_size": 4, + "Benchmark_CNV_Caller.wittyer_docker": "us.gcr.io/tag-team-160914/wittyer:v2", + "Benchmark_CNV_Caller.BenchmarkCNV.mem": 4, + "Benchmark_CNV_Caller.wittyer_evaluation_mode": "CrossTypeAndSimpleCounting", + "Benchmark_CNV_Caller.BenchmarkCNV.disk_size": 100, + "Benchmark_CNV_Caller.wittyer4mat_docker": "us.gcr.io/tag-team-160914/wittyer4mat:v11-outstat-broom", + "Benchmark_CNV_Caller.eval_vcf": "gs://dragen_v_3_10_4_cnv_validation/hg38_ml/NA24385_hg38/NA24385.cnv.vcf.gz", + "Benchmark_CNV_Caller.BenchmarkCNV.disk_space": 100, + "Benchmark_CNV_Caller.truth_sample_name": "NA24385", + "Benchmark_CNV_Caller.truth_vcf": "gs://fc-cd2e8270-8c64-4f10-bcfb-2f1b5f44aee4/submissions/47e84385-8207-4ad0-a412-b2c113efb470/SelectSampleFromCallSet/17210bc4-e6c8-4ce5-bb23-486fcb18ab89/call-SelectSample/NA24385_filtered.vcf", + "Benchmark_CNV_Caller.query_sample_name": "SM-MVWA8", + "Benchmark_CNV_Caller.wittyer_config": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/wittyerConfigFile/cnv-config.json" +} \ No newline at end of file diff --git a/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl new file mode 100644 index 0000000..fe433ae --- /dev/null +++ b/BenchmarkCNV/BenchmarkCNV/BenchmarkCNV.wdl @@ -0,0 +1,125 @@ +version 1.0 + +workflow Benchmark_CNV_Caller { + input { + String truth_sample_name + String query_sample_name + File truth_vcf + File eval_vcf + File wittyer_config + File? bedfile + String wittyer_evaluation_mode + String wittyer_docker = "yg96/wittyer:v2" + String wittyer4mat_docker + Boolean run_wittyer4mat + } + + # benchmark cnv.vcf and sv.vcf using witty.er tool + call BenchmarkCNV { + input: + wittyer_docker = wittyer_docker, + truth_vcf = truth_vcf, + truth_sample_name = truth_sample_name, + query_sample_name = query_sample_name, + eval_vcf = eval_vcf, + wittyer_config = wittyer_config, + wittyer_evaluation_mode = wittyer_evaluation_mode, + bedfile = bedfile + } + + # wittyer4mat to parse the wittyer json output + if (run_wittyer4mat) { + call Wittyer4Mat { + input: + wittyer4mat_docker = wittyer4mat_docker, + wittyer_stats = BenchmarkCNV.wittyer_stats, + truth_sample_name = truth_sample_name + } + } + + # Outputs that will be retained when execution is complete + output { + File wittyer_stats = BenchmarkCNV.wittyer_stats + File wittyer_annotated_vcf = BenchmarkCNV.wittyer_annotated_vcf + File wittyer_annotated_vcf_index = BenchmarkCNV.wittyer_annotated_vcf_index + Array[File]? Wittyer4Mat_event_stats = Wittyer4Mat.event_level_wittyer_stats + } + meta { + author: "Yueyao Gao" + email: "gaoyueya@broadinstitute.org" + description: "BenchmarkCNV.wdl is designed to evaluate the performance of Dragen CNV (Copy Number Variation) caller against GATK SV (Structural Variation) caller." + } +} + + + # Task 1: Benchmark the large variant vcf against truth set + # If you are extracting vcf from a large callset vcf + # Checkout /BenchmarkCNV/SelectSampleFromCallSet.wdl + task BenchmarkCNV { + + input { + String wittyer_docker + File truth_vcf + File eval_vcf + File wittyer_config + String wittyer_evaluation_mode + File? bedfile + String truth_sample_name + String query_sample_name + Int? mem + Int? disk_space + # If mem and disk size were not specified, use 4GB and 100 GB as default + Int mem_size = select_first([mem, 4]) + Int disk_size = select_first([disk_space,100]) + } + command <<< + set -e + + # Run Benchmarking tool wittyer on dragen generated cnv.vcf and truth set + /opt/Wittyer/Wittyer -i ~{eval_vcf} \ + -t ~{truth_vcf} \ + -em ~{wittyer_evaluation_mode} \ + --configFile ~{wittyer_config} \ + ~{'--includeBed '+ bedfile} \ + -o ~{truth_sample_name}_wittyer_output + + >>> + runtime { + docker: wittyer_docker + bootDiskSizeGb: 12 + memory: mem_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } + output { + File wittyer_stats = "~{truth_sample_name}_wittyer_output/Wittyer.Stats.json" + File wittyer_annotated_vcf = "~{truth_sample_name}_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz" + File wittyer_annotated_vcf_index = "~{truth_sample_name}_wittyer_output/Wittyer.~{truth_sample_name}.Vs.~{query_sample_name}.vcf.gz.tbi" + } +} + # Task2: Format the wittyer json output + task Wittyer4Mat{ + + input{ + String wittyer4mat_docker + File wittyer_stats + String truth_sample_name + } + command <<< + set -e + + # Run wittyer4mat script on wittyer output + conda run --no-capture-output \ + -n wittyer-parser \ + python3 /wittyer4mat/wittyer_4mat.py -i ~{wittyer_stats} \ + -t event \ + -o ~{truth_sample_name}_event_level_wittyer4mat + >>> + runtime { + docker: wittyer4mat_docker + preemptible: 2 + } + output { + Array[File] event_level_wittyer_stats = glob("~{truth_sample_name}_event_level_wittyer4mat/*.csv") + } + } \ No newline at end of file diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.inputs.json b/BenchmarkCNV/MergeVCF/MergeVCF.inputs.json new file mode 100644 index 0000000..b0b19e4 --- /dev/null +++ b/BenchmarkCNV/MergeVCF/MergeVCF.inputs.json @@ -0,0 +1,10 @@ +{ + "MergeVCF.mergeVCF.mem_size": "Int? (optional)", + "MergeVCF.mergeVCF.mem": "Int? (optional)", + "MergeVCF.mergeVCF.disk_space": "Int? (optional)", + "MergeVCF.vcf1": "File", + "MergeVCF.sample_name": "String", + "MergeVCF.vcf2": "File", + "MergeVCF.bcftools_docker": "us.gcr.io/broad-dsde-methods/liquidbiopsy:0.0.4.3", + "MergeVCF.mergeVCF.disk_size": "Int? (optional)" +} diff --git a/BenchmarkCNV/MergeVCF/MergeVCF.wdl b/BenchmarkCNV/MergeVCF/MergeVCF.wdl new file mode 100644 index 0000000..25894a2 --- /dev/null +++ b/BenchmarkCNV/MergeVCF/MergeVCF.wdl @@ -0,0 +1,80 @@ +version 1.0 + +workflow MergeVCF { + input { + String bcftools_docker + File vcf1 + File vcf2 + String sample_name + } + + # Merge Two Input VCF Files + call mergeVCF { + input: + bcftools_docker = bcftools_docker, + vcf1 = vcf1, + vcf2 = vcf2, + sample_name = sample_name + } + + + # Outputs that will be retained when execution is complete + output { + File merged_vcf = mergeVCF.output_vcf + } + meta { + author: "Yueyao Gao" + email: "tag@broadinstitute.org" + description: "MergeVCF.wdl is design to merge two input vcfs" + } +} +task mergeVCF { + + input { + String bcftools_docker + File vcf1 + File vcf2 + String sample_name + Int? mem + Int? disk_space + # If mem and disk size were not specified, use 4GB and 100 GB as default + Int mem_size = select_first([mem, 4]) + Int disk_size = select_first([disk_space,100]) + + } + command <<< + set -e + + echo "Input VCF files:" + echo "vcf1: ~{vcf1}" + echo "vcf2: ~{vcf2}" + + # bgzf-compress the input vcf files + bcftools view ~{vcf1} -Oz -o ~{vcf1}.gz + bcftools view ~{vcf2} -Oz -o ~{vcf2}.gz + + echo "Index the input VCFs" + bcftools index -t ~{vcf1}.gz + bcftools index -t ~{vcf2}.gz + + # Use bcftools to concat two vcfs + bcftools concat -Oz -a ~{vcf1}.gz ~{vcf2}.gz -o ~{sample_name}_merged.vcf.gz + + #sort the merged vcf + bcftools sort -Oz -o ~{sample_name}_sorted_merged.vcf.gz ~{sample_name}_merged.vcf.gz + + # Print the output file name + echo "Merged VCF file: ~{sample_name}_sorted_merged.vcf.gz" + >>> + runtime { + docker: bcftools_docker + bootDiskSizeGb: 12 + memory: mem_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } + output { + File output_vcf = "~{sample_name}_sorted_merged.vcf.gz" + } + +} \ No newline at end of file diff --git a/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json new file mode 100644 index 0000000..a3fd317 --- /dev/null +++ b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.inputs.json @@ -0,0 +1,9 @@ +{ + "SelectSampleFromCallSet.SelectSample.mem": "Int? (optional)", + "SelectSampleFromCallSet.sample_name": "HG00513", + "SelectSampleFromCallSet.SelectSample.disk_size": "Int? (optional)", + "SelectSampleFromCallSet.variant_callset": "gs://gptag/tag_1455_Dragen_CNV_Caller_Validation/1KGP_3202.gatksv_svtools_novelins.freeze_V3_fixed.wAF.vcf", + "SelectSampleFromCallSet.SelectSample.disk_space": "Int? (optional)", + "SelectSampleFromCallSet.SelectSample.mem_size": "Int? (optional)", + "SelectSampleFromCallSet.bcftools_docker": "us.gcr.io/broad-dsde-methods/liquidbiopsy:0.0.4.3" +} diff --git a/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl new file mode 100644 index 0000000..b8b4206 --- /dev/null +++ b/BenchmarkCNV/SelectSampleFromCallSet/SelectSampleFromCallSet.wdl @@ -0,0 +1,66 @@ +version 1.0 + +workflow SelectSampleFromCallSet { + input { + String bcftools_docker + File variant_callset + String sample_name + } + + # Select vcf for specific sample + call SelectSampleRemoveComplexSV { + input: + bcftools_docker = bcftools_docker, + vcf = variant_callset, + sample_name = sample_name + } + + + # Outputs that will be retained when execution is complete + output { + File selected_vcf = SelectSampleRemoveComplexSV.output_vcf + } + meta { + author: "Yueyao Gao" + email: "tag@broadinstitute.org" + description: "SelectSampleFromCallSet.wdl is design to extract specific sample from a large callset vcf" + } +} + # Select sample from a vcf and remove complex SVs and INV + task SelectSampleRemoveComplexSV { + + input { + String bcftools_docker + File vcf + String sample_name + Int? mem + Int? disk_space + # If mem and disk size were not specified, use 4GB and 100 GB as default + Int mem_size = select_first([mem, 4]) + Int disk_size = select_first([disk_space,100]) + + } + command <<< + set -e + # Select sample using bcftools + bcftools view -s ~{sample_name} -O v -o ~{sample_name}.vcf ~{vcf} + + # Remove Complex SV from the sample vcf because wittyer can't process CPX variants + # Remove INV from the sample vcf because wittyer's exception + # Remove reference allele + + bcftools view -e 'SVTYPE="INV" | SVTYPE="CPX" | GT="0/0"' ~{sample_name}.vcf -o ~{sample_name}_filtered.vcf + + >>> + runtime { + docker: bcftools_docker + bootDiskSizeGb: 12 + memory: mem_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } + output { + File output_vcf = "~{sample_name}_filtered.vcf" + } + + } \ No newline at end of file