-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_vcf_samples.wdl
122 lines (103 loc) · 3.45 KB
/
check_vcf_samples.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
version 1.0
workflow check_vcf_samples {
input {
File vcf_file
String dataset_id
String dataset_type
String workspace_name
String workspace_namespace
}
Int disk_gb = ceil(size(vcf_file, "GB")*1.5) + 5
call vcf_samples {
input: vcf_file = vcf_file,
disk_gb = disk_gb
}
call compare_sample_sets {
input: sample_file = vcf_samples.sample_file,
dataset_id = dataset_id,
dataset_type = dataset_type,
workspace_name = workspace_name,
workspace_namespace = workspace_namespace
}
output {
String vcf_sample_check = compare_sample_sets.check_status
}
meta {
author: "Stephanie Gogarten"
email: "[email protected]"
}
}
task vcf_samples {
input {
File vcf_file
Int disk_gb = 10
}
command {
bcftools query --list-samples ${vcf_file} > vcf_samples.txt
}
output {
File sample_file = "vcf_samples.txt"
}
runtime {
docker: "xbrianh/xsamtools:v0.5.2"
disks: "local-disk ${disk_gb} SSD"
}
}
task compare_sample_sets {
input {
File sample_file
String dataset_id
String dataset_type
String workspace_name
String workspace_namespace
}
command <<<
Rscript -e "\
dataset_id <- '~{dataset_id}'; \
dataset_type <- '~{dataset_type}'; \
workspace_name <- '~{workspace_name}'; \
workspace_namespace <- '~{workspace_namespace}'; \
stopifnot(dataset_type %in% c('array', 'imputation', 'sequencing')); \
dataset_table <- AnVIL::avtable(paste0(dataset_type, '_dataset'), name=workspace_name, namespace=workspace_namespace); \
sample_set_id <- dataset_table[['sample_set_id']][dataset_table[[paste0(dataset_type, '_dataset_id')]] == dataset_id]; \
sample_set <- AnVIL::avtable('sample_set', name=workspace_name, namespace=workspace_namespace); \
samples <- sample_set[['samples.items']][sample_set[['sample_set_id']] == sample_set_id][[1]][['entityName']]; \
writeLines(samples, 'workspace_samples.txt'); \
vcf_samples <- readLines('~{sample_file}'); \
if (setequal(samples, vcf_samples)) status <- 'PASS' else status <- 'FAIL'; \
cat(status, file='status.txt'); \
if (status == 'FAIL') stop('Samples do not match; compare vcf_samples.txt and workspace_samples.txt') \
"
>>>
output {
String check_status = read_string("status.txt")
File workspace_samples = "workspace_samples.txt"
}
runtime {
docker: "us.gcr.io/broad-dsp-gcr-public/anvil-rstudio-bioconductor:3.17.0"
}
}
task summarize_vcf_check {
input {
Array[String] file
Array[String] vcf_check
}
command <<<
Rscript -e "\
files <- readLines('~{write_lines(file)}'); \
checks <- readLines('~{write_lines(vcf_check)}'); \
library(dplyr); \
dat <- tibble(file_path=files, vcf_check=checks); \
readr::write_tsv(dat, 'details.txt'); \
ct <- mutate(count(dat, vcf_check), x=paste(n, vcf_check)); \
writeLines(paste(ct[['x']], collapse=', '), 'summary.txt'); \
"
>>>
output {
String summary = read_string("summary.txt")
File details = "details.txt"
}
runtime {
docker: "us.gcr.io/broad-dsp-gcr-public/anvil-rstudio-bioconductor:3.17.0"
}
}