-
Notifications
You must be signed in to change notification settings - Fork 2
/
bclqc.py
153 lines (135 loc) · 5.46 KB
/
bclqc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import sys
from helpers import *
from subprocess import call
HELP_MSG = """BCLQC
Usage: python3 bclqc.py [options] fastqs_dir bams_dir run_dir
fastqs_dir is the parent directory where FASTQs will be output
bams_dir is the parent directory where BAMs will be output
run_dir is the directory containing the run data to be processed
For example:
`python3 bclqc.py [options] /fastqs /bams /runs/210930`
In this case /fastqs/210930 and /bams/210930 will be created if they don't exist,
and FASTQs and BAMs will be written to those directories respectively.
Options:
-P [pass1,pass2,...] Run only the specified passes
-h, --help Print this help message
-d, --dry-run Dry run: print commands without executing them"""
DEFAULT_BED_FILE = "/mnt/pns/tracks/ucla_mdl_cancer_ngs_v1_exon_targets.hg38.bed"
DEFAULT_HUMAN_REF = "/staging/human/reference/hg38_alt_masked_graph_v2"
MAIN_PASSES = [
"demux",
"align",
"multiqc",
]
def demux(run_dir, samplesheet, fastq_output,
exec_cmd=call):
"""
Perform demultiplexing on a NovaSeq run using Dragen BCLConvert
Inputs:
run_dir: path to the folder containing the NovaSeq run data
samplesheet: path to the SampleSheet.csv to be used for this demultiplexing
fastq_output: path to the folder where the demultiplexed FASTQs will be written
"""
exec_cmd(["dragen",
"--bcl-conversion-only", "true",
"--bcl-use-hw", "false",
"--bcl-only-matched-reads", "true",
"--bcl-input-directory", run_dir,
"--sample-sheet", samplesheet,
"--output-directory", fastq_output])
def align(fastq_list, bam_output, bed_file, sample_id, exec_cmd=call):
exec_cmd(["mkdir", "-p", bam_output])
exec_cmd(["dragen",
"--enable-map-align", "true",
"--enable-map-align-output", "true",
"--output-format", "BAM",
"--enable-duplicate-marking", "true",
"--generate-sa-tags", "true",
"--enable-sort", "true",
"--soft-read-trimmers", "polyg,quality",
"--trim-min-quality", "2",
"--ref-dir", DEFAULT_HUMAN_REF,
"--intermediate-results-dir", "/staging/tmp",
"--qc-coverage-tag-1", "target_bed",
"--qc-coverage-region-1", bed_file,
"--qc-coverage-reports-1", "cov_report",
"--qc-coverage-ignore-overlaps", "true",
"--enable-variant-caller", "true",
"--vc-combine-phased-variants-distance", "6",
"--vc-emit-ref-confidence", "GVCF",
"--enable-hla", "true",
"--fastq-list", fastq_list,
"--fastq-list-sample-id", sample_id,
"--output-directory", bam_output,
"--output-file-prefix", sample_id,
"--vc-output-evidence-bam", "true",
"--vc-evidence-bam-output-haplotypes", "true",
])
def multiqc_cmd(fastqs_dir, bams_dir, exec_cmd=call):
exec_cmd(["rm", "-f", f"{bams_dir}/*/*.wgs_*.csv"])
exec_cmd(["multiqc",
"--force",
"--config", "config/multiqc_config.yaml",
"--outdir", bams_dir,
# dirs to scan
bams_dir,
fastqs_dir,
])
def demux_pass(run_info):
run_dir = run_info.run_dir
fastqs_dir = run_info.fastqs_dir
exec_cmd = run_info.exec_cmd
exec_cmd(["mkdir", "-p", fastqs_dir])
for idx in run_info.indices:
samplesheet = f"{run_dir}/SampleSheet_{idx}.csv"
fastq_output = f"{fastqs_dir}/{idx}"
demux(run_dir, samplesheet, fastq_output, exec_cmd)
def align_pass(run_info):
fastqs_dir = run_info.fastqs_dir
bams_dir = run_info.bams_dir
for idx in run_info.indices:
fastq_list = f"{fastqs_dir}/{idx}/Reports/fastq_list.csv"
if not os.path.exists(fastq_list):
raise Exception(f"Alignment Error: {fastq_list} does not exist")
for sample_id in get_sample_ids(fastq_list):
bam_output = f"{bams_dir}/{sample_id}"
align(fastq_list, bam_output, run_info.bed_file, sample_id, run_info.exec_cmd)
def multiqc_pass(run_info):
save_occ_pf_plot(run_info.run_dir, run_info.bams_dir, run_info.exec_cmd)
multiqc_cmd(run_info.fastqs_dir, run_info.bams_dir, run_info.exec_cmd)
def get_pass_f(pass_name):
pass_f_name = pass_name + "_pass"
# look into global namespace for pass function
for name, f in globals().items():
if name == pass_f_name:
if callable(f) and f.__module__ == __name__:
return f
return None
def execute_pass(pass_name, run_info):
pass_function = get_pass_f(pass_name)
if pass_function: # if a custom pass function is defined, call it first
print(f"Running {pass_name}...")
pass_function(run_info)
else:
print(f"Pass not found: {pass_name}\n"
f"Define a function called {pass_name}_pass in bcl-qc.py")
class RunInfo:
def __init__(self):
[fastqs_dir, bams_dir, run_dir] = get_arg_dirs()
self.run_dir = run_dir
self.bams_dir = bams_dir
self.fastqs_dir = fastqs_dir
# TODO allow BED paths to be specified per sample in samplesheet
self.bed_file = DEFAULT_BED_FILE
self.indices = get_indices(run_dir)
self.passes = get_passes()
self.exec_cmd = get_exec_cmd()
def bclqc_run():
if "-h" in sys.argv or "--help" in sys.argv:
print(HELP_MSG)
return
run_info = RunInfo()
for pass_name in run_info.passes:
execute_pass(pass_name, run_info)
if __name__ == "__main__":
bclqc_run()