-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathPathogenTrack.py
executable file
·464 lines (430 loc) · 24.5 KB
/
PathogenTrack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
'''
This tool is only designed to work with single cell RNA-seq fastq reads.
'''
import sys
import re
import os
import gzip
import os.path
from Bio import SeqIO
from Bio.Seq import Seq
import argparse
import subprocess
VERSION = '0.2.3'
USAGE = '''%(prog)s [options]'''
def check_status(cmd):
''' get command status '''
status = 0
try:
output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
status = e.returncode
output = e.output
output = output.decode()
if output[-1] == '\n':
output = output[:-1]
return (status, output)
def check_dir(path):
if not os.path.exists(path):
os.makedirs(path)
return(0)
def check_deps():
status, _ = check_status('fastp --help')
if (status != 0):
sys.stderr.write('fastp is required for PathogenTrack!\n')
status, _ = check_status('STAR --help')
if (status != 0):
sys.stderr.write('STAR is required for PathogenTrack!\n')
status, _ = check_status('umi_tools --help')
if (status != 0):
sys.stderr.write('umi_tools is required for PathogenTrack!\n')
status, _ = check_status('kraken2 --help')
if (status != 0):
sys.stderr.write('kraken2 is required for PathogenTrack!\n')
def trim(project_id, barcode, output): ## trim -1 at the end of barcode
check_deps()
check_dir(project_id)
output = project_id + '/' + output
fout = open(output, "w")
with open(barcode, 'r') as fin:
for line in fin:
cb = line.strip().split("-")[0]
fout.write(cb + '\n')
fin.close()
fout.close()
def extract(project_id, read1, read2, pattern, barcode, output, ignore_suffix): ## add cell barcode
check_deps()
check_dir(project_id)
pattern = str(pattern)
barcode = project_id + '/' + barcode
output = project_id + '/' + output
cmd = 'umi_tools extract --bc-pattern ' + str(pattern)
cmd += ' --stdin ' + read1 + ' --stdout /dev/null' + ' --read2-in ' + read2
cmd += ' --read2-out ' + output + ' --filter-cell-barcode' + ' --whitelist ' + barcode
if (ignore_suffix):
cmd += ' --ignore-read-pair-suffixes'
status, stdout = check_status(cmd)
print("status is %s and output is %s" % (status, stdout))
return(0)
def filter(project_id, input_reads, thread, output, trim_poly_x, filter_low_complexity, complexity_threshold): ## filter low quality / complexity reads
check_deps()
check_dir(project_id)
input_reads = project_id + '/' + input_reads
output = project_id + '/' + output
cmd = 'fastp --thread ' + str(thread) + ' -i ' + input_reads + ' -o ' + output
if (trim_poly_x):
cmd += ' --trim_poly_x'
if (filter_low_complexity):
cmd += ' --low_complexity_filter'
if (complexity_threshold):
cmd += ' --complexity_threshold ' + str(complexity_threshold)
cmd += ' && rm fastp.json && rm fastp.html'
status, stdout = check_status(cmd)
print("status is %s and output is %s" % (status, stdout))
return(0)
def align(project_id, star_index, input_reads, thread, output): ## do star alignment
check_deps()
check_dir(project_id)
input_reads = project_id + '/' + input_reads
output = project_id + '/' + output
cmd = 'STAR --genomeDir ' + star_index + ' --readFilesIn ' + input_reads + ' --readFilesCommand gunzip -c '
cmd += '--runThreadN ' + str(thread) + ' --outFilterMismatchNmax 6 ' + ' --outSAMtype None ' + ' --outFilterMultimapNmax 20 '
cmd += '--outFilterIntronMotifs RemoveNoncanonical ' + ' --quantMode - ' + ' --outFileNamePrefix ' + project_id + '/ --outReadsUnmapped Fastx'
cmd += ' && mv ' + project_id + '/Unmapped.out.mate1 ' + output + ' && rm ' + project_id + '/Log.final.out && rm ' + project_id + '/Log.out && rm ' + project_id + '/Log.progress.out && rm ' + project_id + '/SJ.out.tab'
status, stdout = check_status(cmd)
print("status is %s and output is %s" % (status, stdout))
return(0)
def classify(project_id, kraken_db, input_reads, thread, confidence, out_reads, out_table, out_report): ## run kraken2
check_deps()
check_dir(project_id)
input_reads = project_id + '/' + input_reads
out_reads = project_id + '/' + out_reads
out_table = project_id + '/' + out_table
out_report = project_id + '/' + out_report
cmd = 'kraken2 --db ' + kraken_db + ' --confidence ' + str(confidence) + ' --threads ' + str(thread) + ' --use-names --classified-out ' + out_reads + ' --report ' + out_report + ' '
cmd += input_reads + ' | grep -w ^C > ' + out_table
status, stdout = check_status(cmd)
print("status is %s and output is %s" % (status, stdout))
return(0)
def check_rank(rank):
ranks = ['U', 'R', 'D', 'P', 'C', 'O', 'F', 'G', 'S']
level = 0
if rank in ranks:
level = ranks.index(rank) # rank in comman order, eg 'F', 'G', 'S'
elif rank[0] in ranks: # rank in sub-comman order, eg 'F1', 'G1', 'S1'
level = ranks.index(rank[0]) + 0.5
return(level)
def quant(project_id, barcode, input_reads, input_table, input_report, min_reads, output_reads, output_matrix): ## deduplication and quantification microbes at single-cell levels
check_deps()
check_dir(project_id)
min_reads = int(min_reads)
barcode = project_id + '/' + barcode
input_reads = project_id + '/' + input_reads
input_table = project_id + '/' + input_table
input_report = project_id + '/' + input_report
output_reads = project_id + '/' + output_reads
output_matrix = project_id + '/' + output_matrix
cells = set()
with open(barcode) as fcb:
for line in fcb:
cb = line.strip().split("-")[0]
cells.add(cb)
fcb.close()
ranks = ['U', 'R', 'D', 'P', 'C', 'O', 'F', 'G', 'S']
Taxons = dict()
level_taxid = '1' # initialize
level_taxname = 'root'
last_rank = 'R'
current_rank = 'R'
selected = False
selected_taxid = ''
selected_taxname = ''
tmp_dict = dict() # store rows containing species/strain
with open(input_report) as freport:
for line in freport:
total_count = int(line.strip().split("\t")[1])
rank_count = int(line.strip().split("\t")[2])
current_rank = line.strip().split("\t")[3]
taxid = line.strip().split("\t")[4]
taxname = line.strip().split("\t")[5].strip()
if current_rank == 'U': # skip first line in report
continue
if check_rank(current_rank) < check_rank(last_rank) or check_rank(current_rank) == check_rank(last_rank) == 8: # changed organism
last_rank = current_rank
if selected == True:
for row in dict(sorted(tmp_dict.items(), key=lambda item: item[1], reverse=True)).keys():
if not (selected_taxid and selected_taxname):
selected_taxid = row.strip().split("\t")[4]
selected_taxname = row.strip().split("\t")[5].strip()
selected_reads = int(row.strip().split("\t")[2])
tmp_taxid = row.strip().split("\t")[4]
tmp_taxname = row.strip().split("\t")[5].strip()
if selected_reads >= min_reads:
Taxons[tmp_taxid] = selected_taxid + '|' + selected_taxname
selected = False
selected_taxid = ''
selected_taxname = ''
tmp_dict = dict()
if check_rank(current_rank) >= 8:
if total_count == rank_count:
selected = True
tmp_dict[line] = rank_count
else:
last_rank = current_rank
if check_rank(current_rank) >= 8:
if total_count == rank_count:
selected = True
tmp_dict[line] = rank_count
if selected == True:
for row in dict(sorted(tmp_dict.items(), key=lambda item: item[1], reverse=True)).keys():
if not (selected_taxid and selected_taxname):
selected_taxid = row.strip().split("\t")[4]
selected_taxname = row.strip().split("\t")[5].strip()
selected_reads = int(row.strip().split("\t")[2])
tmp_taxid = row.strip().split("\t")[4]
tmp_taxname = row.strip().split("\t")[5].strip()
if selected_reads >= min_reads:
Taxons[tmp_taxid] = selected_taxid + '|' + selected_taxname
freport.close()
Dict = {}
Pool = {}
fout = open(output_reads, "w")
with open(input_reads) as ffastq, open(input_table) as fkraken:
for record, row in zip(SeqIO.parse(ffastq, "fastq"), fkraken):
cb, umi = record.id.split(" ")[0].split("_")[-2:]
if cb not in cells:
print("CB is: " + cb + " and UMI is: " + umi)
continue
taxonomy = row.split("\t")[2]
taxname, taxid = taxonomy.split(" (taxid ")
taxid = taxid.split(")")[0]
if taxid == "9606" or taxid == "10090" or taxid == "28384":
continue
if taxid not in Taxons.keys():
continue
taxname = Taxons[taxid].split('|')[1]
fout.write('>' + record.id + ' ' + taxname + '\n' + str(record.seq) + '\n')
ids = cb + '_' + umi
if ids not in Pool:
Pool[ids] = {}
if taxid not in Pool[ids]:
Pool[ids][taxname] = 1
else:
Pool[ids][taxname] = Pool[ids][taxname] + 1
if taxname in Dict:
Dict[taxname].add(ids)
else:
Dict[taxname] = set()
Dict[taxname].add(ids)
fout.close()
# find optimal organism with same barcode + umi
organisms = {}
for ids in Pool:
organisms[ids] = sorted(Pool[ids].items(), reverse=True)[0][0]
# store organisms in set
organisms_set = {organisms[k] for k in organisms}
genes = set()
gene_counts = {}
for gene in Dict:
if gene not in organisms_set:
continue
for ids in Dict[gene]:
cell = ids.split("_")[0]
count = 1
genes.add(gene)
if gene not in gene_counts:
gene_counts[gene] = {}
if cell not in gene_counts[gene]:
gene_counts[gene][cell] = 1
else:
gene_counts[gene][cell] = gene_counts[gene][cell] + 1
with open(output_matrix, "w") as fmatrix:
fmatrix.write("gene" + '\t' + '\t'.join(sorted(cells)) + '\n')
for gene in sorted(genes):
counts = []
for cell in sorted(cells):
if cell in gene_counts[gene]:
counts.append(gene_counts[gene][cell])
else:
counts.append(0)
if sorted(counts, reverse=True)[0] >= 2: # remove species/strains with maximum UMIs < 2
positive_counts = str(sum(i > 0 for i in counts))
fmatrix.write(gene + '\t' + '\t'.join(map(str, counts)) + '\n')
print(gene + '\t' + positive_counts)
fmatrix.close()
return
def count(project_id, star_index, kraken_db, barcode, read1, read2, pattern, thread, confidence, min_reads, ignore_suffix, trim_poly_x, filter_low_complexity, complexity_threshold):
check_deps()
check_dir(project_id)
trim(project_id, barcode, output='barcodes.tsv')
extract(project_id, read1, read2, pattern, barcode='barcodes.tsv', output='reads_barcoded.fq.gz', ignore_suffix=ignore_suffix)
filter(project_id, input_reads='reads_barcoded.fq.gz', thread=thread, output='reads_barcoded_clean.fq.gz', trim_poly_x=trim_poly_x, filter_low_complexity=filter_low_complexity, complexity_threshold=complexity_threshold)
align(project_id, star_index, input_reads='reads_barcoded_clean.fq.gz', thread=thread, output='unmapped_reads.fq')
classify(project_id, kraken_db, input_reads='unmapped_reads.fq', thread=thread, confidence=confidence, out_reads='classified_reads.fq', out_table='classified_table.txt', out_report='classified_report.txt')
quant(project_id, barcode='barcodes.tsv', input_reads='classified_reads.fq', input_table='classified_table.txt', input_report='classified_report.txt', min_reads=min_reads, output_reads='microbes.fa', output_matrix='microbes.tsv')
return(0)
def get_args():
class Parser(argparse.ArgumentParser):
def print_message(self, message, color=None):
if message:
sys.stderr.write('\033[91m' + message + '\033[0m\n')
def error(self, message):
#sys.stderr.write('\nerror: %s\n\n' % message)
#sys.stderr.write('\033[31m', '\nerror: %s\n\n' % message)
self.print_message('\nerror: %s\n' % message)
self.print_help(sys.stderr)
sys.exit(2)
#parser = argparse.ArgumentParser(usage=USAGE)
parser = Parser()
# Global options
parser.add_argument('-v', '--version', action='version', version=VERSION)
subparsers = parser.add_subparsers(dest='command')
# Command: count
parser_count = subparsers.add_parser('count',
help='one command to count microbes at single-cell levels')
parser_count.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_count.add_argument('--star_index', action='store', default='',
help='star indices of the reference genome', required=True)
parser_count.add_argument('--kraken_db', action='store', default='',
help='kraken2 database name', required=True)
parser_count.add_argument('--barcode', action='store', default='barcodes.tsv',
help='file name for raw barcodes from cellranger, decompress it before use', required=True)
parser_count.add_argument('--read1', default='',
help='file name for Read1', action='store', required=True)
parser_count.add_argument('--read2', default='',
help='file name for Read2', action='store', required=True)
parser_count.add_argument('--pattern', action='store', default='CCCCCCCCCCCCCCCCNNNNNNNNNN',
help='barcode pattern, default="CCCCCCCCCCCCCCCCNNNNNNNNNN", please refer to umi_tools help manual', required=False)
parser_count.add_argument('--thread', action='store', default=8, type=int,
help='number of threads to use, default is 8', required=False)
parser_count.add_argument('--confidence', action='store', default=0.11, type=float,
help='confidence threshold to use [0-1], default is 0.11', required=False)
parser_count.add_argument('--min_reads', action='store', default=10,
help='minimum number of reads that supports the taxonomy, default is 10', required=False)
parser_count.add_argument('--trim_poly_x', action='store_true',
help='enable polyA/T/C/G/N trimming in 3\' ends', required=False)
parser_count.add_argument('--filter_low_complexity', action='store_true',
help='enable filter out low complexity reads, default is False', required=False)
parser_count.add_argument('--complexity_threshold', action='store', default=30,
help='the threshold for low complexity filter [0-100], default is 30', required=False)
parser_count.add_argument('--ignore_suffix', action='store_true',
help='ignore the read name suffixes, eg., /1 and /2 in the read are ignored, default is False', required=False)
# Command: trim
parser_trim = subparsers.add_parser('trim',
help='trim -1 at the end of barcode')
parser_trim.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_trim.add_argument('--barcode', default='', action='store',
help='file name for raw barcodes from cellranger, decompress it before use', required=True)
parser_trim.add_argument('--output', default='barcodes.tsv', action='store',
help='file to output barcodes to, default is barcodes.tsv', required=False)
# Command: extract
parser_extract = subparsers.add_parser('extract',
help='extract and append barcode and UMI to the read2 name')
parser_extract.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_extract.add_argument('--read1', default='',
help='file name for Read1', action='store', required=True)
parser_extract.add_argument('--read2', default='',
help='file name for Read2', action='store', required=True)
parser_extract.add_argument('--pattern', action='store', default='CCCCCCCCCCCCCCCCNNNNNNNNNN',
help='barcode pattern, default="CCCCCCCCCCCCCCCCNNNNNNNNNN", please refer to umi_tools help manual', required=False)
parser_extract.add_argument('--barcode', default='barcodes.tsv',
help='file name for valid barcodes', action='store', required=False)
parser_extract.add_argument('--output', action='store', default='reads_barcoded.fq.gz',
help='file to output processed read to', required=False)
parser_extract.add_argument('--ignore_suffix', action='store_true',
help='ignore the read name suffixes, eg., /1 and /2 in the read are ignored, default is False', required=False)
# Command: filter
parser_filter = subparsers.add_parser('filter',
help='filter out low quality / complexity reads')
parser_filter.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_filter.add_argument('--input_reads', action='store', default='reads_barcoded.fq.gz',
help='input single-end fastq file produced by "extract" command', required=False)
parser_filter.add_argument('--thread', action='store', default=8, type=int,
help='number of threads to use, default is 8', required=False)
parser_filter.add_argument('--output', action='store', default='reads_barcoded_clean.fq.gz',
help='file to output processed read to, default is reads_barcoded_clean.fq.gz', required=False)
parser_filter.add_argument('--trim_poly_x', action='store_true',
help='enable polyA/T/C/G/N trimming in 3\' ends', required=False)
parser_filter.add_argument('--filter_low_complexity', action='store_true',
help='enable filter out low complexity reads, default is False', required=False)
parser_filter.add_argument('--complexity_threshold', action='store', default=30,
help='the threshold for low complexity filter [0-100], default is 30', required=False)
# Command: align
parser_align = subparsers.add_parser('align',
help='align clean reads to the reference genome')
parser_align.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_align.add_argument('--star_index', action='store', default='',
help='star indices of the reference genome', required=True)
parser_align.add_argument('--input_reads', action='store', default='reads_barcoded_clean.fq.gz',
help='input single-end fastq file', required=False)
parser_align.add_argument('--thread', action='store', default=8, type=int,
help='number of threads to use, default is 8', required=False)
parser_align.add_argument('--output', action='store', default='unmapped_reads.fq',
help='file to output unmapped reads to', required=False)
# Command: classify
parser_classify = subparsers.add_parser('classify',
help='classify unmapped reads to taxons')
parser_classify.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_classify.add_argument('--kraken_db', action='store', default='',
help='kraken2 database name', required=True)
parser_classify.add_argument('--input_reads', action='store', default='unmapped_reads.fq',
help='input single-end fastq file produced by "align" command', required=False)
parser_classify.add_argument('--thread', action='store', default=8, type=int,
help='number of threads to use, default is 8', required=False)
parser_classify.add_argument('--confidence', action='store', default=0.11, type=float,
help='confidence threshold to use [0-1], default is 0.11', required=False)
parser_classify.add_argument('--out_reads', action='store', default='classified_reads.fq',
help='file to output processed reads to', required=False)
parser_classify.add_argument('--out_table', action='store', default='classified_table.txt',
help='file to output kraken results to', required=False)
parser_classify.add_argument('--out_report', action='store', default='classified_report.txt',
help='file to output kraken report to', required=False)
# Command: quant
parser_quant = subparsers.add_parser('quant',
help='deduplication and quantification of microbes at single-cell levels')
parser_quant.add_argument('--project_id', default='./', action='store',
help='project id to store output files', required=True)
parser_quant.add_argument('--barcode', action='store', default='barcodes.tsv',
help='file name for valid barcodes', required=False)
parser_quant.add_argument('--input_reads', action='store', default='classified_reads.fq',
help='input fastq file produced by "classify" command', required=False)
parser_quant.add_argument('--input_table', action='store', default='classified_table.txt',
help='input table file produced by "classify" command', required=False)
parser_quant.add_argument('--input_report', action='store', default='classified_report.txt',
help='input report file produced by "classify" command', required=False)
parser_quant.add_argument('--min_reads', action='store', default=10,
help='minimum number of reads that supports the taxonomy, default is 10', required=False)
parser_quant.add_argument('--output_reads', action='store', default='microbes.fa',
help='file to output classified reads to, default is microbes.fa', required=False)
parser_quant.add_argument('--output_matrix', action='store', default='microbes.tsv',
help='file to output microbe quantification matrix to, default is microbes.tsv', required=False)
return parser
def main():
parser = get_args()
args = parser.parse_args()
if not args.command:
parser.parse_args(["--help"])
sys.exit(0)
if args.command == "count":
count(args.project_id, args.star_index, args.kraken_db, args.barcode, args.read1, args.read2, args.pattern, args.thread, args.confidence, args.min_reads, args.ignore_suffix, args.trim_poly_x, args.filter_low_complexity, args.complexity_threshold)
elif args.command == "trim":
trim(args.project_id, args.barcode, args.output)
elif args.command == "extract":
extract(args.project_id, args.read1, args.read2, args.pattern, args.barcode, args.output, args.ignore_suffix)
elif args.command == "filter":
filter(args.project_id, args.input_reads, args.thread, args.output, args.trim_poly_x, args.filter_low_complexity, args.complexity_threshold)
elif args.command == "align":
align(args.project_id, args.star_index, args.input_reads, args.thread, args.output)
elif args.command == "classify":
classify(args.project_id, args.kraken_db, args.input_reads, args.thread, args.confidence, args.out_reads, args.out_table, args.out_report)
elif args.command == "quant":
quant(args.project_id, args.barcode, args.input_reads, args.input_table, args.input_report, args.min_reads, args.output_reads, args.output_matrix)
return
if __name__ == '__main__':
main()