forked from WrightonLabCSU/columbia_river
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2.2_map_reads_fasta_abundance.py
467 lines (347 loc) · 12.9 KB
/
2.2_map_reads_fasta_abundance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
#!/usr/bin/python
##This is in-house script that was used for read mapping of vMAGs and MAGs using the parameters mentioned in steps 2.0 and 3.0.
#PYTHON SCRIPT
#written by: Richard Wolfe
#
#to run type: python sam_file_analize_reads.py -i <inputfile>
# or: ./ongest_sequence.py -i <inputfile>
#
# if error: /usr/bin/python^M: bad interpreter: No such file or directory
# -there is a windows endl after shebang
# -open in vi
# once in vi type:
# :set ff=unix<return>
# :x<return>
#
#
#
# goes through the sam file and analyze how the reads mapped
#
# Note:
#
import sys #for exit command and maxint
import argparse #to get command line args
#needed to install argparse module because using python 2.6
#and argparse comes with python 2.7
# sudo easy_install argparse
import os.path
import glob #for * in file name
import toolbox
import count_multi_map_reads #to make sam_stats table
def check_fastq_lines(R1,R2):
#R1 and R2 are file names
"""
f1 = open(R1, "r")
f2 = open(R2, "r")
f1_lines = 0
f2_lines = 0
line = f1.readline()
while line:
f1_lines += 1
line = f1.readline()
line = f2.readline()
while line:
f2_lines += 1
line = f2.readline()
"""
f1_lines = 0
f2_lines = 0
with open(R1) as f:
for f1_lines, l in enumerate(f):
pass
with open(R2) as f:
for f2_lines, l in enumerate(f):
pass
#need to add 1 line?
f1_lines += 1
f2_lines += 1
if f1_lines != f2_lines:
print "Error....Files are not same length"
print R1 + " lines = ", f1_lines
print R2 + " lines = ", f2_lines
sys.exit(1)
#f1.close()
#f2.close()
#print "Files are same length"
#print R1 + " lines = ", f1_lines
#print R2 + " lines = ", f2_lines
#sys.exit(1)
return
def get_from_file(f):
#
# f = file handle
# returns a list of the lines in the file
l = []
line = f.readline()
while line:
l.append(line.rstrip()) #removes the endline
line = f.readline()
return l
#create an argument parser object
#description will be printed when help is used
parser = argparse.ArgumentParser(description='A script to analyze how the reads mapped')
#add the available arguments -h and --help are aqdded by default
#if the input file does not exist then program will exit
#if output file does not exit it will be created
# args.input is the input file Note: cant write to this file because read only
# args.output is the output file
# args.m is the minimum seq length
parser.add_argument('-f', '--fasta_file',type=argparse.FileType('rU'), help='fasta file to map to ', required=True)
#parser.add_argument('-r', '--reads_files', help='comma seperated list of reads files to map(must be 2 and no spaces)', required=True)
#parser.add_argument('-n', '--names', help='comma seperated list of sample names for the reads(no spaces)', required=True)
parser.add_argument('-r', '--reads_files', type=argparse.FileType('rU'), help='text file of R1 and R2 reads files to map(must be 1 file per line)', required=True)
parser.add_argument('-n', '--names', type=argparse.FileType('rU'), help='text file of sample names for the reads(mut be 1 per line)', required=True)
parser.add_argument('-t', '--output', help='Output file table name. Also, sam_stats_<filename>', required=True)
parser.add_argument('-m', '--max_mismatch', type=int, help='max number of mismatches') #required=True)
parser.add_argument('--percent_read', type=int, help='percent of (read_length - mismatches)/read_length to keep EX: 90') #optional
parser.add_argument('-p', '--processors', type=int, help='number of processors', required=True)
parser.add_argument('--min_coverage', type=float, help='Minimum coverage cutoff Ex:250 or 250.00', required=True)
parser.add_argument('--min_percent_contig_coverage', type=float, help='Minimum percent of contig coverage cutoff Ex:80 or 80.00', required=True)
parser.add_argument('-u', '--multi_map', help='multi map T or F', default="F")
parser.add_argument('-d', '--delete_files', help='delete files made EX: bowtie and sam files T or F', default="T")
#get the args
args = parser.parse_args()
#can only have -m or --percent_read not both
if args.max_mismatch == None: #because int
if args.percent_read == None: #because int
print "Error ... must have -m or --percent_read"
sys.exit(1)
if not args.max_mismatch == None: #because int
if not args.percent_read == None: #because int
print "Error ... can not have both -m and --percent_read"
sys.exit(1)
#if using --percent_read set max_mismatch very high
if args.max_mismatch == None: #because int
args.max_mismatch = 5000
#print args.max_mismatch
#sys.exit(0)
if args.max_mismatch < 0:
print "Error ... -m must be greater than or equal to 0"
sys.exit(1)
if args.processors < 1:
print "Error ... -p must be greater than 0"
sys.exit(1)
#see if output file exists so it is not overwritten
if os.path.exists(args.output):
print "Error .... Output file already exists"
sys.exit(1)
if args.multi_map != "T":
if args.multi_map != "F":
print "Error ... -u must be T or F"
sys.exit(1)
if args.delete_files != "T":
if args.delete_files != "F":
print "Error ... -d must be T or F"
sys.exit(1)
print "Script Started ... "
#reads = args.reads_files.split(",")
#names = args.names.split(",")
reads = get_from_file(args.reads_files)
names = get_from_file(args.names)
reads_str = ",".join(reads)
names_str = ",".join(names)
#print reads_str
#print names_str
#sys.exit()
#check if reads file is a pair
if len(reads) % 2 != 0:
print "Error .... reads files not in sets of 2"
sys.exit(1)
#make sure all files exist
for item in reads:
if not os.path.exists(item):
print "Error .... reads file does not exist...." + item
sys.exit(1)
#check if each pair of reads has a name and not 2 names the same
if len(reads) / 2 != len(names):
print "Error .... number of reads files divided by 2 not equal to number of names"
sys.exit(1)
temp = []
for item in names:
if item in temp:
print "Error ... Same name occurs more than once"
sys.exit(1)
else:
temp.append(item)
temp = []
#check to make sure reads files have same number of lines
i = 0
while i < len(reads):
check_fastq_lines(reads[i],reads[i+1])
i = i + 2
#map each pair of reads to the fasta file
# 0. This script checks to make sure no sam files are in the folder before proceeding
# 1. This runs bowtie2 with -N 1 -- alows mismatches in seed
# bowtie2 is run on each set of reads mapping to the fasta file
# 2. Then runs script that makes a new sam files by marking reads that map with mismatches > specified to not map
# 3. Also runs a script that reads all the new sam files and makes a report table named map_reads_fasta_output.txt
# This table is not used
cmd = "python /ORG-Data/scripts/map_reads_fasta.py -f " + args.fasta_file.name + " -r " + reads_str
cmd = cmd + " -n " + names_str + " -t map_reads_fasta_output.txt " + " -m " + str(args.max_mismatch)
cmd = cmd + " -p " + str(args.processors) + " -d F -u " + args.multi_map
if not args.percent_read == None: #because int
cmd = cmd + " --percent_read " + str(args.percent_read)
print "Running command: " + cmd
toolbox.run_system(cmd)
for item in names:
f_name = args.fasta_file.name.split("/")[-1] #get the file name in case it is a path
sam_file = "mismatches_" + str(args.max_mismatch) + "_" + item + "_to_" + f_name + ".sam"
cmd = "python /ORG-Data/scripts/make_contig_cov_file_abundance.py -s " + args.fasta_file.name
cmd = cmd + " -o " + item + "_make_contig_cov_abundance.txt" + " -b " + sam_file
print "Running command: " + cmd
toolbox.run_system(cmd)
for item in names:
f_name = args.fasta_file.name.split("/")[-1] #get the file name in case it is a path
sam_file = "mismatches_" + str(args.max_mismatch) + "_" + item + "_to_" + f_name + ".sam"
cmd = "python /ORG-Data/scripts/make_contig_cov_file.py -s " + args.fasta_file.name
cmd = cmd + " -o " + item + "_make_contig_cov_output.txt" + " -b " + sam_file
print "Running command: " + cmd
toolbox.run_system(cmd)
for item in names:
f_name = args.fasta_file.name.split("/")[-1] #get the file name in case it is a path
sam_file = "mismatches_" + str(args.max_mismatch) + "_" + item + "_to_" + f_name + ".sam"
cmd = "python /ORG-Data/scripts/calc_percent_contig_cov.py -s " + sam_file
cmd = cmd + " -o " + item + "_cal_percent_contig_cov.txt" + " -v " + str(args.max_mismatch)
cmd = cmd + " -m 1 -p 0"
print "Running command: " + cmd
toolbox.run_system(cmd)
#make a list of genes from the fasta file
gene_ids = []
results = [] #will be a list of lists .. the percent_reads_mapped for each sample
line = args.fasta_file.readline()
while line:
if line.startswith(">"):
id = line.split()[0]
id = id[1:] #remove >
if id not in gene_ids:
gene_ids.append(id)
else:
print "Error... duplicate id in fasta file = " + id
sys.exit(1)
line = args.fasta_file.readline()
#extract the info from the ouptut files
for item in names:
percent_reads_mapped = []
contig_cov = []
percent_contig_cov = []
for i in gene_ids:
percent_reads_mapped.append("NA")
contig_cov.append("NA")
percent_contig_cov.append("NA")
#get the % reads mapped for this sample
f = open(item + "_make_contig_cov_abundance.txt", "r")
line = f.readline() #header line
line = f.readline()
while line:
line = line.rstrip()
p = line.split("\t")[3]
id = line.split("\t")[0]
index = gene_ids.index(id)
percent_reads_mapped[index] = p
line = f.readline()
#make sure everything found
for i in percent_reads_mapped:
if i == "NA":
print "ERROR in " + item + "_make_contig_cov_abundance.txt"
sys.exit(1)
#get the contig cov for this sample
f = open(item + "_make_contig_cov_output.txt", "r")
line = f.readline()
while line:
line = line.rstrip()
c = line.split("\t")[4]
id = line.split("\t")[0]
index = gene_ids.index(id)
contig_cov[index] = c
line = f.readline()
#make sure everything found
for i in contig_cov:
if i == "NA":
print "ERROR in " + item + "_make_contig_cov_output.txt"
sys.exit(1)
#get the % contig cov for this sample
f = open(item + "_cal_percent_contig_cov.txt", "r")
line = f.readline()
while line:
line = line.rstrip()
pc = line.split("\t")[3]
id = line.split("\t")[0]
index = gene_ids.index(id)
percent_contig_cov[index] = pc
line = f.readline()
#make sure everything found
for i in percent_contig_cov:
if i == "NA":
print "ERROR in " + item + "_cal_percent_contig_cov.txt"
sys.exit(1)
#see if this % reads mapped meets thresholds and save list
#parser.add_argument('--min_coverage', type=float, help='Minimum coverage cutoff Ex:250 or 250.00', required=True)
#parser.add_argument('--min_percent_contig_coverage', type=float, help='Minimum percent of contig coverage cutoff Ex:80 or 80.00', required=True)
i = 0
while i < len(gene_ids):
c = float(contig_cov[i]) * 1.0
p = float(percent_contig_cov[i]) * 100.0
if c < args.min_coverage or p < args.min_percent_contig_coverage:
percent_reads_mapped[i] = "0.0"
i += 1
results.append(percent_reads_mapped)
#print the output table
output = open(args.output, "w")
output.write("Gene_id\t")
i = 0
while i < len(names):
output.write(names[i] + "\t")
i += 1
output.write("\n")
i = 0
while i < len(gene_ids):
output.write(gene_ids[i] + "\t")
for item in results:
output.write(item[i] + "\t")
output.write("\n")
i += 1
##################################################
###make sam_stats table for each of the samples
sam_files = [] #the sam files for each sample
sample_names = [] #the sample names (could have used names [])
sam_data = [] #will be a list of tuples with the data from 1 or more sam files
for item in names:
sam_file_name = "mismatches_" + str(args.max_mismatch) + "_" + item + "_to_" + f_name + ".sam"
sam_files.append(sam_file_name)
sample_names.append(item)
#read each sam file and put the data tuple into sam_data
i = 0
while i < len(sam_files):
data_tuple = count_multi_map_reads.read_sam_file(sam_files[i])
sam_data.append(data_tuple)
i += 1
#print the data to the output file
sam_data_table = open("sam_stats_" + args.output, "w")
count_multi_map_reads.write_table(sam_data_table, sam_data, sample_names)
############################
if args.delete_files == "T":
for item in names:
try:
os.remove(item + "_make_contig_cov_abundance.txt")
os.remove(item + "_make_contig_cov_output.txt")
os.remove(item + "_cal_percent_contig_cov.txt")
except OSError:
print "Some files for " + item + " could not be removed"
try:
#os.remove(glob('*.sam'))
#os.remove(glob('*.bt2'))
cmd = "rm *.sam"
print "Running command: " + cmd
toolbox.run_system(cmd)
cmd = "rm *.bt2"
print "Running command: " + cmd
toolbox.run_system(cmd)
except OSError:
print "Some sam or bowtie files could not be removed"
print ""
print "Number of reads files = ", len(reads)
print "Number of names = ", len(names)
print ""
print "Number of ids in fasta file = ",len(gene_ids)
print "Script finished..."
sys.exit(0)