-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_relevant_genomes.py
executable file
·560 lines (430 loc) · 19.1 KB
/
retrieve_relevant_genomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
# -*- coding: utf-8 -*-
"""
RETRIEVE RELEVANT GENOMES (RRG)
[Description]
"""
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import Entrez
import json
import time
import pandas as pd
import os
config_json_file_name = "retrieve_relevant_genomes_config.json"
def set_up():
global ANALYSIS_NAME
global PROTEINS_FILENAME
global DATABASE
global TAXID
global EVAL_THRESHOLD
global QCOV_THRESHOLD
global ATTEMPTS_LIMIT
global EMAIL
global API_KEY
global SLEEP_TIME
global MINIMUM_GENOME_LENGTH
global ALPHA_PHAGES_FILENAME
global PILITROPIC_PHAGES_FILENAME
global FLAGELLOTROPIC_PHAGES_FILENAME
global USE_TBLASTN_HITS_FROM_PREVIOUS_RUN
file_reader = json.load(open(config_json_file_name))
# Name of the folder for the results from this analysis
ANALYSIS_NAME = file_reader['analysis_name']
# Set file names
PROTEINS_FILENAME = file_reader['proteins_filename']
# Set BLAST parameters
DATABASE = file_reader['blast_parameters']['database']
TAXID = file_reader['blast_parameters']['taxid']
EVAL_THRESHOLD = file_reader['blast_parameters']['eval_threshold']
QCOV_THRESHOLD = file_reader['blast_parameters']['qcov_threshold']
MINIMUM_GENOME_LENGTH = file_reader['blast_parameters']['minimum_genome_length']
ATTEMPTS_LIMIT = file_reader['blast_parameters']['attempts_limit']
# Set Entrez parameters
EMAIL = file_reader['entrez_parameters']['email']
API_KEY = file_reader['entrez_parameters']['api_key']
SLEEP_TIME = file_reader['entrez_parameters']['sleep_time']
# Files for alpha-infecting phages and pilitropic phages
ALPHA_PHAGES_FILENAME = file_reader["alpha_phages_file_name"]
PILITROPIC_PHAGES_FILENAME = file_reader["plitropic_phages_file_name"]
FLAGELLOTROPIC_PHAGES_FILENAME = file_reader["flagellotropic_phages_file_name"]
# Use the tBLASTn hits obtained in a previous run, instead of searching again
USE_TBLASTN_HITS_FROM_PREVIOUS_RUN = file_reader["use_tblastn_hits_from_previous_run"]
def tblastn_search(proteins, tag='0'):
# If download fails after as many attempts as ATTEMPTS_LIMIT, the protein will
# be appended to this list
failed_list = []
info_table = None
# taxon parameter is defined. It will be used to restrict the BLAST search
if TAXID != None:
taxon = "txid" + str(TAXID) + "[orgn]"
else:
taxon = None
list_1_query_prot_acc = []
list_2_hit_genome_acc = []
list_3_alignment_start_pos = []
list_4_alignment_end_pos = []
list_5_hit_name = []
list_6_hit_E_value = []
list_7_hit_query_cover = []
for protein in proteins:
print('Blast search for protein', protein.id)
failed = False
for i in range(ATTEMPTS_LIMIT):
try:
# Blast search
result_handle = NCBIWWW.qblast("tblastn", DATABASE, protein.seq, entrez_query=taxon, expect=EVAL_THRESHOLD)
time.sleep(SLEEP_TIME)
break
except:
print('Attempt', i + 1, 'failed')
if i == ATTEMPTS_LIMIT - 1:
failed = True
if failed == True:
print('Search failed for', protein, '\nAppending it to failed_list')
failed_list.append(protein)
continue
for n in range(ATTEMPTS_LIMIT):
try:
blast_record = NCBIXML.read(result_handle)
time.sleep(SLEEP_TIME)
break
except:
print('Attempt', n + 1, 'failed')
if n == ATTEMPTS_LIMIT - 1:
failed = True
if failed == True:
print('Reading failed for', protein, '\nAppending it to failed_list')
failed_list.append(protein)
continue
# Get Hits
for alignment in blast_record.alignments:
if iscompletegenome(alignment.hit_def):
for hsp in alignment.hsps:
# Compute query cover:
# Length of the query sequence
query_length = len(protein.seq)
# Length of the part of the query sequence involved in the alignment
query_span = hsp.query_end - hsp.query_start + 1
# query cover
query_cover = query_span / query_length
# Check whether query cover is acceptable
if query_cover > QCOV_THRESHOLD:
# For info Table
list_1_query_prot_acc.append(protein.id)
list_2_hit_genome_acc.append(alignment.accession)
list_3_alignment_start_pos.append(hsp.sbjct_start)
list_4_alignment_end_pos.append(hsp.sbjct_end)
list_5_hit_name.append(alignment.hit_def)
list_6_hit_E_value.append(hsp.expect)
list_7_hit_query_cover.append(query_cover)
# NEW TMP INFO TABLE
# Better names
list_5_hit_name_short = [s.split(', complete')[0] for s in list_5_hit_name]
# Info Table with E-values and query cover values
info_table = pd.DataFrame({'Delta phage protein': list_1_query_prot_acc,
'Hit genome accession': list_2_hit_genome_acc,
'Start position of the alignment': list_3_alignment_start_pos,
'End position of the alignment': list_4_alignment_end_pos,
'Phage name': list_5_hit_name_short,
'E_value': list_6_hit_E_value,
'Q_cov': list_7_hit_query_cover})
# Save temporary table
tmp_table_filename = 'tmp_' + tag + '_tBLASTn_hits_Info_Table.tsv'
RRG_out_dir = os.path.join(ANALYSIS_NAME, 'RRG_output')
tmp_table_filepath = os.path.join(RRG_out_dir, tmp_table_filename)
info_table.to_csv(tmp_table_filepath, sep='\t', index=False)
# Return info_table and list of failed searches
return info_table, failed_list
def isrefseq(genome_accession):
''' Returns True if the input is a RefSeq genome accession '''
return genome_accession[2] == '_'
def iscompletegenome(description):
return 'complete genome' in description and 'partial' not in description
def check_dir(dirname):
if not os.path.exists(dirname):
os.mkdir(dirname)
def organize_genomes_by_taxid(genomes):
# Genome accessions found for each species
taxid_to_accessions = {} # key: species taxid -> value: list of accessions
taxid_to_title = {} # key: species taxid -> value: species title
failed_downloads = []
failed_reads = []
short_genomes = [] # Genomes that are descarted because too short
incomplete_genomes = [] # Genomes that are descarted because incomplete
# Keep track of which species have a refseq
#species_with_refseq = []
for acc in genomes:
failed = False # Becomes True when either downloading or reading fails
# Try to download genome info
for n in range(ATTEMPTS_LIMIT):
try:
handle = Entrez.esummary(db="nucleotide", id=acc)
time.sleep(SLEEP_TIME)
break
except:
print('Attempt', n + 1, 'failed')
if n == ATTEMPTS_LIMIT - 1:
failed = True
if failed:
print('Download failed for', acc)
print('Appending it to failed_downloads')
failed_downloads.append(acc)
continue
# If download didn't fail, try to read handle:
for n in range(ATTEMPTS_LIMIT):
try:
record = Entrez.read(handle)
handle.close()
break
except:
print('Reading attempt', n + 1, 'failed')
time.sleep(SLEEP_TIME)
if n == ATTEMPTS_LIMIT - 1:
handle.close()
failed = True
if failed:
print('Reading failed for', acc)
print('Appending it to failed_reads')
failed_reads.append(acc)
continue
# Process record
genome_title = record[0]['Title']
genome_taxid = int(record[0]['TaxId'])
genome_length = int(record[0]['Length'])
print(genome_title)
# Check genome length
if genome_length > MINIMUM_GENOME_LENGTH:
# Check genome completeness
if iscompletegenome(genome_title):
# Compile species-specific list of genome accessions
if genome_taxid in taxid_to_accessions.keys():
taxid_to_accessions[genome_taxid].append(acc)
else:
taxid_to_accessions[genome_taxid] = [acc]
taxid_to_title[genome_taxid] = genome_title
# Discard if incomplete
else:
incomplete_genomes.append(acc)
# Discard if too short
else:
short_genomes.append(acc)
# Check failed downloads
if len(failed_downloads) > 0:
print('\nCheck failed_downloads list: {} failed downloads'.format(
len(failed_downloads))
)
# Check failed reads
if len(failed_reads) > 0:
print('\nCheck failed_reads list: {} failed reads'.format(
len(failed_reads))
)
# Check short genomes
if len(short_genomes) > 0:
print('\n' + str(len(short_genomes)) + ' genomes were discarted ' +
'because shorter than the reqired minimum length of ' +
str(MINIMUM_GENOME_LENGTH) + ' base-pairs')
# Check incomplete genomes
if len(incomplete_genomes) > 0:
print('\n{} genomes were discarted because incomplete'.format(
len(incomplete_genomes))
)
return [taxid_to_accessions, taxid_to_title, failed_downloads,
failed_reads, short_genomes, incomplete_genomes]
def loop_over_failed_searches(info_table, failed_list):
''' Re-runs the tblastn_search function until failed_list stops shrinking. '''
# While loop to try again failed searches
failed_list_old = []
c = 0
while failed_list != failed_list_old:
print('tblastn_search iteration {}:\t{} failed tBLASTn queries'.format(
c, len(failed_list))
)
failed_list_old = failed_list
c += 1
info_table_to_add, failed_list = tblastn_search(failed_list_old, str(c))
# Add new hits to info_table
if isinstance(info_table_to_add, pd.DataFrame):
if len(info_table_to_add) != 0:
info_table = pd.concat([info_table, info_table_to_add])
# Clean up temporary tBLASTn tables
RRG_out_dir = os.path.join(ANALYSIS_NAME, 'RRG_output')
analysis_dir_content = os.listdir(RRG_out_dir)
for f in analysis_dir_content:
if f[:4] == 'tmp_' and f[-15:] == '_Info_Table.tsv':
print('\nRemoving file ' + f)
filepath = os.path.join(RRG_out_dir, f)
os.remove(filepath)
return info_table, failed_list
def select_one_genome_per_taxid(taxid_to_accessions):
# Select accessions for the analysis
selected_acc_dict = {}
for taxid in taxid_to_accessions.keys():
accessions = taxid_to_accessions[taxid]
for acc in accessions:
if isrefseq(acc):
# if a refseq is present, it becomes the only element of the
# list. Otherwise, 'accession' is going to be a list of one or
# more genbank accesssions.
accessions = [acc]
# The first element (alphabetically) is going to be selected. (It's
# going to be the refseq if 'accessions' only contained the refseq).
accessions.sort()
selected_acc = accessions[0]
selected_acc_dict[taxid] = selected_acc
return selected_acc_dict
def main():
print('\nRUN NAME:', ANALYSIS_NAME, '\n')
# Make directory for this run if it doesn't exist yet
check_dir(ANALYSIS_NAME)
# Output subfolder
RRG_out_dir = os.path.join(ANALYSIS_NAME, 'RRG_output')
check_dir(RRG_out_dir)
# Set email and API key
Entrez.email = EMAIL
Entrez.apikey = API_KEY
# tBLASTn
if USE_TBLASTN_HITS_FROM_PREVIOUS_RUN:
# Previous run RRG output directory
prev_outdir = os.path.join(USE_TBLASTN_HITS_FROM_PREVIOUS_RUN, 'RRG_output')
# Load tBLASTn hits from previous run
filepath = os.path.join(prev_outdir, 'tBLASTn_hits.json')
tblastn_accessions = json.load(open(filepath))
# Load tBLASTn hits Info Table
filepath = os.path.join(prev_outdir, 'tBLASTn_hits_Info_Table.tsv')
info_table = pd.read_csv(filepath, sep='\t')
else:
# Perform tBLASTn search
# Query proteins
proteins_filepath = os.path.join('input_datasets', PROTEINS_FILENAME)
proteins = SeqIO.parse(proteins_filepath, format="fasta")
# Run tBLASTn
info_table, failed_list = tblastn_search(proteins)
# Try again with the failed searches until failed_list stops shrinking
info_table, failed_list = loop_over_failed_searches(info_table, failed_list)
# If there are still failed searches after the while loop, report them
if len(failed_list) > 0:
failed_filepath = os.path.join(RRG_out_dir, 'failed_tBLASTn_searches.json')
print("{} genomes couldn't be retrieved. Saving them into: {}".format(
len(failed_list), failed_filepath))
with open(failed_filepath, 'w') as outfile:
json.dump(failed_list, outfile)
# Hit genomes
tblastn_accessions = list(info_table['Hit genome accession'])
tblastn_accessions = list(set(tblastn_accessions)) # Remove duplicates
# Save tBLASTn hit genomes
filepath = os.path.join(RRG_out_dir, 'tBLASTn_hits.json')
with open(filepath, 'w') as outfile:
json.dump(tblastn_accessions, outfile)
# Save a table about tBLASTn output with complete info
filepath = os.path.join(RRG_out_dir, 'tBLASTn_hits_Info_Table.tsv')
info_table.to_csv(filepath, sep='\t', index=False)
# Load the other datasets
# Alpha-infecting phage genomes
alpha_filepath = os.path.join('input_datasets', ALPHA_PHAGES_FILENAME)
alpha_accessions = json.load(open(alpha_filepath))
# Pilitropic phage genomes
pili_filepath = os.path.join('input_datasets', PILITROPIC_PHAGES_FILENAME)
pilitropic_accessions = json.load(open(pili_filepath))
# Flagellotropic phage genomes
flagel_filepath = os.path.join('input_datasets', FLAGELLOTROPIC_PHAGES_FILENAME)
flagellotropic_accessions = json.load(open(flagel_filepath))
# All the genomes
genomes = (tblastn_accessions +
alpha_accessions +
pilitropic_accessions +
flagellotropic_accessions)
genomes = list(set(genomes)) # Remove duplicates
print('\n\tOrganizing genomes by taxid:')
(taxid_to_accessions, taxid_to_title, failed_download, failed_reading,
short_genomes, incomplete_genomes) = organize_genomes_by_taxid(genomes)
# Select one representative genome per taxid
selected_acc_dict = select_one_genome_per_taxid(taxid_to_accessions)
# accession_to_taxid dictionary
accession_to_taxid = {}
for taxid in taxid_to_accessions.keys():
accessions = taxid_to_accessions[taxid]
for acc in accessions:
accession_to_taxid[acc] = taxid
# All the accessions collected
collected_accessions = list(accession_to_taxid.keys())
acc_list = []
taxid_list = []
title_list = []
tblastn_list = []
alpha_list = []
pili_list = []
flagel_list = []
used_list = []
for acc in genomes:
# Get taxid and title
if acc in collected_accessions:
taxid = accession_to_taxid[acc]
title = taxid_to_title[taxid]
else:
taxid = 'ignored accession'
title = 'ignored accession'
# Define source(s)
# Is from tblastn set
if acc in tblastn_accessions:
tblastn_tag = 'yes'
else:
tblastn_tag = 'no'
# Is from alpha-infecting set
if acc in alpha_accessions:
alpha_tag = 'yes'
else:
alpha_tag = 'no'
# Is from pilitropic set
if acc in pilitropic_accessions:
pili_tag = 'yes'
else:
pili_tag = 'no'
# Is from flagellotropic set
if acc in flagellotropic_accessions:
flagel_tag = 'yes'
else:
flagel_tag = 'no'
# Define used_tag
if acc in failed_download or acc in failed_reading:
used_tag = 'No: failed request to NCBI'
elif acc in short_genomes:
used_tag = 'No: short genome'
elif acc in incomplete_genomes:
used_tag = 'No: incomplete genome'
elif acc not in selected_acc_dict.values():
used_tag = 'No: replaced with ' + selected_acc_dict[taxid]
else:
used_tag = 'Yes'
acc_list.append(acc)
taxid_list.append(taxid)
title_list.append(title)
tblastn_list.append(tblastn_tag)
alpha_list.append(alpha_tag)
pili_list.append(pili_tag)
flagel_list.append(flagel_tag)
used_list.append(used_tag)
report_table = pd.DataFrame({'accession': acc_list,
'taxid': taxid_list,
'title': title_list,
'from_tblastn_search': tblastn_list,
'from_alpha-infecting_set': alpha_list,
'from_pilitropic_set': pili_list,
'from_flagellotropic_set': flagel_list,
'used': used_list})
# Save Report Table
filepath = os.path.join(RRG_out_dir, 'relevant_genomes_Report_Table.tsv')
report_table.to_csv(filepath, sep='\t', index=False)
# All selected accessions
selected_df = report_table.loc[report_table['used'] == 'Yes']
all_selected_accessions = list(selected_df['accession'])
# Save list of selected accessions
filepath = os.path.join(RRG_out_dir, 'relevant_genomes.json')
with open(filepath, 'w') as outfile:
json.dump(all_selected_accessions, outfile)
print('\nRRG: done.\n')
if __name__ == "__main__":
# Read config file and set all the variables
set_up()
# Main program
main()