-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_reviewed.py
105 lines (92 loc) · 4.09 KB
/
find_reviewed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Warning: This script requires a ranked list text file to be generated from
# rank_and_plot_distmtx.py. It also requires that a folder with fasta files
# for each cluster in the distance matrix is available.
#
# This script takes a ranked list of centroid ids and finds if any reviewed
# sequences exist within the same cluster.
#
# Example:
# python .\find_reviewed.py ranked_list.txt ranked_list_with_reviewed.txt
# 'C:\\Users\\Name\\Project\\Cluster Fasta Files\\'
import os
import argparse
# Get command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('infile', nargs=1, type=str,
help='Ranked list (centroids) text file generated from '
'rank_and_plot_distmtx.py.')
parser.add_argument('outfile', nargs=1, type=str,
help='Ranked list of centroids with a reviewed sequence id '
'(if available) inserted after the centroid sequence '
'id (separated by a comma). File format is txt.')
parser.add_argument('directory', nargs=1, type=str,
help='File path to a folder with cluster fasta '
'files (filenames should be cluster numbers with '
'.fasta extension). These files can be generated '
'using the -clusters option during usearch '
'clustering.')
args = parser.parse_args()
directory = vars(args)['directory'][0]
# Get a dictionary relating cluster filenames (e.g. 123.fasta) with one
# reviewed sequence in that cluster, if available.
reviewed_dict = {}
for filename in os.listdir(directory):
with open(directory + filename) as infile:
cluster_num = filename[0:-6]
for line in infile:
# sp stands for swiss-prot which means the entry has been
# reviewed on uniprot
if line[1:3] == 'sp':
end = 6
char = line[6]
while char != '|':
char = line[end]
end += 1
current_id = line[4:end - 1]
reviewed_dict[filename[0:-6]] = current_id
# Get list of ranked clusters from rank_and_plot_distmtx outfile
ranked_list = []
with open(vars(args)['infile'][0]) as infile:
for line in infile:
end = 6
char = line[6]
while char != '|':
char = line[end]
end += 1
current_id = line[3:end - 1]
ranked_list.append(current_id)
# Find the cluster that each centroid_id is found in and relate the cluster
# number to that id in ranked_dict.
ranked_dict = {}
for centroid_id in ranked_list:
for filename in os.listdir(directory):
# FASTA files end in cluster_num.fasta
cluster_num = filename[0:-6]
with open(directory + filename) as infastafile:
for line in infastafile:
if line[0] == '>':
# Shortest uniprot ids are 6 characters.
end = 6
char = line[6]
while char != '|':
char = line[end]
end += 1
current_id = line[4:end - 1]
if centroid_id == current_id:
ranked_dict[centroid_id] = cluster_num
# Create output file by writing centroids and, if the centroids cluster
# number is found in the reviewed_dict (which contains cluster numbers for
# reviewed sequences only), then write that reviewed sequence id next to the
# centroid id.
with open(vars(args)['outfile'][0], 'w') as outfile:
outfile.write('Uniprot ID,Reviewed ID in same cluster (if '
'available)\n')
for centroid in ranked_list:
outfile.write(centroid)
# Second condition to make sure that the reviewed sequence id does
# not match the centroid id.
if centroid in ranked_dict and ranked_dict[centroid] in reviewed_dict \
and reviewed_dict[ranked_dict[centroid]] != ranked_dict[centroid]:
outfile.write(',' + reviewed_dict[ranked_dict[centroid]] + '\n')
else:
outfile.write('\n')