-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrank_and_plot_distmtx.py
380 lines (337 loc) · 17.1 KB
/
rank_and_plot_distmtx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# Takes a distance matrix (calculated from Calc_distmtx.py) and converts it
# to a numpy array of positions. This is then converted into a ranked list of
# clusters as a text file with one cluster per line. First argument is
# infile and second is outfile. Third is FASTA file that was used to generate
# the distance matrix. It must have cluster size annotations for each
# cluster centroid (can get this by using -sizeout option with usearch
# -cluster_fast algorithm).
#
# Example implementation:
# python ./rank_and_plot_distmtx.py distmtx.txt ranked_centroids.txt
# centroids.fasta -title 'My Title Here' -rank 20 -show 10 -show_pdb True
# -cluster_file_dir "./cluster_files/" -circle_size_multiplier 10
import numpy as np
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from Find_PDB_Clusters_SIFTS import *
from get_size import get_size
# Get cluster sizes and generate distance matrix from distance matrix text file
def convert_distance_matrix(filename, num_seqs, out_matrix, in_size_dict):
line_num = -1
number = ''
print(in_size_dict)
with open(filename, 'r') as inputfile:
for line in inputfile:
# Distance matrix text file generated via biopython only includes
# first 10 characters of the full sequence id. If two sequences
# have the same first 10 characters for their id (extremely
# unlikely), then just manually change one of them temporarily.
current_id = ''
char_index = 4
while line[char_index] != '|':
char_index += 1
current_id_frag = line[:char_index]
for key in in_size_dict:
if current_id_frag in key:
current_id = key
# Loop until all lines in distance matrix have been processed
print('Line Num: ', line_num, '\tnum_seqs: ', num_seqs)
if line_num < num_seqs - 1:
line_num += 1
else:
return out_matrix, size_list, size_sqrt_list, labels
# Generate cluster labels and sizes for graph
labels[line_num] = current_id
size_list[line_num] = in_size_dict[current_id]
# Arbitrarily multiply by 300 so that circles are larger on graph
size_sqrt_list[line_num] = 300 * np.sqrt(in_size_dict[current_id])
# Extract each number from distance matrix text file by finding its
# decimal point. Then add to our matrix.
column_num = 0
for char_num in range(len(line)):
if line[char_num] == '.':
number += line[char_num - 1]
while line[char_num] != '\t' and line[char_num] != '\n':
number += line[char_num]
char_num += 1
number_to_add = float(number)
number = ''
out_matrix[column_num][line_num] = number_to_add
out_matrix[line_num][column_num] = number_to_add
column_num += 1
# Get positions (pos) of all clusters based on distance matrix using
# multi-dimensional scaling (mds). Must first convert our list of lists
# (matrix) to a numpy array (np_matrix).
def transform_matrix(in_matrix):
np_matrix = np.array(in_matrix)
mds = MDS()
positions = mds.fit_transform(np_matrix)
return positions
def get_max_cluster(in_size_dict):
# Initialize variables
max_size = 0
max_cluster = ''
# Find biggest cluster and its size to calculate weighted size later
for cluster_id in in_size_dict.keys():
if in_size_dict[cluster_id] > max_size:
max_size = in_size_dict[cluster_id]
max_cluster = cluster_id
return max_cluster
def get_ranked_list(in_table, number_picked_clusters, number_shown_clusters,
starting_cluster, size_weight):
# Generate ranked list of clusters
current_cluster = starting_cluster
ranked_list = []
i = 0
# Pairwise maximum distance
d_pair_max = calc_pairwise_max_distance(in_table)
# Choose weight for size and distance (must add to 1)
distance_weight = 1 - size_weight
while i < number_picked_clusters:
score_max = 0
# Maximum sum of distances to all other points from current point
d_total_max = d_pair_max * (i + 1)
# Go through each cluster that hasn't yet been picked and calculate its
# distance to every other cluster that has been picked. Then,
ranked_list.append(current_cluster)
in_table.set_value(current_cluster, 'picked', True)
if i < number_shown_clusters:
in_table.set_value(current_cluster, 'show', True)
for cluster in in_table.loc[~in_table['picked']].loc[:, 'id']:
d = 0
x1 = in_table.at[cluster, 'x']
y1 = in_table.at[cluster, 'y']
# Calculate sum of distances from cluster to all picked clusters
for picked_cluster in in_table.loc[in_table['picked']].loc[:, 'id']:
x2 = in_table.at[picked_cluster, 'x']
y2 = in_table.at[picked_cluster, 'y']
d += np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) / d_pair_max
cluster_score = distance_weight / d_total_max * d + \
size_weight * size_dict[cluster] / size_dict[starting_cluster]
if cluster_score > score_max:
score_max = cluster_score
current_cluster = cluster
i += 1
print(i, 'out of', num_picked_clusters, 'read.')
return ranked_list
def calc_pairwise_max_distance(in_table):
max_distance = 0
x_list = in_table['x']
y_list = in_table['y']
# Calculate distance from every point to every other point and record max
# distance
for x1, y1 in zip(x_list, y_list):
for x2, y2 in zip(x_list, y_list):
distance = np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
if distance > max_distance:
max_distance = distance
return max_distance
def get_pdb_id_dict(ranked_list, directory):
# Find cluster number for each id
cluster_file_dict = find_cluster_files(ranked_list,
directory)
mapping_df = pd.read_csv('uniprot_pdb.csv')
mapping_df = mapping_df.set_index('SP_PRIMARY')
PDB_id_dict = map_all(ranked_list, cluster_file_dict,
mapping_df)
return PDB_id_dict
def fill_in_pdb_info(in_table, PDB_id_dict):
pdb_list = []
for centroid_id in in_table['id']:
if centroid_id in PDB_id_dict:
pdb_list.append(PDB_id_dict[centroid_id])
else:
pdb_list.append(None)
in_table['PDB'] = pdb_list
if __name__ == '__main__':
# Parse command line arguments from stdin
parser = argparse.ArgumentParser()
parser.add_argument('infile', nargs=1, type=str,
help='Must be a distance matrix text file (i.e. '
'calculated with calc_distmtx.py).')
parser.add_argument('outfile', nargs=1, type=str,
help='Must be a text file.')
parser.add_argument('fig_filepath', nargs=1, type=str,
help='Filepath to output figure (.svg)')
parser.add_argument('size_fasta', nargs=1, type=str,
help='Centroids FASTA file used to generate '
'alignment file is needed to get size of each '
'cluster. Must use -sizeout option with usearch '
'to generate such a FASTA file.')
parser.add_argument('-title', nargs=1, type=str,
help='Title for plot - default is an empty string.')
parser.add_argument('-rank', nargs=1, type=int,
help='Number of clusters to rank - default is all.')
parser.add_argument('-size_weight', nargs=1, type=float,
help='Must be a number between 0 and 1. Indicates how '
'much size will be weighted in ranking. Values '
'that seem to \"look\" the best tend to fall '
'between 0.2 and 0.5. If zero, only distance '
'will be used to determine ranking.')
parser.add_argument('-show', nargs=1, type=int,
help='Number of clusters to label - default is all.')
parser.add_argument('-show_pdb', nargs=1, type=bool, default=False,
help='If true, highlight clusters for which PDB '
'structural information known for at least one '
'sequence within that cluster (May take a '
'while). Make sure to combine this with the '
'-cluster_file_dir option.')
parser.add_argument('-cluster_file_dir', nargs=1, type=str,
help='If you want to show PDB information on plot, '
'make sure show_pdb is checked and provide the '
'directory of all the cluster fasta files ('
'generated in usearch via the -cluster option.)')
parser.add_argument('-circle_size_multiplier', nargs=1, type=float,
default=1,
help='Can increase or decrease circle size by putting '
'in a multiplier - less than 1 decreases circle '
'size and greater than 1 increases circle size '
'proportionally.')
args = parser.parse_args()
# Initialize all lists, matrices, and dicts to be used in the script
with open(vars(args)['infile'][0], 'r') as infile:
# Get number of clusters by counting lines in script except last line
# which starts with a tab
num_clusters = 0
print('reading infile')
for line in infile:
if line[0] != '\t':
num_clusters += 1
empty_matrix = [[0 for j in range(num_clusters)] for i in
range(num_clusters)]
size_list = [0 for k in range(num_clusters)]
size_sqrt_list = [0 for l in range(num_clusters)]
labels = ['' for m in range(num_clusters)]
# Get dict of cluster ids (keys) and their respective sizes (values)
size_dict = get_size(vars(args)['size_fasta'][0])
# Fill our empty matrix with values from the distance matrix text file.
# Also generates 3 ordered lists: cluster size (the absolute area of the
# cluster); the square root of that size (to account for the fact that r
# is proportional to the square root of size for a circle, and r is used
# to create the circles on the graph, not area); and labels, which are
# just the cluster ids to be used in the graph labels.
matrix, size_list, size_sqrt_list, labels = convert_distance_matrix(
vars(args)['infile'][0], num_clusters, empty_matrix, size_dict)
# Get a set of x, y coordinates from our set of distances in our distance
# matrix using mds via scikit learn.
pos = transform_matrix(matrix)
# Create pandas dataframe to easily view and manipulate the data as a table
data_dict = {'id': labels, 'size': size_list, 'show': [False for n in
range(len(labels))],
'x': pos[:, 0], 'y': pos[:, 1], 'PDB': [None for n in range(
len(labels))],
'picked': [False for n in range(len(labels))]}
table = pd.DataFrame(data_dict, index=labels)
# Start our ranking at the largest cluster.
largest_cluster = get_max_cluster(size_dict)
table.set_value(largest_cluster, 'picked', True)
pd.set_option('max_rows', 100)
# Set how many clusters we want to choose (specified by '-rank' option).
# Default to all clusters.
if vars(args)['rank']:
num_picked_clusters = vars(args)['rank'][0]
else:
num_picked_clusters = num_clusters
if vars(args)['size_weight']:
cluster_size_weight = vars(args)['size_weight'][0]
else:
cluster_size_weight = 0
if vars(args)['show']:
num_shown_clusters = vars(args)['show'][0]
else:
num_shown_clusters = 10
if vars(args)['show_pdb']:
show_pdb_info = True
try:
cluster_file_dir = vars(args)['cluster_file_dir'][0]
except TypeError:
raise Exception('If you want to show PDB information, you must '
'also use -cluster_file_dir option.')
else:
show_pdb_info = False
cluster_file_dir = None
# Generate ranked list from our table of cluster positions, the amount of
# clusters we would like to rank, and the starting cluster.
ranked_list_centroids = get_ranked_list(table, num_picked_clusters,
num_shown_clusters, largest_cluster,
cluster_size_weight)
# If desired, get PDB information (if any) and add to table
if show_pdb_info:
print('Extracting PDB information...')
pdb_id_dict = get_pdb_id_dict(table['id'], cluster_file_dir)
fill_in_pdb_info(table, pdb_id_dict)
# Generate a list of colors based on which clusters were picked.
color_list = ['' for i in range(num_clusters)]
for i in range(num_clusters):
if table.iloc[i].loc['PDB'] and table.iloc[i].loc['show']:
color_list[i] = 'magenta'
elif table.iloc[i].loc['PDB']:
color_list[i] = 'green'
elif table.iloc[i].loc['show']:
color_list[i] = 'red'
else:
color_list[i] = 'blue'
# Functionality to increase/decrease circle size
if vars(args)['circle_size_multiplier']:
circle_size_multiplier = float(vars(args)['circle_size_multiplier'][0])
for index, size in enumerate(size_list):
size_list[index] = circle_size_multiplier * size
# Plot the clusters using x and y positions (pos) with circle sizes
# taken from size_list and colors taken from color_list.
plt.figure(figsize=(15, 15))
plt.scatter(pos[:, 0], pos[:, 1], size_list, alpha=0.7, c=color_list)
try:
plt.title(vars(args)['title'][0], fontsize=60)
except TypeError:
print('No plot title chosen. Add a title with the -title option.\n')
plt.grid(True)
# Add ranking to list of cluster labels (form is 'cluster_id ranking_num')
for label_num in range(len(labels)):
for seq_id in ranked_list_centroids:
if labels[label_num] == seq_id:
labels[label_num] += (
' ' + str(ranked_list_centroids.index(seq_id) + 1))
# Add labels to plot
for label, x, y in zip(labels, pos[:, 0], pos[:, 1]):
# Only want labels for picked clusters
if table.iloc[labels.index(label)].loc['show']:
if table.iloc[labels.index(label)].loc['PDB']:
plt.annotate(label, xy=(x, y), xytext=(10, 10), fontsize=22,
textcoords='offset points', ha='left', va='top',
bbox=dict(boxstyle='round', pad=0.5, fc='magenta',
alpha=0.3))
else:
plt.annotate(label, xy=(x, y), xytext=(-10, 10), fontsize=22,
textcoords='offset points', ha='right',
va='bottom', bbox=dict(boxstyle='round,pad=0.5',
fc='red', alpha=0.3))
elif table.iloc[labels.index(label)].loc['PDB']:
plt.annotate(label, xy=(x, y),
xytext=(25, 25), textcoords='offset points',
ha='left', va='top', fontsize=22,
bbox=dict(boxstyle='round', pad=0.5, fc='green',
alpha=0.3))
# Add axis labels
plt.xlabel('Relative Distance (arbitrary units)', size=50)
plt.ylabel('Relative Distance (arbitrary units)', size=50)
# Add legend to plot
blue_patch = mpatches.Patch(color='blue', label='No PDB, not ranked in '
'top ' +
str(num_shown_clusters))
red_patch = mpatches.Patch(color='red', label='No PDB, ranked in top ' +
str(num_shown_clusters))
green_patch = mpatches.Patch(color='green', label='PDB structure exists, '
'not ranked in top ' +
str(num_shown_clusters))
magenta_patch = mpatches.Patch(color='magenta',
label='PDB structure exists, and ranked '
'in top ' + str(num_shown_clusters))
plt.legend(handles=[blue_patch, red_patch, green_patch, magenta_patch],
fontsize=30)
# Generate ranked list text output file (to be used by find_reviewed.py)
with open(vars(args)['outfile'][0], 'w') as outfile:
for item in ranked_list_centroids:
outfile.write(item + '\n')
print(table.head())
plt.savefig(vars(args)['fig_filepath'][0])