-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_cluster_algos.py
480 lines (407 loc) · 20.4 KB
/
feature_cluster_algos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
import pickle
import os
import numpy as np
import cupy as cp
import tensorflow as tf
import cv2 as cv
from generate_datasets import try_draw_single_font
from unicode_info.database import generate_data_for_experiment
from cluster_metrics import calculate_mean_iou, calculate_mean_precision
import time
import datetime
import tqdm
def generate_features_dict_file_path(save_dir: str, features_dict_file="features_dict_file.pkl"):
return os.path.join(save_dir, features_dict_file)
def generate_codepoints_cluster_map_file_path(save_dir: str): return os.path.join(save_dir,
"codepoints_cluster_map.pkl")
def generate_cluster_codepoints_map_file_path(save_dir: str): return os.path.join(save_dir,
"codepoints_cluster_map.pkl")
class EfficientNetFeatureExtractor:
def __init__(self, model_path: str, batch_size: int, save_dir: str, multifont_mapping_path: str):
self.mp = model_path
self.bs = batch_size
self.sd = save_dir
self.mmp = multifont_mapping_path
# Returns: A dictionary - Keys are all supported unicode characters, Values are minimum supported font for that unicode
def generate_minimum_used_fonts_dict(self, unicode_supported_fonts_drawn_dict) -> dict:
num_unicodes = len(unicode_supported_fonts_drawn_dict)
minimum_font_dict = {}
for unicode, font_feature_dict in unicode_supported_fonts_drawn_dict.items():
supported_fonts = font_feature_dict.keys()
for font in supported_fonts:
if font not in minimum_font_dict:
minimum_font_dict[font] = [unicode]
else:
minimum_font_dict[font].append(unicode)
font_unicodes_list = list(minimum_font_dict.items())
minimum_used_fonts_dict = {}
last_update = []
copied_font_unicodes_list = font_unicodes_list.copy()
while len(minimum_used_fonts_dict) < num_unicodes:
max_len_index = -1
max_len = 0
for char_ in range(len(copied_font_unicodes_list)):
copied_font_unicodes_list[char_] = (copied_font_unicodes_list[char_][0],
[i for i in copied_font_unicodes_list[char_][1] if
i not in last_update])
font_len = len(copied_font_unicodes_list[char_][1])
if font_len > max_len:
max_len = font_len
max_len_index = char_
most_supported_font = copied_font_unicodes_list[max_len_index]
for unicode in most_supported_font[1]:
assert unicode not in minimum_used_fonts_dict
minimum_used_fonts_dict[unicode] = most_supported_font[0]
last_update = most_supported_font[1]
del copied_font_unicodes_list[max_len_index]
return minimum_used_fonts_dict
def _load_model_load_data_and_extract_features(self, model_path: str, batch_size: int,
multifont_mapping_path: str) -> dict:
# Load model + weights
json_file = open(os.path.join(model_path, 'model.json'), 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = tf.keras.models.model_from_json(loaded_model_json)
loaded_model.load_weights(os.path.join(model_path, 'model.h5'))
# Load model input info
model_info_file = open(os.path.join(model_path, 'model_info.pkl'), 'rb')
model_info_dict = pickle.load(model_info_file)
img_size, font_size = model_info_dict['img_size'], model_info_dict['font_size']
empty_image = np.full((img_size, img_size), 255)
# Load multifont mapping dict
multifont_mapping_file = open(multifont_mapping_path, 'rb')
unicode_font_mapping_dict = pickle.load(multifont_mapping_file)
if os.path.exists('unicode_supported_fonts_drawn_dict.pkl'):
unicode_supported_fonts_drawn_dict_file = open('unicode_supported_fonts_drawn_dict.pkl', 'rb')
unicode_supported_fonts_drawn_dict = pickle.load(unicode_supported_fonts_drawn_dict_file)
else:
unicode_supported_fonts_drawn_dict = {}
for unicode_point in unicode_font_mapping_dict.keys():
print(unicode_point)
for font_path in unicode_font_mapping_dict[unicode_point]:
unicode_drawn = try_draw_single_font(unicode_point, font_path, empty_image, img_size, font_size,
"./fonts",
transform_img=False)
if not (unicode_drawn == empty_image).all():
unicode_drawn_preprocessed = (
(cv.cvtColor(unicode_drawn, cv.COLOR_GRAY2RGB) - (255 / 2)) / (255 / 2)).astype(np.float32)
if unicode_point not in unicode_supported_fonts_drawn_dict:
unicode_supported_fonts_drawn_dict[unicode_point] = {font_path: unicode_drawn_preprocessed}
else:
unicode_supported_fonts_drawn_dict[unicode_point][font_path] = unicode_drawn_preprocessed
with open('unicode_supported_fonts_drawn_dict.pkl', 'wb+') as f:
pickle.dump(unicode_supported_fonts_drawn_dict, f)
print(len(unicode_supported_fonts_drawn_dict))
if not os.path.exists('unicode_minimum_font_dict.pkl'):
minimum_used_fonts_dict = self.generate_minimum_used_fonts_dict(unicode_supported_fonts_drawn_dict)
with open('unicode_minimum_font_dict.pkl', 'wb+') as f:
pickle.dump(minimum_used_fonts_dict, f)
else:
minimum_used_fonts_dict_file = open('unicode_minimum_font_dict.pkl', 'rb')
minimum_used_fonts_dict = pickle.load(minimum_used_fonts_dict_file)
print(len(minimum_used_fonts_dict))
unicode_batch = {}
unicode_feature_vectors_dict = {}
for unicode_point in unicode_supported_fonts_drawn_dict.keys():
unicode_batch[unicode_point] = unicode_supported_fonts_drawn_dict[unicode_point][
minimum_used_fonts_dict[unicode_point]]
if len(unicode_batch) == batch_size:
unicode_batch_forward = loaded_model.predict(tf.convert_to_tensor(list(unicode_batch.values())))
unicode_batch = dict(zip(unicode_batch.keys(), unicode_batch_forward))
unicode_feature_vectors_dict.update(unicode_batch)
unicode_batch = {}
if len(unicode_batch) > 0:
unicode_batch_forward = loaded_model.predict(tf.convert_to_tensor(list(unicode_batch.values())))
unicode_batch = dict(zip(unicode_batch.keys(), unicode_batch_forward))
unicode_feature_vectors_dict.update(unicode_batch)
print(len(unicode_feature_vectors_dict))
return unicode_feature_vectors_dict
def extract_and_save_features(self):
features_dict = self._load_model_load_data_and_extract_features(self.mp, self.bs, self.mmp)
with open(generate_features_dict_file_path(self.sd), 'wb+') as f:
pickle.dump(features_dict, f)
def cosine_similarity_matrix_cpu(features: np.ndarray) -> np.ndarray:
start_time = time.time()
n, k = features.shape
a_ = features.reshape((n, 1, 1, k))
b_ = features.reshape((1, n, k, 1))
# [n, n]
dot_products = np.matmul(a_, b_).reshape((n, n))
# [n]
norms = np.linalg.norm(features, axis=1)
norms_a = norms.reshape((n, 1))
norms_b = norms.reshape((1, n))
# [n, n]
norms_prod = np.multiply(norms_a, norms_b)
cosine_similarity = dot_products / norms_prod
elapsed_seconds = time.time() - start_time
print("Time spent on similarity matrix: " + str(datetime.timedelta(seconds=elapsed_seconds)))
return cosine_similarity
def cosine_similarity_matrix_gpu_stream(features_cpu: np.ndarray, batch_size: int) -> np.ndarray:
start_time = time.time()
n, k = features_cpu.shape
# [n, k] cpu-> gpu
features = cp.asarray(features_cpu)
# [n] gpu
norms_gpu = cp.linalg.norm(features, axis=1)
norms_a = norms_gpu.reshape((n, 1))
norms_b = norms_gpu.reshape((1, n))
# [n, k] gpu
a_ = features.reshape((n, 1, 1, k))
b_ = features.reshape((1, n, k, 1))
cosine_similarities = []
num_batches = n // batch_size
for batch_i in range(num_batches):
start = batch_i * batch_size
end = (batch_i + 1) * batch_size
if end > n: end = n
norm_a_batch = norms_a[start:end]
a_batch = a_[start:end] = a_[start:end]
norms_prod_batch = cp.multiply(norm_a_batch, norms_b)
dot_products_batch = cp.matmul(a_batch, b_).reshape((batch_size, n))
cosine_similarities.append(cp.asnumpy(dot_products_batch / norms_prod_batch).astype(np.float16))
cosine_similarities = np.concatenate(cosine_similarities)
elapsed_seconds = time.time() - start_time
print("Time spent on similarity matrix: " + str(datetime.timedelta(seconds=elapsed_seconds)))
return cosine_similarities
class _AbstractFeatureClusterer:
def __init__(self, save_dir: str):
self.sd = save_dir
with open(generate_features_dict_file_path(self.sd), 'rb') as f:
self.features_dict = pickle.load(f)
def cluster_features_into_equivalence_classes(self, features_dict: dict) -> (dict, dict):
"""
*
:param features_dict: keys are codepoint integers, values are numpy arrays of identical dimension
:return: mapping of codepoints to cluster id, mapping of cluster ids to lists of codepoints
"""
raise NotImplementedError
def find_and_save_equivalence_classes(self):
codepoints_cluster_map, cluster_codepoints_map = self.cluster_features_into_equivalence_classes(
self.features_dict)
with open(generate_codepoints_cluster_map_file_path(self.sd), 'wb+') as f:
pickle.dump(codepoints_cluster_map, f)
with open(generate_cluster_codepoints_map_file_path(self.sd), 'wb+') as f:
pickle.dump(cluster_codepoints_map, f)
def _dfs_traverse(adj_mat: np.ndarray, visited: np.ndarray, node: int, trace: list):
visited[node] = True
trace.append(node)
neighbors_indices = np.nonzero(adj_mat[node])[0]
for i in neighbors_indices:
if not visited[i]:
_dfs_traverse(adj_mat, visited, i, trace)
def _find_nontrivial_components_from_adjacency_matrix(adj_mat: np.ndarray) -> list:
"""
a simple algorithm to iterate through all unvisited nodes and DFS through their neighbors to find components
ignores components with only one node
:param adj_mat: *
:return: list of lists of node ids, corresponding to connected components
"""
n, _ = adj_mat.shape
visited = np.zeros(n, dtype=np.bool)
connected_components = []
for node in range(n):
if not visited[node]:
trace = []
_dfs_traverse(adj_mat, visited, node, trace)
if len(trace) > 0:
connected_components.append(trace)
return connected_components
class _AbstractGraphClusterer(_AbstractFeatureClusterer):
"""
cluster unicodes
1. compute adjacency matrix from pairwise comparison
2. find connected components with more than one nodes
"""
def __init__(self, save_dir: str):
super().__init__(save_dir)
def _generate_adjacency_matrix(self, features: np.ndarray):
raise NotImplementedError
def cluster_features_into_equivalence_classes(self, features_dict: dict) -> (dict, dict):
"""
*
:param features_dict: keys are codepoint integers, values are numpy arrays of identical dimension
:return: mapping of codepoints to cluster id, a mapping of cluster ids to lists of codepoints
"""
# cpu [n]
ordered_codepoints = list(features_dict.keys())
ordered_features = np.stack(list(features_dict.values()))
adj_mat = self._generate_adjacency_matrix(ordered_features)
connected_components = _find_nontrivial_components_from_adjacency_matrix(adj_mat)
codepoints_cluster_map = {}
cluster_codepoints_map = {}
for cluster_id, component in enumerate(connected_components):
cluster_codepoints_map[cluster_id] = []
for node in component:
codepoint = ordered_codepoints[node]
codepoints_cluster_map[codepoint] = cluster_id
cluster_codepoints_map[cluster_id].append(codepoint)
for cluster_id_neg, codepoint in enumerate(ordered_codepoints):
if codepoint not in codepoints_cluster_map:
cluster_id = -1 * cluster_id_neg
codepoints_cluster_map[codepoint] = cluster_id
cluster_codepoints_map[cluster_id] = [codepoint]
return codepoints_cluster_map, cluster_codepoints_map
class CosineSimGraphClustererCPU(_AbstractGraphClusterer):
def __init__(self, save_dir: str, threshold: float, epsilon: float):
super().__init__(save_dir)
self.threshold = threshold
self.epsilon = epsilon
def _generate_adjacency_matrix(self, features: np.ndarray):
self.cosine_similarity = cosine_similarity_matrix_cpu(features)
adjacency_matrix = (self.cosine_similarity > self.threshold)
return adjacency_matrix
def greedy_clique_cluster_heuristic(features_dict: dict, target_mean: float, target_std: float,
target_threshold: float):
"""
Ground Truth Mean: 0.767601996660232
Ground Truth Std: 0.0935437240793059
Optimized parameters: .72,.01,.94
"""
ordered_codepoints = list(features_dict.keys())
ordered_features = np.stack(list(features_dict.values()))
similarity_matrix = cosine_similarity_matrix_cpu(ordered_features)
start_time = time.time()
cluster_id_indices_map = {}
for node_index in range(len(ordered_codepoints)):
added_to_existing_cluster = False
for cluster_id, cluster_indices in cluster_id_indices_map.items():
edges = similarity_matrix[node_index, cluster_indices]
if (edges > target_threshold).all():
cluster_id_indices_map[cluster_id].append(node_index)
added_to_existing_cluster = True
break
if not added_to_existing_cluster:
cluster_id_indices_map[len(cluster_id_indices_map)] = [node_index]
print("Time spent on finding cliques: " + str(datetime.timedelta(seconds=time.time() - start_time)))
start_time = time.time()
id_upperbound = len(cluster_id_indices_map)
for cluster_id in range(id_upperbound):
if cluster_id not in cluster_id_indices_map: continue
cluster_indices = cluster_id_indices_map[cluster_id]
for merge_candidate_id in range(cluster_id + 1, id_upperbound):
if merge_candidate_id not in cluster_id_indices_map: continue
candidate_indices = cluster_id_indices_map[merge_candidate_id]
# https://stackoverflow.com/questions/22927181/selecting-specific-rows-and-columns-from-numpy-array
edges = similarity_matrix[np.ix_(cluster_indices, candidate_indices)]
edges_mean = np.mean(edges)
edges_std = np.std(edges)
if edges_mean > target_mean and edges_std < target_std:
cluster_id_indices_map[cluster_id].extend(candidate_indices)
del cluster_id_indices_map[merge_candidate_id]
print("Time spent on merging clusters: " + str(datetime.timedelta(seconds=time.time() - start_time)))
predicted_cluster_codepoints_map = {cluster_id: [ordered_codepoints[index] for index in indices]
for cluster_id, indices in cluster_id_indices_map.items()}
predicted_codepoints_cluster_map = {ordered_codepoints[index]: cluster_id
for cluster_id, indices in cluster_id_indices_map.items()
for index in indices}
return predicted_codepoints_cluster_map, predicted_cluster_codepoints_map
def component_into_cliques_heuristic(features_dict: dict, target_mean: float, target_std: float,
target_threshold: float):
ordered_codepoints = list(features_dict.keys())
ordered_features = np.stack(list(features_dict.values()))
adj_mat = None
connected_components = _find_nontrivial_components_from_adjacency_matrix(adj_mat)
return
def baseline_heuristic(features_dict: dict):
predicted_codepoints_cluster_map = {}
predicted_cluster_codepoints_map = {}
for cluster_id, codepoint in enumerate(features_dict.keys()):
predicted_codepoints_cluster_map[codepoint] = cluster_id
predicted_cluster_codepoints_map[cluster_id] = [codepoint]
return predicted_codepoints_cluster_map, predicted_cluster_codepoints_map
def _test_dfs_components_finder():
import matplotlib.pyplot as plt
import networkx as nx
def show_graph_with_labels(adjacency_matrix):
n, _ = adjacency_matrix.shape
rows, cols = np.where(adjacency_matrix == 1)
edges = zip(rows.tolist(), cols.tolist())
gr = nx.Graph()
gr.add_edges_from(edges)
nx.draw_networkx(gr, node_size=250, labels={i: str(i) for i in range(n)}, with_labels=True)
plt.show()
adj_mat = np.zeros(shape=[10, 10], dtype=np.bool)
for i in range(10):
adj_mat[i, i] = True
# component 1: 1, 2, 3
adj_mat[1, 3] = True
adj_mat[3, 1] = True
adj_mat[1, 2] = True
adj_mat[2, 1] = True
adj_mat[2, 3] = True
adj_mat[3, 2] = True
# component 2: 4, 5
adj_mat[4, 5] = True
adj_mat[5, 4] = True
# component 3: 6, 7, 8, 9
adj_mat[6, 7] = True
adj_mat[7, 6] = True
adj_mat[7, 8] = True
adj_mat[8, 7] = True
adj_mat[8, 9] = True
adj_mat[9, 8] = True
print(_find_nontrivial_components_from_adjacency_matrix(adj_mat))
show_graph_with_labels(adj_mat)
def run_dfs_on_consortium(num_random_additions: int = 0):
supported_consortium_feature_vectors, ground_truth_consoritium_codepoints_map = generate_data_for_experiment(
num_random_additions)
Cluster_Algo = CosineSimGraphClustererCPU(save_dir="./", threshold=.92, epsilon=1e-5)
predicted_codepoints_cluster_map, predicted_cluster_codepoints_map = Cluster_Algo.cluster_features_into_equivalence_classes(
features_dict=supported_consortium_feature_vectors)
mean_IOU = calculate_mean_iou(predicted_codepoints_cluster_map, predicted_cluster_codepoints_map,
ground_truth_consoritium_codepoints_map)
mean_precision = calculate_mean_precision(predicted_codepoints_cluster_map, predicted_cluster_codepoints_map,
ground_truth_consoritium_codepoints_map)
print(f"Mean IOU: " + str(mean_IOU))
print(f"Mean precision: " + str(mean_precision))
def run_clique_on_consortium(num_random_additions: int = 0):
supported_consortium_feature_vectors, ground_truth_consoritium_codepoints_map = generate_data_for_experiment(
num_random_additions)
predicted_codepoints_cluster_map, predicted_cluster_codepoints_map = \
greedy_clique_cluster_heuristic(supported_consortium_feature_vectors, 0.72, 0.01, 0.94)
mean_IOU = calculate_mean_iou(predicted_codepoints_cluster_map, predicted_cluster_codepoints_map,
ground_truth_consoritium_codepoints_map)
mean_precision = calculate_mean_precision(predicted_codepoints_cluster_map, predicted_cluster_codepoints_map,
ground_truth_consoritium_codepoints_map)
print(f"Mean IOU: " + str(mean_IOU))
print(f"Mean precision: " + str(mean_precision))
def run_baseline_on_consortium(num_random_additions: int = 0):
supported_consortium_feature_vectors, ground_truth_consoritium_codepoints_map = generate_data_for_experiment(
num_random_additions)
predicted_codepoints_cluster_map, predicted_cluster_codepoints_map = baseline_heuristic(supported_consortium_feature_vectors)
mean_IOU = calculate_mean_iou(predicted_codepoints_cluster_map, predicted_cluster_codepoints_map,
ground_truth_consoritium_codepoints_map)
mean_precision = calculate_mean_precision(predicted_codepoints_cluster_map, predicted_cluster_codepoints_map,
ground_truth_consoritium_codepoints_map)
print(f"Mean IOU: " + str(mean_IOU))
print(f"Mean precision: " + str(mean_precision))
def generate_sim_mat_float16_gpu():
supported_unicode_feature_vectors = pickle.load(open('features_dict_file.pkl', 'rb'))
ordered_features = np.stack(list(supported_unicode_feature_vectors.values()))
similarity_matrix = cosine_similarity_matrix_gpu_stream(ordered_features, batch_size=1000)
np.save('sim_mat_float16.npy', similarity_matrix)
def load_sim_mat_float16():
return np.load('sim_mat_float16.npy')
if __name__ == "__main__":
# _test_dfs_components_finder()
generate_sim_mat_float16_gpu()
# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument('-hc', '--heuristic_choice', action='store', type=str, default='dfs')
# parser.add_argument('-nra', '--num_rand_add', action='store', type=int, default=0)
# args = parser.parse_args()
# num_rand_add = args.num_rand_add
# hc = args.heuristic_choice
#
# t0 = time.time()
# print("Heuristic: " + hc)
# print("Random Additions: " + str(num_rand_add))
#
# if hc == "clique":
# run_clique_on_consortium(num_rand_add)
# elif hc == "dfs":
# run_dfs_on_consortium(num_rand_add)
# elif hc == "baseline":
# run_baseline_on_consortium(num_rand_add)
# print("Total Elapsed Time: " + str(datetime.timedelta(seconds=time.time() - t0)))