1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ from collections import Counter
4
+ from concurrent .futures import ThreadPoolExecutor , as_completed
5
+ from itertools import combinations
6
+ import os
7
+ from TELF .pre_processing .Vulture import Vulture
8
+ from TELF .pre_processing .Vulture .modules import SubstitutionOperator
9
+
10
+ class VocabularyConsolidator :
11
+ """
12
+ A class for processing and replacing similar keys in dictionaries based on Levenshtein distance and suffix processing.
13
+ """
14
+
15
+ def __init__ (self ):
16
+ self .suffixes = ['ingly' , 'edly' , 'fully' , 'ness' , 'less' , 'ment' , 'tion' , 'sion' ,
17
+ 'ship' , 'able' , 'ible' , 'al' , 'ial' , 'ed' , 'ing' , 'ly' , 'es' , 's' ,
18
+ 'er' , 'tor' ]
19
+ self .suffixes .sort (key = len , reverse = True )
20
+
21
+ @staticmethod
22
+ def levenshtein_distance (s1 , s2 , length_1 , length_2 ):
23
+ """
24
+ Calculate the Levenshtein distance between two strings s1 and s2.
25
+
26
+ Parameters
27
+ ----------
28
+ s1 : str
29
+ The first string.
30
+ s2 : str
31
+ The second string.
32
+ length_1 : int
33
+ The length of the first string.
34
+ length_2 : int
35
+ The length of the second string.
36
+
37
+ Returns
38
+ -------
39
+ int
40
+ The Levenshtein distance between s1 and s2.
41
+ """
42
+ if length_1 < length_2 :
43
+ return VocabularyConsolidator .levenshtein_distance (s2 , s1 , length_2 , length_1 )
44
+ if length_2 == 0 :
45
+ return length_1
46
+ previous_row = range (length_2 + 1 )
47
+ for i , c1 in enumerate (s1 ):
48
+ current_row = [i + 1 ]
49
+ for j , c2 in enumerate (s2 ):
50
+ insertions = previous_row [j + 1 ] + 1
51
+ deletions = current_row [j ] + 1
52
+ substitutions = previous_row [j ] + (c1 != c2 )
53
+ current_row .append (min (insertions , deletions , substitutions ))
54
+ previous_row = current_row
55
+ return previous_row [- 1 ]
56
+
57
+ def prefix_process_key (self ,
58
+ key ):
59
+ """
60
+ Preprocess a key by removing suffixes from it exhaustively.
61
+
62
+ Parameters
63
+ ----------
64
+ key : str
65
+ The key to preprocess.
66
+
67
+ Returns
68
+ -------
69
+ str
70
+ The preprocessed key with the first matched suffix removed.
71
+ """
72
+ original_key = key
73
+ for suffix in self .suffixes :
74
+ if key .endswith (suffix ):
75
+ key = key [:- len (suffix )]
76
+ break
77
+ return key
78
+
79
+ def compare_keys (self ,
80
+ key1 ,
81
+ key2 ,
82
+ threshold = 0.80 ,
83
+ edge_range = 0.1 ):
84
+ """
85
+ Compare two keys to determine if they are similar based on their Levenshtein distance and a similarity threshold.
86
+
87
+ Parameters
88
+ ----------
89
+ key1 : str
90
+ The first key.
91
+ key2 : str
92
+ The second key.
93
+ threshold : float
94
+ The similarity threshold.
95
+ edge_range : float
96
+ The range around the threshold to consider for preprocessing.
97
+
98
+ Returns
99
+ -------
100
+ tuple
101
+ A tuple containing a boolean indicating similarity and the similarity score.
102
+ """
103
+ length_1 , length_2 = len (key1 ), len (key2 )
104
+ max_len = max (length_1 , length_2 )
105
+ dist = self .levenshtein_distance (key1 , key2 , length_1 , length_2 )
106
+ similarity = (max_len - dist ) / max_len
107
+
108
+ if threshold <= similarity <= (threshold + edge_range ):
109
+ key1_processed = self .prefix_process_key (key1 )
110
+ key2_processed = self .prefix_process_key (key2 )
111
+ return (key1_processed == key2_processed , similarity )
112
+ return (similarity >= (threshold + edge_range ), similarity )
113
+
114
+ def process_chunk (self ,
115
+ pairs ,
116
+ key_frequency ,
117
+ threshold = 0.90 ):
118
+ """
119
+ Process a chunk of key pairs to find similar keys.
120
+
121
+ Parameters
122
+ ----------
123
+ pairs : list of tuple
124
+ List of key pairs to compare.
125
+ key_frequency : Counter
126
+ Frequency count of all keys.
127
+ threshold : float
128
+ The similarity threshold.
129
+
130
+ Returns
131
+ -------
132
+ list
133
+ List of tuples containing less preferred key, preferred key, and similarity score.
134
+ """
135
+ results = []
136
+ for key1 , key2 in pairs :
137
+ similar_bool , similar_score = self .compare_keys (key1 , key2 , threshold )
138
+ if similar_bool :
139
+ preferred_key = key1 if key_frequency [key1 ] > key_frequency [key2 ] else key2
140
+ less_preferred_key = key2 if preferred_key == key1 else key1
141
+ results .append ((less_preferred_key , preferred_key , similar_score ))
142
+ return results
143
+
144
+ def replace_similar_keys_levenshtein (self ,
145
+ dict_list ,
146
+ group_by_first_letter = True ,
147
+ group_by_length_difference = True ,
148
+ max_length_difference = 2 ,
149
+ min_chars = 4 ,
150
+ changes_made_save_path = None ,
151
+ similarity_threshold = 0.80 ,
152
+ n_jobs = - 1 ):
153
+ """
154
+ Replace similar keys in a list of dictionaries based on their similarity, optionally grouping them by first letter or length difference.
155
+
156
+ Parameters
157
+ ----------
158
+ dict_list : list of dict
159
+ List of dictionaries to process.
160
+ group_by_first_letter : bool
161
+ Whether to group keys by their first letter.
162
+ group_by_length_difference : bool
163
+ Whether to group keys by length difference.
164
+ max_length_difference : int
165
+ The maximum allowable length difference for grouping.
166
+ min_chars : int
167
+ Minimum character count to consider a key.
168
+ changes_made_save_path : str
169
+ Path to save the changes made.
170
+ similarity_threshold : float
171
+ The threshold for considering keys as similar.
172
+ n_jobs : int
173
+ Number of concurrent jobs to run. Uses all available CPUs if set to -1.
174
+
175
+ Returns
176
+ -------
177
+ tuple
178
+ A tuple containing the modified list of dictionaries and a DataFrame with the changes made.
179
+ """
180
+ all_keys = [key for d in dict_list for key in d .keys ()]
181
+ key_frequency = Counter (all_keys )
182
+ similar_keys = {}
183
+ changes = []
184
+
185
+ sorted_keys = sorted (set (all_keys ))
186
+ grouped_keys = {}
187
+
188
+ # Group keys by the first character
189
+ if group_by_first_letter :
190
+ for key in sorted_keys :
191
+ first_char = key [0 ]
192
+ if first_char not in grouped_keys :
193
+ grouped_keys [first_char ] = []
194
+ grouped_keys [first_char ].append (key )
195
+
196
+ # Further grouping by length difference within groups formed by the first letter
197
+ if group_by_length_difference :
198
+ final_grouped_keys = {}
199
+ for key_group , keys in grouped_keys .items ():
200
+ temp_grouped_keys = {}
201
+ keys_sorted_by_length = sorted (keys , key = len )
202
+
203
+ # Dont pair to check words below a threshold of keys_sorted_by_length
204
+ for index , key in enumerate (keys_sorted_by_length ):
205
+ if len (key ) >= min_chars :
206
+ break
207
+ else :
208
+ index = - 1
209
+ if index != - 1 :
210
+ keys_sorted_by_length = keys_sorted_by_length [index :]
211
+
212
+ # Only pair the terms that are not more different than max_length_difference
213
+ for key in keys_sorted_by_length :
214
+ added = False
215
+ for group_key in list (temp_grouped_keys .keys ()):
216
+ if abs (len (group_key ) - len (key )) <= max_length_difference :
217
+ temp_grouped_keys [group_key ].append (key )
218
+ added = True
219
+ break
220
+ if not added :
221
+ temp_grouped_keys [key ] = [key ]
222
+ final_grouped_keys [key_group ] = temp_grouped_keys
223
+
224
+ # Flatten the groups correctly
225
+ grouped_keys = {group_key : vals for subdict in final_grouped_keys .values () for group_key , vals in subdict .items ()}
226
+
227
+ # Generate all pairs for comparison
228
+ all_pairs = [pair for key_list in grouped_keys .values () for pair in combinations (key_list , 2 )]
229
+
230
+ num_cpus = os .cpu_count () if n_jobs == - 1 else min (n_jobs , os .cpu_count ())
231
+ chunk_size = int (len (all_pairs ) / num_cpus ) + 1
232
+ chunks = [all_pairs [i :i + chunk_size ] for i in range (0 , len (all_pairs ), chunk_size )]
233
+ progress = tqdm (total = len (chunks ), desc = "Processing Chunks" )
234
+
235
+ with ThreadPoolExecutor (max_workers = min (num_cpus , len (chunks ))) as executor :
236
+ results = list (executor .map (self .process_chunk , chunks , [key_frequency ]* len (chunks ), [similarity_threshold ]* len (chunks )))
237
+ for chunk_result in results :
238
+ for less_preferred_key , preferred_key , similar_score in chunk_result :
239
+ similar_keys [less_preferred_key ] = (preferred_key , similar_score )
240
+ progress .update (1 )
241
+
242
+ progress .close ()
243
+
244
+ for dict_ in dict_list :
245
+ for less_preferred_key , (preferred_key , score ) in similar_keys .items ():
246
+ if less_preferred_key in dict_ :
247
+ if isinstance (dict_ [less_preferred_key ], int ):
248
+ dict_ [preferred_key ] = dict_ .get (preferred_key , 0 ) + dict_ .pop (less_preferred_key )
249
+ elif isinstance (dict_ [less_preferred_key ], str ):
250
+ dict_ [preferred_key ] = dict_ .get (preferred_key , '' ) + dict_ .pop (less_preferred_key )
251
+ changes .append ({
252
+ 'Previous Key' : less_preferred_key ,
253
+ 'New Key' : preferred_key ,
254
+ 'Similarity Score' : score
255
+ })
256
+
257
+ changes_df = pd .DataFrame (changes )
258
+
259
+ if changes_made_save_path :
260
+ changes_df .to_csv (changes_made_save_path , index = False )
261
+
262
+ return dict_list , changes_df
263
+
264
+ def unique_words_by_id (self ,
265
+ input_dict ):
266
+ """
267
+ Create a list of dictionaries with unique words from the input dictionary.
268
+
269
+ Parameters
270
+ ----------
271
+ input_dict : dict of {int: str}
272
+ A dictionary where each key is an integer ID and each value is a string of words.
273
+
274
+ Returns
275
+ -------
276
+ list of dict
277
+ A list where each dictionary contains unique words from the input, preserving order.
278
+ """
279
+ output_list = []
280
+ for key , word_string in input_dict .items ():
281
+ unique_words_dict = dict .fromkeys (word_string .split (), "" )
282
+ output_list .append (unique_words_dict )
283
+ return output_list
284
+
285
+
286
+ def consolidate_terms (self ,
287
+ vocabulary = None ,
288
+ texts = None ,
289
+ vulture = None ,
290
+ changes_made_save_path = None ,
291
+ operated_text_save_path = None ):
292
+ """
293
+ Consolidate terms in a vocabulary or a list of texts using a Vulture pre-processing engine.
294
+
295
+ Parameters
296
+ ----------
297
+ vocabulary : list of str, optional
298
+ A list of vocabulary terms to process.
299
+ texts : list of str, optional
300
+ A list of texts to process.
301
+ vulture : Vulture, optional
302
+ An instance of the Vulture pre-processing engine.
303
+ changes_made_save_path : str, optional
304
+ Path to save the changes made.
305
+ operated_text_save_path : str, optional
306
+ Path to save the substituted text after word changes.
307
+
308
+ Returns
309
+ -------
310
+ list
311
+ Processed texts with consolidated terms.
312
+ """
313
+ if vocabulary and texts :
314
+ raise ValueError ("Specify either vocabulary or texts, not both." )
315
+
316
+ if vocabulary :
317
+ raise ValueError ("Not implemented yet" )
318
+
319
+ if texts :
320
+ output_list = self .unique_words_by_id (texts )
321
+ consolidated_vocab , df_changes = self .replace_similar_keys_levenshtein (output_list , changes_made_save_path = changes_made_save_path )
322
+ corpus_substitutions = {}
323
+ for p , n in zip (df_changes ['Previous Key' ], df_changes ['New Key' ]):
324
+ corpus_substitutions [p ] = n
325
+
326
+ if not vulture :
327
+ vulture = Vulture (n_jobs = - 1 , verbose = True )
328
+
329
+ if operated_text_save_path :
330
+ split_path = operated_text_save_path .split (os .path .sep )
331
+ save_path = (os .path .sep ).join (split_path [:- 1 ])
332
+ save_file = split_path [- 1 ]
333
+ else :
334
+ save_path = None
335
+ save_file = None
336
+ output = vulture .operate (texts ,
337
+ steps = [SubstitutionOperator (document_substitutions = None ,
338
+ corpus_substitutions = corpus_substitutions ,
339
+ document_priority = False )],
340
+ save_path = save_path ,
341
+ file_name = save_file )
342
+ return output
343
+
344
+
0 commit comments