Skip to content

Commit ae59b3f

Browse files
committed
initial vocabulary consolidator
1 parent 7f3643c commit ae59b3f

File tree

3 files changed

+975
-2
lines changed

3 files changed

+975
-2
lines changed

Diff for: TELF/pre_processing/Vulture/modules/doc_op_substitution.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def _document_substitution(self, text, substitution_map):
6969
"""
7070
replaced_text = text
7171
for search, replace in substitution_map.items():
72-
replaced_text = re.sub(r'\b{}\b'.format(search), replace, replaced_text)
73-
72+
replaced_text = re.sub(r'\b{}\b'.format(re.escape(search)), replace, replaced_text)
7473
return {"replaced_text":replaced_text}
7574

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
import pandas as pd
2+
from tqdm import tqdm
3+
from collections import Counter
4+
from concurrent.futures import ThreadPoolExecutor, as_completed
5+
from itertools import combinations
6+
import os
7+
from TELF.pre_processing.Vulture import Vulture
8+
from TELF.pre_processing.Vulture.modules import SubstitutionOperator
9+
10+
class VocabularyConsolidator:
11+
"""
12+
A class for processing and replacing similar keys in dictionaries based on Levenshtein distance and suffix processing.
13+
"""
14+
15+
def __init__(self):
16+
self.suffixes = ['ingly', 'edly', 'fully', 'ness', 'less', 'ment', 'tion', 'sion',
17+
'ship', 'able', 'ible', 'al', 'ial', 'ed', 'ing', 'ly', 'es', 's',
18+
'er', 'tor']
19+
self.suffixes.sort(key=len, reverse=True)
20+
21+
@staticmethod
22+
def levenshtein_distance(s1, s2, length_1, length_2):
23+
"""
24+
Calculate the Levenshtein distance between two strings s1 and s2.
25+
26+
Parameters
27+
----------
28+
s1 : str
29+
The first string.
30+
s2 : str
31+
The second string.
32+
length_1 : int
33+
The length of the first string.
34+
length_2 : int
35+
The length of the second string.
36+
37+
Returns
38+
-------
39+
int
40+
The Levenshtein distance between s1 and s2.
41+
"""
42+
if length_1 < length_2:
43+
return VocabularyConsolidator.levenshtein_distance(s2, s1, length_2, length_1)
44+
if length_2 == 0:
45+
return length_1
46+
previous_row = range(length_2 + 1)
47+
for i, c1 in enumerate(s1):
48+
current_row = [i + 1]
49+
for j, c2 in enumerate(s2):
50+
insertions = previous_row[j + 1] + 1
51+
deletions = current_row[j] + 1
52+
substitutions = previous_row[j] + (c1 != c2)
53+
current_row.append(min(insertions, deletions, substitutions))
54+
previous_row = current_row
55+
return previous_row[-1]
56+
57+
def prefix_process_key(self,
58+
key):
59+
"""
60+
Preprocess a key by removing suffixes from it exhaustively.
61+
62+
Parameters
63+
----------
64+
key : str
65+
The key to preprocess.
66+
67+
Returns
68+
-------
69+
str
70+
The preprocessed key with the first matched suffix removed.
71+
"""
72+
original_key = key
73+
for suffix in self.suffixes:
74+
if key.endswith(suffix):
75+
key = key[:-len(suffix)]
76+
break
77+
return key
78+
79+
def compare_keys(self,
80+
key1,
81+
key2,
82+
threshold=0.80,
83+
edge_range=0.1):
84+
"""
85+
Compare two keys to determine if they are similar based on their Levenshtein distance and a similarity threshold.
86+
87+
Parameters
88+
----------
89+
key1 : str
90+
The first key.
91+
key2 : str
92+
The second key.
93+
threshold : float
94+
The similarity threshold.
95+
edge_range : float
96+
The range around the threshold to consider for preprocessing.
97+
98+
Returns
99+
-------
100+
tuple
101+
A tuple containing a boolean indicating similarity and the similarity score.
102+
"""
103+
length_1, length_2 = len(key1), len(key2)
104+
max_len = max(length_1, length_2)
105+
dist = self.levenshtein_distance(key1, key2, length_1, length_2)
106+
similarity = (max_len - dist) / max_len
107+
108+
if threshold <= similarity <= (threshold + edge_range):
109+
key1_processed = self.prefix_process_key(key1)
110+
key2_processed = self.prefix_process_key(key2)
111+
return (key1_processed == key2_processed, similarity)
112+
return (similarity >= (threshold + edge_range), similarity)
113+
114+
def process_chunk(self,
115+
pairs,
116+
key_frequency,
117+
threshold=0.90):
118+
"""
119+
Process a chunk of key pairs to find similar keys.
120+
121+
Parameters
122+
----------
123+
pairs : list of tuple
124+
List of key pairs to compare.
125+
key_frequency : Counter
126+
Frequency count of all keys.
127+
threshold : float
128+
The similarity threshold.
129+
130+
Returns
131+
-------
132+
list
133+
List of tuples containing less preferred key, preferred key, and similarity score.
134+
"""
135+
results = []
136+
for key1, key2 in pairs:
137+
similar_bool, similar_score = self.compare_keys(key1, key2, threshold)
138+
if similar_bool:
139+
preferred_key = key1 if key_frequency[key1] > key_frequency[key2] else key2
140+
less_preferred_key = key2 if preferred_key == key1 else key1
141+
results.append((less_preferred_key, preferred_key, similar_score))
142+
return results
143+
144+
def replace_similar_keys_levenshtein(self,
145+
dict_list,
146+
group_by_first_letter=True,
147+
group_by_length_difference=True,
148+
max_length_difference=2,
149+
min_chars=4,
150+
changes_made_save_path=None,
151+
similarity_threshold=0.80,
152+
n_jobs=-1):
153+
"""
154+
Replace similar keys in a list of dictionaries based on their similarity, optionally grouping them by first letter or length difference.
155+
156+
Parameters
157+
----------
158+
dict_list : list of dict
159+
List of dictionaries to process.
160+
group_by_first_letter : bool
161+
Whether to group keys by their first letter.
162+
group_by_length_difference : bool
163+
Whether to group keys by length difference.
164+
max_length_difference : int
165+
The maximum allowable length difference for grouping.
166+
min_chars : int
167+
Minimum character count to consider a key.
168+
changes_made_save_path : str
169+
Path to save the changes made.
170+
similarity_threshold : float
171+
The threshold for considering keys as similar.
172+
n_jobs : int
173+
Number of concurrent jobs to run. Uses all available CPUs if set to -1.
174+
175+
Returns
176+
-------
177+
tuple
178+
A tuple containing the modified list of dictionaries and a DataFrame with the changes made.
179+
"""
180+
all_keys = [key for d in dict_list for key in d.keys()]
181+
key_frequency = Counter(all_keys)
182+
similar_keys = {}
183+
changes = []
184+
185+
sorted_keys = sorted(set(all_keys))
186+
grouped_keys = {}
187+
188+
# Group keys by the first character
189+
if group_by_first_letter:
190+
for key in sorted_keys:
191+
first_char = key[0]
192+
if first_char not in grouped_keys:
193+
grouped_keys[first_char] = []
194+
grouped_keys[first_char].append(key)
195+
196+
# Further grouping by length difference within groups formed by the first letter
197+
if group_by_length_difference:
198+
final_grouped_keys = {}
199+
for key_group, keys in grouped_keys.items():
200+
temp_grouped_keys = {}
201+
keys_sorted_by_length = sorted(keys, key=len)
202+
203+
# Dont pair to check words below a threshold of keys_sorted_by_length
204+
for index, key in enumerate(keys_sorted_by_length):
205+
if len(key) >= min_chars:
206+
break
207+
else:
208+
index = -1
209+
if index != -1:
210+
keys_sorted_by_length = keys_sorted_by_length[index:]
211+
212+
# Only pair the terms that are not more different than max_length_difference
213+
for key in keys_sorted_by_length:
214+
added = False
215+
for group_key in list(temp_grouped_keys.keys()):
216+
if abs(len(group_key) - len(key)) <= max_length_difference:
217+
temp_grouped_keys[group_key].append(key)
218+
added = True
219+
break
220+
if not added:
221+
temp_grouped_keys[key] = [key]
222+
final_grouped_keys[key_group] = temp_grouped_keys
223+
224+
# Flatten the groups correctly
225+
grouped_keys = {group_key: vals for subdict in final_grouped_keys.values() for group_key, vals in subdict.items()}
226+
227+
# Generate all pairs for comparison
228+
all_pairs = [pair for key_list in grouped_keys.values() for pair in combinations(key_list, 2)]
229+
230+
num_cpus = os.cpu_count() if n_jobs == -1 else min(n_jobs, os.cpu_count())
231+
chunk_size = int(len(all_pairs) / num_cpus) + 1
232+
chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]
233+
progress = tqdm(total=len(chunks), desc="Processing Chunks")
234+
235+
with ThreadPoolExecutor(max_workers=min(num_cpus, len(chunks))) as executor:
236+
results = list(executor.map(self.process_chunk, chunks, [key_frequency]*len(chunks), [similarity_threshold]*len(chunks)))
237+
for chunk_result in results:
238+
for less_preferred_key, preferred_key, similar_score in chunk_result:
239+
similar_keys[less_preferred_key] = (preferred_key, similar_score)
240+
progress.update(1)
241+
242+
progress.close()
243+
244+
for dict_ in dict_list:
245+
for less_preferred_key, (preferred_key, score) in similar_keys.items():
246+
if less_preferred_key in dict_:
247+
if isinstance(dict_[less_preferred_key], int):
248+
dict_[preferred_key] = dict_.get(preferred_key, 0) + dict_.pop(less_preferred_key)
249+
elif isinstance(dict_[less_preferred_key], str):
250+
dict_[preferred_key] = dict_.get(preferred_key, '') + dict_.pop(less_preferred_key)
251+
changes.append({
252+
'Previous Key': less_preferred_key,
253+
'New Key': preferred_key,
254+
'Similarity Score': score
255+
})
256+
257+
changes_df = pd.DataFrame(changes)
258+
259+
if changes_made_save_path:
260+
changes_df.to_csv(changes_made_save_path, index=False)
261+
262+
return dict_list, changes_df
263+
264+
def unique_words_by_id(self,
265+
input_dict):
266+
"""
267+
Create a list of dictionaries with unique words from the input dictionary.
268+
269+
Parameters
270+
----------
271+
input_dict : dict of {int: str}
272+
A dictionary where each key is an integer ID and each value is a string of words.
273+
274+
Returns
275+
-------
276+
list of dict
277+
A list where each dictionary contains unique words from the input, preserving order.
278+
"""
279+
output_list = []
280+
for key, word_string in input_dict.items():
281+
unique_words_dict = dict.fromkeys(word_string.split(), "")
282+
output_list.append(unique_words_dict)
283+
return output_list
284+
285+
286+
def consolidate_terms(self,
287+
vocabulary=None,
288+
texts=None,
289+
vulture=None,
290+
changes_made_save_path=None,
291+
operated_text_save_path=None):
292+
"""
293+
Consolidate terms in a vocabulary or a list of texts using a Vulture pre-processing engine.
294+
295+
Parameters
296+
----------
297+
vocabulary : list of str, optional
298+
A list of vocabulary terms to process.
299+
texts : list of str, optional
300+
A list of texts to process.
301+
vulture : Vulture, optional
302+
An instance of the Vulture pre-processing engine.
303+
changes_made_save_path : str, optional
304+
Path to save the changes made.
305+
operated_text_save_path : str, optional
306+
Path to save the substituted text after word changes.
307+
308+
Returns
309+
-------
310+
list
311+
Processed texts with consolidated terms.
312+
"""
313+
if vocabulary and texts:
314+
raise ValueError("Specify either vocabulary or texts, not both.")
315+
316+
if vocabulary:
317+
raise ValueError("Not implemented yet")
318+
319+
if texts:
320+
output_list = self.unique_words_by_id(texts)
321+
consolidated_vocab, df_changes = self.replace_similar_keys_levenshtein(output_list, changes_made_save_path=changes_made_save_path)
322+
corpus_substitutions = {}
323+
for p, n in zip(df_changes['Previous Key'], df_changes['New Key']):
324+
corpus_substitutions[p] = n
325+
326+
if not vulture:
327+
vulture = Vulture(n_jobs=-1, verbose=True)
328+
329+
if operated_text_save_path:
330+
split_path = operated_text_save_path.split(os.path.sep)
331+
save_path = (os.path.sep).join(split_path[:-1])
332+
save_file = split_path[-1]
333+
else:
334+
save_path = None
335+
save_file = None
336+
output = vulture.operate(texts,
337+
steps=[SubstitutionOperator(document_substitutions=None,
338+
corpus_substitutions=corpus_substitutions,
339+
document_priority=False)],
340+
save_path=save_path,
341+
file_name=save_file)
342+
return output
343+
344+

0 commit comments

Comments
 (0)