Work Area: Add Collocation/Colligation Extractor - Filter results - N…

…ode/Collocation length
BLKSerene · May 19, 2024 · 6f8aa74 · 6f8aa74
1 parent 0d95bf9
commit 6f8aa74
Show file tree

Hide file tree

Showing 21 changed files with 574 additions and 367 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@
 - Utils: Add PyThaiNLP's Han-solo
 - Utils: Add Stanza's Sindhi part-of-speech tagger
 - Utils: Add VADER's sentiment analyzers
+- Work Area: Add Collocation/Colligation Extractor - Filter results / 
 - Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic
 
 ### ✨ Improvements
@@ -40,7 +41,7 @@
 
 ### ❌ Removals
 - Menu: Remove Settings - Measures - Statistical Significance - Welch's t-test
-- Work Area: Remove Collocation Extractor / Colligation Extractor / Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test
+- Work Area: Remove Collocation/Colligation/Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test
 - Utils: Remove Dostoevsky's Russian sentiment analyzer
 
 ### ⏫ Dependency Changes

diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py
@@ -62,7 +62,7 @@ def update_gui(err_msg, results):
 
     for (
         head, dependent, dependency_relation, dependency_len,
-        sentence_display, sentence_search,
+        sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search,
         no_sentence, len_sentences, file
     ) in results:
         # Head
@@ -75,8 +75,9 @@ def update_gui(err_msg, results):
         assert isinstance(dependency_len, int)
 
         # Sentence
-        assert all(sentence_display)
-        assert all(sentence_search)
+        assert all(sentence_tokens_raw)
+        assert all(sentence_tokens_fig)
+        assert all(sentence_tokens_search)
 
         # Sentence No.
         assert no_sentence >= 1

diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py
@@ -153,7 +153,7 @@ def update_gui_file_types(err_msg, new_files):
         file_text = new_files[0]['text']
 
         tokens = file_text.to_token_texts()
-        tags = file_text.get_token_properties('tag')
+        tags = file_text.get_token_properties('tag', flat = True)
 
         print(tokens)
 
@@ -192,7 +192,7 @@ def update_gui_file_types(err_msg, new_files):
         file_text_tgt = new_files[1]['text']
 
         tokens_src = file_text_src.to_token_texts()
-        tags_src = file_text_src.get_token_properties('tag')
+        tags_src = file_text_src.get_token_properties('tag', flat = True)
 
         # Source files
         print(file_text_src.lang)
@@ -204,7 +204,7 @@ def update_gui_file_types(err_msg, new_files):
 
         # Target files
         tokens_tgt = file_text_tgt.to_token_texts()
-        tags_tgt = file_text_tgt.get_token_properties('tag')
+        tags_tgt = file_text_tgt.get_token_properties('tag', flat = True)
 
         print(file_text_tgt.lang)
         print(tokens_tgt)
@@ -226,7 +226,7 @@ def update_gui_tags(err_msg, new_files):
     file_text = new_files[0]['text']
 
     tokens = file_text.to_token_texts()
-    tags = file_text.get_token_properties('tag')
+    tags = file_text.get_token_properties('tag', flat = True)
 
     print(tokens)
     print(tags)

diff --git a/tests/tests_utils/test_detection.py b/tests/tests_utils/test_detection.py
@@ -290,11 +290,11 @@ def test_lingua():
     langs_exceptions = {'bokmal', 'ganda', 'nynorsk', 'slovene'}
     langs_extra = set()
 
-    for lang in lingua.Language.all():
+    for lang in lingua.Language.all(): # pylint: disable=no-member
         if lang.name.lower() not in langs | langs_exceptions:
             langs_extra.add(lang.name)
 
-    print(f"Extra languages: {', '.join(langs_extra)}\n")
+    print(f"\nExtra languages: {', '.join(langs_extra)}\n")
 
     assert langs_extra == {'BOSNIAN', 'MAORI', 'SHONA', 'SOMALI', 'SOTHO', 'TSONGA', 'XHOSA'}
 

diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py
@@ -909,7 +909,7 @@ def run(self):
                 colligations_freqs_file = {}
                 colligations_freqs_file_all = {}
 
-                text = wl_token_processing.wl_process_tokens(
+                text = wl_token_processing.wl_process_tokens_colligation_extractor(
                     self.main, file['text'],
                     token_settings = settings['token_settings']
                 )
@@ -976,23 +976,21 @@ def run(self):
                         tags_left = []
                         tags_right = []
 
-                        tags = wl_texts.to_tokens(wl_texts.get_token_properties(tokens, 'tag'), lang = file['lang'])
-
                         if window_left < 0 < window_right:
                             # Limit Searching
                             if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
-                                tags_left = tags[max(0, i + window_left) : i]
-                                tags_right = tags[i + ngram_size : i + ngram_size + window_right]
+                                tags_left = text.tags[max(0, i + window_left) : i]
+                                tags_right = text.tags[i + ngram_size : i + ngram_size + window_right]
                             else:
                                 # Span positions (Left)
                                 for position in range(max(0, i + window_left), i):
                                     if i_unit_start <= position <= i_unit_end:
-                                        tags_left.append(tags[position])
+                                        tags_left.append(text.tags[position])
 
                                 # Span positions (Right)
                                 for position in range(i + ngram_size, i + ngram_size + window_right):
                                     if i_unit_start <= position <= i_unit_end:
-                                        tags_right.append(tags[position])
+                                        tags_right.append(text.tags[position])
 
                             for j, collocate in enumerate(reversed(tags_left)):
                                 if wl_matching.check_context(
@@ -1024,12 +1022,12 @@ def run(self):
                         elif window_left < 0 and window_right < 0:
                             # Limit Searching
                             if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
-                                tags_left = tags[max(0, i + window_left) : max(0, i + window_right + 1)]
+                                tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)]
                             else:
                                 # Span positions (Left)
                                 for position in range(max(0, i + window_left), max(0, i + window_right + 1)):
                                     if i_unit_start <= position <= i_unit_end:
-                                        tags_left.append(tags[position])
+                                        tags_left.append(text.tags[position])
 
                             for j, collocate in enumerate(reversed(tags_left)):
                                 if wl_matching.check_context(
@@ -1047,12 +1045,12 @@ def run(self):
                         elif window_left > 0 and window_right > 0:
                             # Limit Searching
                             if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
-                                tags_right = tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
+                                tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
                             else:
                                 # Span positions (Right)
                                 for position in range(i + ngram_size + window_left - 1, i + ngram_size + window_right):
                                     if i_unit_start <= position <= i_unit_end:
-                                        tags_right.append(tags[position])
+                                        tags_right.append(text.tags[position])
 
                             for j, collocate in enumerate(tags_right):
                                 if wl_matching.check_context(

diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py
@@ -906,7 +906,7 @@ def run(self):
                 collocations_freqs_file = {}
                 collocations_freqs_file_all = {}
 
-                text = wl_token_processing.wl_process_tokens(
+                text = wl_token_processing.wl_process_tokens_ngram_generator(
                     self.main, file['text'],
                     token_settings = settings['token_settings']
                 )

diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py
@@ -727,9 +727,10 @@ def run(self):
                             no_sentence = bisect.bisect(offsets_sentences, i)
                             no_para = bisect.bisect(offsets_paras, i)
 
-                            # Search in Results (Node)
-                            node_tokens_search = list(ngram)
-                            node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(ngram))
+                            node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+                                ngram,
+                                punc_mark = True
+                            ))
 
                             # Width Unit
                             if settings['generation_settings']['width_unit'] == self.tr('Character'):
@@ -811,16 +812,46 @@ def run(self):
                                 left_tokens_raw = tokens[offset_start:i]
                                 right_tokens_raw = tokens[i + len_search_term : offset_end]
 
-                            # Search in results (Left & Right)
-                            left_tokens_search = copy.deepcopy(left_tokens_raw)
-                            right_tokens_search = copy.deepcopy(right_tokens_raw)
+                            if settings['token_settings']['punc_marks']:
+                                node_tokens_search = list(ngram)
+
+                                # Remove empty tokens for searching in results
+                                left_tokens_search = [token for token in copy.deepcopy(left_tokens_raw) if token]
+                                right_tokens_search = [token for token in copy.deepcopy(right_tokens_raw) if token]
+                            # Convert trailing punctuation marks, if any, to separate tokens for searching
+                            else:
+                                node_tokens_search = []
+                                left_tokens_search = []
+                                right_tokens_search = []
+
+                                for token in list(ngram):
+                                    node_tokens_search.append(token)
+
+                                    if token.punc_mark:
+                                        node_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
 
-                            # Remove empty tokens for searching in results
-                            left_tokens_search = [token for token in left_tokens_search if token]
-                            right_tokens_search = [token for token in right_tokens_search if token]
+                                for token in copy.deepcopy(left_tokens_raw):
+                                    if token:
+                                        left_tokens_search.append(token)
 
-                            left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(left_tokens_raw))
-                            right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(right_tokens_raw))
+                                        if token.punc_mark:
+                                            left_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
+
+                                for token in copy.deepcopy(right_tokens_raw):
+                                    if token:
+                                        right_tokens_search.append(token)
+
+                                        if token.punc_mark:
+                                            right_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
+
+                            left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+                                left_tokens_raw,
+                                punc_mark = True
+                            ))
+                            right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+                                right_tokens_raw,
+                                punc_mark = True
+                            ))
 
                             # Left
                             concordance_line.append([left_tokens_raw, left_tokens_search])

diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py
@@ -408,9 +408,10 @@ def run(self):
                                 parallel_unit_no = bisect.bisect(offsets_paras, j)
 
                                 if parallel_unit_no not in parallel_units:
+                                    # Save all nodes if multiple nodes are found in the same parallel unit
                                     parallel_units[parallel_unit_no] = [[] for _ in range(len_files)]
 
-                                parallel_units[parallel_unit_no][i] = ngram
+                                parallel_units[parallel_unit_no][i].append(ngram)
                 # Search for additions & deletions
                 else:
                     for j, para in enumerate(text.tokens_multilevel):
@@ -428,22 +429,37 @@ def run(self):
                 len_parallel_units = len(offsets_paras)
 
                 for parallel_unit_no, parallel_unit_nodes in parallel_units.items():
-                    node = parallel_unit_nodes[i]
+                    nodes = parallel_unit_nodes[i]
 
                     if parallel_unit_no <= len_parallel_units:
-                        parallel_unit_tokens_raw = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
-                        parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(parallel_unit_tokens_raw))
-                        # Search in Results
-                        parallel_unit_tokens_search = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
-
-                        # Highlight node if found
-                        if node:
-                            len_node = len(node)
-
-                            for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit_tokens_search, len_node)):
-                                if ngram == tuple(node):
-                                    parallel_unit_tokens_raw[j] = f'<span style="color: {node_color}; font-weight: bold;">{parallel_unit_tokens_raw[j]}'
-                                    parallel_unit_tokens_raw[j + len_node - 1] += '</span>'
+                        parallel_unit = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
+
+                        if settings['token_settings']['punc_marks']:
+                            parallel_unit_tokens_search = copy.deepcopy(parallel_unit)
+                        # Convert trailing punctuation marks, if any, to separate tokens for searching
+                        else:
+                            parallel_unit_tokens_search = []
+
+                            for token in copy.deepcopy(parallel_unit):
+                                parallel_unit_tokens_search.append(token)
+
+                                if token.punc_mark:
+                                    parallel_unit_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
+
+                        parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+                            parallel_unit,
+                            punc_mark = True
+                        ))
+
+                        # Highlight nodes if found
+                        if nodes:
+                            for node in nodes:
+                                len_node = len(node)
+
+                                for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit, len_node)):
+                                    if ngram == tuple(node):
+                                        parallel_unit_tokens_raw[j] = f'<span style="color: {node_color}; font-weight: bold;">{parallel_unit_tokens_raw[j]}'
+                                        parallel_unit_tokens_raw[j + len_node - 1] += '</span>'
                     else:
                         parallel_unit_tokens_raw = []
                         parallel_unit_tokens_search = []