Work Area: Fix Dependency Parser - analysis of files whose first toke…

…n is a punctuation mark
BLKSerene · Dec 10, 2023 · 011847d · 011847d
1 parent 347b5b6
commit 011847d
Show file tree

Hide file tree

Showing 15 changed files with 63 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@
 ## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - ??/??/2023
 ### 📌 Bugfixes
 - Utils: Fix downloading of Stanza models
+- Work Area: Fix Dependency Parser - analysis of files whose first token is a punctuation mark
 
 ### ⏫ Dependency Changes
 - Dependencies: Remove jieba
@@ -128,7 +129,7 @@
 
 ### 📌 Bugfixes
 - Work Area: Remove all invalid XML characters when exporting tables to Excel workbooks
-- Work Area: Fix Parallel Concordancer - Search Settings - Search for additions and deletions
+- Work Area: Fix Parallel Concordancer - searching for additions and deletions
 
 ### ⏫ Dependency Changes
 - Dependencies: Add python-mecab-ko

diff --git a/tests/files/file_area/misc/[eng_us] 1st_token_is_punc_mark.txt b/tests/files/file_area/misc/[eng_us] 1st_token_is_punc_mark.txt
@@ -0,0 +1 @@
+"The first token is a punctuation mark.
diff --git a/tests/test_colligation_extractor.py b/tests/test_colligation_extractor.py
@@ -47,14 +47,14 @@ def test_colligation_extractor():
         fillvalue = 'none'
     )):
         # Single file
-        if i % 3 == 0:
+        if i % 4 == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
-        elif i % 3 == 1:
+        elif i % 4 == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i % 3 == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i % 4 + 1])
 
         main.settings_custom['colligation_extractor']['generation_settings']['test_statistical_significance'] = test_statistical_significance
         main.settings_custom['colligation_extractor']['generation_settings']['measure_bayes_factor'] = measure_bayes_factor

diff --git a/tests/test_collocation_extractor.py b/tests/test_collocation_extractor.py
@@ -47,14 +47,14 @@ def test_collocation_extractor():
         fillvalue = 'none'
     )):
         # Single file
-        if i % 3 == 0:
+        if i % 4 == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
-        elif i % 3 == 1:
+        elif i % 4 == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i % 3 == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i % 4 + 1])
 
         main.settings_custom['collocation_extractor']['generation_settings']['test_statistical_significance'] = test_statistical_significance
         main.settings_custom['collocation_extractor']['generation_settings']['measure_bayes_factor'] = measure_bayes_factor

diff --git a/tests/test_concordancer.py b/tests/test_concordancer.py
@@ -28,16 +28,16 @@ def test_concordancer():
     main.settings_custom['concordancer']['search_settings']['multi_search_mode'] = True
     main.settings_custom['concordancer']['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
 
-    for i in range(3):
+    for i in range(4):
         # Single file
         if i == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
         elif i == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i + 1])
 
         global main_global # pylint: disable=global-statement
         main_global = main
@@ -80,8 +80,14 @@ def update_gui_table(err_msg, concordance_lines):
         assert node_text_search
         # Left & Right
         assert left_text or right_text
+        assert left_text == [] or all(left_text)
+        assert right_text == [] or all(right_text)
         assert left_text_raw or right_text_raw
+        assert left_text_raw == [] or all(left_text_raw)
+        assert right_text_raw == [] or all(right_text_raw)
         assert left_text_search or right_text_search
+        assert left_text_search == [] or all(left_text_search)
+        assert right_text_search == [] or all(right_text_search)
 
         # Sentiment
         assert sentiment == 'No language support' or -1 <= sentiment <= 1

diff --git a/tests/test_concordancer_parallel.py b/tests/test_concordancer_parallel.py
@@ -30,7 +30,7 @@ def test_concordancer_parallel():
         if i == 0:
             wl_test_init.select_test_files(main, no_files = [0, 1, 2])
         elif i == 1:
-            wl_test_init.select_test_files(main, no_files = [1, 2, 3])
+            wl_test_init.select_test_files(main, no_files = [1, 2, 3, 4])
 
         print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
 

diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py
@@ -28,16 +28,16 @@ def test_dependency_parser():
     main.settings_custom['dependency_parser']['search_settings']['multi_search_mode'] = True
     main.settings_custom['dependency_parser']['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
 
-    for i in range(3):
+    for i in range(4):
         # Single file
         if i == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
         elif i == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i + 1])
 
         global main_global # pylint: disable=global-statement
         main_global = main

diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py
@@ -46,24 +46,24 @@ def test_keyword_extractor():
         fillvalue = 'none'
     )):
         # Single observed file & single reference file
-        if i % 5 == 0:
+        if i % 6 == 0:
             wl_test_init.select_test_files(main, no_files = [0])
             wl_test_init.select_test_files(main, no_files = [0], ref = True)
         # Single observed file & multiple reference files
-        elif i % 5 == 1:
+        elif i % 6 == 1:
             wl_test_init.select_test_files(main, no_files = [0])
             wl_test_init.select_test_files(main, no_files = [1, 2], ref = True)
         # Multiple observed files & single reference file
-        elif i % 5 == 2:
+        elif i % 6 == 2:
             wl_test_init.select_test_files(main, no_files = [1, 2])
             wl_test_init.select_test_files(main, no_files = [0], ref = True)
         # Multiple observed files & multiple reference files
-        elif i % 5 == 3:
+        elif i % 6 == 3:
             wl_test_init.select_test_files(main, no_files = [1, 2])
             wl_test_init.select_test_files(main, no_files = [1, 2], ref = True)
-        # TTR = 1
-        elif i % 5 == 4:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i % 6 - 1])
             wl_test_init.select_test_files(main, no_files = [0], ref = True)
 
         main.settings_custom['keyword_extractor']['generation_settings']['test_statistical_significance'] = test_statistical_significance

diff --git a/tests/test_ngram_generator.py b/tests/test_ngram_generator.py
@@ -39,14 +39,14 @@ def test_ngram_generator():
         fillvalue = 'none'
     )):
         # Single file
-        if i % 3 == 0:
+        if i % 4 == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
-        elif i % 3 == 1:
+        elif i % 4 == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i % 3 == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i % 4 + 1])
 
         global main_global # pylint: disable=global-statement
         main_global = main

diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -32,16 +32,16 @@
 def test_profiler():
     main = wl_test_init.Wl_Test_Main()
 
-    for i in range(3):
+    for i in range(4):
         # Single file
         if i == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
         elif i == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i + 1])
 
         global main_global # pylint: disable=global-statement
         main_global = main

diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py
@@ -36,14 +36,14 @@ def test_wordlist_generator():
         fillvalue = 'none'
     )):
         # Single file
-        if i % 3 == 0:
+        if i % 4 == 0:
             wl_test_init.select_test_files(main, no_files = [0])
         # Multiple files
-        elif i % 3 == 1:
+        elif i % 4 == 1:
             wl_test_init.select_test_files(main, no_files = [1, 2])
-        # TTR = 1
-        elif i % 3 == 2:
-            wl_test_init.select_test_files(main, no_files = [3])
+        # Miscellaneous
+        else:
+            wl_test_init.select_test_files(main, no_files = [i % 4 + 1])
 
         global main_global # pylint: disable=global-statement
         main_global = main

diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py
@@ -777,7 +777,7 @@ def run(self):
 
                                     len_context_right += len_token_next
 
-                                # Search in Results (Left & Right)
+                                # Search in results (Left & Right)
                                 text_search_left = copy.deepcopy(context_left)
                                 text_search_right = copy.deepcopy(context_right)
 
@@ -821,7 +821,7 @@ def run(self):
                                 context_left = text.tokens_flat_punc_marks_merged[max(0, i - width_left_token) : i]
                                 context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : i + len_search_term + width_right_token]
 
-                                # Search in Results (Left & Right)
+                                # Search in results (Left & Right)
                                 if settings['token_settings']['punc_marks']:
                                     text_search_left = copy.deepcopy(context_left)
                                     text_search_right = copy.deepcopy(context_right)
@@ -858,14 +858,18 @@ def run(self):
                                 context_left = text.tokens_flat_punc_marks_merged[offset_start:i]
                                 context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : offset_end]
 
-                                # Search in Results (Left & Right)
+                                # Search in results (Left & Right)
                                 if settings['token_settings']['punc_marks']:
                                     text_search_left = copy.deepcopy(context_left)
                                     text_search_right = copy.deepcopy(context_right)
                                 else:
                                     text_search_left = tokens[offset_start:i]
                                     text_search_right = tokens[i + len_search_term : offset_end]
 
+                            # Remove empty tokens for searching in results
+                            text_search_left = [token for token in text_search_left if token]
+                            text_search_right = [token for token in text_search_right if token]
+
                             context_left = wl_nlp_utils.escape_tokens(context_left)
                             context_right = wl_nlp_utils.escape_tokens(context_right)
 

diff --git a/wordless/wl_dependency_parser.py b/wordless/wl_dependency_parser.py
@@ -535,7 +535,8 @@ def run(self):
                                         offset_end = offsets_sentences[no_sentence]
 
                                     sentence_display = text.tokens_flat_punc_marks_merged[offsets_sentences[no_sentence - 1]:offset_end]
-                                    sentence_search = sentence
+                                    # Remove empty tokens for searching in results
+                                    sentence_search = [token for token in sentence if token]
 
                                     # Head
                                     results[-1].append(head)

diff --git a/wordless/wl_nlp/wl_dependency_parsing.py b/wordless/wl_nlp/wl_dependency_parsing.py
@@ -96,6 +96,9 @@ def wl_dependency_parse_text(main, inputs, lang, dependency_parser):
 def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged):
     dependencies = []
 
+    # Discard empty tokens since they are useless for dependency parsing and spacy.tokens.Doc does not accept empty strings
+    inputs = [token for token in inputs if token]
+
     if tagged:
         inputs, tags = wl_matching.split_tokens_tags(main, inputs)
 
@@ -142,7 +145,6 @@ def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged):
                         token.head - token.id if token.head > 0 else 0
                     ))
 
-    # Put back tokens and tags
     if tagged:
         for i, dependency in enumerate(dependencies):
             token, head, dependency_relation, dependency_dist = dependency

diff --git a/wordless/wl_nlp/wl_lemmatization.py b/wordless/wl_nlp/wl_lemmatization.py
@@ -157,16 +157,16 @@ def wl_lemmatize_text(main, inputs, lang, lemmatizer):
     return lemmas
 
 def wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged):
-    empty_offsets = []
     lemma_tokens = []
     lemmas = []
+    empty_offsets = []
 
     if tagged:
         inputs, tags = wl_matching.split_tokens_tags(main, inputs)
     else:
         tags = [''] * len(inputs)
 
-    # Record positions of empty tokens and tags
+    # Record positions of empty tokens and tags since spacy.tokens.Doc does not accept empty strings
     for i, token in reversed(list(enumerate(inputs))):
         if not token.strip():
             empty_offsets.append(i)