Utils: Update sentence tokenizers, word tokenizers, syllable tokenize…

…rs, part-of-speech taggers, lemmatizers, dependency parsers, and sentiment analyzers
BLKSerene · May 16, 2024 · 93f9cb0 · 93f9cb0
1 parent f5693ac
commit 93f9cb0
Show file tree

Hide file tree

Showing 61 changed files with 2,809 additions and 2,877 deletions.
diff --git a/pylintrc b/pylintrc
@@ -24,23 +24,23 @@ extension-pkg-whitelist=
 [MESSAGES CONTROL]
 
 disable=
-    # C103, C114, C115, C116
+    # C0103, C0114, C0115, C0116
     invalid-name,
     missing-module-docstring,
     missing-class-docstring,
     missing-function-docstring,
-    # C301, C302
+    # C0301, C0302
     line-too-long,
     too-many-lines,
-    # C413, C415
+    # C0413, C0415
     wrong-import-position,
     import-outside-toplevel,
 
-    # R401
+    # R0401
     cyclic-import,
-    # R801
+    # R0801
     duplicate-code,
-    # R901, R902, R903, R904, R912, R913, R914, R915, R916
+    # R0901, R0902, R0903, R0904, R0912, R0913, R0914, R0915, R0916, R0917
     too-many-ancestors,
     too-many-instance-attributes,
     too-few-public-methods,
@@ -50,21 +50,22 @@ disable=
     too-many-locals,
     too-many-statements,
     too-many-branches,
-    # R1702, R1705, R1723, R1724
+    too-many-positional,
+    # R1702, R1705, R1720, R1723, R1724
     too-many-nested-blocks,
     no-else-return,
+    no-else-raise,
     no-else-break,
     no-else-continue,
 
-    # W201, W212
+    # W0201, W0212
     attribute-defined-outside-init,
     protected-access,
-    # W511
-    fixme,
-    # W621
+    # W0603, W0621
+    global-statement,
     redefined-outer-name,
 
-    # E401
+    # E0401
     import-error,
-    # E1121
-    too-many-function-args
+    # E0606
+    possibly-used-before-assignment,
diff --git a/tests/files/file_area/misc/[eng_gb] Tagged.txt b/tests/files/file_area/misc/[eng_gb] Tagged.txt
diff --git a/tests/test_colligation_extractor.py b/tests/test_colligation_extractor.py
@@ -86,14 +86,18 @@ def update_gui(err_msg, colligations_freqs_files, colligations_stats_files):
         assert node
         # Collocate
         assert collocate
+
         # Frequency (span positions)
         for freqs_file in freqs_files:
             assert len(freqs_file) == 10
+
         # Frequency (total)
         assert sum((sum(freqs_file) for freqs_file in freqs_files)) >= 0
+
         # p-value
         for _, p_value, _, _ in stats_files:
             assert p_value is None or 0 <= p_value <= 1
+
         # Number of Files Found
         assert len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)]) >= 1
 

diff --git a/tests/test_collocation_extractor.py b/tests/test_collocation_extractor.py
@@ -86,14 +86,18 @@ def update_gui(err_msg, collocations_freqs_files, collocations_stats_files):
         assert node
         # Collocate
         assert collocate
+
         # Frequency (span positions)
         for freqs_file in freqs_files:
             assert len(freqs_file) == 10
+
         # Frequency (total)
         assert sum((sum(freqs_file) for freqs_file in freqs_files)) >= 0
+
         # p-value
         for _, p_value, _, _ in stats_files:
             assert p_value is None or 0 <= p_value <= 1
+
         # Number of Files Found
         assert len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)]) >= 1
 

diff --git a/tests/test_concordancer.py b/tests/test_concordancer.py
@@ -42,7 +42,7 @@ def test_concordancer():
             case _:
                 wl_test_init.select_test_files(main, no_files = [i + 1])
 
-        global main_global # pylint: disable=global-statement
+        global main_global
         main_global = main
 
         print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
@@ -66,9 +66,9 @@ def update_gui_table(err_msg, concordance_lines):
     file_names_selected = list(main_global.wl_file_area.get_selected_file_names())
 
     for concordance_line in concordance_lines:
-        left_text, left_text_raw, left_text_search = concordance_line[0]
-        node_text, node_text_raw, node_text_search = concordance_line[1]
-        right_text, right_text_raw, right_text_search = concordance_line[2]
+        left_tokens_raw, left_tokens_search = concordance_line[0]
+        node_tokens_raw, node_tokens_search = concordance_line[1]
+        right_tokens_raw, right_tokens_search = concordance_line[2]
 
         sentiment = concordance_line[3]
         no_token, len_tokens = concordance_line[4]
@@ -78,19 +78,16 @@ def update_gui_table(err_msg, concordance_lines):
         file_name = concordance_line[8]
 
         # Node
-        assert node_text
-        assert node_text_raw
-        assert node_text_search
+        assert node_tokens_raw
+        assert node_tokens_search
+
         # Left & Right
-        assert left_text or right_text
-        assert left_text == [] or all(left_text)
-        assert right_text == [] or all(right_text)
-        assert left_text_raw or right_text_raw
-        assert left_text_raw == [] or all(left_text_raw)
-        assert right_text_raw == [] or all(right_text_raw)
-        assert left_text_search or right_text_search
-        assert left_text_search == [] or all(left_text_search)
-        assert right_text_search == [] or all(right_text_search)
+        assert left_tokens_raw or right_tokens_raw
+        assert left_tokens_raw == [] or all(left_tokens_raw)
+        assert right_tokens_raw == [] or all(right_tokens_raw)
+        assert left_tokens_search or right_tokens_search
+        assert left_tokens_search == [] or all(left_tokens_search)
+        assert right_tokens_search == [] or all(right_tokens_search)
 
         # Sentiment
         assert sentiment == 'No language support' or -1 <= sentiment <= 1

diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py
@@ -42,7 +42,7 @@ def test_dependency_parser():
             case _:
                 wl_test_init.select_test_files(main, no_files = [i + 1])
 
-        global main_global # pylint: disable=global-statement
+        global main_global
         main_global = main
 
         print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")

diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py
@@ -16,8 +16,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------
 
-# pylint: disable=unsupported-assignment-operation
-
 import random
 
 from tests import wl_test_init

diff --git a/tests/test_ngram_generator.py b/tests/test_ngram_generator.py
@@ -47,7 +47,7 @@ def test_ngram_generator():
             case _:
                 wl_test_init.select_test_files(main, no_files = [i + 1])
 
-        global main_global # pylint: disable=global-statement
+        global main_global
         main_global = main
 
         settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion)

diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -44,7 +44,7 @@ def test_profiler():
             case _:
                 wl_test_init.select_test_files(main, no_files = [i + 1])
 
-        global main_global # pylint: disable=global-statement
+        global main_global
         main_global = main
 
         print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")

diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py
@@ -44,7 +44,7 @@ def test_wordlist_generator():
             case _:
                 wl_test_init.select_test_files(main, no_files = [i + 1])
 
-        global main_global # pylint: disable=global-statement
+        global main_global
         main_global = main
 
         settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion)

diff --git a/tests/tests_figs/test_figs.py b/tests/tests_figs/test_figs.py
@@ -38,10 +38,10 @@ def test_get_data_ranks():
     assert wl_figs.get_data_ranks(data_files_items, fig_settings) == [(str(i), i) for i in range(50)]
 
 def test_generate_line_chart():
-    main = wl_test_init.Wl_Test_Main()
+    main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
     wl_test_init.select_test_files(main, no_files = [0, 1])
 
-    global main_global # pylint: disable=global-statement
+    global main_global
     main_global = main
 
     wl_figs.generate_line_chart(

diff --git a/tests/tests_figs/test_figs_freqs.py b/tests/tests_figs/test_figs_freqs.py
@@ -20,9 +20,10 @@
 
 from tests import wl_test_init
 from wordless.wl_figs import wl_figs_freqs
+from wordless.wl_nlp import wl_texts
 
 def test_wl_fig_freqs():
-    main = wl_test_init.Wl_Test_Main()
+    main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
 
     for tab in [
         'wordlist_generator',
@@ -45,30 +46,35 @@ def test_wl_fig_freqs():
 
             if graph_type == 'Network Graph':
                 for node in range(10):
+                    node = wl_texts.Wl_Token(str(node))
+
                     for collocate in range(10):
+                        collocate = wl_texts.Wl_Token(str(collocate))
                         freq_1, freq_2 = random.sample(range(10000), 2)
 
-                        freq_files_items[(str(node), str(collocate))] = [
+                        freq_files_items[(node, collocate)] = [
                             max(freq_1, freq_2) - min(freq_1, freq_2),
                             min(freq_1, freq_2),
                             max(freq_1, freq_2)
                         ]
             else:
                 if tab == 'keyword_extractor':
                     for item in range(100):
+                        item = wl_texts.Wl_Token(str(item))
                         freq_1, freq_2 = random.sample(range(100), 2)
 
-                        freq_files_items[str(item)] = [
+                        freq_files_items[item] = [
                             random.randint(0, 100),
                             max(freq_1, freq_2) - min(freq_1, freq_2),
                             min(freq_1, freq_2),
                             max(freq_1, freq_2)
                         ]
                 else:
                     for item in range(100):
+                        item = wl_texts.Wl_Token(str(item))
                         freq_1, freq_2 = random.sample(range(100), 2)
 
-                        freq_files_items[str(item)] = [
+                        freq_files_items[item] = [
                             max(freq_1, freq_2) - min(freq_1, freq_2),
                             min(freq_1, freq_2),
                             max(freq_1, freq_2)

diff --git a/tests/tests_figs/test_figs_stats.py b/tests/tests_figs/test_figs_stats.py
@@ -20,9 +20,10 @@
 
 from tests import wl_test_init
 from wordless.wl_figs import wl_figs_stats
+from wordless.wl_nlp import wl_texts
 
 def test_wl_fig_stats():
-    main = wl_test_init.Wl_Test_Main()
+    main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
 
     for tab in [
         'wordlist_generator',
@@ -55,15 +56,19 @@ def test_wl_fig_stats():
 
             if graph_type == 'Network Graph':
                 for node in range(10):
+                    node = wl_texts.Wl_Token(str(node))
+
                     for collocate in range(10):
-                        stat_files_items[(str(node), str(collocate))] = [
+                        collocate = wl_texts.Wl_Token(str(collocate))
+                        stat_files_items[(node, collocate)] = [
                             random.uniform(0, val_max),
                             random.uniform(0, val_max),
                             random.uniform(0, val_max)
                         ]
             else:
                 for item in range(100):
-                    stat_files_items[str(item)] = [
+                    item = wl_texts.Wl_Token(str(item))
+                    stat_files_items[item] = [
                         random.uniform(0, val_max),
                         random.uniform(0, val_max),
                         random.uniform(0, val_max)