Utils: Add encoding detection - UTF-8 with BOM

BLKSerene · May 19, 2024 · b0d540d · b0d540d
1 parent 6f8aa74
commit b0d540d
Show file tree

Hide file tree

Showing 25 changed files with 555 additions and 402 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -61,7 +61,7 @@ jobs:
 
       # Upload coverage to Codecov
       - name: "Upload coverage to Codecov"
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
 
   # macOS
   build-macos:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@
 - Utils: Add Pyphen's Basque syllable tokenizer
 - Utils: Add PyThaiNLP's Han-solo
 - Utils: Add Stanza's Sindhi part-of-speech tagger
+- Utils: Add encoding detection - UTF-8 with BOM 
 - Utils: Add VADER's sentiment analyzers
 - Work Area: Add Collocation/Colligation Extractor - Filter results / 
 - Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic

diff --git a/doc/doc.md b/doc/doc.md
@@ -793,7 +793,7 @@ XML files¹               |\*.xml
 Language               |File Encoding          |Auto-detection
 -----------------------|-----------------------|:------------:
 All languages          |UTF-8 without BOM      |✔
-All languages          |UTF-8 with BOM         |✖️
+All languages          |UTF-8 with BOM         |✔
 All languages          |UTF-16 with BOM        |✔
 All languages          |UTF-16BE without BOM   |✔
 All languages          |UTF-16LE without BOM   |✔

diff --git a/tests/files/wl_checks/wl_checks_files/unsupported_file_type.unsupported b/tests/files/wl_checks/wl_checks_files/unsupported_file_type.unsupported
diff --git a/tests/tests_checks/test_checks_files.py b/tests/tests_checks/test_checks_files.py
@@ -31,7 +31,7 @@ def get_normalized_file_path(file_name):
 ]
 
 FILE_PATHS_UNSUPPORTED = [
-    get_normalized_file_path('unsupported_file_type.unsupported')
+    get_normalized_file_path('unsupported.unsupported')
 ]
 FILE_PATHS_EMPTY = [
     get_normalized_file_path('empty_txt.txt'),
@@ -44,12 +44,12 @@ def get_normalized_file_path(file_name):
 ]
 
 def test_check_file_paths_unsupported():
-    _, files_unsupported = wl_checks_files.check_file_paths_unsupported(main, FILE_PATHS_UNSUPPORTED)
+    _, files_unsupported = wl_checks_files.check_file_paths_unsupported(main, ['supported.txt'] + FILE_PATHS_UNSUPPORTED)
 
     assert files_unsupported == FILE_PATHS_UNSUPPORTED
 
 def test_check_file_paths_empty():
-    _, files_empty = wl_checks_files.check_file_paths_empty(main, FILE_PATHS_EMPTY)
+    _, files_empty = wl_checks_files.check_file_paths_empty(main, [FILE_PATHS_DUP[0]] + FILE_PATHS_EMPTY)
 
     assert files_empty == FILE_PATHS_EMPTY
 
@@ -60,6 +60,7 @@ def test_check_file_paths_duplicate():
 
 def test_check_err_file_area():
     assert wl_checks_files.check_err_file_area(main, '')
+    assert not wl_checks_files.check_err_file_area(main, 'test')
 
 if __name__ == '__main__':
     test_check_file_paths_unsupported()

diff --git a/tests/tests_checks/test_checks_misc.py b/tests/tests_checks/test_checks_misc.py
@@ -45,6 +45,10 @@ def test_check_dir():
 
 def test_check_new_name():
     assert wl_checks_misc.check_new_name('new_name', ['new_name', 'new_name (2)', 'new_name (4)']) == 'new_name (3)'
+    assert wl_checks_misc.check_new_name(
+        'new_name', ['new_name', 'new_name (2)', 'new_name (4)'],
+        separator = '/'
+    ) == 'new_name/2'
 
 def test_check_new_path():
     if os.path.exists('temp'):

diff --git a/tests/tests_checks/test_checks_work_area.py b/tests/tests_checks/test_checks_work_area.py
@@ -82,32 +82,44 @@ def test_check_search_terms():
 
 def test_check_nlp_support():
     file_eng_us = {'selected': True, 'name': 'test', 'lang': 'eng_us', 'tagged': False}
-    file_xxx = {'selected': True, 'name': 'test', 'lang': 'xxx', 'tagged': False}
+    file_other = {'selected': True, 'name': 'test', 'lang': 'other', 'tagged': False}
 
     assert wl_checks_work_area.check_nlp_support(
         main,
         nlp_utils = ['pos_taggers'],
         files = [file_eng_us]
     )
+    assert wl_checks_work_area.check_nlp_support(
+        main,
+        nlp_utils = ['lemmatizers'],
+        files = [file_eng_us]
+    )
     assert not wl_checks_work_area.check_nlp_support(
         main,
         nlp_utils = ['pos_taggers'],
-        files = [file_xxx]
+        files = [file_other]
+    )
+    assert not wl_checks_work_area.check_nlp_support(
+        main,
+        nlp_utils = ['lemmatizers'],
+        files = [file_other]
     )
 
     main.settings_custom['file_area']['files_open'] = [file_eng_us]
-    main.settings_custom['file_area']['files_open_ref'] = [file_xxx]
+    main.settings_custom['file_area']['files_open_ref'] = [file_other]
 
     assert wl_checks_work_area.check_nlp_support(main, nlp_utils = ['pos_taggers'])
     assert not wl_checks_work_area.check_nlp_support(main, nlp_utils = ['pos_taggers'], ref = True)
 
 def test_check_results():
     assert wl_checks_work_area.check_results(main, '', 'test')
     assert not wl_checks_work_area.check_results(main, 'test', '')
+    assert not wl_checks_work_area.check_results(main, '', '')
 
 def test_check_results_download_model():
-    wl_checks_work_area.check_results_download_model(main, '', 'test')
-    wl_checks_work_area.check_results_download_model(main, 'test', '')
+    assert wl_checks_work_area.check_results_download_model(main, '', 'test')
+    assert not wl_checks_work_area.check_results_download_model(main, 'test', '')
+    assert not wl_checks_work_area.check_results_download_model(main, '', 'module_not_found')
 
 def test_check_err_table():
     wl_checks_work_area.check_err_table(main, '')

diff --git a/tests/tests_dialogs/test_dialogs.py b/tests/tests_dialogs/test_dialogs.py
@@ -27,6 +27,8 @@ def test_wl_dialog():
     wl_dialog.set_fixed_height()
     wl_dialog.move_to_center()
 
+    wl_dialog = wl_dialogs.Wl_Dialog(main, title = 'test', resizable = True)
+
 def test_wl_dialog_frameless():
     wl_dialogs.Wl_Dialog_Frameless(main).open()
 
@@ -40,6 +42,9 @@ def test_wl_dialog_info_copy():
     wl_dialog_info_copy.get_info()
     wl_dialog_info_copy.set_info('test')
 
+    wl_dialog_info_copy = wl_dialogs.Wl_Dialog_Info_Copy(main, title = 'test', is_plain_text = True)
+    wl_dialog_info_copy.set_info('test')
+
 def test_wl_dialog_settings():
     wl_dialog_settings = wl_dialogs.Wl_Dialog_Settings(main, title = 'test')
     wl_dialog_settings.open()

diff --git a/tests/tests_measures/test_measure_utils.py b/tests/tests_measures/test_measure_utils.py
@@ -52,6 +52,8 @@ def test_to_measure_text():
         for measure_text, measure_code in measures.items():
             assert wl_measure_utils.to_measure_text(main, measure_type, measure_code) == measure_text
 
+    assert wl_measure_utils.to_measure_text(main, list(main.settings_global['mapping_measures'])[0], 'test') is None
+
 def test_to_freqs_sections_1_sample():
     assert wl_measure_utils.to_freqs_sections_1_sample(
         ITEMS_TO_SEARCH, ITEMS,

diff --git a/tests/tests_measures/test_measures_readability.py b/tests/tests_measures/test_measures_readability.py
@@ -58,6 +58,7 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'):
 
 test_text_ara_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara')
 test_text_ara_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara')
+test_text_ara_faseeh = Wl_Test_Text([[[['\u064B\u064B\u0621']]]], lang = 'ara')
 
 test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de')
 test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de')
@@ -685,18 +686,28 @@ def test_nws():
     assert nws_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144
     assert nws_eng_12 == 'no_support'
 
+def test__get_num_syls_ara():
+    assert wl_measures_readability._get_num_syls_ara('') == 0
+    assert wl_measures_readability._get_num_syls_ara('\u064E\u0627') == 2
+    assert wl_measures_readability._get_num_syls_ara('\u064Ea') == 1
+    assert wl_measures_readability._get_num_syls_ara('\u064E') == 1
+    assert wl_measures_readability._get_num_syls_ara('\u064B') == 2
+
 def test_osman():
     osman_ara_0 = wl_measures_readability.osman(main, test_text_ara_0)
     osman_ara_12 = wl_measures_readability.osman(main, test_text_ara_12)
+    osman_ara_faseeh = wl_measures_readability.osman(main, test_text_ara_faseeh)
     osman_eng_12 = wl_measures_readability.osman(main, test_text_eng_12)
 
     print('OSMAN:')
     print(f'\tara/0: {osman_ara_0}')
     print(f'\tara/12: {osman_ara_12}')
+    print(f'\tara/faseeh: {osman_ara_faseeh}')
     print(f'\teng/12: {osman_eng_12}')
 
     assert osman_ara_0 == 'text_too_short'
-    assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 23 + 3 + 0) / 12)
+    assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 26 + 3 + 0) / 12)
+    assert osman_ara_faseeh == 200.791 - 1.015 * (1 / 1) - 24.181 * ((0 + 5 + 1 + 1) / 1)
     assert osman_eng_12 == 'no_support'
 
 def test_rix():
@@ -857,6 +868,7 @@ def test_wheeler_smiths_readability_formula():
     test_eflaw()
     test_nwl()
     test_nws()
+    test__get_num_syls_ara()
     test_osman()
     test_rix()
     test_smog_grade()

diff --git a/tests/tests_measures/test_measures_statistical_significance.py b/tests/tests_measures/test_measures_statistical_significance.py
@@ -50,6 +50,11 @@ def test_get_freqs_expected():
         numpy.array([4.2, 2, 0])
     ))
 
+def test_get_alt():
+    assert wl_measures_statistical_significance.get_alt('Two-tailed') == 'two-sided'
+    assert wl_measures_statistical_significance.get_alt('Left-tailed') == 'less'
+    assert wl_measures_statistical_significance.get_alt('Right-tailed') == 'greater'
+
 # References: Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference (pp. 188-200). The South–Central Regional SAS Users' Group. (p. 10)
 def test_fishers_exact_test():
     settings['fishers_exact_test']['direction'] = 'Two-tailed'
@@ -107,6 +112,18 @@ def test_log_likelihood_ratio_test():
     )
     numpy.testing.assert_array_equal(numpy.round(gs, 2), numpy.array([167.23] * 2))
 
+    main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] = False
+    gs, p_vals = wl_measures_statistical_significance.log_likelihood_ratio_test(
+        main,
+        numpy.array([1, 0]),
+        numpy.array([1, 0]),
+        numpy.array([1, 0]),
+        numpy.array([1, 0])
+    )
+    numpy.testing.assert_array_equal(gs, numpy.array([0, 0]))
+    numpy.testing.assert_array_equal(p_vals, numpy.array([1, 1]))
+
+    main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] = True
     gs, p_vals = wl_measures_statistical_significance.log_likelihood_ratio_test(
         main,
         numpy.array([1, 0]),
@@ -127,6 +144,27 @@ def test_mann_whitney_u_test():
 
     numpy.testing.assert_array_equal(5 * (5 + 1) / 2 + u1s, numpy.array([31] * 2))
 
+    main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Two-tailed'
+    numpy.testing.assert_array_equal(
+        wl_measures_statistical_significance.mann_whitney_u_test(
+            main,
+            numpy.array([[0] * 5] * 2),
+            numpy.array([[0] * 5] * 2)
+        ),
+        (numpy.array([12.5] * 2), numpy.array([1] * 2))
+    )
+
+    main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Left-tailed'
+    numpy.testing.assert_array_equal(
+        wl_measures_statistical_significance.mann_whitney_u_test(
+            main,
+            numpy.array([[0] * 5] * 2),
+            numpy.array([[0] * 5] * 2)
+        ),
+        (numpy.array([12.5] * 2), numpy.array([1] * 2))
+    )
+
+    main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Right-tailed'
     numpy.testing.assert_array_equal(
         wl_measures_statistical_significance.mann_whitney_u_test(
             main,
@@ -182,6 +220,7 @@ def test_students_t_test_1_sample():
     )
     numpy.testing.assert_array_equal(numpy.round(t_stats, 6), numpy.array([0.999932] * 2))
 
+    main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Two-tailed'
     t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample(
         main,
         numpy.array([0, 0]),
@@ -192,10 +231,27 @@ def test_students_t_test_1_sample():
     numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0]))
     numpy.testing.assert_array_equal(p_vals, numpy.array([1, 1]))
 
-def test__students_t_test_2_sample_alt():
-    assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Two-tailed') == 'two-sided'
-    assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Left-tailed') == 'less'
-    assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Right-tailed') == 'greater'
+    main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Left-tailed'
+    t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample(
+        main,
+        numpy.array([0, 0]),
+        numpy.array([1, 1]),
+        numpy.array([1, 1]),
+        numpy.array([2, 1])
+    )
+    numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0]))
+    numpy.testing.assert_array_equal(p_vals, numpy.array([0.5, 0.5]))
+
+    main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Right-tailed'
+    t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample(
+        main,
+        numpy.array([0, 0]),
+        numpy.array([1, 1]),
+        numpy.array([1, 1]),
+        numpy.array([2, 1])
+    )
+    numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0]))
+    numpy.testing.assert_array_equal(p_vals, numpy.array([0.5, 0.5]))
 
 def test_students_t_test_2_sample():
     t_stats, p_vals = wl_measures_statistical_significance.students_t_test_2_sample(
@@ -212,6 +268,14 @@ def test__z_score_p_val():
         wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Two-tailed'),
         numpy.array([1] * 2)
     )
+    numpy.testing.assert_array_equal(
+        wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Left-tailed'),
+        numpy.array([0] * 2)
+    )
+    numpy.testing.assert_array_equal(
+        wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Right-tailed'),
+        numpy.array([0] * 2)
+    )
 
 def test_z_score():
     z_scores, p_vals = wl_measures_statistical_significance.z_score(
@@ -241,14 +305,13 @@ def test_z_score_berry_rogghe():
 if __name__ == '__main__':
     test_get_freqs_marginal()
     test_get_freqs_expected()
+    test_get_alt()
 
     test_fishers_exact_test()
     test_log_likelihood_ratio_test()
     test_mann_whitney_u_test()
     test_pearsons_chi_squared_test()
     test_students_t_test_1_sample()
-
-    test__students_t_test_2_sample_alt()
     test_students_t_test_2_sample()
 
     test__z_score_p_val()