Skip to content

Commit

Permalink
Utils: Add encoding detection - UTF-8 with BOM
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed May 19, 2024
1 parent 6f8aa74 commit b0d540d
Show file tree
Hide file tree
Showing 25 changed files with 555 additions and 402 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
# Upload coverage to Codecov
- name: "Upload coverage to Codecov"
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4

# macOS
build-macos:
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- Utils: Add Pyphen's Basque syllable tokenizer
- Utils: Add PyThaiNLP's Han-solo
- Utils: Add Stanza's Sindhi part-of-speech tagger
- Utils: Add encoding detection - UTF-8 with BOM
- Utils: Add VADER's sentiment analyzers
- Work Area: Add Collocation/Colligation Extractor - Filter results /
- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic
Expand Down
2 changes: 1 addition & 1 deletion doc/doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ XML files¹ |\*.xml
Language |File Encoding |Auto-detection
-----------------------|-----------------------|:------------:
All languages |UTF-8 without BOM |✔
All languages |UTF-8 with BOM |✖️
All languages |UTF-8 with BOM |
All languages |UTF-16 with BOM |✔
All languages |UTF-16BE without BOM |✔
All languages |UTF-16LE without BOM |✔
Expand Down

This file was deleted.

7 changes: 4 additions & 3 deletions tests/tests_checks/test_checks_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_normalized_file_path(file_name):
]

FILE_PATHS_UNSUPPORTED = [
get_normalized_file_path('unsupported_file_type.unsupported')
get_normalized_file_path('unsupported.unsupported')
]
FILE_PATHS_EMPTY = [
get_normalized_file_path('empty_txt.txt'),
Expand All @@ -44,12 +44,12 @@ def get_normalized_file_path(file_name):
]

def test_check_file_paths_unsupported():
_, files_unsupported = wl_checks_files.check_file_paths_unsupported(main, FILE_PATHS_UNSUPPORTED)
_, files_unsupported = wl_checks_files.check_file_paths_unsupported(main, ['supported.txt'] + FILE_PATHS_UNSUPPORTED)

assert files_unsupported == FILE_PATHS_UNSUPPORTED

def test_check_file_paths_empty():
_, files_empty = wl_checks_files.check_file_paths_empty(main, FILE_PATHS_EMPTY)
_, files_empty = wl_checks_files.check_file_paths_empty(main, [FILE_PATHS_DUP[0]] + FILE_PATHS_EMPTY)

assert files_empty == FILE_PATHS_EMPTY

Expand All @@ -60,6 +60,7 @@ def test_check_file_paths_duplicate():

def test_check_err_file_area():
assert wl_checks_files.check_err_file_area(main, '')
assert not wl_checks_files.check_err_file_area(main, 'test')

if __name__ == '__main__':
test_check_file_paths_unsupported()
Expand Down
4 changes: 4 additions & 0 deletions tests/tests_checks/test_checks_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ def test_check_dir():

def test_check_new_name():
assert wl_checks_misc.check_new_name('new_name', ['new_name', 'new_name (2)', 'new_name (4)']) == 'new_name (3)'
assert wl_checks_misc.check_new_name(
'new_name', ['new_name', 'new_name (2)', 'new_name (4)'],
separator = '/'
) == 'new_name/2'

def test_check_new_path():
if os.path.exists('temp'):
Expand Down
22 changes: 17 additions & 5 deletions tests/tests_checks/test_checks_work_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,32 +82,44 @@ def test_check_search_terms():

def test_check_nlp_support():
file_eng_us = {'selected': True, 'name': 'test', 'lang': 'eng_us', 'tagged': False}
file_xxx = {'selected': True, 'name': 'test', 'lang': 'xxx', 'tagged': False}
file_other = {'selected': True, 'name': 'test', 'lang': 'other', 'tagged': False}

assert wl_checks_work_area.check_nlp_support(
main,
nlp_utils = ['pos_taggers'],
files = [file_eng_us]
)
assert wl_checks_work_area.check_nlp_support(
main,
nlp_utils = ['lemmatizers'],
files = [file_eng_us]
)
assert not wl_checks_work_area.check_nlp_support(
main,
nlp_utils = ['pos_taggers'],
files = [file_xxx]
files = [file_other]
)
assert not wl_checks_work_area.check_nlp_support(
main,
nlp_utils = ['lemmatizers'],
files = [file_other]
)

main.settings_custom['file_area']['files_open'] = [file_eng_us]
main.settings_custom['file_area']['files_open_ref'] = [file_xxx]
main.settings_custom['file_area']['files_open_ref'] = [file_other]

assert wl_checks_work_area.check_nlp_support(main, nlp_utils = ['pos_taggers'])
assert not wl_checks_work_area.check_nlp_support(main, nlp_utils = ['pos_taggers'], ref = True)

def test_check_results():
assert wl_checks_work_area.check_results(main, '', 'test')
assert not wl_checks_work_area.check_results(main, 'test', '')
assert not wl_checks_work_area.check_results(main, '', '')

def test_check_results_download_model():
wl_checks_work_area.check_results_download_model(main, '', 'test')
wl_checks_work_area.check_results_download_model(main, 'test', '')
assert wl_checks_work_area.check_results_download_model(main, '', 'test')
assert not wl_checks_work_area.check_results_download_model(main, 'test', '')
assert not wl_checks_work_area.check_results_download_model(main, '', 'module_not_found')

def test_check_err_table():
wl_checks_work_area.check_err_table(main, '')
Expand Down
5 changes: 5 additions & 0 deletions tests/tests_dialogs/test_dialogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def test_wl_dialog():
wl_dialog.set_fixed_height()
wl_dialog.move_to_center()

wl_dialog = wl_dialogs.Wl_Dialog(main, title = 'test', resizable = True)

def test_wl_dialog_frameless():
wl_dialogs.Wl_Dialog_Frameless(main).open()

Expand All @@ -40,6 +42,9 @@ def test_wl_dialog_info_copy():
wl_dialog_info_copy.get_info()
wl_dialog_info_copy.set_info('test')

wl_dialog_info_copy = wl_dialogs.Wl_Dialog_Info_Copy(main, title = 'test', is_plain_text = True)
wl_dialog_info_copy.set_info('test')

def test_wl_dialog_settings():
wl_dialog_settings = wl_dialogs.Wl_Dialog_Settings(main, title = 'test')
wl_dialog_settings.open()
Expand Down
2 changes: 2 additions & 0 deletions tests/tests_measures/test_measure_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def test_to_measure_text():
for measure_text, measure_code in measures.items():
assert wl_measure_utils.to_measure_text(main, measure_type, measure_code) == measure_text

assert wl_measure_utils.to_measure_text(main, list(main.settings_global['mapping_measures'])[0], 'test') is None

def test_to_freqs_sections_1_sample():
assert wl_measure_utils.to_freqs_sections_1_sample(
ITEMS_TO_SEARCH, ITEMS,
Expand Down
14 changes: 13 additions & 1 deletion tests/tests_measures/test_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'):

test_text_ara_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara')
test_text_ara_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara')
test_text_ara_faseeh = Wl_Test_Text([[[['\u064B\u064B\u0621']]]], lang = 'ara')

test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de')
test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de')
Expand Down Expand Up @@ -685,18 +686,28 @@ def test_nws():
assert nws_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144
assert nws_eng_12 == 'no_support'

def test__get_num_syls_ara():
assert wl_measures_readability._get_num_syls_ara('') == 0
assert wl_measures_readability._get_num_syls_ara('\u064E\u0627') == 2
assert wl_measures_readability._get_num_syls_ara('\u064Ea') == 1
assert wl_measures_readability._get_num_syls_ara('\u064E') == 1
assert wl_measures_readability._get_num_syls_ara('\u064B') == 2

def test_osman():
osman_ara_0 = wl_measures_readability.osman(main, test_text_ara_0)
osman_ara_12 = wl_measures_readability.osman(main, test_text_ara_12)
osman_ara_faseeh = wl_measures_readability.osman(main, test_text_ara_faseeh)
osman_eng_12 = wl_measures_readability.osman(main, test_text_eng_12)

print('OSMAN:')
print(f'\tara/0: {osman_ara_0}')
print(f'\tara/12: {osman_ara_12}')
print(f'\tara/faseeh: {osman_ara_faseeh}')
print(f'\teng/12: {osman_eng_12}')

assert osman_ara_0 == 'text_too_short'
assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 23 + 3 + 0) / 12)
assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 26 + 3 + 0) / 12)
assert osman_ara_faseeh == 200.791 - 1.015 * (1 / 1) - 24.181 * ((0 + 5 + 1 + 1) / 1)
assert osman_eng_12 == 'no_support'

def test_rix():
Expand Down Expand Up @@ -857,6 +868,7 @@ def test_wheeler_smiths_readability_formula():
test_eflaw()
test_nwl()
test_nws()
test__get_num_syls_ara()
test_osman()
test_rix()
test_smog_grade()
Expand Down
75 changes: 69 additions & 6 deletions tests/tests_measures/test_measures_statistical_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def test_get_freqs_expected():
numpy.array([4.2, 2, 0])
))

def test_get_alt():
assert wl_measures_statistical_significance.get_alt('Two-tailed') == 'two-sided'
assert wl_measures_statistical_significance.get_alt('Left-tailed') == 'less'
assert wl_measures_statistical_significance.get_alt('Right-tailed') == 'greater'

# References: Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference (pp. 188-200). The South–Central Regional SAS Users' Group. (p. 10)
def test_fishers_exact_test():
settings['fishers_exact_test']['direction'] = 'Two-tailed'
Expand Down Expand Up @@ -107,6 +112,18 @@ def test_log_likelihood_ratio_test():
)
numpy.testing.assert_array_equal(numpy.round(gs, 2), numpy.array([167.23] * 2))

main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] = False
gs, p_vals = wl_measures_statistical_significance.log_likelihood_ratio_test(
main,
numpy.array([1, 0]),
numpy.array([1, 0]),
numpy.array([1, 0]),
numpy.array([1, 0])
)
numpy.testing.assert_array_equal(gs, numpy.array([0, 0]))
numpy.testing.assert_array_equal(p_vals, numpy.array([1, 1]))

main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] = True
gs, p_vals = wl_measures_statistical_significance.log_likelihood_ratio_test(
main,
numpy.array([1, 0]),
Expand All @@ -127,6 +144,27 @@ def test_mann_whitney_u_test():

numpy.testing.assert_array_equal(5 * (5 + 1) / 2 + u1s, numpy.array([31] * 2))

main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Two-tailed'
numpy.testing.assert_array_equal(
wl_measures_statistical_significance.mann_whitney_u_test(
main,
numpy.array([[0] * 5] * 2),
numpy.array([[0] * 5] * 2)
),
(numpy.array([12.5] * 2), numpy.array([1] * 2))
)

main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Left-tailed'
numpy.testing.assert_array_equal(
wl_measures_statistical_significance.mann_whitney_u_test(
main,
numpy.array([[0] * 5] * 2),
numpy.array([[0] * 5] * 2)
),
(numpy.array([12.5] * 2), numpy.array([1] * 2))
)

main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Right-tailed'
numpy.testing.assert_array_equal(
wl_measures_statistical_significance.mann_whitney_u_test(
main,
Expand Down Expand Up @@ -182,6 +220,7 @@ def test_students_t_test_1_sample():
)
numpy.testing.assert_array_equal(numpy.round(t_stats, 6), numpy.array([0.999932] * 2))

main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Two-tailed'
t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample(
main,
numpy.array([0, 0]),
Expand All @@ -192,10 +231,27 @@ def test_students_t_test_1_sample():
numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0]))
numpy.testing.assert_array_equal(p_vals, numpy.array([1, 1]))

def test__students_t_test_2_sample_alt():
assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Two-tailed') == 'two-sided'
assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Left-tailed') == 'less'
assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Right-tailed') == 'greater'
main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Left-tailed'
t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample(
main,
numpy.array([0, 0]),
numpy.array([1, 1]),
numpy.array([1, 1]),
numpy.array([2, 1])
)
numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0]))
numpy.testing.assert_array_equal(p_vals, numpy.array([0.5, 0.5]))

main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Right-tailed'
t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample(
main,
numpy.array([0, 0]),
numpy.array([1, 1]),
numpy.array([1, 1]),
numpy.array([2, 1])
)
numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0]))
numpy.testing.assert_array_equal(p_vals, numpy.array([0.5, 0.5]))

def test_students_t_test_2_sample():
t_stats, p_vals = wl_measures_statistical_significance.students_t_test_2_sample(
Expand All @@ -212,6 +268,14 @@ def test__z_score_p_val():
wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Two-tailed'),
numpy.array([1] * 2)
)
numpy.testing.assert_array_equal(
wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Left-tailed'),
numpy.array([0] * 2)
)
numpy.testing.assert_array_equal(
wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Right-tailed'),
numpy.array([0] * 2)
)

def test_z_score():
z_scores, p_vals = wl_measures_statistical_significance.z_score(
Expand Down Expand Up @@ -241,14 +305,13 @@ def test_z_score_berry_rogghe():
if __name__ == '__main__':
test_get_freqs_marginal()
test_get_freqs_expected()
test_get_alt()

test_fishers_exact_test()
test_log_likelihood_ratio_test()
test_mann_whitney_u_test()
test_pearsons_chi_squared_test()
test_students_t_test_1_sample()

test__students_t_test_2_sample_alt()
test_students_t_test_2_sample()

test__z_score_p_val()
Expand Down
Loading

0 comments on commit b0d540d

Please sign in to comment.