diff --git a/CHANGELOG.md b/CHANGELOG.md index b707bd15c..2d5983c70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - Utils: Add Pyphen's Basque syllable tokenizer - Utils: Add PyThaiNLP's Han-solo - Utils: Add Stanza's Sindhi part-of-speech tagger +- Utils: Add encoding detection - UTF-8 with BOM - Utils: Add VADER's sentiment analyzers - Work Area: Add Collocation/Colligation Extractor - Filter results / - Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic diff --git a/doc/doc.md b/doc/doc.md index 3d3752e73..2ea2f257d 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -793,7 +793,7 @@ XML files¹ |\*.xml Language |File Encoding |Auto-detection -----------------------|-----------------------|:------------: All languages |UTF-8 without BOM |✔ -All languages |UTF-8 with BOM |✖️ +All languages |UTF-8 with BOM |✔ All languages |UTF-16 with BOM |✔ All languages |UTF-16BE without BOM |✔ All languages |UTF-16LE without BOM |✔ diff --git a/tests/files/wl_checks/wl_checks_files/unsupported_file_type.unsupported b/tests/files/wl_checks/wl_checks_files/unsupported_file_type.unsupported deleted file mode 100644 index 0334e47e4..000000000 --- a/tests/files/wl_checks/wl_checks_files/unsupported_file_type.unsupported +++ /dev/null @@ -1 +0,0 @@ -Unsupported file type \ No newline at end of file diff --git a/tests/tests_checks/test_checks_files.py b/tests/tests_checks/test_checks_files.py index 346298729..1f938c3da 100644 --- a/tests/tests_checks/test_checks_files.py +++ b/tests/tests_checks/test_checks_files.py @@ -31,7 +31,7 @@ def get_normalized_file_path(file_name): ] FILE_PATHS_UNSUPPORTED = [ - get_normalized_file_path('unsupported_file_type.unsupported') + get_normalized_file_path('unsupported.unsupported') ] FILE_PATHS_EMPTY = [ get_normalized_file_path('empty_txt.txt'), @@ -44,12 +44,12 @@ def get_normalized_file_path(file_name): ] def test_check_file_paths_unsupported(): - _, files_unsupported = wl_checks_files.check_file_paths_unsupported(main, FILE_PATHS_UNSUPPORTED) + _, files_unsupported = wl_checks_files.check_file_paths_unsupported(main, ['supported.txt'] + FILE_PATHS_UNSUPPORTED) assert files_unsupported == FILE_PATHS_UNSUPPORTED def test_check_file_paths_empty(): - _, files_empty = wl_checks_files.check_file_paths_empty(main, FILE_PATHS_EMPTY) + _, files_empty = wl_checks_files.check_file_paths_empty(main, [FILE_PATHS_DUP[0]] + FILE_PATHS_EMPTY) assert files_empty == FILE_PATHS_EMPTY diff --git a/tests/tests_checks/test_checks_misc.py b/tests/tests_checks/test_checks_misc.py index 6e9a51d47..a9f83ae37 100644 --- a/tests/tests_checks/test_checks_misc.py +++ b/tests/tests_checks/test_checks_misc.py @@ -45,6 +45,10 @@ def test_check_dir(): def test_check_new_name(): assert wl_checks_misc.check_new_name('new_name', ['new_name', 'new_name (2)', 'new_name (4)']) == 'new_name (3)' + assert wl_checks_misc.check_new_name( + 'new_name', ['new_name', 'new_name (2)', 'new_name (4)'], + separator = '/' + ) == 'new_name/2' def test_check_new_path(): if os.path.exists('temp'): diff --git a/tests/tests_checks/test_checks_work_area.py b/tests/tests_checks/test_checks_work_area.py index 7bcf41d08..ec997968e 100644 --- a/tests/tests_checks/test_checks_work_area.py +++ b/tests/tests_checks/test_checks_work_area.py @@ -82,21 +82,31 @@ def test_check_search_terms(): def test_check_nlp_support(): file_eng_us = {'selected': True, 'name': 'test', 'lang': 'eng_us', 'tagged': False} - file_xxx = {'selected': True, 'name': 'test', 'lang': 'xxx', 'tagged': False} + file_other = {'selected': True, 'name': 'test', 'lang': 'other', 'tagged': False} assert wl_checks_work_area.check_nlp_support( main, nlp_utils = ['pos_taggers'], files = [file_eng_us] ) + assert wl_checks_work_area.check_nlp_support( + main, + nlp_utils = ['lemmatizers'], + files = [file_eng_us] + ) assert not wl_checks_work_area.check_nlp_support( main, nlp_utils = ['pos_taggers'], - files = [file_xxx] + files = [file_other] + ) + assert not wl_checks_work_area.check_nlp_support( + main, + nlp_utils = ['lemmatizers'], + files = [file_other] ) main.settings_custom['file_area']['files_open'] = [file_eng_us] - main.settings_custom['file_area']['files_open_ref'] = [file_xxx] + main.settings_custom['file_area']['files_open_ref'] = [file_other] assert wl_checks_work_area.check_nlp_support(main, nlp_utils = ['pos_taggers']) assert not wl_checks_work_area.check_nlp_support(main, nlp_utils = ['pos_taggers'], ref = True) @@ -104,10 +114,12 @@ def test_check_nlp_support(): def test_check_results(): assert wl_checks_work_area.check_results(main, '', 'test') assert not wl_checks_work_area.check_results(main, 'test', '') + assert not wl_checks_work_area.check_results(main, '', '') def test_check_results_download_model(): - wl_checks_work_area.check_results_download_model(main, '', 'test') - wl_checks_work_area.check_results_download_model(main, 'test', '') + assert wl_checks_work_area.check_results_download_model(main, '', 'test') + assert not wl_checks_work_area.check_results_download_model(main, 'test', '') + assert not wl_checks_work_area.check_results_download_model(main, '', 'module_not_found') def test_check_err_table(): wl_checks_work_area.check_err_table(main, '') diff --git a/tests/tests_dialogs/test_dialogs.py b/tests/tests_dialogs/test_dialogs.py index a1879b3e1..8fbf465ac 100644 --- a/tests/tests_dialogs/test_dialogs.py +++ b/tests/tests_dialogs/test_dialogs.py @@ -27,6 +27,8 @@ def test_wl_dialog(): wl_dialog.set_fixed_height() wl_dialog.move_to_center() + wl_dialog = wl_dialogs.Wl_Dialog(main, title = 'test', resizable = True) + def test_wl_dialog_frameless(): wl_dialogs.Wl_Dialog_Frameless(main).open() @@ -40,6 +42,8 @@ def test_wl_dialog_info_copy(): wl_dialog_info_copy.get_info() wl_dialog_info_copy.set_info('test') + wl_dialog_info_copy = wl_dialogs.Wl_Dialog_Info_Copy(main, title = 'test', is_plain_text = True) + def test_wl_dialog_settings(): wl_dialog_settings = wl_dialogs.Wl_Dialog_Settings(main, title = 'test') wl_dialog_settings.open() diff --git a/tests/tests_measures/test_measure_utils.py b/tests/tests_measures/test_measure_utils.py index 265d04a0c..a7d119467 100644 --- a/tests/tests_measures/test_measure_utils.py +++ b/tests/tests_measures/test_measure_utils.py @@ -52,6 +52,8 @@ def test_to_measure_text(): for measure_text, measure_code in measures.items(): assert wl_measure_utils.to_measure_text(main, measure_type, measure_code) == measure_text + assert wl_measure_utils.to_measure_text(main, list(main.settings_global['mapping_measures'])[0], 'test') is None + def test_to_freqs_sections_1_sample(): assert wl_measure_utils.to_freqs_sections_1_sample( ITEMS_TO_SEARCH, ITEMS, diff --git a/tests/tests_measures/test_measures_readability.py b/tests/tests_measures/test_measures_readability.py index 4eaa8c6e6..557bc8340 100644 --- a/tests/tests_measures/test_measures_readability.py +++ b/tests/tests_measures/test_measures_readability.py @@ -58,6 +58,7 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'): test_text_ara_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara') test_text_ara_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara') +test_text_ara_faseeh = Wl_Test_Text([[[['\u064B\u064B\u0621']]]], lang = 'ara') test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de') test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de') @@ -685,18 +686,28 @@ def test_nws(): assert nws_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144 assert nws_eng_12 == 'no_support' +def test__get_num_syls_ara(): + assert wl_measures_readability._get_num_syls_ara('') == 0 + assert wl_measures_readability._get_num_syls_ara('\u064E\u0627') == 2 + assert wl_measures_readability._get_num_syls_ara('\u064Ea') == 1 + assert wl_measures_readability._get_num_syls_ara('\u064E') == 1 + assert wl_measures_readability._get_num_syls_ara('\u064B') == 2 + def test_osman(): osman_ara_0 = wl_measures_readability.osman(main, test_text_ara_0) osman_ara_12 = wl_measures_readability.osman(main, test_text_ara_12) + osman_ara_faseeh = wl_measures_readability.osman(main, test_text_ara_faseeh) osman_eng_12 = wl_measures_readability.osman(main, test_text_eng_12) print('OSMAN:') print(f'\tara/0: {osman_ara_0}') print(f'\tara/12: {osman_ara_12}') + print(f'\tara/faseeh: {osman_ara_faseeh}') print(f'\teng/12: {osman_eng_12}') assert osman_ara_0 == 'text_too_short' - assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 23 + 3 + 0) / 12) + assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 26 + 3 + 0) / 12) + assert osman_ara_faseeh == 200.791 - 1.015 * (1 / 1) - 24.181 * ((0 + 5 + 1 + 1) / 1) assert osman_eng_12 == 'no_support' def test_rix(): @@ -857,6 +868,7 @@ def test_wheeler_smiths_readability_formula(): test_eflaw() test_nwl() test_nws() + test__get_num_syls_ara() test_osman() test_rix() test_smog_grade() diff --git a/tests/tests_measures/test_measures_statistical_significance.py b/tests/tests_measures/test_measures_statistical_significance.py index 52462dfc3..77bf7878f 100644 --- a/tests/tests_measures/test_measures_statistical_significance.py +++ b/tests/tests_measures/test_measures_statistical_significance.py @@ -50,6 +50,11 @@ def test_get_freqs_expected(): numpy.array([4.2, 2, 0]) )) +def test_get_alt(): + assert wl_measures_statistical_significance.get_alt('Two-tailed') == 'two-sided' + assert wl_measures_statistical_significance.get_alt('Left-tailed') == 'less' + assert wl_measures_statistical_significance.get_alt('Right-tailed') == 'greater' + # References: Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference (pp. 188-200). The South–Central Regional SAS Users' Group. (p. 10) def test_fishers_exact_test(): settings['fishers_exact_test']['direction'] = 'Two-tailed' @@ -107,6 +112,18 @@ def test_log_likelihood_ratio_test(): ) numpy.testing.assert_array_equal(numpy.round(gs, 2), numpy.array([167.23] * 2)) + main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] = False + gs, p_vals = wl_measures_statistical_significance.log_likelihood_ratio_test( + main, + numpy.array([1, 0]), + numpy.array([1, 0]), + numpy.array([1, 0]), + numpy.array([1, 0]) + ) + numpy.testing.assert_array_equal(gs, numpy.array([0, 0])) + numpy.testing.assert_array_equal(p_vals, numpy.array([1, 1])) + + main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] = True gs, p_vals = wl_measures_statistical_significance.log_likelihood_ratio_test( main, numpy.array([1, 0]), @@ -127,6 +144,27 @@ def test_mann_whitney_u_test(): numpy.testing.assert_array_equal(5 * (5 + 1) / 2 + u1s, numpy.array([31] * 2)) + main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Two-tailed' + numpy.testing.assert_array_equal( + wl_measures_statistical_significance.mann_whitney_u_test( + main, + numpy.array([[0] * 5] * 2), + numpy.array([[0] * 5] * 2) + ), + (numpy.array([12.5] * 2), numpy.array([1] * 2)) + ) + + main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Left-tailed' + numpy.testing.assert_array_equal( + wl_measures_statistical_significance.mann_whitney_u_test( + main, + numpy.array([[0] * 5] * 2), + numpy.array([[0] * 5] * 2) + ), + (numpy.array([12.5] * 2), numpy.array([1] * 2)) + ) + + main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] = 'Right-tailed' numpy.testing.assert_array_equal( wl_measures_statistical_significance.mann_whitney_u_test( main, @@ -182,6 +220,7 @@ def test_students_t_test_1_sample(): ) numpy.testing.assert_array_equal(numpy.round(t_stats, 6), numpy.array([0.999932] * 2)) + main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Two-tailed' t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample( main, numpy.array([0, 0]), @@ -192,10 +231,27 @@ def test_students_t_test_1_sample(): numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0])) numpy.testing.assert_array_equal(p_vals, numpy.array([1, 1])) -def test__students_t_test_2_sample_alt(): - assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Two-tailed') == 'two-sided' - assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Left-tailed') == 'less' - assert wl_measures_statistical_significance._students_t_test_2_sample_alt('Right-tailed') == 'greater' + main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Left-tailed' + t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample( + main, + numpy.array([0, 0]), + numpy.array([1, 1]), + numpy.array([1, 1]), + numpy.array([2, 1]) + ) + numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0])) + numpy.testing.assert_array_equal(p_vals, numpy.array([0.5, 0.5])) + + main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] = 'Right-tailed' + t_stats, p_vals = wl_measures_statistical_significance.students_t_test_1_sample( + main, + numpy.array([0, 0]), + numpy.array([1, 1]), + numpy.array([1, 1]), + numpy.array([2, 1]) + ) + numpy.testing.assert_array_equal(t_stats, numpy.array([0, 0])) + numpy.testing.assert_array_equal(p_vals, numpy.array([0.5, 0.5])) def test_students_t_test_2_sample(): t_stats, p_vals = wl_measures_statistical_significance.students_t_test_2_sample( @@ -212,6 +268,14 @@ def test__z_score_p_val(): wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Two-tailed'), numpy.array([1] * 2) ) + numpy.testing.assert_array_equal( + wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Left-tailed'), + numpy.array([0] * 2) + ) + numpy.testing.assert_array_equal( + wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Right-tailed'), + numpy.array([0] * 2) + ) def test_z_score(): z_scores, p_vals = wl_measures_statistical_significance.z_score( @@ -241,14 +305,13 @@ def test_z_score_berry_rogghe(): if __name__ == '__main__': test_get_freqs_marginal() test_get_freqs_expected() + test_get_alt() test_fishers_exact_test() test_log_likelihood_ratio_test() test_mann_whitney_u_test() test_pearsons_chi_squared_test() test_students_t_test_1_sample() - - test__students_t_test_2_sample_alt() test_students_t_test_2_sample() test__z_score_p_val() diff --git a/tests/tests_utils/test_conversion.py b/tests/tests_utils/test_conversion.py index 004122c43..bc0f2db86 100644 --- a/tests/tests_utils/test_conversion.py +++ b/tests/tests_utils/test_conversion.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import pytest + from tests import wl_test_init from wordless.wl_utils import wl_conversion @@ -26,19 +28,12 @@ TO_LANG_TEXT = { lang_code_639_3: lang_text - for lang_text, (lang_code_639_3, _, _) in settings_langs.items() -} -TO_ISO_639_1 = { - lang_code_639_3: lang_code_639_1 - for lang_code_639_3, lang_code_639_1, _ in settings_langs.values() + for lang_text, (lang_code_639_3, _) in settings_langs.items() } +TO_ISO_639_1 = dict(settings_langs.values()) TO_ISO_639_3 = { lang_code_639_1: lang_code_639_3 - for lang_code_639_3, lang_code_639_1, _ in settings_langs.values() -} -GET_LANG_FAMILY = { - lang_code_639_3: lang_family - for lang_code_639_3, _, lang_family in settings_langs.values() + for lang_code_639_3, lang_code_639_1 in settings_langs.values() } def test_normalize_lang_code(): @@ -71,11 +66,12 @@ def test_to_iso_639_3(): assert lang_code_639_3 == TO_ISO_639_3[lang_code] -def test_to_iso_639_1(): - for lang_code in TO_ISO_639_1.keys(): - lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code) + with pytest.raises(Exception): + wl_conversion.to_iso_639_3(main, 'test') - assert lang_code_639_1 == TO_ISO_639_1[lang_code] +def test_to_iso_639_1(): + for lang_code_639_3, lang_code_639_1 in TO_ISO_639_1.items(): + assert wl_conversion.to_iso_639_1(main, lang_code_639_3) == lang_code_639_1 def test_remove_lang_code_suffixes(): for lang_code_639_3, lang_code_639_1 in TO_ISO_639_1.items(): @@ -89,12 +85,6 @@ def test_remove_lang_code_suffixes(): assert lang_code_639_1.find('_') == -1 -def test_get_lang_family(): - for lang_code in TO_ISO_639_1.keys(): - lang_family = wl_conversion.get_lang_family(main, lang_code) - - assert lang_family == GET_LANG_FAMILY[lang_code] - def test_to_encoding_code(): for encoding_text, encoding_code in settings_file_encodings.items(): assert wl_conversion.to_encoding_code(main, encoding_text) == encoding_code @@ -108,14 +98,23 @@ def test_to_encoding_text(): for encoding_text, encoding_code in settings_file_encodings.items() }[encoding_code] + with pytest.raises(Exception): + wl_conversion.to_encoding_text(main, 'test') + def test_to_yes_no_code(): assert wl_conversion.to_yes_no_code('Yes') is True assert wl_conversion.to_yes_no_code('No') is False + with pytest.raises(Exception): + wl_conversion.to_yes_no_code('test') + def test_to_yes_no_text(): assert wl_conversion.to_yes_no_text(True) == 'Yes' assert wl_conversion.to_yes_no_text(False) == 'No' + with pytest.raises(Exception): + wl_conversion.to_yes_no_text('test') + if __name__ == '__main__': test_normalize_lang_code() test_to_lang_code() @@ -127,7 +126,6 @@ def test_to_yes_no_text(): test_to_iso_639_1() test_remove_lang_code_suffixes() - test_get_lang_family() test_to_encoding_code() test_to_encoding_text() diff --git a/tests/tests_utils/test_detection.py b/tests/tests_utils/test_detection.py index c26fcbf17..e7f8e8ea6 100644 --- a/tests/tests_utils/test_detection.py +++ b/tests/tests_utils/test_detection.py @@ -36,12 +36,18 @@ def check_encodings_detected(test_file_dir, encodings, text): with open(file_path, 'w', encoding = encoding, errors = 'replace') as f: f.write(text) + main.settings_custom['files']['auto_detection_settings']['num_lines_no_limit'] = True + encoding_detected_no_limit = wl_detection.detect_encoding(main, file_path) + + main.settings_custom['files']['auto_detection_settings']['num_lines_no_limit'] = False encoding_detected = wl_detection.detect_encoding(main, file_path) + # Check whether the detected code could be successfully converted to text encoding_detected_text = wl_conversion.to_encoding_text(main, encoding_detected) print(f'{encoding} detected as {encoding_detected} / {encoding_detected_text}') + assert encoding_detected == encoding_detected_no_limit assert encoding_detected_text encodings_detected.append(encoding_detected) @@ -53,13 +59,18 @@ def test_detection_encoding(): os.makedirs(test_file_dir, exist_ok = True) + with open(f'{test_file_dir}/test.exe', 'wb') as f: + f.write(b'\xFF\x00\x00') + + assert wl_detection.detect_encoding(main, f'{test_file_dir}/test.exe') == 'utf_8' + try: # All languages # Charset Normalizer does not return "utf_8_sig" # Reference: https://github.com/Ousret/charset_normalizer/pull/38 check_encodings_detected( test_file_dir = test_file_dir, - encodings = ['utf_8', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_32', 'utf_32_be', 'utf_32_le'], # 'utf_8_sig', 'utf_7' + encodings = ['utf_8', 'utf_8_sig', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_32', 'utf_32_be', 'utf_32_le'], # 'utf_7' text = wl_test_lang_examples.ENCODING_FRA ) # Arabic @@ -303,6 +314,14 @@ def test_detection_lang(): os.makedirs(test_file_dir, exist_ok = True) + file = {'path': f'{test_file_dir}/detect_lang_file_fallback.txt', 'encoding': 'ascii'} + + with open(file['path'], 'w', encoding = 'gb2312') as f: + f.write('测试') + + assert wl_detection.detect_lang_file(main, file) == main.settings_custom['files']['default_settings']['lang'] + assert wl_detection.detect_lang_text(main, '\x00') == 'other' + try: for lang in [ 'afr', 'sqi', 'ara', 'hye', 'aze', diff --git a/tests/tests_utils/test_misc.py b/tests/tests_utils/test_misc.py index 9ef1b4951..6c6a84498 100644 --- a/tests/tests_utils/test_misc.py +++ b/tests/tests_utils/test_misc.py @@ -25,14 +25,6 @@ main = wl_test_init.Wl_Test_Main() -def test_change_file_owner_to_user(): - with open('test', 'wb'): - pass - - wl_misc.change_file_owner_to_user('test') - - os.remove('test') - def test_check_os(): is_windows, is_macos, is_linux = wl_misc.check_os() @@ -43,22 +35,39 @@ def test_check_os(): elif platform.system() == 'Linux': assert not is_windows and not is_macos and is_linux -def test_flatten_list(): - assert list(wl_misc.flatten_list([1, 2, [3, 4, [5, 6]]])) == [1, 2, 3, 4, 5, 6] - def test_get_linux_distro(): assert wl_misc.get_linux_distro() == 'ubuntu' +def test_change_file_owner_to_user(): + with open('test', 'wb'): + pass + + wl_misc.change_file_owner_to_user('test') + + os.remove('test') + def test_get_wl_ver(): assert re.search(r'^[0-9]+\.[0-9]+\.[0-9]$', str(wl_misc.get_wl_ver())) -def test_merge_dicts(): - assert wl_misc.merge_dicts([{1: 10}, {1: 20, 2: 30}]) == {1: [10, 20], 2: [0, 30]} - assert wl_misc.merge_dicts([{1: [10, 20]}, {1: [30, 40], 2: [50, 60]}]) == {1: [[10, 20], [30, 40]], 2: [[0, 0], [50, 60]]} +def test_wl_get_proxies(): + proxy_settings = main.settings_custom['general']['proxy_settings'] -def test_normalize_nums(): - assert wl_misc.normalize_nums([1, 2, 3, 4, 5], 0, 100) == [0, 25, 50, 75, 100] - assert wl_misc.normalize_nums([1, 2, 3, 4, 5], 0, 100, reverse = True) == [100, 75, 50, 25, 0] + proxy_settings['use_proxy'] = False + assert wl_misc.wl_get_proxies(main) is None + + proxy_settings['use_proxy'] = True + proxy_settings['username'] = 'username' + proxy_settings['password'] = 'password' + proxy_settings['address'] = 'address' + proxy_settings['port'] = 'port' + + assert wl_misc.wl_get_proxies(main) == {'http': 'http://username:password@address:port', 'https': 'http://username:password@address:port'} + + proxy_settings['username'] = '' + assert wl_misc.wl_get_proxies(main) == {'http': 'http://address:port', 'https': 'http://address:port'} + + # Clear proxies settings + proxy_settings['use_proxy'] = False URL_VER = 'https://raw.githubusercontent.com/BLKSerene/Wordless/main/VERSION' @@ -68,18 +77,42 @@ def test_wl_download(): assert r assert not err_msg + r, err_msg = wl_misc.wl_download(main, 'https://httpstat.us/404') + + assert r.status_code == 404 + assert err_msg + + r, err_msg = wl_misc.wl_download(main, 'test') + + assert r is None + assert err_msg + def test_wl_download_file_size(): - file_size = wl_misc.wl_download_file_size(main, URL_VER) + assert wl_misc.wl_download_file_size(main, URL_VER) + assert wl_misc.wl_download_file_size(main, 'test') == 0 + +def test_flatten_list(): + assert list(wl_misc.flatten_list([1, 2, [3, 4, [5, 6]]])) == [1, 2, 3, 4, 5, 6] + +def test_merge_dicts(): + assert wl_misc.merge_dicts([{1: 10}, {1: 20, 2: 30}]) == {1: [10, 20], 2: [0, 30]} + assert wl_misc.merge_dicts([{1: [10, 20]}, {1: [30, 40], 2: [50, 60]}]) == {1: [[10, 20], [30, 40]], 2: [[0, 0], [50, 60]]} - assert file_size +def test_normalize_nums(): + assert wl_misc.normalize_nums([1, 2, 3, 4, 5], 0, 100) == [0, 25, 50, 75, 100] + assert wl_misc.normalize_nums([1, 2, 3, 4, 5], 0, 100, reverse = True) == [100, 75, 50, 25, 0] + assert wl_misc.normalize_nums([1, 2, 3, 4, 5], 0, 0) == [0] * 5 if __name__ == '__main__': - test_change_file_owner_to_user() test_check_os() - test_flatten_list() test_get_linux_distro() + test_change_file_owner_to_user() + test_get_wl_ver() - test_merge_dicts() - test_normalize_nums() + test_wl_get_proxies() test_wl_download() test_wl_download_file_size() + + test_flatten_list() + test_merge_dicts() + test_normalize_nums() diff --git a/tests/tests_utils/test_paths.py b/tests/tests_utils/test_paths.py index 107ce61fc..1214e728e 100644 --- a/tests/tests_utils/test_paths.py +++ b/tests/tests_utils/test_paths.py @@ -17,8 +17,9 @@ # ---------------------------------------------------------------------- import os +import sys -from wordless.wl_utils import wl_paths +from wordless.wl_utils import wl_paths, wl_misc def test_get_normalized_path(): assert wl_paths.get_normalized_path('.') != '.' @@ -37,6 +38,19 @@ def test_get_path_file(): assert wl_paths.get_path_file('a', 'b', 'c').endswith(os.path.sep.join(['a', 'b', 'c'])) assert wl_paths.get_path_file('a', '..', 'b').endswith('b') + sys._MEIPASS = 'test' + + assert wl_paths.get_path_file('a', internal = True).endswith(os.path.sep.join(['test', 'a'])) + assert wl_paths.get_path_file('a', internal = False).endswith('a') + + check_os_orig = wl_misc.check_os + wl_misc.check_os = lambda: (False, True, False) + + assert wl_paths.get_path_file('a', internal = False).endswith(os.path.sep.join(['MacOS', 'a'])) + + wl_misc.check_os = check_os_orig + del sys._MEIPASS + def test_get_path_data(): assert wl_paths.get_path_data('a').endswith(os.path.sep.join(['data', 'a'])) diff --git a/tests/wl_test_doc.py b/tests/wl_test_doc.py index 808374c38..5c8ece4b0 100644 --- a/tests/wl_test_doc.py +++ b/tests/wl_test_doc.py @@ -24,7 +24,7 @@ def wl_test_supported_langs(main): langs_supported = [ (lang_name, lang_code_639_3) - for lang_name, (lang_code_639_3, _, _) in main.settings_global['langs'].items() + for lang_name, (lang_code_639_3, _) in main.settings_global['langs'].items() ] # Translations @@ -126,10 +126,7 @@ def wl_test_supported_encodings(main): len_max_encodings = max((len(encoding) for encoding in encodings)) for lang, encoding in zip(langs, encodings): - if encoding in ['UTF-8 with BOM', 'UTF-8 带签名']: - print(f'{lang:{len_max_langs}}|{encoding:{len_max_encodings}}|✖️') - else: - print(f'{lang:{len_max_langs}}|{encoding:{len_max_encodings}}|✔') + print(f'{lang:{len_max_langs}}|{encoding:{len_max_encodings}}|✔') print() diff --git a/tests/wl_test_lang_examples.py b/tests/wl_test_lang_examples.py index 4f9f4dcd3..3ffd77a5d 100644 --- a/tests/wl_test_lang_examples.py +++ b/tests/wl_test_lang_examples.py @@ -389,7 +389,6 @@ SENTENCE_POR_BR = SENTENCE_POR_PT = TEXT_POR_BR[0] SENTENCE_RON = TEXT_RON[0] SENTENCE_PAN_GURU = 'ਪੰਜਾਬੀ ਭਾਸ਼ਾ (ਸ਼ਾਹਮੁਖੀ: ‎پنجابی, ਪੰਜਾਬੀ) ਪੰਜਾਬ ਦੀ ਭਾਸ਼ਾ ਹੈ, ਜਿਸ ਨੂੰ ਪੰਜਾਬ ਖੇਤਰ ਦੇ ਵਸਨੀਕ ਜਾਂ ਸੰਬੰਧਿਤ ਲੋਕ ਬੋਲਦੇ ਹਨ।[18]' -SENTENCE_QUE = 'Qhichwa simi icha Runasimi ñisqaqa Urin Apya Yalapi rimasqan rimaymi.' SENTENCE_RUS = TEXT_RUS[0] SENTENCE_ORV = TEXT_ORV[0] SENTENCE_SME = TEXT_SME[0] diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index c0d026cd8..7a7be94a8 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -90,13 +90,13 @@ def get_num_words_ltrs(words, len_min = 1, len_max = None): return len([ True for word in words - if len_min <= len([char for char in word if char.isalpha()]) <= len_max + if len_min <= len([char for char in word if char.isalpha()]) <= len_max ]) else: return len([ True for word in words - if len([char for char in word if char.isalpha()]) >= len_min + if len([char for char in word if char.isalpha()]) >= len_min ]) def get_num_words_syls(syls_words, len_min = 1, len_max = None): @@ -1068,38 +1068,38 @@ def nws(main, text): return nws # Estimate number of syllables in Arabic texts by counting short, long, and stress syllables -# Reference: https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569 -def _get_num_syls_ara(text): - short_count = 0 - long_count = 0 - - # tashkeel: fatha | damma | kasra - tashkeel = [r'\u064E', r'\u064F', r'\u0650'] - char_list = list(re.sub(r"[^\w\s\']", '', text)) - - for t in tashkeel: - for i, c in enumerate(char_list): - if c != t: - continue - - # Only if a character is a tashkeel, has a successor and is followed by an alef, waw or yaaA - if ( - i + 1 < len(char_list) - and char_list[i + 1] in ['\u0627', '\u0648', '\u064a'] - ): - long_count += 1 +# References: +# https://github.com/drelhaj/OsmanReadability/blob/master/src/org/project/osman/process/Syllables.java +# https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569 +def _get_num_syls_ara(word): + count_short = 0 + count_long = 0 + + # Tashkeel: fatha, damma, kasra + tashkeel = ['\u064E', '\u064F', '\u0650'] + + for i, char in enumerate(word): + if char not in tashkeel: + continue + + # Only if a character is a tashkeel, has a successor, and is followed by an alef, waw, or yeh + if i + 1 < len(word): + if word[i + 1] in ['\u0627', '\u0648', '\u064A']: + count_long += 1 else: - short_count += 1 + count_short += 1 + else: + count_short += 1 - # stress syllables: tanween fatih | tanween damm | tanween kasr | shadda - stress_pattern = re.compile(r'[\u064B\u064C\u064D\u0651]') - stress_count = len(stress_pattern.findall(text)) + # Stress syllables: tanween fatha, tanween damma, tanween kasra, shadda + count_stress = len(re.findall(r'[\u064B\u064C\u064D\u0651]', word)) - if short_count == 0: - text = re.sub(r'[\u0627\u0649\?\.\!\,\s*]', '', text) - short_count = len(text) - 2 + if count_short == 0: + word = re.sub(r'[\u0627\u0649\?\.\!\,\s]', '', word) + count_short = max(0, len(word) - 2) - return short_count + 2 * (long_count + stress_count) + # Reference: https://github.com/drelhaj/OsmanReadability/blob/405b927ef3fde200fa08efe12ec2f39b8716e4be/src/org/project/osman/process/OsmanReadability.java#L259 + return count_short + 2 * (count_long + count_stress) # OSMAN # Reference: El-Haj, M., & Rayson, P. (2016). OSMAN: A novel Arabic readability metric. In N. Calzolari, K. Choukri, T. Declerck, S. Goggi, M. Grobelnik, B. Maegaard, J. Mariani, H. Mazo, A. Moreno, J. Odijk, & S. Piperidis (Eds.), Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016) (pp. 250–255). European Language Resources Association. http://www.lrec-conf.org/proceedings/lrec2016/index.html @@ -1120,9 +1120,13 @@ def osman(main, text): for word, num_syls in zip(text.words_flat, nums_syls_tokens): if ( num_syls > 4 + # Faseeh letters + # Reference: https://github.com/drelhaj/OsmanReadability/blob/405b927ef3fde200fa08efe12ec2f39b8716e4be/src/org/project/osman/process/OsmanReadability.java#L264 and ( - any((letter in word for letter in ['ء', 'ئ', 'ؤ', 'ذ', 'ظ'])) - or any((word.endswith(letters) for letters in ['وا', 'ون'])) + # Hamza (ء), yeh with hamza above (ئ), waw with hamza above (ؤ), zah (ظ), thal (ذ) + any((char in word for char in ['\u0621', '\u0626', '\u0624', '\u0638', '\u0630'])) + # Waw noon (ون), waw alef (وا) + or word.endswith(('\u0648\u0646', '\u0648\u0627')) ) ): h += 1 diff --git a/wordless/wl_measures/wl_measures_statistical_significance.py b/wordless/wl_measures/wl_measures_statistical_significance.py index 29836f010..512414a4b 100644 --- a/wordless/wl_measures/wl_measures_statistical_significance.py +++ b/wordless/wl_measures/wl_measures_statistical_significance.py @@ -60,20 +60,26 @@ def yatess_correction(o11s, o12s, o21s, o22s, e11s, e12s, e21s, e22s): return o11s, o12s, o21s, o22s -# Fisher's Exact Test -# References: Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference (pp. 188–200). The South–Central Regional SAS Users' Group. -def fishers_exact_test(main, o11s, o12s, o21s, o22s): - direction = main.settings_custom['measures']['statistical_significance']['fishers_exact_test']['direction'] - +def get_alt(direction): if direction == _tr('wl_measures_statistical_significance', 'Two-tailed'): - alternative = 'two-sided' + alt = 'two-sided' elif direction == _tr('wl_measures_statistical_significance', 'Left-tailed'): - alternative = 'less' + alt = 'less' elif direction == _tr('wl_measures_statistical_significance', 'Right-tailed'): - alternative = 'greater' + alt = 'greater' + + return alt + +# Fisher's Exact Test +# References: Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference (pp. 188–200). The South–Central Regional SAS Users' Group. +def fishers_exact_test(main, o11s, o12s, o21s, o22s): + settings = main.settings_custom['measures']['statistical_significance']['fishers_exact_test'] p_vals = numpy.array([ - scipy.stats.fisher_exact([[o11, o12], [o21, o22]], alternative = alternative)[1] + scipy.stats.fisher_exact( + [[o11, o12], [o21, o22]], + alternative = get_alt(settings['direction']) + )[1] for o11, o12, o21, o22 in zip(o11s, o12s, o21s, o22s) ]) @@ -82,11 +88,11 @@ def fishers_exact_test(main, o11s, o12s, o21s, o22s): # Log-likelihood Ratio # References: Dunning, T. E. (1993). Accurate methods for the statistics of surprise and coincidence. Computational Linguistics, 19(1), 61–74. def log_likelihood_ratio_test(main, o11s, o12s, o21s, o22s): - apply_correction = main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']['apply_correction'] + settings = main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test'] e11s, e12s, e21s, e22s = get_freqs_expected(o11s, o12s, o21s, o22s) - if apply_correction: + if settings['apply_correction']: o11s, o12s, o21s, o22s = yatess_correction(o11s, o12s, o21s, o22s, e11s, e12s, e21s, e22s) gs_11 = o11s * wl_measure_utils.numpy_log(wl_measure_utils.numpy_divide(o11s, e11s)) @@ -105,15 +111,7 @@ def log_likelihood_ratio_test(main, o11s, o12s, o21s, o22s): # Mann-Whitney U Test # References: Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics, 6(1), 232–263. https://doi.org/10.1075/ijcl.6.1.05kil def mann_whitney_u_test(main, freqs_x1s, freqs_x2s): - direction = main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['direction'] - apply_correction = main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']['apply_correction'] - - if direction == _tr('wl_measures_statistical_significance', 'Two-tailed'): - alternative = 'two-sided' - elif direction == _tr('wl_measures_statistical_significance', 'Left-tailed'): - alternative = 'less' - elif direction == _tr('wl_measures_statistical_significance', 'Right-tailed'): - alternative = 'greater' + settings = main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test'] num_types = len(freqs_x1s) u1s = numpy.empty(shape = num_types, dtype = numpy.float64) @@ -122,8 +120,8 @@ def mann_whitney_u_test(main, freqs_x1s, freqs_x2s): for i, (freqs_x1, freqs_x2) in enumerate(zip(freqs_x1s, freqs_x2s)): u1, p_val = scipy.stats.mannwhitneyu( freqs_x1, freqs_x2, - use_continuity = apply_correction, - alternative = alternative + use_continuity = settings['apply_correction'], + alternative = get_alt(settings['direction']) ) u1s[i] = u1 @@ -136,11 +134,11 @@ def mann_whitney_u_test(main, freqs_x1s, freqs_x2s): # Hofland, K., & Johanson, S. (1982). Word frequencies in British and American English. Norwegian Computing Centre for the Humanities. # Oakes, M. P. (1998). Statistics for Corpus Linguistics. Edinburgh University Press. def pearsons_chi_squared_test(main, o11s, o12s, o21s, o22s): - apply_correction = main.settings_custom['measures']['statistical_significance']['pearsons_chi_squared_test']['apply_correction'] + settings = main.settings_custom['measures']['statistical_significance']['pearsons_chi_squared_test'] e11s, e12s, e21s, e22s = get_freqs_expected(o11s, o12s, o21s, o22s) - if apply_correction: + if settings['apply_correction']: o11s, o12s, o21s, o22s = yatess_correction(o11s, o12s, o21s, o22s, e11s, e12s, e21s, e22s) chi2s_11 = wl_measure_utils.numpy_divide((o11s - e11s) ** 2, e11s) @@ -159,7 +157,7 @@ def pearsons_chi_squared_test(main, o11s, o12s, o21s, o22s): # Student's t-test (1-sample) # References: Church, K., Gale, W., Hanks, P., & Hindle, D. (1991). Using statistics in lexical analysis. In U. Zernik (Ed.), Lexical acquisition: Exploiting on-line resources to build a lexicon (pp. 115–164). Psychology Press. def students_t_test_1_sample(main, o11s, o12s, o21s, o22s): - direction = main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']['direction'] + settings = main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample'] oxxs = o11s + o12s + o21s + o22s e11s, _, _, _ = get_freqs_expected(o11s, o12s, o21s, o22s) @@ -167,33 +165,22 @@ def students_t_test_1_sample(main, o11s, o12s, o21s, o22s): t_stats = wl_measure_utils.numpy_divide(o11s - e11s, numpy.sqrt(o11s * (1 - wl_measure_utils.numpy_divide(o11s, oxxs)))) p_vals = numpy.empty_like(t_stats) - if direction == _tr('wl_measures_statistical_significance', 'Two-tailed'): + if settings['direction'] == _tr('wl_measures_statistical_significance', 'Two-tailed'): for i, (oxx, t_stat) in enumerate(zip(oxxs, t_stats)): p_vals[i] = scipy.stats.distributions.t.sf(numpy.abs(t_stat), oxx - 1) * 2 if oxx > 1 else 1 - elif direction == _tr('wl_measures_statistical_significance', 'Left-tailed'): + elif settings['direction'] == _tr('wl_measures_statistical_significance', 'Left-tailed'): for i, (oxx, t_stat) in enumerate(zip(oxxs, t_stats)): p_vals[i] = scipy.stats.distributions.t.cdf(t_stat, oxx - 1) if oxx > 1 else 1 - elif direction == _tr('wl_measures_statistical_significance', 'Right-tailed'): + elif settings['direction'] == _tr('wl_measures_statistical_significance', 'Right-tailed'): for i, (oxx, t_stat) in enumerate(zip(oxxs, t_stats)): p_vals[i] = scipy.stats.distributions.t.sf(t_stat, oxx - 1) if oxx > 1 else 1 return t_stats, p_vals -def _students_t_test_2_sample_alt(direction): - if direction == _tr('wl_measures_statistical_significance', 'Two-tailed'): - alt = 'two-sided' - elif direction == _tr('wl_measures_statistical_significance', 'Left-tailed'): - alt = 'less' - elif direction == _tr('wl_measures_statistical_significance', 'Right-tailed'): - alt = 'greater' - - return alt - # Student's t-test (2-sample) # References: Paquot, M., & Bestgen, Y. (2009). Distinctive words in academic writing: A comparison of three statistical tests for keyword extraction. Language and Computers, 68, 247–269. def students_t_test_2_sample(main, freqs_x1s, freqs_x2s): - direction = main.settings_custom['measures']['statistical_significance']['students_t_test_2_sample']['direction'] - alt = _students_t_test_2_sample_alt(direction) + settings = main.settings_custom['measures']['statistical_significance']['students_t_test_2_sample'] num_types = len(freqs_x1s) t_stats = numpy.empty(shape = num_types, dtype = numpy.float64) @@ -201,7 +188,11 @@ def students_t_test_2_sample(main, freqs_x1s, freqs_x2s): for i, (freqs_x1, freqs_x2) in enumerate(zip(freqs_x1s, freqs_x2s)): if any(freqs_x1) or any(freqs_x2): - t_stat, p_val = scipy.stats.ttest_ind(freqs_x1, freqs_x2, equal_var = True, alternative = alt) + t_stat, p_val = scipy.stats.ttest_ind( + freqs_x1, freqs_x2, + equal_var = True, + alternative = get_alt(settings['direction']) + ) else: t_stat = 0 p_val = 1 @@ -229,20 +220,20 @@ def _z_score_p_val(z_scores, direction): # z-score # References: Dennis, S. F. (1964). The construction of a thesaurus automatically from a sample of text. In M. E. Stevens, V. E. Giuliano, & L. B. Heilprin (Eds.), Proceedings of the symposium on statistical association methods for mechanized documentation (pp. 61–148). National Bureau of Standards. def z_score(main, o11s, o12s, o21s, o22s): - direction = main.settings_custom['measures']['statistical_significance']['z_score']['direction'] + settings = main.settings_custom['measures']['statistical_significance']['z_score'] oxxs = o11s + o12s + o21s + o22s e11s, _, _, _ = get_freqs_expected(o11s, o12s, o21s, o22s) z_scores = wl_measure_utils.numpy_divide(o11s - e11s, numpy.sqrt(e11s * (1 - wl_measure_utils.numpy_divide(e11s, oxxs)))) - p_vals = _z_score_p_val(z_scores, direction) + p_vals = _z_score_p_val(z_scores, settings['direction']) return z_scores, p_vals # z-score (Berry-Rogghe) # References: Berry-Rogghe, G. L. M. (1973). The computation of collocations and their relevance in lexical studies. In A. J. Aiken, R. W. Bailey, & N. Hamilton-Smith (Eds.), The computer and literary studies (pp. 103–112). Edinburgh University Press. def z_score_berry_rogghe(main, o11s, o12s, o21s, o22s, span): - direction = main.settings_custom['measures']['statistical_significance']['z_score_berry_rogghe']['direction'] + settings = main.settings_custom['measures']['statistical_significance']['z_score_berry_rogghe'] o1xs, o2xs, ox1s, _ = get_freqs_marginal(o11s, o12s, o21s, o22s) @@ -251,6 +242,6 @@ def z_score_berry_rogghe(main, o11s, o12s, o21s, o22s, span): es = ps * o1xs * span z_scores = wl_measure_utils.numpy_divide(o11s - es, numpy.sqrt(es * (1 - ps))) - p_vals = _z_score_p_val(z_scores, direction) + p_vals = _z_score_p_val(z_scores, settings['direction']) return z_scores, p_vals diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index a50db6ff0..769fa2a4c 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -31,129 +31,129 @@ SETTINGS_GLOBAL = { # Language names should be always capitalized 'langs': { - _tr('wl_settings_global', 'Afrikaans'): ['afr', 'af', 'Indo-European'], - _tr('wl_settings_global', 'Albanian'): ['sqi', 'sq', 'Indo-European'], - _tr('wl_settings_global', 'Amharic'): ['amh', 'am', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Arabic'): ['ara', 'ar', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Armenian (Eastern)'): ['hye', 'hy', 'Indo-European'], - _tr('wl_settings_global', 'Armenian (Western)'): ['hyw', 'hyw', 'Indo-European'], - _tr('wl_settings_global', 'Assamese'): ['asm', 'as', 'Indo-European'], - _tr('wl_settings_global', 'Asturian'): ['ast', 'ast', 'Indo-European'], - _tr('wl_settings_global', 'Azerbaijani'): ['aze', 'az', 'Turkic'], - _tr('wl_settings_global', 'Basque'): ['eus', 'eu', 'Language isolate'], - _tr('wl_settings_global', 'Belarusian'): ['bel', 'be', 'Indo-European'], - _tr('wl_settings_global', 'Bengali'): ['ben', 'bn', 'Indo-European'], - _tr('wl_settings_global', 'Bulgarian'): ['bul', 'bg', 'Indo-European'], - _tr('wl_settings_global', 'Burmese'): ['mya', 'my', 'Sino-Tibetan'], - _tr('wl_settings_global', 'Buryat (Russia)'): ['bxr', 'bxr', 'Mongolic'], - _tr('wl_settings_global', 'Catalan'): ['cat', 'ca', 'Indo-European'], - _tr('wl_settings_global', 'Chinese (Classical)'): ['lzh', 'lzh', 'Sino-Tibetan'], - _tr('wl_settings_global', 'Chinese (Simplified)'): ['zho_cn', 'zh_cn', 'Sino-Tibetan'], - _tr('wl_settings_global', 'Chinese (Traditional)'): ['zho_tw', 'zh_tw', 'Sino-Tibetan'], - _tr('wl_settings_global', 'Church Slavonic (Old)'): ['chu', 'cu', 'Indo-European'], - _tr('wl_settings_global', 'Coptic'): ['cop', 'cop', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Croatian'): ['hrv', 'hr', 'Indo-European'], - _tr('wl_settings_global', 'Czech'): ['ces', 'cs', 'Indo-European'], - _tr('wl_settings_global', 'Danish'): ['dan', 'da', 'Indo-European'], - _tr('wl_settings_global', 'Dutch'): ['nld', 'nl', 'Indo-European'], - _tr('wl_settings_global', 'English (Middle)'): ['enm', 'enm', 'Indo-European'], - _tr('wl_settings_global', 'English (United Kingdom)'): ['eng_gb', 'en_gb', 'Indo-European'], - _tr('wl_settings_global', 'English (United States)'): ['eng_us', 'en_us', 'Indo-European'], - _tr('wl_settings_global', 'Erzya'): ['myv', 'myv', 'Uralic'], - _tr('wl_settings_global', 'Esperanto'): ['epo', 'eo', 'Constructed'], - _tr('wl_settings_global', 'Estonian'): ['est', 'et', 'Uralic'], - _tr('wl_settings_global', 'Faroese'): ['fao', 'fo', 'Indo-European'], - _tr('wl_settings_global', 'Finnish'): ['fin', 'fi', 'Uralic'], - _tr('wl_settings_global', 'French'): ['fra', 'fr', 'Indo-European'], - _tr('wl_settings_global', 'French (Old)'): ['fro', 'fro', 'Indo-European'], - _tr('wl_settings_global', 'Galician'): ['glg', 'gl', 'Indo-European'], - _tr('wl_settings_global', 'Georgian'): ['kat', 'ka', 'Kartvelian'], - _tr('wl_settings_global', 'German (Austria)'): ['deu_at', 'de_at', 'Indo-European'], - _tr('wl_settings_global', 'German (Germany)'): ['deu_de', 'de_de', 'Indo-European'], - _tr('wl_settings_global', 'German (Switzerland)'): ['deu_ch', 'de_ch', 'Indo-European'], - _tr('wl_settings_global', 'Gothic'): ['got', 'got', 'Indo-European'], - _tr('wl_settings_global', 'Greek (Ancient)'): ['grc', 'grc', 'Unclassified'], - _tr('wl_settings_global', 'Greek (Modern)'): ['ell', 'el', 'Indo-European'], - _tr('wl_settings_global', 'Gujarati'): ['guj', 'gu', 'Indo-European'], - _tr('wl_settings_global', 'Hebrew (Ancient)'): ['hbo', 'hbo', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Hebrew (Modern)'): ['heb', 'he', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Hindi'): ['hin', 'hi', 'Indo-European'], - _tr('wl_settings_global', 'Hungarian'): ['hun', 'hu', 'Uralic'], - _tr('wl_settings_global', 'Icelandic'): ['isl', 'is', 'Indo-European'], - _tr('wl_settings_global', 'Indonesian'): ['ind', 'id', 'Austronesian'], - _tr('wl_settings_global', 'Irish'): ['gle', 'ga', 'Indo-European'], - _tr('wl_settings_global', 'Italian'): ['ita', 'it', 'Indo-European'], - _tr('wl_settings_global', 'Japanese'): ['jpn', 'ja', 'Japonic'], - _tr('wl_settings_global', 'Kannada'): ['kan', 'kn', 'Dravidian'], - _tr('wl_settings_global', 'Kazakh'): ['kaz', 'kk', 'Turkic'], - _tr('wl_settings_global', 'Khmer'): ['khm', 'km', 'Austroasiatic'], - _tr('wl_settings_global', 'Korean'): ['kor', 'ko', 'Koreanic'], - _tr('wl_settings_global', 'Kurdish (Kurmanji)'): ['kmr', 'kmr', 'Indo-European'], - _tr('wl_settings_global', 'Kyrgyz'): ['kir', 'ky', 'Turkic'], - _tr('wl_settings_global', 'Lao'): ['lao', 'lo', 'Kra-Dai'], - _tr('wl_settings_global', 'Latin'): ['lat', 'la', 'Indo-European'], - _tr('wl_settings_global', 'Latvian'): ['lav', 'lv', 'Indo-European'], - _tr('wl_settings_global', 'Ligurian'): ['lij', 'lij', 'Unclassified'], - _tr('wl_settings_global', 'Lithuanian'): ['lit', 'lt', 'Indo-European'], - _tr('wl_settings_global', 'Luganda'): ['lug', 'lg', 'Niger-Congo'], - _tr('wl_settings_global', 'Luxembourgish'): ['ltz', 'lb', 'Indo-European'], - _tr('wl_settings_global', 'Macedonian'): ['mkd', 'mk', 'Indo-European'], - _tr('wl_settings_global', 'Malay'): ['msa', 'ms', 'Austronesian'], - _tr('wl_settings_global', 'Malayalam'): ['mal', 'ml', 'Dravidian'], - _tr('wl_settings_global', 'Maltese'): ['mlt', 'mt', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Manx'): ['glv', 'gv', 'Indo-European'], - _tr('wl_settings_global', 'Marathi'): ['mar', 'mr', 'Indo-European'], - _tr('wl_settings_global', 'Meitei (Meitei script)'): ['mni_mtei', 'mni_mtei', 'Sino-Tibetan'], - _tr('wl_settings_global', 'Mongolian'): ['mon', 'mn', 'Mongolic'], - _tr('wl_settings_global', 'Nepali'): ['nep', 'ne', 'Indo-European'], - _tr('wl_settings_global', 'Nigerian Pidgin'): ['pcm', 'pcm', 'English Creole'], - _tr('wl_settings_global', 'Norwegian (Bokmål)'): ['nob', 'nb', 'Indo-European'], - _tr('wl_settings_global', 'Norwegian (Nynorsk)'): ['nno', 'nn', 'Indo-European'], - _tr('wl_settings_global', 'Odia'): ['ori', 'or', 'Indo-European'], - _tr('wl_settings_global', 'Persian'): ['fas', 'fa', 'Indo-European'], - _tr('wl_settings_global', 'Polish'): ['pol', 'pl', 'Indo-European'], - _tr('wl_settings_global', 'Pomak'): ['qpm', 'qpm', 'Unclassified'], - _tr('wl_settings_global', 'Portuguese (Brazil)'): ['por_br', 'pt_br', 'Indo-European'], - _tr('wl_settings_global', 'Portuguese (Portugal)'): ['por_pt', 'pt_pt', 'Indo-European'], - _tr('wl_settings_global', 'Punjabi (Gurmukhi script)'): ['pan_guru', 'pa_guru', 'Indo-European'], - _tr('wl_settings_global', 'Romanian'): ['ron', 'ro', 'Indo-European'], - _tr('wl_settings_global', 'Russian'): ['rus', 'ru', 'Indo-European'], - _tr('wl_settings_global', 'Russian (Old)'): ['orv', 'orv', 'Indo-European'], - _tr('wl_settings_global', 'Sámi (Northern)'): ['sme', 'se', 'Uralic'], - _tr('wl_settings_global', 'Sanskrit'): ['san', 'sa', 'Indo-European'], - _tr('wl_settings_global', 'Scottish Gaelic'): ['gla', 'gd', 'Indo-European'], - _tr('wl_settings_global', 'Serbian (Cyrillic script)'): ['srp_cyrl', 'sr_cyrl', 'Indo-European'], - _tr('wl_settings_global', 'Serbian (Latin script)'): ['srp_latn', 'sr_latn', 'Indo-European'], - _tr('wl_settings_global', 'Sindhi'): ['snd', 'sd', 'Indo-European'], - _tr('wl_settings_global', 'Sinhala'): ['sin', 'si', 'Indo-European'], - _tr('wl_settings_global', 'Slovak'): ['slk', 'sk', 'Indo-European'], - _tr('wl_settings_global', 'Slovene'): ['slv', 'sl', 'Indo-European'], - _tr('wl_settings_global', 'Sorbian (Lower)'): ['dsb', 'dsb', 'Indo-European'], - _tr('wl_settings_global', 'Sorbian (Upper)'): ['hsb', 'hsb', 'Indo-European'], - _tr('wl_settings_global', 'Spanish'): ['spa', 'es', 'Indo-European'], - _tr('wl_settings_global', 'Swahili'): ['swa', 'sw', 'Niger-Congo'], - _tr('wl_settings_global', 'Swedish'): ['swe', 'sv', 'Indo-European'], - _tr('wl_settings_global', 'Tagalog'): ['tgl', 'tl', 'Austronesian'], - _tr('wl_settings_global', 'Tajik'): ['tgk', 'tg', 'Indo-European'], - _tr('wl_settings_global', 'Tamil'): ['tam', 'ta', 'Dravidian'], - _tr('wl_settings_global', 'Tatar'): ['tat', 'tt', 'Turkic'], - _tr('wl_settings_global', 'Telugu'): ['tel', 'te', 'Dravidian'], - _tr('wl_settings_global', 'Tetun (Dili)'): ['tdt', 'tdt', 'Unclassified'], - _tr('wl_settings_global', 'Thai'): ['tha', 'th', 'Tai-Kadai'], - _tr('wl_settings_global', 'Tibetan'): ['bod', 'bo', 'Sino-Tibetan'], - _tr('wl_settings_global', 'Tigrinya'): ['tir', 'ti', 'Afro-Asiatic'], - _tr('wl_settings_global', 'Tswana'): ['tsn', 'tn', 'Niger-Congo'], - _tr('wl_settings_global', 'Turkish'): ['tur', 'tr', 'Turkic'], - _tr('wl_settings_global', 'Ukrainian'): ['ukr', 'uk', 'Indo-European'], - _tr('wl_settings_global', 'Urdu'): ['urd', 'ur', 'Indo-European'], - _tr('wl_settings_global', 'Uyghur'): ['uig', 'ug', 'Turkic'], - _tr('wl_settings_global', 'Vietnamese'): ['vie', 'vi', 'Austroasiatic'], - _tr('wl_settings_global', 'Welsh'): ['cym', 'cy', 'Indo-European'], - _tr('wl_settings_global', 'Wolof'): ['wol', 'wo', 'Niger-Congo'], - _tr('wl_settings_global', 'Yoruba'): ['yor', 'yo', 'Niger-Congo'], - _tr('wl_settings_global', 'Zulu'): ['zul', 'zu', 'Niger-Congo'], - - _tr('wl_settings_global', 'Other languages'): ['other', 'other', 'Unclassified'] + _tr('wl_settings_global', 'Afrikaans'): ['afr', 'af'], + _tr('wl_settings_global', 'Albanian'): ['sqi', 'sq'], + _tr('wl_settings_global', 'Amharic'): ['amh', 'am'], + _tr('wl_settings_global', 'Arabic'): ['ara', 'ar'], + _tr('wl_settings_global', 'Armenian (Eastern)'): ['hye', 'hy'], + _tr('wl_settings_global', 'Armenian (Western)'): ['hyw', 'hyw'], + _tr('wl_settings_global', 'Assamese'): ['asm', 'as'], + _tr('wl_settings_global', 'Asturian'): ['ast', 'ast'], + _tr('wl_settings_global', 'Azerbaijani'): ['aze', 'az'], + _tr('wl_settings_global', 'Basque'): ['eus', 'eu'], + _tr('wl_settings_global', 'Belarusian'): ['bel', 'be'], + _tr('wl_settings_global', 'Bengali'): ['ben', 'bn'], + _tr('wl_settings_global', 'Bulgarian'): ['bul', 'bg'], + _tr('wl_settings_global', 'Burmese'): ['mya', 'my'], + _tr('wl_settings_global', 'Buryat (Russia)'): ['bxr', 'bxr'], + _tr('wl_settings_global', 'Catalan'): ['cat', 'ca'], + _tr('wl_settings_global', 'Chinese (Classical)'): ['lzh', 'lzh'], + _tr('wl_settings_global', 'Chinese (Simplified)'): ['zho_cn', 'zh_cn'], + _tr('wl_settings_global', 'Chinese (Traditional)'): ['zho_tw', 'zh_tw'], + _tr('wl_settings_global', 'Church Slavonic (Old)'): ['chu', 'cu'], + _tr('wl_settings_global', 'Coptic'): ['cop', 'cop'], + _tr('wl_settings_global', 'Croatian'): ['hrv', 'hr'], + _tr('wl_settings_global', 'Czech'): ['ces', 'cs'], + _tr('wl_settings_global', 'Danish'): ['dan', 'da'], + _tr('wl_settings_global', 'Dutch'): ['nld', 'nl'], + _tr('wl_settings_global', 'English (Middle)'): ['enm', 'enm'], + _tr('wl_settings_global', 'English (United Kingdom)'): ['eng_gb', 'en_gb'], + _tr('wl_settings_global', 'English (United States)'): ['eng_us', 'en_us'], + _tr('wl_settings_global', 'Erzya'): ['myv', 'myv'], + _tr('wl_settings_global', 'Esperanto'): ['epo', 'eo'], + _tr('wl_settings_global', 'Estonian'): ['est', 'et'], + _tr('wl_settings_global', 'Faroese'): ['fao', 'fo'], + _tr('wl_settings_global', 'Finnish'): ['fin', 'fi'], + _tr('wl_settings_global', 'French'): ['fra', 'fr'], + _tr('wl_settings_global', 'French (Old)'): ['fro', 'fro'], + _tr('wl_settings_global', 'Galician'): ['glg', 'gl'], + _tr('wl_settings_global', 'Georgian'): ['kat', 'ka'], + _tr('wl_settings_global', 'German (Austria)'): ['deu_at', 'de_at'], + _tr('wl_settings_global', 'German (Germany)'): ['deu_de', 'de_de'], + _tr('wl_settings_global', 'German (Switzerland)'): ['deu_ch', 'de_ch'], + _tr('wl_settings_global', 'Gothic'): ['got', 'got'], + _tr('wl_settings_global', 'Greek (Ancient)'): ['grc', 'grc'], + _tr('wl_settings_global', 'Greek (Modern)'): ['ell', 'el'], + _tr('wl_settings_global', 'Gujarati'): ['guj', 'gu'], + _tr('wl_settings_global', 'Hebrew (Ancient)'): ['hbo', 'hbo'], + _tr('wl_settings_global', 'Hebrew (Modern)'): ['heb', 'he'], + _tr('wl_settings_global', 'Hindi'): ['hin', 'hi'], + _tr('wl_settings_global', 'Hungarian'): ['hun', 'hu'], + _tr('wl_settings_global', 'Icelandic'): ['isl', 'is'], + _tr('wl_settings_global', 'Indonesian'): ['ind', 'id'], + _tr('wl_settings_global', 'Irish'): ['gle', 'ga'], + _tr('wl_settings_global', 'Italian'): ['ita', 'it'], + _tr('wl_settings_global', 'Japanese'): ['jpn', 'ja'], + _tr('wl_settings_global', 'Kannada'): ['kan', 'kn'], + _tr('wl_settings_global', 'Kazakh'): ['kaz', 'kk'], + _tr('wl_settings_global', 'Khmer'): ['khm', 'km'], + _tr('wl_settings_global', 'Korean'): ['kor', 'ko'], + _tr('wl_settings_global', 'Kurdish (Kurmanji)'): ['kmr', 'kmr'], + _tr('wl_settings_global', 'Kyrgyz'): ['kir', 'ky'], + _tr('wl_settings_global', 'Lao'): ['lao', 'lo'], + _tr('wl_settings_global', 'Latin'): ['lat', 'la'], + _tr('wl_settings_global', 'Latvian'): ['lav', 'lv'], + _tr('wl_settings_global', 'Ligurian'): ['lij', 'lij'], + _tr('wl_settings_global', 'Lithuanian'): ['lit', 'lt'], + _tr('wl_settings_global', 'Luganda'): ['lug', 'lg'], + _tr('wl_settings_global', 'Luxembourgish'): ['ltz', 'lb'], + _tr('wl_settings_global', 'Macedonian'): ['mkd', 'mk'], + _tr('wl_settings_global', 'Malay'): ['msa', 'ms'], + _tr('wl_settings_global', 'Malayalam'): ['mal', 'ml'], + _tr('wl_settings_global', 'Maltese'): ['mlt', 'mt'], + _tr('wl_settings_global', 'Manx'): ['glv', 'gv'], + _tr('wl_settings_global', 'Marathi'): ['mar', 'mr'], + _tr('wl_settings_global', 'Meitei (Meitei script)'): ['mni_mtei', 'mni_mtei'], + _tr('wl_settings_global', 'Mongolian'): ['mon', 'mn'], + _tr('wl_settings_global', 'Nepali'): ['nep', 'ne'], + _tr('wl_settings_global', 'Nigerian Pidgin'): ['pcm', 'pcm'], + _tr('wl_settings_global', 'Norwegian (Bokmål)'): ['nob', 'nb'], + _tr('wl_settings_global', 'Norwegian (Nynorsk)'): ['nno', 'nn'], + _tr('wl_settings_global', 'Odia'): ['ori', 'or'], + _tr('wl_settings_global', 'Persian'): ['fas', 'fa'], + _tr('wl_settings_global', 'Polish'): ['pol', 'pl'], + _tr('wl_settings_global', 'Pomak'): ['qpm', 'qpm'], + _tr('wl_settings_global', 'Portuguese (Brazil)'): ['por_br', 'pt_br'], + _tr('wl_settings_global', 'Portuguese (Portugal)'): ['por_pt', 'pt_pt'], + _tr('wl_settings_global', 'Punjabi (Gurmukhi script)'): ['pan_guru', 'pa_guru'], + _tr('wl_settings_global', 'Romanian'): ['ron', 'ro'], + _tr('wl_settings_global', 'Russian'): ['rus', 'ru'], + _tr('wl_settings_global', 'Russian (Old)'): ['orv', 'orv'], + _tr('wl_settings_global', 'Sámi (Northern)'): ['sme', 'se'], + _tr('wl_settings_global', 'Sanskrit'): ['san', 'sa'], + _tr('wl_settings_global', 'Scottish Gaelic'): ['gla', 'gd'], + _tr('wl_settings_global', 'Serbian (Cyrillic script)'): ['srp_cyrl', 'sr_cyrl'], + _tr('wl_settings_global', 'Serbian (Latin script)'): ['srp_latn', 'sr_latn'], + _tr('wl_settings_global', 'Sindhi'): ['snd', 'sd'], + _tr('wl_settings_global', 'Sinhala'): ['sin', 'si'], + _tr('wl_settings_global', 'Slovak'): ['slk', 'sk'], + _tr('wl_settings_global', 'Slovene'): ['slv', 'sl'], + _tr('wl_settings_global', 'Sorbian (Lower)'): ['dsb', 'dsb'], + _tr('wl_settings_global', 'Sorbian (Upper)'): ['hsb', 'hsb'], + _tr('wl_settings_global', 'Spanish'): ['spa', 'es'], + _tr('wl_settings_global', 'Swahili'): ['swa', 'sw'], + _tr('wl_settings_global', 'Swedish'): ['swe', 'sv'], + _tr('wl_settings_global', 'Tagalog'): ['tgl', 'tl'], + _tr('wl_settings_global', 'Tajik'): ['tgk', 'tg'], + _tr('wl_settings_global', 'Tamil'): ['tam', 'ta'], + _tr('wl_settings_global', 'Tatar'): ['tat', 'tt'], + _tr('wl_settings_global', 'Telugu'): ['tel', 'te'], + _tr('wl_settings_global', 'Tetun (Dili)'): ['tdt', 'tdt'], + _tr('wl_settings_global', 'Thai'): ['tha', 'th'], + _tr('wl_settings_global', 'Tibetan'): ['bod', 'bo'], + _tr('wl_settings_global', 'Tigrinya'): ['tir', 'ti'], + _tr('wl_settings_global', 'Tswana'): ['tsn', 'tn'], + _tr('wl_settings_global', 'Turkish'): ['tur', 'tr'], + _tr('wl_settings_global', 'Ukrainian'): ['ukr', 'uk'], + _tr('wl_settings_global', 'Urdu'): ['urd', 'ur'], + _tr('wl_settings_global', 'Uyghur'): ['uig', 'ug'], + _tr('wl_settings_global', 'Vietnamese'): ['vie', 'vi'], + _tr('wl_settings_global', 'Welsh'): ['cym', 'cy'], + _tr('wl_settings_global', 'Wolof'): ['wol', 'wo'], + _tr('wl_settings_global', 'Yoruba'): ['yor', 'yo'], + _tr('wl_settings_global', 'Zulu'): ['zul', 'zu'], + + _tr('wl_settings_global', 'Other languages'): ['other', 'other'] }, # Language and geographical names should be always capitalized diff --git a/wordless/wl_utils/wl_conversion.py b/wordless/wl_utils/wl_conversion.py index 6bcf6cab3..0927d370c 100644 --- a/wordless/wl_utils/wl_conversion.py +++ b/wordless/wl_utils/wl_conversion.py @@ -20,6 +20,8 @@ _tr = QCoreApplication.translate +# pylint: disable=broad-exception-raised + # Languages def normalize_lang_code(lang_code): return lang_code.replace('-', '_').lower() @@ -33,11 +35,11 @@ def to_lang_codes(main, lang_texts): def _to_lang_text(main, lang_code): lang_code = normalize_lang_code(lang_code) - for lang_text, (lang_code_639_3, _, _) in main.settings_global['langs'].items(): + for lang_text, (lang_code_639_3, _) in main.settings_global['langs'].items(): if lang_code_639_3 == lang_code: return lang_text - return '' + raise Exception('Failed to convert the language code to text!') def to_lang_text(main, lang_code): return _to_lang_text(main, lang_code) @@ -48,27 +50,27 @@ def to_lang_texts(main, lang_codes): def to_iso_639_3(main, lang_code): lang_code = normalize_lang_code(lang_code) - for lang_code_639_3, lang_code_639_1, _ in main.settings_global['langs'].values(): + for lang_code_639_3, lang_code_639_1 in main.settings_global['langs'].values(): if lang_code_639_1 == lang_code: return lang_code_639_3 # ISO 639-1 codes without country codes - for lang_code_639_3, lang_code_639_1, _ in main.settings_global['langs'].values(): + for lang_code_639_3, lang_code_639_1 in main.settings_global['langs'].values(): if lang_code_639_1.startswith(f'{lang_code}_'): return lang_code_639_3 - return '' + raise Exception('Failed to convert the ISO 639-1 language code to ISO 639-3 code!') def to_iso_639_1(main, lang_code, no_suffix = False): lang_code = normalize_lang_code(lang_code) # Fuzzy matching without code suffixes if '_' in lang_code: - for lang_code_639_3, lang_code_639_1, _ in main.settings_global['langs'].values(): + for lang_code_639_3, lang_code_639_1 in main.settings_global['langs'].values(): if lang_code_639_3 == lang_code: lang_code_converted = lang_code_639_1 else: - for lang_code_639_3, lang_code_639_1, _ in main.settings_global['langs'].values(): + for lang_code_639_3, lang_code_639_1 in main.settings_global['langs'].values(): if remove_lang_code_suffixes(main, lang_code_639_3) == remove_lang_code_suffixes(main, lang_code): lang_code_converted = lang_code_639_1 @@ -85,15 +87,6 @@ def remove_lang_code_suffixes(main, lang_code): # pylint: disable=unused-argumen else: return lang_code -def get_lang_family(main, lang_code): - lang_code = normalize_lang_code(lang_code) - - for lang_code_639_3, _, lang_family in main.settings_global['langs'].values(): - if lang_code_639_3 == lang_code: - return lang_family - - return '' - # Encodings def to_encoding_code(main, encoding_text): return main.settings_global['encodings'][encoding_text] @@ -103,7 +96,7 @@ def to_encoding_text(main, encoding_code): if encoding_code == code: return text - return '' + raise Exception('Failed to convert the encoding code to text!') # Yes/No def to_yes_no_code(yes_no_text): @@ -111,13 +104,13 @@ def to_yes_no_code(yes_no_text): return True elif yes_no_text == _tr('wl_conversion', 'No'): return False - else: - return None + + raise Exception('Failed to convert the Yes/No text to code!') def to_yes_no_text(yes_no_code): if yes_no_code is True: return _tr('wl_conversion', 'Yes') elif yes_no_code is False: return _tr('wl_conversion', 'No') - else: - return None + + raise Exception('Failed to convert the Yes/No code to text!') diff --git a/wordless/wl_utils/wl_detection.py b/wordless/wl_utils/wl_detection.py index dee923323..efce5f90a 100644 --- a/wordless/wl_utils/wl_detection.py +++ b/wordless/wl_utils/wl_detection.py @@ -33,10 +33,14 @@ def detect_encoding(main, file_path): else: break - results = charset_normalizer.from_bytes(text) + result = charset_normalizer.from_bytes(text).best() - if results: - encoding = results.best().encoding + if result is not None: + encoding = result.encoding + + if encoding == 'utf_8' and result.bom: + encoding = 'utf_8_sig' + # Fall back to UTF-8 without BOM if there are no results else: encoding = 'utf_8' @@ -45,9 +49,10 @@ def detect_encoding(main, file_path): try: with open(file_path, 'r', encoding = encoding) as f: f.read() - # Fall back to UTF-8 if fail + # Fall back to UTF-8 without BOM if not decodable except UnicodeDecodeError: encoding = 'utf_8' + print('test') return encoding @@ -67,28 +72,30 @@ def detect_lang_text(main, text): lang = lingua_detector.detect_language_of(text) - if lang.name == 'CHINESE': - converter = opencc.OpenCC('t2s') - - if converter.convert(text) == text: - lang_code = 'zho_cn' - else: - lang_code = 'zho_tw' - elif lang.name == 'ENGLISH': - lang_code = 'eng_us' - elif lang.name == 'GERMAN': - lang_code = 'deu_de' - elif lang.name == 'PUNJABI': - lang_code = 'pan_guru' - elif lang.name == 'PORTUGUESE': - lang_code = 'por_pt' - elif lang.name == 'SERBIAN': - lang_code = 'srp_cyrl' # No results - elif lang is None: + if lang is None: lang_code = 'other' else: - lang_code = lang.iso_code_639_3.name.lower() + match lang.name: + case 'CHINESE': + converter = opencc.OpenCC('t2s') + + if converter.convert(text) == text: + lang_code = 'zho_cn' + else: + lang_code = 'zho_tw' + case 'ENGLISH': + lang_code = 'eng_us' + case 'GERMAN': + lang_code = 'deu_de' + case 'PUNJABI': + lang_code = 'pan_guru' + case 'PORTUGUESE': + lang_code = 'por_pt' + case 'SERBIAN': + lang_code = 'srp_cyrl' + case _: + lang_code = lang.iso_code_639_3.name.lower() return lang_code diff --git a/wordless/wl_utils/wl_misc.py b/wordless/wl_utils/wl_misc.py index 4e8815ed1..31fd78ce3 100644 --- a/wordless/wl_utils/wl_misc.py +++ b/wordless/wl_utils/wl_misc.py @@ -34,17 +34,6 @@ _tr = QCoreApplication.translate -def change_file_owner_to_user(file_path): - # pylint: disable=no-member - _, is_macos, is_linux = check_os() - - # Available on Unix only - if (is_macos or is_linux) and os.getuid() == 0: - uid = int(os.environ.get('SUDO_UID')) - gid = int(os.environ.get('SUDO_GID')) - - os.chown(file_path, uid, gid) - def check_os(): is_windows = False is_macos = False @@ -60,6 +49,27 @@ def check_os(): return is_windows, is_macos, is_linux +def get_linux_distro(): + try: + os_release = platform.freedesktop_os_release() + # Default to Ubuntu if undetermined + except OSError: + os_release = {'ID': 'ubuntu'} + + return os_release['ID'] + +def change_file_owner_to_user(file_path): + # pylint: disable=no-member + _, is_macos, is_linux = check_os() + + # Available on Unix only + if (is_macos or is_linux) and os.getuid() == 0: + print('test') + uid = int(os.environ.get('SUDO_UID')) + gid = int(os.environ.get('SUDO_GID')) + + os.chown(file_path, uid, gid) + def find_wl_main(widget): if 'main' in widget.__dict__: main = widget.main @@ -71,23 +81,6 @@ def find_wl_main(widget): return main -def flatten_list(list_to_flatten): - for item in list_to_flatten: - if isinstance(item, collections.abc.Iterable) and not isinstance(item, (str, bytes)): - yield from flatten_list(item) - else: - yield item - -def get_linux_distro(): - try: - os_release = platform.freedesktop_os_release() - # Default to Ubuntu if undetermined - except OSError: - os_release = {'ID': 'ubuntu'} - - return os_release['ID'] - - def get_wl_ver(): with open(wl_paths.get_path_file('VERSION'), 'r', encoding = 'utf_8') as f: for line in f: @@ -98,6 +91,57 @@ def get_wl_ver(): return packaging.version.Version(wl_ver) +REQUESTS_TIMEOUT = 10 + +def wl_get_proxies(main): + proxy_settings = main.settings_custom['general']['proxy_settings'] + + if proxy_settings['use_proxy']: + if proxy_settings['username']: + proxy_username = urllib.parse.quote(proxy_settings['username']) + proxy_password = urllib.parse.quote(proxy_settings['password']) + + proxy = f"http://{proxy_username}:{proxy_password}@{proxy_settings['address']}:{proxy_settings['port']}" + else: + proxy = f"http://{proxy_settings['address']}:{proxy_settings['port']}" + + proxies = {'http': proxy, 'https': proxy} + else: + proxies = None + + return proxies + +def wl_download(main, url): + err_msg = '' + + try: + r = requests.get(url, timeout = REQUESTS_TIMEOUT, proxies = wl_get_proxies(main)) + + if r.status_code != 200: + err_msg = traceback.format_exc() + except requests.RequestException: + r = None + err_msg = traceback.format_exc() + + return r, err_msg + +def wl_download_file_size(main, url): + file_size = 0 + + try: + r = requests.get(url, timeout = REQUESTS_TIMEOUT, stream = True, proxies = wl_get_proxies(main)) + + if r.status_code == 200: + file_size = int(r.headers['content-length']) + + # See: https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow + r.close() + except requests.RequestException: + pass + + # In megabytes + return file_size / 1024 / 1024 + def log_timing(func): def wrapper(widget, *args, **kwargs): if isinstance(widget, QMainWindow): @@ -128,6 +172,13 @@ def wrapper(widget, *args, **kwargs): return wrapper +def flatten_list(list_to_flatten): + for item in list_to_flatten: + if isinstance(item, collections.abc.Iterable) and not isinstance(item, (str, bytes)): + yield from flatten_list(item) + else: + yield item + def merge_dicts(dicts_to_merge): dict_merged = {} len_dicts = len(dicts_to_merge) @@ -162,7 +213,7 @@ def normalize_nums(nums, normalized_min, normalized_max, reverse = False): nums_min = min(nums) nums_max = max(nums) - if nums_max - nums_min == 0: + if nums_max == nums_min: nums_normalized = [normalized_min] * len(nums) else: if reverse: @@ -177,54 +228,3 @@ def normalize_nums(nums, normalized_min, normalized_max, reverse = False): ] return nums_normalized - -REQUESTS_TIMEOUT = 10 - -def wl_get_proxies(main): - proxy_settings = main.settings_custom['general']['proxy_settings'] - - if proxy_settings['use_proxy']: - if proxy_settings['username']: - proxy_username = urllib.parse.quote(proxy_settings['username']) - proxy_password = urllib.parse.quote(proxy_settings['password']) - - proxy = f"http://{proxy_username}:{proxy_password}@{proxy_settings['address']}:{proxy_settings['port']}" - else: - proxy = f"http://{proxy_settings['address']}:{proxy_settings['port']}" - - proxies = {'http': proxy, 'https': proxy} - else: - proxies = None - - return proxies - -def wl_download(main, url): - err_msg = '' - - try: - r = requests.get(url, timeout = REQUESTS_TIMEOUT, proxies = wl_get_proxies(main)) - - if r.status_code != 200: - err_msg = 'A network error occurred!' - except requests.RequestException: - r = None - err_msg = traceback.format_exc() - - return r, err_msg - -def wl_download_file_size(main, url): - file_size = 0 - - try: - r = requests.get(url, timeout = REQUESTS_TIMEOUT, stream = True, proxies = wl_get_proxies(main)) - - if r.status_code == 200: - file_size = int(r.headers['content-length']) - - # See: https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow - r.close() - except requests.RequestException: - pass - - # In megabytes - return file_size / 1024 / 1024