Settings: Add Settings - Measures - Readability - Coleman's Readabili…

…ty Formula; Work Area: Add Profiler - Readability - Coleman's Readability Formula
BLKSerene · Jul 27, 2023 · 9848685 · 9848685
1 parent 8ff158d
commit 9848685
Show file tree

Hide file tree

Showing 14 changed files with 788 additions and 306 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,13 +20,13 @@
 
 ## [3.3.0](https://github.com/BLKSerene/Wordless/releases/tag/3.3.0) - ??/??/2023
 ### 🎉 New Features
-- Settings: Add Settings - Measures - Readability - Bormuth's Grade Placement / Flesch Reading Ease
+- Settings: Add Settings - Measures - Readability - Bormuth's Grade Placement / Coleman's Readability Formula / Flesch Reading Ease
 - Utils: Add khmer-nltk's Khmer sentence tokenizer, word tokenizer, and part-of-speech tagger
 - Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard)
 - Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
 - Utils: Add spaCy's Malay word tokenizer
 - Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser
-- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement
+- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula
 
 ### ✨ Improvements
 - Utils: Update Wordless's sentence and sentence segment splitters

diff --git a/doc/doc_eng.md b/doc/doc_eng.md
diff --git a/doc/measures/readability/colemans_readability_formula.svg b/doc/measures/readability/colemans_readability_formula.svg
diff --git a/doc/measures/readability/lensear_write.svg b/doc/measures/readability/lensear_write.svg
diff --git a/doc/measures/readability/re_simplified.svg b/doc/measures/readability/re_simplified.svg
diff --git a/doc/measures/readability/rgl.svg b/doc/measures/readability/rgl.svg
diff --git a/doc/measures/readability/smog_grade.svg b/doc/measures/readability/smog_grade.svg
diff --git a/doc/measures/readability/wstf.svg b/doc/measures/readability/wstf.svg
diff --git a/tests/wl_tests_measures/test_measures_readability.py b/tests/wl_tests_measures/test_measures_readability.py
@@ -21,8 +21,6 @@
 from tests import wl_test_init
 from wordless.wl_measures import wl_measures_readability
 
-main = wl_test_init.Wl_Test_Main()
-
 class Wl_Test_Text():
     def __init__(self, tokens_multilevel, lang = 'eng_us'):
         super().__init__()
@@ -31,6 +29,9 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'):
         self.lang = lang
         self.tokens_multilevel = tokens_multilevel
 
+main = wl_test_init.Wl_Test_Main()
+settings = main.settings_custom['measures']['readability']
+
 TOKENS_MULTILEVEL_0 = []
 TOKENS_MULTILEVEL_12 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['This', 'is', 'a', 'sen-tence0', '.']]]]
 TOKENS_MULTILEVEL_12_PROPN = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['Louisiana', 'readability', 'boxes', 'created', '.']]]]
@@ -150,6 +151,33 @@ def test_coleman_liau_index():
     assert grade_level_eng_0 == 'text_too_short'
     assert grade_level_eng_12 == grade_level_spa_12 == -27.4004 * (est_cloze_pct / 100) + 23.06395
 
+def test_colemans_readability_formula():
+    cloze_pct_eng_0 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_0)
+    settings['colemans_readability_formula']['variant'] = '1'
+    cloze_pct_eng_12_1 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
+    settings['colemans_readability_formula']['variant'] = '2'
+    cloze_pct_eng_12_2 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
+    settings['colemans_readability_formula']['variant'] = '3'
+    cloze_pct_eng_12_3 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
+    settings['colemans_readability_formula']['variant'] = '4'
+    cloze_pct_eng_12_4 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
+    cloze_pct_other_12 = wl_measures_readability.colemans_readability_formula(main, test_text_other_12)
+
+    print("Coleman's Readability Formula:")
+    print(f'\teng/0: {cloze_pct_eng_0}')
+    print(f'\teng/12-1: {cloze_pct_eng_12_1}')
+    print(f'\teng/12-2: {cloze_pct_eng_12_2}')
+    print(f'\teng/12-3: {cloze_pct_eng_12_3}')
+    print(f'\teng/12-4: {cloze_pct_eng_12_4}')
+    print(f'\tother/12: {cloze_pct_other_12}')
+
+    assert cloze_pct_eng_0 == 'text_too_short'
+    assert cloze_pct_eng_12_1 == 1.29 * (9 / 12 * 100) - 38.45
+    assert cloze_pct_eng_12_2 == 1.16 * (9 / 12 * 100) + 1.48 * (3 / 12 * 100) - 37.95
+    assert cloze_pct_eng_12_3 == 1.07 * (9 / 12 * 100) + 1.18 * (3 / 12 * 100) + 0.76 * (0 / 12 * 100) - 34.02
+    assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (0 / 12 * 100) - 0.36 * (0 / 12) - 26.01
+    assert cloze_pct_other_12 == 'no_support'
+
 def test_dale_chall_readability_score():
     x_c50_eng_0 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_0)
     x_c50_eng_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_12)
@@ -198,19 +226,19 @@ def test_flesch_reading_ease():
     flesch_re_eng_0 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_0)
     flesch_re_eng_12 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_12)
 
-    main.settings_custom['measures']['readability']['re']['variant_nld'] = 'Douma'
+    settings['re']['variant_nld'] = 'Douma'
     flesch_re_nld_12_douma = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12)
-    main.settings_custom['measures']['readability']['re']['variant_nld'] = "Brouwer's Leesindex A"
+    settings['re']['variant_nld'] = "Brouwer's Leesindex A"
     flesch_re_nld_12_brouwer = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12)
 
     flesch_re_fra_12 = wl_measures_readability.flesch_reading_ease(main, test_text_fra_12)
     flesch_re_deu_12 = wl_measures_readability.flesch_reading_ease(main, test_text_deu_12)
     flesch_re_ita_12 = wl_measures_readability.flesch_reading_ease(main, test_text_ita_12)
     flesch_re_rus_12 = wl_measures_readability.flesch_reading_ease(main, test_text_rus_12)
 
-    main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Fernández Huerta'
+    settings['re']['variant_spa'] = 'Fernández Huerta'
     flesch_re_spa_12_fh = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12)
-    main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Szigriszt Pazos'
+    settings['re']['variant_spa'] = 'Szigriszt Pazos'
     flesch_re_spa_12_sp = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12)
 
     flesch_re_afr_12 = wl_measures_readability.flesch_reading_ease(main, test_text_afr_12)
@@ -219,14 +247,14 @@ def test_flesch_reading_ease():
     print('Flesch Reading Ease:')
     print(f'\teng/0: {flesch_re_eng_0}')
     print(f'\teng/12: {flesch_re_eng_12}')
-    print(f'\tnld-douma/12: {flesch_re_nld_12_douma}')
-    print(f'\tnld-brouwer/12: {flesch_re_nld_12_brouwer}')
+    print(f'\tnld/12-douma: {flesch_re_nld_12_douma}')
+    print(f'\tnld/12-brouwer: {flesch_re_nld_12_brouwer}')
     print(f'\tfra/12: {flesch_re_fra_12}')
     print(f'\tdeu/12: {flesch_re_deu_12}')
     print(f'\tita/12: {flesch_re_ita_12}')
     print(f'\trus/12: {flesch_re_rus_12}')
-    print(f'\tspa-fh/12: {flesch_re_spa_12_fh}')
-    print(f'\tspa-sp/12: {flesch_re_spa_12_sp}')
+    print(f'\tspa/12-fh: {flesch_re_spa_12_fh}')
+    print(f'\tspa/12-sp: {flesch_re_spa_12_sp}')
     print(f'\tafr/12: {flesch_re_afr_12}')
     print(f'\tother/12: {flesch_re_other_12}')
 
@@ -453,18 +481,22 @@ def test_spache_grade_level():
 
 def test_wiener_sachtextformel():
     wstf_deu_0 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_0)
-    wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '1')
-    wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '2')
-    wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '3')
-    wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '4')
+    settings['wstf']['variant'] = '1'
+    wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
+    settings['wstf']['variant'] = '2'
+    wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
+    settings['wstf']['variant'] = '3'
+    wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
+    settings['wstf']['variant'] = '4'
+    wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
     wstf_eng_12 = wl_measures_readability.wiener_sachtextformel(main, test_text_eng_12)
 
     print('Wiener Sachtextformel:')
     print(f'\tdeu/0: {wstf_deu_0}')
-    print(f'\tdeu-1/12: {wstf_deu_12_1}')
-    print(f'\tdeu-2/12: {wstf_deu_12_2}')
-    print(f'\tdeu-3/12: {wstf_deu_12_3}')
-    print(f'\tdeu-4/12: {wstf_deu_12_4}')
+    print(f'\tdeu/12-1: {wstf_deu_12_1}')
+    print(f'\tdeu/12-2: {wstf_deu_12_2}')
+    print(f'\tdeu/12-3: {wstf_deu_12_3}')
+    print(f'\tdeu/12-4: {wstf_deu_12_4}')
     print(f'\teng/12: {wstf_eng_12}')
 
     ms = 0 / 12
@@ -485,6 +517,7 @@ def test_wiener_sachtextformel():
     test_bormuths_cloze_mean()
     test_bormuths_gp()
     test_coleman_liau_index()
+    test_colemans_readability_formula()
     test_dale_chall_readability_score()
     test_devereux_readability_index()
     test_flesch_kincaid_grade_level()

diff --git a/tests/wl_tests_work_area/test_profiler.py b/tests/wl_tests_work_area/test_profiler.py
@@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files):
         count_tokens_lens_syls.append(collections.Counter(len_tokens_syls))
         count_tokens_lens_chars.append(collections.Counter(len_tokens_chars))
 
-        assert len(readability_statistics) == 24
+        assert len(readability_statistics) == 25
 
         # Counts
         assert count_paras

diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py
@@ -129,9 +129,9 @@ def get_count_words_dale(words, num_easy_words):
 # Automated Arabic Readability Index
 # Reference: Al-Tamimi, A., Jaradat M., Aljarrah, N., & Ghanim, S. (2013). AARI: Automatic Arabic readability index. The International Arab Journal of Information Technology, 11(4), pp. 370–378.
 def automated_ara_readability_index(main, text):
-    text = get_counts(main, text)
-
     if text.lang == 'ara':
+        text = get_counts(main, text)
+
         if text.count_words and text.count_sentences:
             aari = (
                 3.28 * text.count_chars_alphanumeric
@@ -164,9 +164,9 @@ def automated_readability_index(main, text):
 # Bormuth's Cloze Mean & Grade Placement
 # Reference: Bormuth, J. R. (1969). Development of readability analyses. U.S. Department of Health, Education, and Welfare. http://files.eric.ed.gov/fulltext/ED029166.pdf
 def bormuths_cloze_mean(main, text):
-    text = get_counts(main, text)
-
     if text.lang.startswith('eng_'):
+        text = get_counts(main, text)
+
         if text.count_sentences and text.count_words:
             ddl = get_count_words_dale(text.words_flat, 3000)
             m = (
@@ -219,12 +219,62 @@ def coleman_liau_index(main, text):
 
     return grade_level
 
+# Coleman's Readability Formula
+# Reference: Liau, T. L., Bassin, C. B., Martin, C. J., & Coleman, E. B. (1976). Modification of the Coleman readability formulas. Journal of Reading Behavior, 8(4), 381–386. https://journals.sagepub.com/doi/pdf/10.1080/10862967609547193
+def colemans_readability_formula(main, text):
+    if text.lang.startswith('eng_'):
+        text = get_counts(main, text)
+
+        if text.count_words:
+            variant = main.settings_custom['measures']['readability']['colemans_readability_formula']['variant']
+            count_words_1_syl = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1)
+
+            if variant in ['3', '4']:
+                pos_tags = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal')
+                count_prons = sum((1 for _, pos in pos_tags if pos == 'PRON'))
+
+                if variant == '4':
+                    count_preps = sum((1 for _, pos in pos_tags if pos == 'ADP'))
+
+            if variant == '1':
+                cloze_pct = (
+                    1.29 * (count_words_1_syl / text.count_words * 100) -
+                    38.45
+                )
+            elif variant == '2':
+                cloze_pct = (
+                    1.16 * (count_words_1_syl / text.count_words * 100) +
+                    1.48 * (text.count_sentences / text.count_words * 100) -
+                    37.95
+                )
+            elif variant == '3':
+                cloze_pct = (
+                    1.07 * (count_words_1_syl / text.count_words * 100) +
+                    1.18 * (text.count_sentences / text.count_words * 100) +
+                    0.76 * (count_prons / text.count_words * 100) -
+                    34.02
+                )
+            elif variant == '4':
+                cloze_pct = (
+                    1.04 * (count_words_1_syl / text.count_words * 100) +
+                    1.06 * (text.count_sentences / text.count_words * 100) +
+                    0.56 * (count_prons / text.count_words * 100) -
+                    0.36 * (count_preps / text.count_words) -
+                    26.01
+                )
+        else:
+            cloze_pct = 'text_too_short'
+    else:
+        cloze_pct = 'no_support'
+
+    return cloze_pct
+
 # Dale-Chall Readability Score
 # References:
 #     Dale, E., & Chall, J. S. (1948a). A formula for predicting readability. Educational Research Bulletin, 27(1), 11–20, 28.
 #     Dale, E., & Chall, J. S. (1948b). A formula for predicting readability: Instructions. Educational Research Bulletin, 27(2), 37–54.
 def dale_chall_readability_score(main, text):
-    if text.lang.startswith('eng'):
+    if text.lang.startswith('eng_'):
         text = get_counts(main, text)
 
         if text.count_words and text.count_sentences:
@@ -374,10 +424,10 @@ def flesch_reading_ease_simplified(main, text):
         text = get_counts(main, text)
 
         if text.count_words and text.count_sentences:
-            count_words_monosyllabic = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1)
+            count_words_1_syl = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1)
 
             flesch_re_simplified = (
-                1.599 * (count_words_monosyllabic / text.count_words * 100)
+                1.599 * (count_words_1_syl / text.count_words * 100)
                 - 1.015 * (text.count_words / text.count_sentences)
                 - 31.517
             )
@@ -398,8 +448,8 @@ def forcast_grade_level(main, text):
             sample_start = random.randint(0, text.count_words - 150)
             sample = text.syls_words[sample_start : sample_start + 150]
 
-            count_words_monosyllabic = get_count_words_syls(sample, len_min = 1, len_max = 1)
-            rgl = 20.43 - 0.11 * count_words_monosyllabic
+            count_words_1_syl = get_count_words_syls(sample, len_min = 1, len_max = 1)
+            rgl = 20.43 - 0.11 * count_words_1_syl
         else:
             rgl = 'text_too_short'
     else:
@@ -452,7 +502,7 @@ def formula_de_crawford(main, text):
 #     Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. Scuola e Città, 39(3), pp. 110–124.
 #     Indice Gulpease. (2021, July 9). In Wikipedia.https://it.wikipedia.org/w/index.php?title=Indice_Gulpease&oldid=121763335.
 def gulpease_index(main, text):
-    if text.lang.startswith('ita'):
+    if text.lang == 'ita':
         text = get_counts(main, text)
 
         if text.count_words:
@@ -470,13 +520,13 @@ def gulpease_index(main, text):
 # Polish variant:
 #     Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. Zeszyty Prasoznawcze, 4(42), 35–48.
 def gunning_fog_index(main, text):
-    if text.lang.startswith('eng') or text.lang == 'pol' and text.lang in main.settings_global['syl_tokenizers']:
+    if text.lang.startswith('eng_') or text.lang == 'pol' and text.lang in main.settings_global['syl_tokenizers']:
         text = get_counts(main, text)
 
         if text.count_sentences and text.count_words:
             count_hard_words = 0
 
-            if text.lang.startswith('eng'):
+            if text.lang.startswith('eng_'):
                 words_tagged = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal')
 
                 for syls, (word, tag) in zip(text.syls_words, words_tagged):
@@ -529,7 +579,7 @@ def legibility_mu(main, text):
 # Lensear Write
 # Reference: O’Hayre, J. (1966). Gobbledygook has gotta go. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf
 def lensear_write(main, text):
-    if text.lang.startswith('eng') and text.lang in main.settings_global['syl_tokenizers']:
+    if text.lang.startswith('eng_') and text.lang in main.settings_global['syl_tokenizers']:
         text = get_counts(main, text)
 
         if text.count_words > 0:
@@ -589,7 +639,7 @@ def lix(main, text):
 # McAlpine EFLAW Readability Score
 # Reference: Nirmaldasan. (2009, April 30). McAlpine EFLAW readability score. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/
 def mcalpine_eflaw(main, text):
-    if text.lang.startswith('eng'):
+    if text.lang.startswith('eng_'):
         text = get_counts(main, text)
 
         if text.count_sentences:
@@ -703,14 +753,14 @@ def smog_grade(main, text):
             )
 
             # Calculate the number of words with 3 or more syllables
-            count_words_polysyllabic = 0
+            count_words_3_plus_syls = 0
 
             for sentence in samples:
                 syls_words = wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang)
 
-                count_words_polysyllabic += get_count_words_syls(syls_words, len_min = 3)
+                count_words_3_plus_syls += get_count_words_syls(syls_words, len_min = 3)
 
-            g = 3.1291 + 1.043 * (count_words_polysyllabic ** 0.5)
+            g = 3.1291 + 1.043 * (count_words_3_plus_syls ** 0.5)
         else:
             g = 'text_too_short'
     else:
@@ -723,7 +773,7 @@ def smog_grade(main, text):
 #     Dale, E. (1931). A comparison of two word lists. Educational Research Bulletin, 10(18), 484–489.
 #     Spache, G. (1953). A new readability formula for primary-grade reading materials. Elementary School Journal, 53(7), 410–413. https://doi.org/10.1086/458513
 def spache_grade_level(main, text):
-    if text.lang.startswith('eng'):
+    if text.lang.startswith('eng_'):
         text = get_counts(main, text)
 
         if text.count_words >= 100:
@@ -771,14 +821,12 @@ def spache_grade_level(main, text):
 # References:
 #     Bamberger, R., & Vanecek, E. (1984). Lesen – Verstehen – Lernen – Schreiben. Jugend und Volk.
 #     Lesbarkeitsindex. (2022, July 21). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=224664667
-def wiener_sachtextformel(main, text, variant = None):
-    if text.lang.startswith('deu') and text.lang in main.settings_global['syl_tokenizers']:
+def wiener_sachtextformel(main, text):
+    if text.lang.startswith('deu_') and text.lang in main.settings_global['syl_tokenizers']:
         text = get_counts(main, text)
 
         if text.count_words and text.count_sentences:
-            if not variant:
-                variant = main.settings_custom['measures']['readability']['wstf']['variant']
-
+            variant = main.settings_custom['measures']['readability']['wstf']['variant']
             ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words
             sl = text.count_words / text.count_sentences
             iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words

diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py
@@ -362,6 +362,7 @@ def __init__(self, parent):
             _tr('wl_profiler', "Bormuth's Cloze Mean"),
             _tr('wl_profiler', "Bormuth's Grade Placement"),
             _tr('wl_profiler', 'Coleman-Liau Index'),
+            _tr('wl_profiler', "Coleman's Readability Formula"),
             _tr('wl_profiler', 'Dale-Chall Readability Score'),
             _tr('wl_profiler', 'Devereaux Readability Index'),
             _tr('wl_profiler', 'Flesch-Kincaid Grade Level'),
@@ -1178,6 +1179,7 @@ def run(self):
                         wl_measures_readability.bormuths_cloze_mean(self.main, text),
                         wl_measures_readability.bormuths_gp(self.main, text),
                         wl_measures_readability.coleman_liau_index(self.main, text),
+                        wl_measures_readability.colemans_readability_formula(self.main, text),
                         wl_measures_readability.dale_chall_readability_score(self.main, text),
                         wl_measures_readability.devereux_readability_index(self.main, text),
                         wl_measures_readability.flesch_kincaid_grade_level(self.main, text),