Work Area: Add Profiler - Readability - Dale-Chall Readability Formul…

…a (New)
BLKSerene · Aug 1, 2023 · 24c58cd · 24c58cd
1 parent 9848685
commit 24c58cd
Show file tree

Hide file tree

Showing 20 changed files with 387 additions and 238 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,7 +26,7 @@
 - Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
 - Utils: Add spaCy's Malay word tokenizer
 - Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser
-- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula
+- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Dale-Chall Readability Formula (New)
 
 ### ✨ Improvements
 - Utils: Update Wordless's sentence and sentence segment splitters

diff --git a/doc/doc_eng.md b/doc/doc_eng.md
diff --git a/doc/measures/readability/x_c50.svg b/doc/measures/readability/x_c50.svg
diff --git a/doc/measures/readability/x_c50_new.svg b/doc/measures/readability/x_c50_new.svg
diff --git a/doc/work_area/colligation_extractor_table.png b/doc/work_area/colligation_extractor_table.png
diff --git a/doc/work_area/collocation_extractor_fig_network_graph.png b/doc/work_area/collocation_extractor_fig_network_graph.png
diff --git a/doc/work_area/collocation_extractor_table.png b/doc/work_area/collocation_extractor_table.png
diff --git a/doc/work_area/concordancer_fig.png b/doc/work_area/concordancer_fig.png
diff --git a/doc/work_area/concordancer_parallel_table.png b/doc/work_area/concordancer_parallel_table.png
diff --git a/doc/work_area/concordancer_table.png b/doc/work_area/concordancer_table.png
diff --git a/doc/work_area/keyword_extractor_table.png b/doc/work_area/keyword_extractor_table.png
diff --git a/doc/work_area/ngram_generator_table.png b/doc/work_area/ngram_generator_table.png
diff --git a/doc/work_area/profiler_table.png b/doc/work_area/profiler_table.png
diff --git a/doc/work_area/wordlist_generator_fig_line_chart.png b/doc/work_area/wordlist_generator_fig_line_chart.png
diff --git a/doc/work_area/wordlist_generator_fig_word_cloud.png b/doc/work_area/wordlist_generator_fig_word_cloud.png
diff --git a/doc/work_area/wordlist_generator_table.png b/doc/work_area/wordlist_generator_table.png
diff --git a/tests/wl_tests_measures/test_measures_readability.py b/tests/wl_tests_measures/test_measures_readability.py
@@ -178,18 +178,32 @@ def test_colemans_readability_formula():
     assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (0 / 12 * 100) - 0.36 * (0 / 12) - 26.01
     assert cloze_pct_other_12 == 'no_support'
 
-def test_dale_chall_readability_score():
-    x_c50_eng_0 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_0)
-    x_c50_eng_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_12)
-    x_c50_spa_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_spa_12)
+def test_dale_chall_readability_formula():
+    x_c50_eng_0 = wl_measures_readability.dale_chall_readability_formula(main, test_text_eng_0)
+    x_c50_eng_12 = wl_measures_readability.dale_chall_readability_formula(main, test_text_eng_12)
+    x_c50_spa_12 = wl_measures_readability.dale_chall_readability_formula(main, test_text_spa_12)
 
-    print('Dale-Chall Readibility Score:')
+    print('Dale-Chall Readability Formula:')
     print(f'\teng/0: {x_c50_eng_0}')
     print(f'\teng/12: {x_c50_eng_12}')
     print(f'\tspa/12: {x_c50_spa_12}')
 
     assert x_c50_eng_0 == 'text_too_short'
-    assert x_c50_eng_12 == 0.1579 * (1 / 12) + 0.0496 * (12 / 3) + 3.6365
+    assert x_c50_eng_12 == 0.1579 * (1 / 12 * 100) + 0.0496 * (12 / 3) + 3.6365
+    assert x_c50_spa_12 == 'no_support'
+
+def test_dale_chall_readability_formula_new():
+    x_c50_eng_0 = wl_measures_readability.dale_chall_readability_formula_new(main, test_text_eng_0)
+    x_c50_eng_12 = wl_measures_readability.dale_chall_readability_formula_new(main, test_text_eng_12)
+    x_c50_spa_12 = wl_measures_readability.dale_chall_readability_formula_new(main, test_text_spa_12)
+
+    print('Dale-Chall Readability Formula (New):')
+    print(f'\teng/0: {x_c50_eng_0}')
+    print(f'\teng/12: {x_c50_eng_12}')
+    print(f'\tspa/12: {x_c50_spa_12}')
+
+    assert x_c50_eng_0 == 'text_too_short'
+    assert x_c50_eng_12 == 64 - 0.95 * (1 / 12 * 100) - 0.69 * (12 / 3)
     assert x_c50_spa_12 == 'no_support'
 
 def test_devereux_readability_index():
@@ -308,7 +322,7 @@ def test_formula_de_comprensibilidad_de_gutierrez_de_polini():
     cp_spa_12 = wl_measures_readability.formula_de_comprensibilidad_de_gutierrez_de_polini(main, test_text_spa_12)
     cp_eng_12 = wl_measures_readability.formula_de_comprensibilidad_de_gutierrez_de_polini(main, test_text_eng_12)
 
-    print('Fórmula de comprensibilidad de Gutiérrez de Polini:')
+    print('Fórmula de Comprensibilidad de Gutiérrez de Polini:')
     print(f'\tspa/0: {cp_spa_0}')
     print(f'\tspa/12: {cp_spa_12}')
     print(f'\teng/12: {cp_eng_12}')
@@ -518,7 +532,8 @@ def test_wiener_sachtextformel():
     test_bormuths_gp()
     test_coleman_liau_index()
     test_colemans_readability_formula()
-    test_dale_chall_readability_score()
+    test_dale_chall_readability_formula()
+    test_dale_chall_readability_formula_new()
     test_devereux_readability_index()
     test_flesch_kincaid_grade_level()
     test_flesch_reading_ease()

diff --git a/tests/wl_tests_work_area/test_profiler.py b/tests/wl_tests_work_area/test_profiler.py
@@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files):
         count_tokens_lens_syls.append(collections.Counter(len_tokens_syls))
         count_tokens_lens_chars.append(collections.Counter(len_tokens_chars))
 
-        assert len(readability_statistics) == 25
+        assert len(readability_statistics) == 26
 
         # Counts
         assert count_paras

diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py
@@ -170,12 +170,12 @@ def bormuths_cloze_mean(main, text):
         if text.count_sentences and text.count_words:
             ddl = get_count_words_dale(text.words_flat, 3000)
             m = (
-                0.886593 -
-                0.083640 * (text.count_chars_alphabetic / text.count_words) +
-                0.161911 * ((ddl / text.count_words)**3) -
-                0.021401 * (text.count_words / text.count_sentences) +
-                0.000577 * ((text.count_words / text.count_sentences)**2) -
-                0.000005 * ((text.count_words / text.count_sentences)**3)
+                0.886593
+                - 0.083640 * (text.count_chars_alphabetic / text.count_words)
+                + 0.161911 * ((ddl / text.count_words)**3)
+                - 0.021401 * (text.count_words / text.count_sentences)
+                + 0.000577 * ((text.count_words / text.count_sentences)**2)
+                - 0.000005 * ((text.count_words / text.count_sentences)**3)
             )
         else:
             m = 'text_too_short'
@@ -193,9 +193,9 @@ def bormuths_gp(main, text):
             gp = m
         else:
             gp = (
-                4.275 + 12.881 * m - 34.934 * (m**2) + 20.388 * (m**3) +
-                26.194 * c - 2.046 * (c**2) - 11.767 * (c**3) -
-                44.285 * (m * c) + 97.620 * ((m * c)**2) - 59.538 * ((m * c)**3)
+                4.275 + 12.881 * m - 34.934 * (m**2) + 20.388 * (m**3)
+                + 26.194 * c - 2.046 * (c**2) - 11.767 * (c**3)
+                - 44.285 * (m * c) + 97.620 * ((m * c)**2) - 59.538 * ((m * c)**3)
             )
     else:
         gp = 'no_support'
@@ -238,29 +238,29 @@ def colemans_readability_formula(main, text):
 
             if variant == '1':
                 cloze_pct = (
-                    1.29 * (count_words_1_syl / text.count_words * 100) -
-                    38.45
+                    1.29 * (count_words_1_syl / text.count_words * 100)
+                    - 38.45
                 )
             elif variant == '2':
                 cloze_pct = (
-                    1.16 * (count_words_1_syl / text.count_words * 100) +
-                    1.48 * (text.count_sentences / text.count_words * 100) -
-                    37.95
+                    1.16 * (count_words_1_syl / text.count_words * 100)
+                    + 1.48 * (text.count_sentences / text.count_words * 100)
+                    - 37.95
                 )
             elif variant == '3':
                 cloze_pct = (
-                    1.07 * (count_words_1_syl / text.count_words * 100) +
-                    1.18 * (text.count_sentences / text.count_words * 100) +
-                    0.76 * (count_prons / text.count_words * 100) -
-                    34.02
+                    1.07 * (count_words_1_syl / text.count_words * 100)
+                    + 1.18 * (text.count_sentences / text.count_words * 100)
+                    + 0.76 * (count_prons / text.count_words * 100)
+                    - 34.02
                 )
             elif variant == '4':
                 cloze_pct = (
-                    1.04 * (count_words_1_syl / text.count_words * 100) +
-                    1.06 * (text.count_sentences / text.count_words * 100) +
-                    0.56 * (count_prons / text.count_words * 100) -
-                    0.36 * (count_preps / text.count_words) -
-                    26.01
+                    1.04 * (count_words_1_syl / text.count_words * 100)
+                    + 1.06 * (text.count_sentences / text.count_words * 100)
+                    + 0.56 * (count_prons / text.count_words * 100)
+                    - 0.36 * (count_preps / text.count_words)
+                    - 26.01
                 )
         else:
             cloze_pct = 'text_too_short'
@@ -269,18 +269,18 @@ def colemans_readability_formula(main, text):
 
     return cloze_pct
 
-# Dale-Chall Readability Score
+# Dale-Chall Readability Formula
 # References:
 #     Dale, E., & Chall, J. S. (1948a). A formula for predicting readability. Educational Research Bulletin, 27(1), 11–20, 28.
 #     Dale, E., & Chall, J. S. (1948b). A formula for predicting readability: Instructions. Educational Research Bulletin, 27(2), 37–54.
-def dale_chall_readability_score(main, text):
+def dale_chall_readability_formula(main, text):
     if text.lang.startswith('eng_'):
         text = get_counts(main, text)
 
         if text.count_words and text.count_sentences:
             count_difficult_words = get_count_words_dale(text.words_flat, 3000)
             x_c50 = (
-                0.1579 * (count_difficult_words / text.count_words)
+                0.1579 * (count_difficult_words / text.count_words * 100)
                 + 0.0496 * (text.count_words / text.count_sentences)
                 + 3.6365
             )
@@ -291,6 +291,26 @@ def dale_chall_readability_score(main, text):
 
     return x_c50
 
+# Dale-Chall Readability Formula (New)
+# Reference: Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall readability formula. Brookline Books.
+def dale_chall_readability_formula_new(main, text):
+    if text.lang.startswith('eng_'):
+        text = get_counts(main, text)
+
+        if text.count_words and text.count_sentences:
+            count_difficult_words = get_count_words_dale(text.words_flat, 3000)
+            x_c50 = (
+                64
+                - 0.95 * (count_difficult_words / text.count_words * 100)
+                - 0.69 * (text.count_words / text.count_sentences)
+            )
+        else:
+            x_c50 = 'text_too_short'
+    else:
+        x_c50 = 'no_support'
+
+    return x_c50
+
 # Devereux Readability Index
 # Reference: Smith, E. A. (1961). Devereaux readability index. Journal of Educational Research, 54(8), 298–303. https://doi.org/10.1080/00220671.1961.10882728
 def devereux_readability_index(main, text):
@@ -457,7 +477,7 @@ def forcast_grade_level(main, text):
 
     return rgl
 
-# Fórmula de comprensibilidad de Gutiérrez de Polini
+# Fórmula de Comprensibilidad de Gutiérrez de Polini
 # References:
 #     Gutiérrez de Polini, L. E. (1972). Investigación sobre lectura en Venezuela [Paper presentation]. Primeras Jornadas de Educación Primaria, Ministerio de Educación, Caracas, Venezuela.
 #     Rodríguez Trujillo, N. (1980). Determinación de la comprensibilidad de materiales de lectura por medio de variables lingüísticas. Lectura y Vida, 1(1). http://www.lecturayvida.fahce.unlp.edu.ar/numeros/a1n1/01_01_Rodriguez.pdf
@@ -506,7 +526,10 @@ def gulpease_index(main, text):
         text = get_counts(main, text)
 
         if text.count_words:
-            gulpease_index = 89 + (300 * text.count_sentences - 10 * text.count_chars_alphabetic) / text.count_words
+            gulpease_index = (
+                89
+                + (300 * text.count_sentences - 10 * text.count_chars_alphabetic) / text.count_words
+            )
         else:
             gulpease_index = 'text_too_short'
     else:
@@ -543,7 +566,10 @@ def gunning_fog_index(main, text):
                     if len(syls) >= 4:
                         count_hard_words += 1
 
-            fog_index = 0.4 * (text.count_words / text.count_sentences + count_hard_words / text.count_words * 100)
+            fog_index = (
+                0.4
+                * (text.count_words / text.count_sentences + count_hard_words / text.count_words * 100)
+            )
         else:
             fog_index = 'text_too_short'
     else:

diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py
@@ -363,13 +363,14 @@ def __init__(self, parent):
             _tr('wl_profiler', "Bormuth's Grade Placement"),
             _tr('wl_profiler', 'Coleman-Liau Index'),
             _tr('wl_profiler', "Coleman's Readability Formula"),
-            _tr('wl_profiler', 'Dale-Chall Readability Score'),
+            _tr('wl_profiler', 'Dale-Chall Readability Formula'),
+            _tr('wl_profiler', 'Dale-Chall Readability Formula (New)'),
             _tr('wl_profiler', 'Devereaux Readability Index'),
             _tr('wl_profiler', 'Flesch-Kincaid Grade Level'),
             _tr('wl_profiler', 'Flesch Reading Ease'),
             _tr('wl_profiler', 'Flesch Reading Ease (Simplified)'),
             _tr('wl_profiler', 'FORCAST Grade Level'),
-            _tr('wl_profiler', 'Fórmula de comprensibilidad de Gutiérrez de Polini'),
+            _tr('wl_profiler', 'Fórmula de Comprensibilidad de Gutiérrez de Polini'),
             _tr('wl_profiler', 'Fórmula de Crawford'),
             _tr('wl_profiler', 'Gulpease Index'),
             _tr('wl_profiler', 'Gunning Fog Index'),
@@ -1180,7 +1181,8 @@ def run(self):
                         wl_measures_readability.bormuths_gp(self.main, text),
                         wl_measures_readability.coleman_liau_index(self.main, text),
                         wl_measures_readability.colemans_readability_formula(self.main, text),
-                        wl_measures_readability.dale_chall_readability_score(self.main, text),
+                        wl_measures_readability.dale_chall_readability_formula(self.main, text),
+                        wl_measures_readability.dale_chall_readability_formula_new(self.main, text),
                         wl_measures_readability.devereux_readability_index(self.main, text),
                         wl_measures_readability.flesch_kincaid_grade_level(self.main, text),
                         wl_measures_readability.flesch_reading_ease(self.main, text),