Work Area: 1. Add Profiler - Readability - Dickes-Steiwer Handformel …

…2. Update Profiler - Readability - neue Wiener Sachtextformel / SMOG Grade / Tränkle & Bailer's Readability Formula
BLKSerene · Aug 12, 2023 · ee5197b · ee5197b
1 parent 3be750b
commit ee5197b
Show file tree

Hide file tree

Showing 12 changed files with 1,026 additions and 748 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,12 +28,12 @@
 - Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
 - Utils: Add spaCy's Malay word tokenizer
 - Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser
-- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Danielson-Bryan's Readability Formula / Degrees of Reading Power / Easy Listening Formula / Fucks's Stilcharakteristik / Strain Index / Tränkle & Bailer's Readability Formula / Tuldava's Text Difficulty / Wheeler & Smith's Readability Formula
+- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Danielson-Bryan's Readability Formula / Degrees of Reading Power / Dickes-Steiwer Handformel / Easy Listening Formula / Fucks's Stilcharakteristik / Strain Index / Tränkle & Bailer's Readability Formula / Tuldava's Text Difficulty / Wheeler & Smith's Readability Formula
 
 ### ✨ Improvements
 - Utils: Update spaCy's sentence recognizers, word tokenizers, part-of-speech taggers, lemmatizers, and dependency parsers
 - Utils: Update Wordless's sentence and sentence segment splitters
-- Work Area: Update Profiler - Readability - Automated Readability Index / Dale-Chall Readability Formula / Flesch Reading Ease / Flesch Reading Ease (Farr-Jenkins-Paterson) / Gunning Fog Index / Spache Grade Level
+- Work Area: Update Profiler - Readability - Automated Readability Index / Dale-Chall Readability Formula / Flesch Reading Ease / Flesch Reading Ease (Farr-Jenkins-Paterson) / Gunning Fog Index / neue Wiener Sachtextformel / SMOG Grade / Spache Grade Level
 
 ### 📌 Bugfixes
 - Work Area: Fix Concordancer - Generation Settings - Width unit - Character

diff --git a/doc/doc_eng.md b/doc/doc_eng.md
diff --git a/doc/measures/readability/dickes_steiwer_handformel.svg b/doc/measures/readability/dickes_steiwer_handformel.svg
diff --git a/doc/measures/readability/wstf.svg → doc/measures/readability/nws.svg b/doc/measures/readability/wstf.svg → doc/measures/readability/nws.svg
diff --git a/doc/measures/readability/smog_grade.svg b/doc/measures/readability/smog_grade.svg
diff --git a/doc/measures/readability/trankle_bailers_readability_formula.svg b/doc/measures/readability/trankle_bailers_readability_formula.svg
diff --git a/tests/wl_tests_measures/test_measures_readability.py b/tests/wl_tests_measures/test_measures_readability.py
@@ -59,6 +59,7 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'):
 
 test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de')
 test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de')
+test_text_deu_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120, lang = 'deu_de')
 
 test_text_ita_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ita')
 test_text_ita_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ita')
@@ -268,7 +269,22 @@ def test_devereux_readability_index():
     print(f'\tspa/12: {grade_placement_spa_12}')
 
     assert grade_placement_eng_0 == 'text_too_short'
-    assert grade_placement_eng_12 == grade_placement_spa_12 == 1.56 * (47 / 12) + 0.19 * (12 / 3) - 6.49
+    assert grade_placement_eng_12 == 1.56 * (47 / 12) + 0.19 * (12 / 3) - 6.49
+    assert grade_placement_spa_12 != 'text_too_short'
+
+def test_dickes_steiwer_handformel():
+    dickes_steiwer_eng_0 = wl_measures_readability.dickes_steiwer_handformel(main, test_text_eng_0)
+    dickes_steiwer_eng_12 = wl_measures_readability.dickes_steiwer_handformel(main, test_text_eng_12)
+    dickes_steiwer_spa_12 = wl_measures_readability.dickes_steiwer_handformel(main, test_text_spa_12)
+
+    print('Dickes-Steiwer Handformel:')
+    print(f'\teng/0: {dickes_steiwer_eng_0}')
+    print(f'\teng/12: {dickes_steiwer_eng_12}')
+    print(f'\tspa/12: {dickes_steiwer_spa_12}')
+
+    assert dickes_steiwer_eng_0 == 'text_too_short'
+    assert dickes_steiwer_eng_12 == 235.95993 - numpy.log(45 / 12 + 1) * 73.021 - numpy.log(12 / 3 + 1) * 12.56438 - 5 / 12 * 50.03293
+    assert dickes_steiwer_spa_12 != 'text_too_short'
 
 def test_elf():
     elf_eng_0 = wl_measures_readability.elf(main, test_text_eng_0)
@@ -538,6 +554,34 @@ def test_eflaw():
     assert eflaw_eng_12 == (12 + 6) / 3
     assert eflaw_spa_12 == 'no_support'
 
+def test_nws():
+    nws_deu_0 = wl_measures_readability.nws(main, test_text_deu_0)
+    settings['nws']['variant'] = '1'
+    nws_deu_12_1 = wl_measures_readability.nws(main, test_text_deu_12)
+    settings['nws']['variant'] = '2'
+    nws_deu_12_2 = wl_measures_readability.nws(main, test_text_deu_12)
+    settings['nws']['variant'] = '3'
+    nws_deu_12_3 = wl_measures_readability.nws(main, test_text_deu_12)
+    nws_eng_12 = wl_measures_readability.nws(main, test_text_eng_12)
+
+    print('neue Wiener Sachtextformel:')
+    print(f'\tdeu/0: {nws_deu_0}')
+    print(f'\tdeu/12-1: {nws_deu_12_1}')
+    print(f'\tdeu/12-2: {nws_deu_12_2}')
+    print(f'\tdeu/12-3: {nws_deu_12_3}')
+    print(f'\teng/12: {nws_eng_12}')
+
+    ms = 0 / 12 * 100
+    sl = 12 / 3
+    iw = 3 / 12 * 100
+    es = 9 / 12 * 100
+
+    assert nws_deu_0 == 'text_too_short'
+    assert nws_deu_12_1 == 0.1925 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
+    assert nws_deu_12_2 == 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
+    assert nws_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144
+    assert nws_eng_12 == 'no_support'
+
 def test_osman():
     osman_ara_0 = wl_measures_readability.osman(main, test_text_ara_0)
     osman_ara_12 = wl_measures_readability.osman(main, test_text_ara_12)
@@ -568,17 +612,21 @@ def test_rix():
 def test_smog_grade():
     g_eng_12 = wl_measures_readability.smog_grade(main, test_text_eng_12)
     g_eng_120 = wl_measures_readability.smog_grade(main, test_text_eng_120)
+    g_eng_120 = wl_measures_readability.smog_grade(main, test_text_eng_120)
+    g_deu_120 = wl_measures_readability.smog_grade(main, test_text_deu_120)
     g_spa_120 = wl_measures_readability.smog_grade(main, test_text_spa_120)
     g_other_12 = wl_measures_readability.smog_grade(main, test_text_other_12)
 
     print('SMOG Grade:')
     print(f'\teng/12: {g_eng_12}')
     print(f'\teng/120: {g_eng_120}')
+    print(f'\tdeu/120: {g_deu_120}')
     print(f'\tspa/120: {g_spa_120}')
     print(f'\tother/12: {g_other_12}')
 
     assert g_eng_12 == 'text_too_short'
-    assert g_eng_120 == 3.1291 + 1.043 * (15 ** 0.5)
+    assert g_eng_120 == 3.1291 + 1.043 * numpy.sqrt(15)
+    assert g_deu_120 == numpy.sqrt(15 / 30 * 30) - 2
     assert g_spa_120 != 'no_support'
     assert g_other_12 == 'no_support'
 
@@ -635,8 +683,8 @@ def test_trankle_bailers_readability_formula():
     print(f'\tother/100: {trankle_bailers_other_100}')
 
     assert trankle_bailers_eng_0 == 'text_too_short'
-    assert trankle_bailers_eng_100_prep_1 == 224.6814 - 79.8304 * (372 / 100) - 12.24032 * (100 / 25) - 1.292857 * 1
-    assert trankle_bailers_eng_100_conj_2 == 234.1063 - 96.11069 * (374 / 100) - 2.05444 * 0 - 1.02805 * 1
+    assert trankle_bailers_eng_100_prep_1 == 224.6814 - numpy.log(372 / 100 + 1) * 79.8304 - numpy.log(100 / 25 + 1) * 12.24032 - 1 * 1.292857
+    assert trankle_bailers_eng_100_conj_2 == 234.1063 - numpy.log(374 / 100 + 1) * 96.11069 - 0 * 2.05444 - 1 * 1.02805
     assert trankle_bailers_tha_100 != 'no_support'
     assert trankle_bailers_other_100 == 'no_support'
 
@@ -674,38 +722,6 @@ def test_wheeler_smiths_readability_formula():
     assert wheeler_smith_spa_12 != 'no_support'
     assert wheeler_smith_other_12 == 'no_support'
 
-def test_wstf():
-    wstf_deu_0 = wl_measures_readability.wstf(main, test_text_deu_0)
-    settings['wstf']['variant'] = '1'
-    wstf_deu_12_1 = wl_measures_readability.wstf(main, test_text_deu_12)
-    settings['wstf']['variant'] = '2'
-    wstf_deu_12_2 = wl_measures_readability.wstf(main, test_text_deu_12)
-    settings['wstf']['variant'] = '3'
-    wstf_deu_12_3 = wl_measures_readability.wstf(main, test_text_deu_12)
-    settings['wstf']['variant'] = '4'
-    wstf_deu_12_4 = wl_measures_readability.wstf(main, test_text_deu_12)
-    wstf_eng_12 = wl_measures_readability.wstf(main, test_text_eng_12)
-
-    print('Wiener Sachtextformel:')
-    print(f'\tdeu/0: {wstf_deu_0}')
-    print(f'\tdeu/12-1: {wstf_deu_12_1}')
-    print(f'\tdeu/12-2: {wstf_deu_12_2}')
-    print(f'\tdeu/12-3: {wstf_deu_12_3}')
-    print(f'\tdeu/12-4: {wstf_deu_12_4}')
-    print(f'\teng/12: {wstf_eng_12}')
-
-    ms = 0 / 12
-    sl = 12 / 3
-    iw = 3 / 12
-    es = 9 / 12
-
-    assert wstf_deu_0 == 'text_too_short'
-    assert wstf_deu_12_1 == 0.1925 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
-    assert wstf_deu_12_2 == 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
-    assert wstf_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144
-    assert wstf_deu_12_4 == 0.2744 * ms + 0.2656 * sl - 1.693
-    assert wstf_eng_12 == 'no_support'
-
 if __name__ == '__main__':
     test_aari()
     test_ari()
@@ -717,6 +733,7 @@ def test_wstf():
     test_danielson_bryans_readability_formula()
     test_drp()
     test_devereux_readability_index()
+    test_dickes_steiwer_handformel()
     test_elf()
     test_gl()
     test_re_flesch()
@@ -731,6 +748,7 @@ def test_wstf():
     test_lensear_write()
     test_lix()
     test_eflaw()
+    test_nws()
     test_osman()
     test_rix()
     test_smog_grade()
@@ -739,4 +757,3 @@ def test_wstf():
     test_trankle_bailers_readability_formula()
     test_td()
     test_wheeler_smiths_readability_formula()
-    test_wstf()
diff --git a/tests/wl_tests_work_area/test_profiler.py b/tests/wl_tests_work_area/test_profiler.py
@@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files):
         count_tokens_lens_syls.append(collections.Counter(len_tokens_syls))
         count_tokens_lens_chars.append(collections.Counter(len_tokens_chars))
 
-        assert len(readability_statistics) == 33
+        assert len(readability_statistics) == 34
 
         # Counts
         assert count_paras
@@ -125,7 +125,7 @@ def update_gui(err_msg, texts_stats_files):
                 assert all((len_syls == 1 for len_syls in len_tokens_syls))
                 assert all((len_syls == 1 for len_syls in len_types_syls))
 
-        # TTR/STTR
+        # TTRs
         assert ttr
         assert sttr
 

diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py
@@ -59,6 +59,7 @@ def get_counts(main, text):
     if 'count_words' not in text.__dict__:
         text.words_flat = list(wl_misc.flatten_list(text.words_multilevel))
         text.count_words = len(text.words_flat)
+        text.count_word_types = len(set(text.words_flat))
 
     # Count of syllables
     if 'count_syls' not in text.__dict__ and text.lang in main.settings_global['syl_tokenizers']:
@@ -401,6 +402,25 @@ def devereux_readability_index(main, text):
 
     return grade_placement
 
+# Dickes-Steiwer Handformel
+# References:
+#     Dickes, P. & Steiwer, L. (1977). Ausarbeitung von lesbarkeitsformeln für die deutsche sprache. Zeitschrift für Entwicklungspsychologie und Pädagogische Psychologie, 9(1), 20–28.
+#     Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache (p. 57). Jugend und Volk.
+def dickes_steiwer_handformel(main, text):
+    text = get_counts(main, text)
+
+    if text.count_words and text.count_sentences:
+        dickes_steiwer = (
+            235.95993
+            - numpy.log(text.count_chars_alpha / text.count_words + 1) * 73.021
+            - numpy.log(text.count_words / text.count_sentences + 1) * 12.56438
+            - text.count_word_types / text.count_words * 50.03293
+        )
+    else:
+        dickes_steiwer = 'text_too_short'
+
+    return dickes_steiwer
+
 # Easy Listening Formula
 # Reference: Fang, I. E. (1966). The easy listening formula. Journal of Broadcasting, 11(1), 63–68. https://doi.org/10.1080/08838156609363529
 def elf(main, text):
@@ -449,7 +469,7 @@ def gl(main, text):
 #     Kopient, A., & Grabar, N. (2020). Rated lexicon for the simplification of medical texts. In B.  Gersbeck-Schierholz (ed.), HEALTHINFO 2020: The fifth international conference on informatics and assistive technologies for health-care, medical support and wellbeing (pp. 11–17). IARIA. https://hal.science/hal-03095275/document
 # German:
 #     Amstad, T. (1978). Wie verständlich sind unsere Zeitungen? [Unpublished doctoral dissertation]. University of Zurich.
-#     Lesbarkeitsindex. (2023, February 2). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=230472824
+#     Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache (p. 56). Jugend und Volk.
 # Italian:
 #     Franchina, V., & Vacca, R. (1986). Adaptation of Flesh readability index on a bilingual text written by the same author both in Italian and English languages. Linguaggi, 3, 47–49.
 #     Garais, E. (2011). Web applications readability. Journal of Information Systems and Operations Management, 5(1), 117–121. http://www.rebe.rau.ro/RePEc/rau/jisomg/SP11/JISOM-SP11-A13.pdf
@@ -839,6 +859,33 @@ def eflaw(main, text):
 
     return eflaw
 
+# neue Wiener Sachtextformel
+# Reference: Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache. Jugend und Volk.
+def nws(main, text):
+    if text.lang.startswith('deu_'):
+        text = get_counts(main, text)
+
+        if text.count_words and text.count_sentences:
+            variant = main.settings_custom['measures']['readability']['nws']['variant']
+
+            ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words * 100
+            sl = text.count_words / text.count_sentences
+            iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words * 100
+            es = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) / text.count_words * 100
+
+            if variant == '1':
+                nws = 0.1935 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
+            elif variant == '2':
+                nws = 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
+            elif variant == '3':
+                nws = 0.2963 * ms + 0.1905 * sl - 1.1144
+        else:
+            nws = 'text_too_short'
+    else:
+        nws = 'no_support'
+
+    return nws
+
 # Estimate number of syllables in Arabic texts by counting short, long, and stress syllables
 # Reference: https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569
 def _get_count_syls_ara(text):
@@ -925,7 +972,10 @@ def rix(main, text):
     return rix
 
 # SMOG Grade
-# Reference: McLaughlin, G. H. (1969). SMOG grading: A new readability formula. Journal of Reading, 12(8), pp. 639–646.
+# References:
+#     McLaughlin, G. H. (1969). SMOG grading: A new readability formula. Journal of Reading, 12(8), pp. 639–646.
+# German:
+#     Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache. Jugend und Volk.
 def smog_grade(main, text):
     if text.lang in main.settings_global['syl_tokenizers']:
         text = get_counts(main, text)
@@ -947,7 +997,10 @@ def smog_grade(main, text):
 
                 count_words_3_plus_syls += get_count_words_syls(syls_words, len_min = 3)
 
-            g = 3.1291 + 1.043 * (count_words_3_plus_syls ** 0.5)
+            if text.lang.startswith('deu_'):
+                g = numpy.sqrt(count_words_3_plus_syls / text.count_sentences * 30) - 2
+            else:
+                g = 3.1291 + 1.043 * numpy.sqrt(count_words_3_plus_syls)
         else:
             g = 'text_too_short'
     else:
@@ -1046,18 +1099,18 @@ def trankle_bailers_readability_formula(main, text):
             if variant == '1':
                 trankle_bailers = (
                     224.6814
-                    - 79.8304 * (count_chars_alnum / 100)
-                    - 12.24032 * (100 / count_sentences)
-                    - 1.292857 * count_preps
+                    - numpy.log(count_chars_alnum / 100 + 1) * 79.8304
+                    - numpy.log(100 / count_sentences + 1) * 12.24032
+                    - count_preps * 1.292857
                 )
             elif variant == '2':
                 count_conjs = sum((1 for _, pos in pos_tags if 'CONJ' in pos)) # CCONJ/SCONJ
 
                 trankle_bailers = (
                     234.1063
-                    - 96.11069 * (count_chars_alnum / 100)
-                    - 2.05444 * count_preps
-                    - 1.02805 * count_conjs
+                    - numpy.log(count_chars_alnum / 100 + 1) * 96.11069
+                    - count_preps * 2.05444
+                    - count_conjs * 1.02805
                 )
         else:
             trankle_bailers = 'text_too_short'
@@ -1193,34 +1246,3 @@ def wheeler_smiths_readability_formula(main, text):
         wheeler_smith = 'no_support'
 
     return wheeler_smith
-
-# Wiener Sachtextformel
-# References:
-#     Bamberger, R., & Vanecek, E. (1984). Lesen – Verstehen – Lernen – Schreiben. Jugend und Volk.
-#     Lesbarkeitsindex. (2022, July 21). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=224664667
-def wstf(main, text):
-    if text.lang.startswith('deu_'):
-        text = get_counts(main, text)
-
-        if text.count_words and text.count_sentences:
-            variant = main.settings_custom['measures']['readability']['wstf']['variant']
-
-            ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words
-            sl = text.count_words / text.count_sentences
-            iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words
-            es = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) / text.count_words
-
-            if variant == '1':
-                wstf = 0.1925 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
-            elif variant == '2':
-                wstf = 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
-            elif variant == '3':
-                wstf = 0.2963 * ms + 0.1905 * sl - 1.1144
-            elif variant == '4':
-                wstf = 0.2744 * ms + 0.2656 * sl - 1.693
-        else:
-            wstf = 'text_too_short'
-    else:
-        wstf = 'no_support'
-
-    return wstf