Skip to content

Commit

Permalink
Work Area: 1. Add Profiler - Readability - Dickes-Steiwer Handformel …
Browse files Browse the repository at this point in the history
…2. Update Profiler - Readability - neue Wiener Sachtextformel / SMOG Grade / Tränkle & Bailer's Readability Formula
  • Loading branch information
BLKSerene committed Aug 12, 2023
1 parent 3be750b commit ee5197b
Show file tree
Hide file tree
Showing 12 changed files with 1,026 additions and 748 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
- Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
- Utils: Add spaCy's Malay word tokenizer
- Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Danielson-Bryan's Readability Formula / Degrees of Reading Power / Easy Listening Formula / Fucks's Stilcharakteristik / Strain Index / Tränkle & Bailer's Readability Formula / Tuldava's Text Difficulty / Wheeler & Smith's Readability Formula
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Danielson-Bryan's Readability Formula / Degrees of Reading Power / Dickes-Steiwer Handformel / Easy Listening Formula / Fucks's Stilcharakteristik / Strain Index / Tränkle & Bailer's Readability Formula / Tuldava's Text Difficulty / Wheeler & Smith's Readability Formula

### ✨ Improvements
- Utils: Update spaCy's sentence recognizers, word tokenizers, part-of-speech taggers, lemmatizers, and dependency parsers
- Utils: Update Wordless's sentence and sentence segment splitters
- Work Area: Update Profiler - Readability - Automated Readability Index / Dale-Chall Readability Formula / Flesch Reading Ease / Flesch Reading Ease (Farr-Jenkins-Paterson) / Gunning Fog Index / Spache Grade Level
- Work Area: Update Profiler - Readability - Automated Readability Index / Dale-Chall Readability Formula / Flesch Reading Ease / Flesch Reading Ease (Farr-Jenkins-Paterson) / Gunning Fog Index / neue Wiener Sachtextformel / SMOG Grade / Spache Grade Level

### 📌 Bugfixes
- Work Area: Fix Concordancer - Generation Settings - Width unit - Character
Expand Down
163 changes: 86 additions & 77 deletions doc/doc_eng.md

Large diffs are not rendered by default.

179 changes: 179 additions & 0 deletions doc/measures/readability/dickes_steiwer_handformel.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
493 changes: 234 additions & 259 deletions doc/measures/readability/wstf.svg → doc/measures/readability/nws.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
163 changes: 108 additions & 55 deletions doc/measures/readability/smog_grade.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
503 changes: 262 additions & 241 deletions doc/measures/readability/trankle_bailers_readability_formula.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
91 changes: 54 additions & 37 deletions tests/wl_tests_measures/test_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'):

test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de')
test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de')
test_text_deu_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120, lang = 'deu_de')

test_text_ita_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ita')
test_text_ita_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ita')
Expand Down Expand Up @@ -268,7 +269,22 @@ def test_devereux_readability_index():
print(f'\tspa/12: {grade_placement_spa_12}')

assert grade_placement_eng_0 == 'text_too_short'
assert grade_placement_eng_12 == grade_placement_spa_12 == 1.56 * (47 / 12) + 0.19 * (12 / 3) - 6.49
assert grade_placement_eng_12 == 1.56 * (47 / 12) + 0.19 * (12 / 3) - 6.49
assert grade_placement_spa_12 != 'text_too_short'

def test_dickes_steiwer_handformel():
dickes_steiwer_eng_0 = wl_measures_readability.dickes_steiwer_handformel(main, test_text_eng_0)
dickes_steiwer_eng_12 = wl_measures_readability.dickes_steiwer_handformel(main, test_text_eng_12)
dickes_steiwer_spa_12 = wl_measures_readability.dickes_steiwer_handformel(main, test_text_spa_12)

print('Dickes-Steiwer Handformel:')
print(f'\teng/0: {dickes_steiwer_eng_0}')
print(f'\teng/12: {dickes_steiwer_eng_12}')
print(f'\tspa/12: {dickes_steiwer_spa_12}')

assert dickes_steiwer_eng_0 == 'text_too_short'
assert dickes_steiwer_eng_12 == 235.95993 - numpy.log(45 / 12 + 1) * 73.021 - numpy.log(12 / 3 + 1) * 12.56438 - 5 / 12 * 50.03293
assert dickes_steiwer_spa_12 != 'text_too_short'

def test_elf():
elf_eng_0 = wl_measures_readability.elf(main, test_text_eng_0)
Expand Down Expand Up @@ -538,6 +554,34 @@ def test_eflaw():
assert eflaw_eng_12 == (12 + 6) / 3
assert eflaw_spa_12 == 'no_support'

def test_nws():
nws_deu_0 = wl_measures_readability.nws(main, test_text_deu_0)
settings['nws']['variant'] = '1'
nws_deu_12_1 = wl_measures_readability.nws(main, test_text_deu_12)
settings['nws']['variant'] = '2'
nws_deu_12_2 = wl_measures_readability.nws(main, test_text_deu_12)
settings['nws']['variant'] = '3'
nws_deu_12_3 = wl_measures_readability.nws(main, test_text_deu_12)
nws_eng_12 = wl_measures_readability.nws(main, test_text_eng_12)

print('neue Wiener Sachtextformel:')
print(f'\tdeu/0: {nws_deu_0}')
print(f'\tdeu/12-1: {nws_deu_12_1}')
print(f'\tdeu/12-2: {nws_deu_12_2}')
print(f'\tdeu/12-3: {nws_deu_12_3}')
print(f'\teng/12: {nws_eng_12}')

ms = 0 / 12 * 100
sl = 12 / 3
iw = 3 / 12 * 100
es = 9 / 12 * 100

assert nws_deu_0 == 'text_too_short'
assert nws_deu_12_1 == 0.1925 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
assert nws_deu_12_2 == 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
assert nws_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144
assert nws_eng_12 == 'no_support'

def test_osman():
osman_ara_0 = wl_measures_readability.osman(main, test_text_ara_0)
osman_ara_12 = wl_measures_readability.osman(main, test_text_ara_12)
Expand Down Expand Up @@ -568,17 +612,21 @@ def test_rix():
def test_smog_grade():
g_eng_12 = wl_measures_readability.smog_grade(main, test_text_eng_12)
g_eng_120 = wl_measures_readability.smog_grade(main, test_text_eng_120)
g_eng_120 = wl_measures_readability.smog_grade(main, test_text_eng_120)
g_deu_120 = wl_measures_readability.smog_grade(main, test_text_deu_120)
g_spa_120 = wl_measures_readability.smog_grade(main, test_text_spa_120)
g_other_12 = wl_measures_readability.smog_grade(main, test_text_other_12)

print('SMOG Grade:')
print(f'\teng/12: {g_eng_12}')
print(f'\teng/120: {g_eng_120}')
print(f'\tdeu/120: {g_deu_120}')
print(f'\tspa/120: {g_spa_120}')
print(f'\tother/12: {g_other_12}')

assert g_eng_12 == 'text_too_short'
assert g_eng_120 == 3.1291 + 1.043 * (15 ** 0.5)
assert g_eng_120 == 3.1291 + 1.043 * numpy.sqrt(15)
assert g_deu_120 == numpy.sqrt(15 / 30 * 30) - 2
assert g_spa_120 != 'no_support'
assert g_other_12 == 'no_support'

Expand Down Expand Up @@ -635,8 +683,8 @@ def test_trankle_bailers_readability_formula():
print(f'\tother/100: {trankle_bailers_other_100}')

assert trankle_bailers_eng_0 == 'text_too_short'
assert trankle_bailers_eng_100_prep_1 == 224.6814 - 79.8304 * (372 / 100) - 12.24032 * (100 / 25) - 1.292857 * 1
assert trankle_bailers_eng_100_conj_2 == 234.1063 - 96.11069 * (374 / 100) - 2.05444 * 0 - 1.02805 * 1
assert trankle_bailers_eng_100_prep_1 == 224.6814 - numpy.log(372 / 100 + 1) * 79.8304 - numpy.log(100 / 25 + 1) * 12.24032 - 1 * 1.292857
assert trankle_bailers_eng_100_conj_2 == 234.1063 - numpy.log(374 / 100 + 1) * 96.11069 - 0 * 2.05444 - 1 * 1.02805
assert trankle_bailers_tha_100 != 'no_support'
assert trankle_bailers_other_100 == 'no_support'

Expand Down Expand Up @@ -674,38 +722,6 @@ def test_wheeler_smiths_readability_formula():
assert wheeler_smith_spa_12 != 'no_support'
assert wheeler_smith_other_12 == 'no_support'

def test_wstf():
wstf_deu_0 = wl_measures_readability.wstf(main, test_text_deu_0)
settings['wstf']['variant'] = '1'
wstf_deu_12_1 = wl_measures_readability.wstf(main, test_text_deu_12)
settings['wstf']['variant'] = '2'
wstf_deu_12_2 = wl_measures_readability.wstf(main, test_text_deu_12)
settings['wstf']['variant'] = '3'
wstf_deu_12_3 = wl_measures_readability.wstf(main, test_text_deu_12)
settings['wstf']['variant'] = '4'
wstf_deu_12_4 = wl_measures_readability.wstf(main, test_text_deu_12)
wstf_eng_12 = wl_measures_readability.wstf(main, test_text_eng_12)

print('Wiener Sachtextformel:')
print(f'\tdeu/0: {wstf_deu_0}')
print(f'\tdeu/12-1: {wstf_deu_12_1}')
print(f'\tdeu/12-2: {wstf_deu_12_2}')
print(f'\tdeu/12-3: {wstf_deu_12_3}')
print(f'\tdeu/12-4: {wstf_deu_12_4}')
print(f'\teng/12: {wstf_eng_12}')

ms = 0 / 12
sl = 12 / 3
iw = 3 / 12
es = 9 / 12

assert wstf_deu_0 == 'text_too_short'
assert wstf_deu_12_1 == 0.1925 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
assert wstf_deu_12_2 == 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
assert wstf_deu_12_3 == 0.2963 * ms + 0.1905 * sl - 1.1144
assert wstf_deu_12_4 == 0.2744 * ms + 0.2656 * sl - 1.693
assert wstf_eng_12 == 'no_support'

if __name__ == '__main__':
test_aari()
test_ari()
Expand All @@ -717,6 +733,7 @@ def test_wstf():
test_danielson_bryans_readability_formula()
test_drp()
test_devereux_readability_index()
test_dickes_steiwer_handformel()
test_elf()
test_gl()
test_re_flesch()
Expand All @@ -731,6 +748,7 @@ def test_wstf():
test_lensear_write()
test_lix()
test_eflaw()
test_nws()
test_osman()
test_rix()
test_smog_grade()
Expand All @@ -739,4 +757,3 @@ def test_wstf():
test_trankle_bailers_readability_formula()
test_td()
test_wheeler_smiths_readability_formula()
test_wstf()
4 changes: 2 additions & 2 deletions tests/wl_tests_work_area/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files):
count_tokens_lens_syls.append(collections.Counter(len_tokens_syls))
count_tokens_lens_chars.append(collections.Counter(len_tokens_chars))

assert len(readability_statistics) == 33
assert len(readability_statistics) == 34

# Counts
assert count_paras
Expand Down Expand Up @@ -125,7 +125,7 @@ def update_gui(err_msg, texts_stats_files):
assert all((len_syls == 1 for len_syls in len_tokens_syls))
assert all((len_syls == 1 for len_syls in len_types_syls))

# TTR/STTR
# TTRs
assert ttr
assert sttr

Expand Down
102 changes: 62 additions & 40 deletions wordless/wl_measures/wl_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def get_counts(main, text):
if 'count_words' not in text.__dict__:
text.words_flat = list(wl_misc.flatten_list(text.words_multilevel))
text.count_words = len(text.words_flat)
text.count_word_types = len(set(text.words_flat))

# Count of syllables
if 'count_syls' not in text.__dict__ and text.lang in main.settings_global['syl_tokenizers']:
Expand Down Expand Up @@ -401,6 +402,25 @@ def devereux_readability_index(main, text):

return grade_placement

# Dickes-Steiwer Handformel
# References:
# Dickes, P. & Steiwer, L. (1977). Ausarbeitung von lesbarkeitsformeln für die deutsche sprache. Zeitschrift für Entwicklungspsychologie und Pädagogische Psychologie, 9(1), 20–28.
# Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache (p. 57). Jugend und Volk.
def dickes_steiwer_handformel(main, text):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
dickes_steiwer = (
235.95993
- numpy.log(text.count_chars_alpha / text.count_words + 1) * 73.021
- numpy.log(text.count_words / text.count_sentences + 1) * 12.56438
- text.count_word_types / text.count_words * 50.03293
)
else:
dickes_steiwer = 'text_too_short'

return dickes_steiwer

# Easy Listening Formula
# Reference: Fang, I. E. (1966). The easy listening formula. Journal of Broadcasting, 11(1), 63–68. https://doi.org/10.1080/08838156609363529
def elf(main, text):
Expand Down Expand Up @@ -449,7 +469,7 @@ def gl(main, text):
# Kopient, A., & Grabar, N. (2020). Rated lexicon for the simplification of medical texts. In B. Gersbeck-Schierholz (ed.), HEALTHINFO 2020: The fifth international conference on informatics and assistive technologies for health-care, medical support and wellbeing (pp. 11–17). IARIA. https://hal.science/hal-03095275/document
# German:
# Amstad, T. (1978). Wie verständlich sind unsere Zeitungen? [Unpublished doctoral dissertation]. University of Zurich.
# Lesbarkeitsindex. (2023, February 2). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=230472824
# Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache (p. 56). Jugend und Volk.
# Italian:
# Franchina, V., & Vacca, R. (1986). Adaptation of Flesh readability index on a bilingual text written by the same author both in Italian and English languages. Linguaggi, 3, 47–49.
# Garais, E. (2011). Web applications readability. Journal of Information Systems and Operations Management, 5(1), 117–121. http://www.rebe.rau.ro/RePEc/rau/jisomg/SP11/JISOM-SP11-A13.pdf
Expand Down Expand Up @@ -839,6 +859,33 @@ def eflaw(main, text):

return eflaw

# neue Wiener Sachtextformel
# Reference: Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache. Jugend und Volk.
def nws(main, text):
if text.lang.startswith('deu_'):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
variant = main.settings_custom['measures']['readability']['nws']['variant']

ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words * 100
sl = text.count_words / text.count_sentences
iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words * 100
es = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) / text.count_words * 100

if variant == '1':
nws = 0.1935 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
elif variant == '2':
nws = 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
elif variant == '3':
nws = 0.2963 * ms + 0.1905 * sl - 1.1144
else:
nws = 'text_too_short'
else:
nws = 'no_support'

return nws

# Estimate number of syllables in Arabic texts by counting short, long, and stress syllables
# Reference: https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569
def _get_count_syls_ara(text):
Expand Down Expand Up @@ -925,7 +972,10 @@ def rix(main, text):
return rix

# SMOG Grade
# Reference: McLaughlin, G. H. (1969). SMOG grading: A new readability formula. Journal of Reading, 12(8), pp. 639–646.
# References:
# McLaughlin, G. H. (1969). SMOG grading: A new readability formula. Journal of Reading, 12(8), pp. 639–646.
# German:
# Bamberger, R., & Vanecek, E. (1984). Lesen-verstehen-lernen-schreiben: Die schwierigkeitsstufen von texten in deutscher sprache. Jugend und Volk.
def smog_grade(main, text):
if text.lang in main.settings_global['syl_tokenizers']:
text = get_counts(main, text)
Expand All @@ -947,7 +997,10 @@ def smog_grade(main, text):

count_words_3_plus_syls += get_count_words_syls(syls_words, len_min = 3)

g = 3.1291 + 1.043 * (count_words_3_plus_syls ** 0.5)
if text.lang.startswith('deu_'):
g = numpy.sqrt(count_words_3_plus_syls / text.count_sentences * 30) - 2
else:
g = 3.1291 + 1.043 * numpy.sqrt(count_words_3_plus_syls)
else:
g = 'text_too_short'
else:
Expand Down Expand Up @@ -1046,18 +1099,18 @@ def trankle_bailers_readability_formula(main, text):
if variant == '1':
trankle_bailers = (
224.6814
- 79.8304 * (count_chars_alnum / 100)
- 12.24032 * (100 / count_sentences)
- 1.292857 * count_preps
- numpy.log(count_chars_alnum / 100 + 1) * 79.8304
- numpy.log(100 / count_sentences + 1) * 12.24032
- count_preps * 1.292857
)
elif variant == '2':
count_conjs = sum((1 for _, pos in pos_tags if 'CONJ' in pos)) # CCONJ/SCONJ

trankle_bailers = (
234.1063
- 96.11069 * (count_chars_alnum / 100)
- 2.05444 * count_preps
- 1.02805 * count_conjs
- numpy.log(count_chars_alnum / 100 + 1) * 96.11069
- count_preps * 2.05444
- count_conjs * 1.02805
)
else:
trankle_bailers = 'text_too_short'
Expand Down Expand Up @@ -1193,34 +1246,3 @@ def wheeler_smiths_readability_formula(main, text):
wheeler_smith = 'no_support'

return wheeler_smith

# Wiener Sachtextformel
# References:
# Bamberger, R., & Vanecek, E. (1984). Lesen – Verstehen – Lernen – Schreiben. Jugend und Volk.
# Lesbarkeitsindex. (2022, July 21). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=224664667
def wstf(main, text):
if text.lang.startswith('deu_'):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
variant = main.settings_custom['measures']['readability']['wstf']['variant']

ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words
sl = text.count_words / text.count_sentences
iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words
es = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) / text.count_words

if variant == '1':
wstf = 0.1925 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875
elif variant == '2':
wstf = 0.2007 * ms + 0.1682 * sl + 0.1373 * iw - 2.779
elif variant == '3':
wstf = 0.2963 * ms + 0.1905 * sl - 1.1144
elif variant == '4':
wstf = 0.2744 * ms + 0.2656 * sl - 1.693
else:
wstf = 'text_too_short'
else:
wstf = 'no_support'

return wstf
Loading

0 comments on commit ee5197b

Please sign in to comment.