Skip to content

Commit

Permalink
Work Area: Add Profiler - Readability - Dale-Chall Readability Formul…
Browse files Browse the repository at this point in the history
…a (New)
  • Loading branch information
BLKSerene committed Aug 1, 2023
1 parent 9848685 commit 24c58cd
Show file tree
Hide file tree
Showing 20 changed files with 387 additions and 238 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
- Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
- Utils: Add spaCy's Malay word tokenizer
- Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Dale-Chall Readability Formula (New)

### ✨ Improvements
- Utils: Update Wordless's sentence and sentence segment splitters
Expand Down
162 changes: 74 additions & 88 deletions doc/doc_eng.md

Large diffs are not rendered by default.

224 changes: 116 additions & 108 deletions doc/measures/readability/x_c50.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
112 changes: 112 additions & 0 deletions doc/measures/readability/x_c50_new.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed doc/work_area/colligation_extractor_table.png
Binary file not shown.
Binary file not shown.
Binary file removed doc/work_area/collocation_extractor_table.png
Binary file not shown.
Binary file removed doc/work_area/concordancer_fig.png
Binary file not shown.
Binary file removed doc/work_area/concordancer_parallel_table.png
Binary file not shown.
Binary file removed doc/work_area/concordancer_table.png
Binary file not shown.
Binary file removed doc/work_area/keyword_extractor_table.png
Binary file not shown.
Binary file removed doc/work_area/ngram_generator_table.png
Binary file not shown.
Binary file removed doc/work_area/profiler_table.png
Binary file not shown.
Binary file removed doc/work_area/wordlist_generator_fig_line_chart.png
Binary file not shown.
Binary file removed doc/work_area/wordlist_generator_fig_word_cloud.png
Binary file not shown.
Binary file removed doc/work_area/wordlist_generator_table.png
Binary file not shown.
31 changes: 23 additions & 8 deletions tests/wl_tests_measures/test_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,32 @@ def test_colemans_readability_formula():
assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (0 / 12 * 100) - 0.36 * (0 / 12) - 26.01
assert cloze_pct_other_12 == 'no_support'

def test_dale_chall_readability_score():
x_c50_eng_0 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_0)
x_c50_eng_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_12)
x_c50_spa_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_spa_12)
def test_dale_chall_readability_formula():
x_c50_eng_0 = wl_measures_readability.dale_chall_readability_formula(main, test_text_eng_0)
x_c50_eng_12 = wl_measures_readability.dale_chall_readability_formula(main, test_text_eng_12)
x_c50_spa_12 = wl_measures_readability.dale_chall_readability_formula(main, test_text_spa_12)

print('Dale-Chall Readibility Score:')
print('Dale-Chall Readability Formula:')
print(f'\teng/0: {x_c50_eng_0}')
print(f'\teng/12: {x_c50_eng_12}')
print(f'\tspa/12: {x_c50_spa_12}')

assert x_c50_eng_0 == 'text_too_short'
assert x_c50_eng_12 == 0.1579 * (1 / 12) + 0.0496 * (12 / 3) + 3.6365
assert x_c50_eng_12 == 0.1579 * (1 / 12 * 100) + 0.0496 * (12 / 3) + 3.6365
assert x_c50_spa_12 == 'no_support'

def test_dale_chall_readability_formula_new():
x_c50_eng_0 = wl_measures_readability.dale_chall_readability_formula_new(main, test_text_eng_0)
x_c50_eng_12 = wl_measures_readability.dale_chall_readability_formula_new(main, test_text_eng_12)
x_c50_spa_12 = wl_measures_readability.dale_chall_readability_formula_new(main, test_text_spa_12)

print('Dale-Chall Readability Formula (New):')
print(f'\teng/0: {x_c50_eng_0}')
print(f'\teng/12: {x_c50_eng_12}')
print(f'\tspa/12: {x_c50_spa_12}')

assert x_c50_eng_0 == 'text_too_short'
assert x_c50_eng_12 == 64 - 0.95 * (1 / 12 * 100) - 0.69 * (12 / 3)
assert x_c50_spa_12 == 'no_support'

def test_devereux_readability_index():
Expand Down Expand Up @@ -308,7 +322,7 @@ def test_formula_de_comprensibilidad_de_gutierrez_de_polini():
cp_spa_12 = wl_measures_readability.formula_de_comprensibilidad_de_gutierrez_de_polini(main, test_text_spa_12)
cp_eng_12 = wl_measures_readability.formula_de_comprensibilidad_de_gutierrez_de_polini(main, test_text_eng_12)

print('Fórmula de comprensibilidad de Gutiérrez de Polini:')
print('Fórmula de Comprensibilidad de Gutiérrez de Polini:')
print(f'\tspa/0: {cp_spa_0}')
print(f'\tspa/12: {cp_spa_12}')
print(f'\teng/12: {cp_eng_12}')
Expand Down Expand Up @@ -518,7 +532,8 @@ def test_wiener_sachtextformel():
test_bormuths_gp()
test_coleman_liau_index()
test_colemans_readability_formula()
test_dale_chall_readability_score()
test_dale_chall_readability_formula()
test_dale_chall_readability_formula_new()
test_devereux_readability_index()
test_flesch_kincaid_grade_level()
test_flesch_reading_ease()
Expand Down
2 changes: 1 addition & 1 deletion tests/wl_tests_work_area/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files):
count_tokens_lens_syls.append(collections.Counter(len_tokens_syls))
count_tokens_lens_chars.append(collections.Counter(len_tokens_chars))

assert len(readability_statistics) == 25
assert len(readability_statistics) == 26

# Counts
assert count_paras
Expand Down
84 changes: 55 additions & 29 deletions wordless/wl_measures/wl_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,12 @@ def bormuths_cloze_mean(main, text):
if text.count_sentences and text.count_words:
ddl = get_count_words_dale(text.words_flat, 3000)
m = (
0.886593 -
0.083640 * (text.count_chars_alphabetic / text.count_words) +
0.161911 * ((ddl / text.count_words)**3) -
0.021401 * (text.count_words / text.count_sentences) +
0.000577 * ((text.count_words / text.count_sentences)**2) -
0.000005 * ((text.count_words / text.count_sentences)**3)
0.886593
- 0.083640 * (text.count_chars_alphabetic / text.count_words)
+ 0.161911 * ((ddl / text.count_words)**3)
- 0.021401 * (text.count_words / text.count_sentences)
+ 0.000577 * ((text.count_words / text.count_sentences)**2)
- 0.000005 * ((text.count_words / text.count_sentences)**3)
)
else:
m = 'text_too_short'
Expand All @@ -193,9 +193,9 @@ def bormuths_gp(main, text):
gp = m
else:
gp = (
4.275 + 12.881 * m - 34.934 * (m**2) + 20.388 * (m**3) +
26.194 * c - 2.046 * (c**2) - 11.767 * (c**3) -
44.285 * (m * c) + 97.620 * ((m * c)**2) - 59.538 * ((m * c)**3)
4.275 + 12.881 * m - 34.934 * (m**2) + 20.388 * (m**3)
+ 26.194 * c - 2.046 * (c**2) - 11.767 * (c**3)
- 44.285 * (m * c) + 97.620 * ((m * c)**2) - 59.538 * ((m * c)**3)
)
else:
gp = 'no_support'
Expand Down Expand Up @@ -238,29 +238,29 @@ def colemans_readability_formula(main, text):

if variant == '1':
cloze_pct = (
1.29 * (count_words_1_syl / text.count_words * 100) -
38.45
1.29 * (count_words_1_syl / text.count_words * 100)
- 38.45
)
elif variant == '2':
cloze_pct = (
1.16 * (count_words_1_syl / text.count_words * 100) +
1.48 * (text.count_sentences / text.count_words * 100) -
37.95
1.16 * (count_words_1_syl / text.count_words * 100)
+ 1.48 * (text.count_sentences / text.count_words * 100)
- 37.95
)
elif variant == '3':
cloze_pct = (
1.07 * (count_words_1_syl / text.count_words * 100) +
1.18 * (text.count_sentences / text.count_words * 100) +
0.76 * (count_prons / text.count_words * 100) -
34.02
1.07 * (count_words_1_syl / text.count_words * 100)
+ 1.18 * (text.count_sentences / text.count_words * 100)
+ 0.76 * (count_prons / text.count_words * 100)
- 34.02
)
elif variant == '4':
cloze_pct = (
1.04 * (count_words_1_syl / text.count_words * 100) +
1.06 * (text.count_sentences / text.count_words * 100) +
0.56 * (count_prons / text.count_words * 100) -
0.36 * (count_preps / text.count_words) -
26.01
1.04 * (count_words_1_syl / text.count_words * 100)
+ 1.06 * (text.count_sentences / text.count_words * 100)
+ 0.56 * (count_prons / text.count_words * 100)
- 0.36 * (count_preps / text.count_words)
- 26.01
)
else:
cloze_pct = 'text_too_short'
Expand All @@ -269,18 +269,18 @@ def colemans_readability_formula(main, text):

return cloze_pct

# Dale-Chall Readability Score
# Dale-Chall Readability Formula
# References:
# Dale, E., & Chall, J. S. (1948a). A formula for predicting readability. Educational Research Bulletin, 27(1), 11–20, 28.
# Dale, E., & Chall, J. S. (1948b). A formula for predicting readability: Instructions. Educational Research Bulletin, 27(2), 37–54.
def dale_chall_readability_score(main, text):
def dale_chall_readability_formula(main, text):
if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
count_difficult_words = get_count_words_dale(text.words_flat, 3000)
x_c50 = (
0.1579 * (count_difficult_words / text.count_words)
0.1579 * (count_difficult_words / text.count_words * 100)
+ 0.0496 * (text.count_words / text.count_sentences)
+ 3.6365
)
Expand All @@ -291,6 +291,26 @@ def dale_chall_readability_score(main, text):

return x_c50

# Dale-Chall Readability Formula (New)
# Reference: Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall readability formula. Brookline Books.
def dale_chall_readability_formula_new(main, text):
if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
count_difficult_words = get_count_words_dale(text.words_flat, 3000)
x_c50 = (
64
- 0.95 * (count_difficult_words / text.count_words * 100)
- 0.69 * (text.count_words / text.count_sentences)
)
else:
x_c50 = 'text_too_short'
else:
x_c50 = 'no_support'

return x_c50

# Devereux Readability Index
# Reference: Smith, E. A. (1961). Devereaux readability index. Journal of Educational Research, 54(8), 298–303. https://doi.org/10.1080/00220671.1961.10882728
def devereux_readability_index(main, text):
Expand Down Expand Up @@ -457,7 +477,7 @@ def forcast_grade_level(main, text):

return rgl

# Fórmula de comprensibilidad de Gutiérrez de Polini
# Fórmula de Comprensibilidad de Gutiérrez de Polini
# References:
# Gutiérrez de Polini, L. E. (1972). Investigación sobre lectura en Venezuela [Paper presentation]. Primeras Jornadas de Educación Primaria, Ministerio de Educación, Caracas, Venezuela.
# Rodríguez Trujillo, N. (1980). Determinación de la comprensibilidad de materiales de lectura por medio de variables lingüísticas. Lectura y Vida, 1(1). http://www.lecturayvida.fahce.unlp.edu.ar/numeros/a1n1/01_01_Rodriguez.pdf
Expand Down Expand Up @@ -506,7 +526,10 @@ def gulpease_index(main, text):
text = get_counts(main, text)

if text.count_words:
gulpease_index = 89 + (300 * text.count_sentences - 10 * text.count_chars_alphabetic) / text.count_words
gulpease_index = (
89
+ (300 * text.count_sentences - 10 * text.count_chars_alphabetic) / text.count_words
)
else:
gulpease_index = 'text_too_short'
else:
Expand Down Expand Up @@ -543,7 +566,10 @@ def gunning_fog_index(main, text):
if len(syls) >= 4:
count_hard_words += 1

fog_index = 0.4 * (text.count_words / text.count_sentences + count_hard_words / text.count_words * 100)
fog_index = (
0.4
* (text.count_words / text.count_sentences + count_hard_words / text.count_words * 100)
)
else:
fog_index = 'text_too_short'
else:
Expand Down
8 changes: 5 additions & 3 deletions wordless/wl_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,13 +363,14 @@ def __init__(self, parent):
_tr('wl_profiler', "Bormuth's Grade Placement"),
_tr('wl_profiler', 'Coleman-Liau Index'),
_tr('wl_profiler', "Coleman's Readability Formula"),
_tr('wl_profiler', 'Dale-Chall Readability Score'),
_tr('wl_profiler', 'Dale-Chall Readability Formula'),
_tr('wl_profiler', 'Dale-Chall Readability Formula (New)'),
_tr('wl_profiler', 'Devereaux Readability Index'),
_tr('wl_profiler', 'Flesch-Kincaid Grade Level'),
_tr('wl_profiler', 'Flesch Reading Ease'),
_tr('wl_profiler', 'Flesch Reading Ease (Simplified)'),
_tr('wl_profiler', 'FORCAST Grade Level'),
_tr('wl_profiler', 'Fórmula de comprensibilidad de Gutiérrez de Polini'),
_tr('wl_profiler', 'Fórmula de Comprensibilidad de Gutiérrez de Polini'),
_tr('wl_profiler', 'Fórmula de Crawford'),
_tr('wl_profiler', 'Gulpease Index'),
_tr('wl_profiler', 'Gunning Fog Index'),
Expand Down Expand Up @@ -1180,7 +1181,8 @@ def run(self):
wl_measures_readability.bormuths_gp(self.main, text),
wl_measures_readability.coleman_liau_index(self.main, text),
wl_measures_readability.colemans_readability_formula(self.main, text),
wl_measures_readability.dale_chall_readability_score(self.main, text),
wl_measures_readability.dale_chall_readability_formula(self.main, text),
wl_measures_readability.dale_chall_readability_formula_new(self.main, text),
wl_measures_readability.devereux_readability_index(self.main, text),
wl_measures_readability.flesch_kincaid_grade_level(self.main, text),
wl_measures_readability.flesch_reading_ease(self.main, text),
Expand Down

0 comments on commit 24c58cd

Please sign in to comment.