From c5e200ba51836570cfe6ea00fd533852475a6155 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Thu, 27 Jul 2023 21:34:04 +0800 Subject: [PATCH] Settings: Add Settings - Measures - Readability - Coleman's Readability Formula; Work Area: Add Profiler - Readability - Coleman's Readability Formula --- CHANGELOG.md | 4 +- doc/doc_eng.md | 96 ++-- .../colemans_readability_formula.svg | 411 ++++++++++++++++++ doc/measures/readability/lensear_write.svg | 53 +-- doc/measures/readability/re_simplified.svg | 120 +++-- doc/measures/readability/rgl.svg | 25 +- doc/measures/readability/smog_grade.svg | 30 +- doc/measures/readability/wstf.svg | 150 +++---- .../test_measures_readability.py | 69 ++- tests/wl_tests_work_area/test_profiler.py | 2 +- .../wl_measures/wl_measures_readability.py | 94 +++- wordless/wl_profiler.py | 2 + wordless/wl_settings/wl_settings_default.py | 4 + wordless/wl_settings/wl_settings_measures.py | 26 +- 14 files changed, 780 insertions(+), 306 deletions(-) create mode 100644 doc/measures/readability/colemans_readability_formula.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index ba214f00e..49c1a8845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,13 +20,13 @@ ## [3.3.0](https://github.com/BLKSerene/Wordless/releases/tag/3.3.0) - ??/??/2023 ### 🎉 New Features -- Settings: Add Settings - Measures - Readability - Bormuth's Grade Placement / Flesch Reading Ease +- Settings: Add Settings - Measures - Readability - Bormuth's Grade Placement / Coleman's Readability Formula / Flesch Reading Ease - Utils: Add khmer-nltk's Khmer sentence tokenizer, word tokenizer, and part-of-speech tagger - Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard) - Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser - Utils: Add spaCy's Malay word tokenizer - Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser -- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement +- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula ### ✨ Improvements - Utils: Update Wordless's sentence and sentence segment splitters diff --git a/doc/doc_eng.md b/doc/doc_eng.md index 7ff37ec57..d614426a5 100644 --- a/doc/doc_eng.md +++ b/doc/doc_eng.md @@ -907,7 +907,8 @@ These variables are used in the following formulas:
**NumWords**: Number of words in the text or sample
**NumWordsDale₇₆₉**: Number of words outside the Dale list of 769 easy words ([Dale, 1931](#ref-dale-1931))
**NumWordsDale₃₀₀₀**: Number of words outside the Dale list of 3000 easy words ([Dale & Chall, 1948b](#ref-dale-chall-1948b))
-**NumWordsMonosyllabic**: Number of monosyllabic words
+**NumWords1Syl**: Number of monosyllabic words
+**NumWords3PlusSyls**: Number of words with 3 or more syllables
**NumSyls**: Number of syllable in the text or sample
**NumCharsAll**: Number of characters (including letters, CJK characters, etc., numerals, and punctuation marks) in the text or sample
**NumCharsAlphanumeric**: Number of alphanumeric characters (letters, CJK characters, etc., numerals) in the text or sample
@@ -934,6 +935,14 @@ Coleman-Liau Index: Grade \; Level &= -27.4004 \times \frac{Estimated \; Cloze \; %}{100} + 23.06395 \end{align*} +Coleman's Readability Formula: + \begin{align*} + Cloze \; % &= 1.29 \times \left(\frac{NumWords1Syl}{NumWords} \times 100\right) - 38.45 \\ + Cloze \; % &= 1.16 \times \left(\frac{NumWords1Syl}{NumWords} \times 100\right) + 1.48 \times \left(\frac{NumSentences}{NumWords} \times 100\right) - 37.95 \\ + Cloze \; % &= 1.07 \times \left(\frac{NumWords1Syl}{NumWords} \times 100\right) + 1.18 \times \left(\frac{NumSentences}{NumWords} \times 100\right) + 0.76 \times \left(\frac{NumProns}{NumWords} \times 100\right) - 34.02 \\ + Cloze \; % &= 1.04 \times \left(\frac{NumWords1Syl}{NumWords} \times 100\right) + 1.06 \times \left(\frac{NumSentences}{NumWords} \times 100\right) + 0.56 \times \left(\frac{NumProns}{NumWords} \times 100\right) - 0.36 \times \left(\frac{NumPreps}{NumWords} \times 100\right) - 26.01 + \end{align*} + Dale-Chall Readability Score: X_{c50} = 0.1579 \times \frac{NumWordsDale_{3000}}{NumWords} + 0.0496 \times \frac{NumWords}{NumSentences} + 3.6365 @@ -958,10 +967,10 @@ Flesch Reading Ease: \end{align*} Flesch Reading Ease (Simplified): - {RE = 1.599 \times \left(\frac{NumWordsMonosyllabic}{NumWords} \times 100\right) - 1.015 \times \frac{NumWords}{NumSentences} - 31.517} + {RE = 1.599 \times \left(\frac{NumWords1Syl}{NumWords} \times 100\right) - 1.015 \times \frac{NumWords}{NumSentences} - 31.517} FORCAST Grade Level: - RGL = 20.43 - 0.11 \times NumWordsMonosyllabic + RGL = 20.43 - 0.11 \times NumWords1Syl Fórmula de comprensibilidad de Gutiérrez de Polini: CP = 95.2 - 9.7 \times \frac{NumCharsAlphabetic}{NumWords} - 0.35 \times \frac{NumWords}{NumSentences} @@ -979,7 +988,7 @@ Legibilidad µ: \mu = \frac{NumWords}{NumWords - 1} \times \frac{LenWordsAvg}{LenWordsVar} \times 100 Lensear Write: - Score = NumWordsMonosyllabic + 3 \times NumSentences + Score = NumWords1Syl + 3 \times NumSentences Lix: Lix = \frac{NumWords}{NumSentences} + 100 \times \frac{NumLongWords}{NumWords} @@ -994,15 +1003,15 @@ Rix: Rix = \frac{NumLongWords}{NumSentences} SMOG Grade: - g = 3.1291 + 1.043 \times \sqrt{NumWordsPolysyllabic} + g = 3.1291 + 1.043 \times \sqrt{NumWords3PlusSyls} Spache Grade Level: {Grade \; Level = 0.141 \times \frac{100}{NumSentences} + 0.086 \times \left(\frac{NumWordsDale_{769}}{100} \times 100\right) + 0.839} Wiener Sachtextformel: \begin{align*} - MS &= \frac{NumWordsPolysyllabic}{NumWords} \qquad SL = \frac{NumWords}{NumSentences} \\ - IW &= \frac{NumLongWords}{NumWords} \qquad \qquad \quad ES = \frac{NumWordsMonosyllabic}{NumWords} \\ + MS &= \frac{NumWords3PlusSyls}{NumWords} \qquad SL = \frac{NumWords}{NumSentences} \\ + IW &= \frac{NumLongWords}{NumWords} \qquad \qquad \; ES = \frac{NumWords1Syl}{NumWords} \\ WSTF_1 &= 0.1925 \times MS + 0.1672 \times SL + 0.1297 \times IW - 0.0327 \times ES - 0.875 \\ WSTF_2 &= 0.2007 \times MS + 0.1682 \times SL + 0.1373 \times IW - 2.779 \\ WSTF_3 &= 0.2963 \times MS + 0.1905 \times SL - 1.1144 \\ @@ -1016,25 +1025,26 @@ Measure of Readability|Formula Automated Readability Index
([Smith & Senter, 1967](#ref-smith-senter-1967))|![Formula](/doc/measures/readability/ari.svg) Bormuth's Cloze Mean & Grade Placement
([Bormuth, 1969](#ref-bormuth-1969))|![Formula](/doc/measures/readability/bormuths_cloze_mean_gp.svg)
where **C** is the cloze criterion score, whose value could be changed via **Menu → Preferences → Settings → Measures → Readability → Bormuth's Grade Placement - Cloze criterion score**

* This measure applies only to **English texts**. Coleman-Liau Index
([Coleman & Liau, 1975](#ref-coleman-liau-1975))|![Formula](/doc/measures/readability/coleman_liau_index.svg) +Coleman's Readability Formula¹
([Coleman et al., 1976](#ref-coleman-et-al-1976))|![Formula](/doc/measures/readability/colemans_readability_formula.svg)
where **NumProns** is the number of pronouns and **NumPreps** is the number of Prepositions

* This measure applies only to **English texts**.
* This measure has 4 variants, which you could select via **Menu → Preferences → Settings → Measures → Readability → Coleman's Readability Formula → Variant**. Dale-Chall Readability Score
([Dale & Chall, 1948a](#ref-dale-chall-1948a))|![Formula](/doc/measures/readability/x_c50.svg)

* This measure applies only to **English texts**. Devereux Readability Index
([Smith, 1961](#ref-smith-1961))|![Formula](/doc/measures/readability/devereux_readability_index.svg) -Flesch-Kincaid Grade Level1
([Kincaid et al., 1975](#ref-kincaid-et-al-1975))|![Formula](/doc/measures/readability/flesch_kincaid_grade_level.svg) -Flesch Reading Ease1
([Flesch, 1948](#ref-flesch-1948)
Dutch: [Douma, 1960](#ref-douma-1960); [Brouwer, 1963](#ref-brouwer-1963)
French: [Kandel & Moles, 1958](#ref-kandel-moles-1958)
German: [Amstad, 1978](#ref-amstad-1978)
Italian: [Franchina & Vacca, 1986](#ref-franchina-vacca-1986)
Russian: [Oborneva, 2006](#ref-oborneva-2006)
Spanish: [Fernández Huerta, 1959](#ref-fernandez-huerta-1959); [Szigriszt Pazos, 1993](#ref-szigrisze-pazos-1993))|![Formula](/doc/measures/readability/re.svg)

* This measure has multiple variants for some languages, which you could select via **Menu → Preferences → Settings → Measures → Readability → Flesch Reading Ease**. -Flesch Reading Ease (Simplified)1
([Farr et al., 1951](#ref-farr-et-al-1951))|![Formula](/doc/measures/readability/re_simplified.svg) -FORCAST Grade Level1
([Caylor et al., 1973](#ref-caylor-et-al-1973))|![Formula](/doc/measures/readability/rgl.svg)

* A sample of 150 words is taken randomly from the text, thus the text should be **at least 150 words long**. +Flesch-Kincaid Grade Level¹
([Kincaid et al., 1975](#ref-kincaid-et-al-1975))|![Formula](/doc/measures/readability/flesch_kincaid_grade_level.svg) +Flesch Reading Ease¹
([Flesch, 1948](#ref-flesch-1948)
Dutch: [Douma, 1960](#ref-douma-1960); [Brouwer, 1963](#ref-brouwer-1963)
French: [Kandel & Moles, 1958](#ref-kandel-moles-1958)
German: [Amstad, 1978](#ref-amstad-1978)
Italian: [Franchina & Vacca, 1986](#ref-franchina-vacca-1986)
Russian: [Oborneva, 2006](#ref-oborneva-2006)
Spanish: [Fernández Huerta, 1959](#ref-fernandez-huerta-1959); [Szigriszt Pazos, 1993](#ref-szigrisze-pazos-1993))|![Formula](/doc/measures/readability/re.svg)

* This measure has multiple variants for some languages, which you could select via **Menu → Preferences → Settings → Measures → Readability → Flesch Reading Ease**. +Flesch Reading Ease (Simplified)¹
([Farr et al., 1951](#ref-farr-et-al-1951))|![Formula](/doc/measures/readability/re_simplified.svg) +FORCAST Grade Level¹
([Caylor et al., 1973](#ref-caylor-et-al-1973))|![Formula](/doc/measures/readability/rgl.svg)

* A sample of 150 words is taken randomly from the text, thus the text should be **at least 150 words long**. Fórmula de comprensibilidad de Gutiérrez de Polini
([Gutiérrez de Polini, 1972](#ref-gutierrez-de-polini-1972))|![Formula](/doc/measures/readability/cp.svg)

* This measure applies only to **Spanish texts**. -Fórmula de Crawford1
([Crawford, 1985](#ref-crawford-1985))|![Formula](/doc/measures/readability/formula_de_crawford.svg)

* This measure applies only to **Spanish texts**. +Fórmula de Crawford¹
([Crawford, 1985](#ref-crawford-1985))|![Formula](/doc/measures/readability/formula_de_crawford.svg)

* This measure applies only to **Spanish texts**. Gulpease Index
([Lucisano & Emanuela Piemontese, 1988](#ref-lucisano-emanuela-piemontese-1988))|![Formula](/doc/measures/readability/gulpease_index.svg)

* This measure applies only to **Italian texts**. -Gunning Fog Index1
(English: [Gunning, 1968](#ref-gunning-1968)
Polish: [Pisarek, 1969](#ref-pisarek-1969))|![Formula](/doc/measures/readability/fog_index.svg)
where **NumHardWords** is the number of words with 3 or more syllables excluding all proper nouns and words with 3 syllables ending with *-ed* or *-es* for **English texts**, and the number of words with 4 or more syllables for **Polish texts**.

* This measure applies only to **English texts** and **Polish texts**. +Gunning Fog Index¹
(English: [Gunning, 1968](#ref-gunning-1968)
Polish: [Pisarek, 1969](#ref-pisarek-1969))|![Formula](/doc/measures/readability/fog_index.svg)
where **NumHardWords** is the number of words with 3 or more syllables excluding all proper nouns and words with 3 syllables ending with *-ed* or *-es* for **English texts**, and the number of words with 4 or more syllables for **Polish texts**.

* This measure applies only to **English texts** and **Polish texts**. Legibilidad µ
([Muñoz Baquedano, 2006](#ref-munoz-baquedano-2006))|![Formula](/doc/measures/readability/mu.svg)
where **LenWordsAvg** is the average word length in letters, and **LenWordsVar** is the variance of word lengths in letters.

* This measure applies only to **Spanish texts**.
* The text should be **at least 2 words long**. -Lensear Write1
([O’Hayre, 1966](#ref-o-hayre-1966))|![Formula](/doc/measures/readability/lensear_write.svg)
where **NumWordsMonosyllabic** is the number of monosyllabic words excluding *the*, *is*, *are*, *was*, *were*, and **NumSentences** is the number of sentences to the nearest period.

* This measure applies only to **English texts**.
* A sample of 100 words is taken randomly from the text.
* If the text is **shorter than 100 words**, **NumWordsMonosyllabic** and **NumSentences** need to be multiplied by 100 and then divided by the number of text. +Lensear Write¹
([O’Hayre, 1966](#ref-o-hayre-1966))|![Formula](/doc/measures/readability/lensear_write.svg)
where **NumWords1Syl** is the number of monosyllabic words excluding *the*, *is*, *are*, *was*, *were*, and **NumSentences** is the number of sentences to the nearest period.

* This measure applies only to **English texts**.
* A sample of 100 words is taken randomly from the text.
* If the text is **shorter than 100 words**, **NumWords1Syl** and **NumSentences** need to be multiplied by 100 and then divided by the number of text. Lix
([Björnsson, 1968](#ref-bjornsson-1968))|![Formula](/doc/measures/readability/lix.svg)
where **NumLongWords** is the number of words with 7 or more letters. McAlpine EFLAW Readability Score
([Nirmaldasan, 2009](#ref-nirmaldasan-2009))|![Formula](/doc/measures/readability/eflaw.svg)

* This measure applies only to **English texts**. OSMAN
([El-Haj & Rayson, 2016](#ref-elhaj-rayson-2016))|![Formula](/doc/measures/readability/osman.svg)
where **NumLongWords** is the number of words with 6 or more letters, **NumComplexWords** is the number of words with 5 or more syllables, and **NumFaseehWords** is the number of complex words containing ء/ئ/ؤ/ذ/ظ or ending with وا/ون.

* This measure applies only to **Arabic texts**.
* The number of syllables in each Arabic word is estimated by adding the number of short syllables and twice the number of long and stress syllables. Rix
([Anderson, 1983](#ref-anderson-1983))|![Formula](/doc/measures/readability/rix.svg)
where **NumLongWords** is the number of words with 7 or more letters. -SMOG Grade1
([McLaughlin, 1969](#ref-mclaughlin-1969))|![Formula](/doc/measures/readability/smog_grade.svg)
where **NumWordsPolysyllabic** is the number of words with 3 or more syllables.

* A sample consisting of the first 10 sentences of the text, the last 10 sentences of the text, and 10 sentences at the middle of the text is taken from the text, thus the text should be **at least 30 sentences long**. +SMOG Grade¹
([McLaughlin, 1969](#ref-mclaughlin-1969))|![Formula](/doc/measures/readability/smog_grade.svg)

* A sample consisting of the first 10 sentences of the text, the last 10 sentences of the text, and 10 sentences at the middle of the text is taken from the text, thus the text should be **at least 30 sentences long**. Spache Grade Level
([Spache, 1953](#ref-spache-1953))|![Formula](/doc/measures/readability/spache_grade_level.svg)

* Three samples each of 100 words are taken randomly from the text and the mean of the three scores is calculated, thus the text should be **at least 100 words long**. -Wiener Sachtextformel1
([Bamberger & Vanecek, 1984](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/wstf.svg)
where **NumWordsPolysyllabic** is the number of words with 3 or more syllables and **NumLongWords** is the numbers of words with 7 or more letters.

* This measure applies only to **German texts**.
* This measure has 4 variants, which you could select via **Menu → Preferences → Settings → Measures → Readability → Wiener Sachtextformel → Variant**. +Wiener Sachtextformel¹
([Bamberger & Vanecek, 1984](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/wstf.svg)
where **NumLongWords** is the numbers of words with 7 or more letters.

* This measure applies only to **German texts**.
* This measure has 4 variants, which you could select via **Menu → Preferences → Settings → Measures → Readability → Wiener Sachtextformel → Variant**. **Notes:** 1. Requires **built-in syllable tokenization support** @@ -1455,55 +1465,57 @@ Cubic Association Ratio
([Daille, 1994](#ref-daille-1994), [1995](#ref-daille [45] [**^**](#ref-kromers-ur) Kromer, V. (2003). A usage measure based on psychophysical relations. *Journal of Quantitative Linguistics*, *10*(2), 177–186. https://doi.org/10.1076/jqul.10.2.177.16718
[46] [**^**](#ref-mi-log-f) Lexical Computing. (2015, July 8). *Statistics used in Sketch Engine*. Sketch Engine. https://www.sketchengine.eu/documentation/statistics-used-in-sketch-engine/
+ +[47] [**^**](#ref-colemans-readability-formula) Liau, T. L., Bassin, C. B., Martin, C. J., & Coleman, E. B. (1976). Modification of the Coleman readability formulas. *Journal of Reading Behavior*, *8*(4), 381–386. https://journals.sagepub.com/doi/pdf/10.1080/10862967609547193
-[47] [**^**](#ref-griess-dp-norm) Lijffijt, J., & Gries, S. T. (2012). Correction to Stefan Th. Gries’ “dispersions and adjusted frequencies in corpora”. *International Journal of Corpus Linguistics*, *17*(1), 147–149. https://doi.org/10.1075/ijcl.17.1.08lij
+[48] [**^**](#ref-griess-dp-norm) Lijffijt, J., & Gries, S. T. (2012). Correction to Stefan Th. Gries’ “dispersions and adjusted frequencies in corpora”. *International Journal of Corpus Linguistics*, *17*(1), 147–149. https://doi.org/10.1075/ijcl.17.1.08lij
-[48] [**^**](#ref-gulpease-index) Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. *Scuola e Città*, *39*(3), pp. 110–124.
+[49] [**^**](#ref-gulpease-index) Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. *Scuola e Città*, *39*(3), pp. 110–124.
-[49] [**^**](#ref-lynes-d3) Lyne, A. A. (1985). Dispersion. In *The vocabulary of French business correspondence: Word frequencies, collocations, and problems of lexicometric method* (pp. 101–124). Slatkine/Champion.
+[50] [**^**](#ref-lynes-d3) Lyne, A. A. (1985). Dispersion. In *The vocabulary of French business correspondence: Word frequencies, collocations, and problems of lexicometric method* (pp. 101–124). Slatkine/Champion.
-[50] [**^**](#ref-smog-grade) McLaughlin, G. H. (1969). SMOG grading: A new readability formula. *Journal of Reading*, *12*(8), pp. 639–646.
+[51] [**^**](#ref-smog-grade) McLaughlin, G. H. (1969). SMOG grading: A new readability formula. *Journal of Reading*, *12*(8), pp. 639–646.
-[51] [**^**](#ref-legibilidad-mu) Muñoz Baquedano, M. (2006). Legibilidad y variabilidad de los textos. *Boletín de Investigación Educacional, Pontificia Universidad Católica de Chile*, *21*(2), 13–26.
+[52] [**^**](#ref-legibilidad-mu) Muñoz Baquedano, M. (2006). Legibilidad y variabilidad de los textos. *Boletín de Investigación Educacional, Pontificia Universidad Católica de Chile*, *21*(2), 13–26.
-[52] [**^**](#ref-eflaw) Nirmaldasan. (2009, April 30). *McAlpine EFLAW readability score*. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/
+[53] [**^**](#ref-eflaw) Nirmaldasan. (2009, April 30). *McAlpine EFLAW readability score*. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/
-[53] [**^**](#ref-pearsons-chi-squared-test) Oakes, M. P. (1998). *Statistics for Corpus Linguistics*. Edinburgh University Press.
+[54] [**^**](#ref-pearsons-chi-squared-test) Oakes, M. P. (1998). *Statistics for Corpus Linguistics*. Edinburgh University Press.
-[54] [**^**](#ref-re) Oborneva, I. V. (2006). *Автоматизированная оценка сложности учебных текстов на основе статистических параметров* [Doctoral dissertation, Institute for Strategy of Education Development of the Russian Academy of Education]. Freereferats.ru. https://static.freereferats.ru/_avtoreferats/01002881899.pdf?ver=3
+[55] [**^**](#ref-re) Oborneva, I. V. (2006). *Автоматизированная оценка сложности учебных текстов на основе статистических параметров* [Doctoral dissertation, Institute for Strategy of Education Development of the Russian Academy of Education]. Freereferats.ru. https://static.freereferats.ru/_avtoreferats/01002881899.pdf?ver=3
-[55] [**^**](#ref-lensear-write) O’Hayre, J. (1966). *Gobbledygook has gotta go*. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf
+[56] [**^**](#ref-lensear-write) O’Hayre, J. (1966). *Gobbledygook has gotta go*. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf
-[56] [**^**](#ref-students-t-test-2-sample) Paquot, M., & Bestgen, Y. (2009). Distinctive words in academic writing: A comparison of three statistical tests for keyword extraction. *Language and Computers*, *68*, 247–269.
+[57] [**^**](#ref-students-t-test-2-sample) Paquot, M., & Bestgen, Y. (2009). Distinctive words in academic writing: A comparison of three statistical tests for keyword extraction. *Language and Computers*, *68*, 247–269.
-[57] [**^**](#ref-fishers-exact-test) Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), *Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference* (pp. 188–200). The South–Central Regional SAS Users' Group.
+[58] [**^**](#ref-fishers-exact-test) Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), *Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference* (pp. 188–200). The South–Central Regional SAS Users' Group.
-[58] [**^**](#ref-min-sensitivity) Pedersen, T. (1998). Dependent bigram identification. In *Proceedings of the Fifteenth National Conference on Artificial Intelligence* (p. 1197). AAAI Press.
+[59] [**^**](#ref-min-sensitivity) Pedersen, T. (1998). Dependent bigram identification. In *Proceedings of the Fifteenth National Conference on Artificial Intelligence* (p. 1197). AAAI Press.
-[59] [**^**](#ref-fog-index) Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. *Zeszyty Prasoznawcze*, *4*(42), 35–48.
+[60] [**^**](#ref-fog-index) Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. *Zeszyty Prasoznawcze*, *4*(42), 35–48.
-[60] [**^**](#ref-odds-ratio) Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. *Corpus Linguistics and Linguistic Theory*, *15*(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030
+[61] [**^**](#ref-odds-ratio) Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. *Corpus Linguistics and Linguistic Theory*, *15*(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030
-[61] [**^**](#ref-poisson-collocation-measure) Quasthoff, U., & Wolff, C. (2002). The poisson collocation measure and its applications. *Proceedings of 2nd International Workshop on Computational Approaches to Collocations*. IEEE.
+[62] [**^**](#ref-poisson-collocation-measure) Quasthoff, U., & Wolff, C. (2002). The poisson collocation measure and its applications. *Proceedings of 2nd International Workshop on Computational Approaches to Collocations*. IEEE.
-[62] [**^**](#ref-rosengrens-s)[**^**](#ref-rosengrens-kf) Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. *Études de linguistique appliquée*, *1*, 103–127.
+[63] [**^**](#ref-rosengrens-s)[**^**](#ref-rosengrens-kf) Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. *Études de linguistique appliquée*, *1*, 103–127.
-[63] [**^**](#ref-log-dice) Rychlý, P. (2008). A lexicographyer-friendly association score. In P. Sojka & A. Horák (Eds.), *Proceedings of Second Workshop on Recent Advances in Slavonic Natural Languages Processing*. Masaryk University
+[64] [**^**](#ref-log-dice) Rychlý, P. (2008). A lexicographyer-friendly association score. In P. Sojka & A. Horák (Eds.), *Proceedings of Second Workshop on Recent Advances in Slavonic Natural Languages Processing*. Masaryk University
-[64] [**^**](#ref-ald) [**^**](#ref-fald) [**^**](#ref-arf) [**^**](#ref-farf) [**^**](#ref-awt) [**^**](#ref-fawt) Savický, P., & Hlaváčová, J. (2002). Measures of word commonness. *Journal of Quantitative Linguistics*, *9*(3), 215–231. https://doi.org/10.1076/jqul.9.3.215.14124
+[65] [**^**](#ref-ald) [**^**](#ref-fald) [**^**](#ref-arf) [**^**](#ref-farf) [**^**](#ref-awt) [**^**](#ref-fawt) Savický, P., & Hlaváčová, J. (2002). Measures of word commonness. *Journal of Quantitative Linguistics*, *9*(3), 215–231. https://doi.org/10.1076/jqul.9.3.215.14124
-[65] [**^**](#ref-dices-coeff) Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. *Computational Linguistics*, *22*(1), pp. 1–38.
+[66] [**^**](#ref-dices-coeff) Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. *Computational Linguistics*, *22*(1), pp. 1–38.
-[66] [**^**](#ref-devereux-readability-index) Smith, E. A. (1961). Devereaux readability index. *Journal of Educational Research*, *54*(8), 298–303. https://doi.org/10.1080/00220671.1961.10882728
+[67] [**^**](#ref-devereux-readability-index) Smith, E. A. (1961). Devereaux readability index. *Journal of Educational Research*, *54*(8), 298–303. https://doi.org/10.1080/00220671.1961.10882728
-[67] [**^**](#ref-ari) Smith, E. A., & Senter, R. J. (1967). *Automated readability index*. Aerospace Medical Research Laboratories. https://apps.dtic.mil/sti/pdfs/AD0667273.pdf
+[68] [**^**](#ref-ari) Smith, E. A., & Senter, R. J. (1967). *Automated readability index*. Aerospace Medical Research Laboratories. https://apps.dtic.mil/sti/pdfs/AD0667273.pdf
-[68] [**^**](#ref-spache-grade-level) Spache, G. (1953). A new readability formula for primary-grade reading materials. *Elementary School Journal*, *53*(7), 410–413. https://doi.org/10.1086/458513
+[69] [**^**](#ref-spache-grade-level) Spache, G. (1953). A new readability formula for primary-grade reading materials. *Elementary School Journal*, *53*(7), 410–413. https://doi.org/10.1086/458513
-[69] [**^**](#ref-re) Szigriszt Pazos, F. (1993). *Sistemas predictivos de legibilidad del mensaje escrito: Formula de perspicuidad* [Doctoral dissertation, Complutense University of Madrid]. Biblos-e Archivo. https://repositorio.uam.es/bitstream/handle/10486/2488/3907_barrio_cantalejo_ines_maria.pdf?sequence=1&isAllowed=y
+[70] [**^**](#ref-re) Szigriszt Pazos, F. (1993). *Sistemas predictivos de legibilidad del mensaje escrito: Formula de perspicuidad* [Doctoral dissertation, Complutense University of Madrid]. Biblos-e Archivo. https://repositorio.uam.es/bitstream/handle/10486/2488/3907_barrio_cantalejo_ines_maria.pdf?sequence=1&isAllowed=y
-[70] [**^**](#ref-lfmd)[**^**](#ref-md) Thanopoulos, A., Fakotakis, N., & Kokkinakis, G. (2002). Comparative evaluation of collocation extraction metrics. In M. G. González & C. P. S. Araujo (Eds.), *Proceedings of the Third International Conference on Language Resources and Evaluation* (pp. 620–625). European Language Resources Association.
+[71] [**^**](#ref-lfmd)[**^**](#ref-md) Thanopoulos, A., Fakotakis, N., & Kokkinakis, G. (2002). Comparative evaluation of collocation extraction metrics. In M. G. González & C. P. S. Araujo (Eds.), *Proceedings of the Third International Conference on Language Resources and Evaluation* (pp. 620–625). European Language Resources Association.
-[71] [**^**](#ref-log-likehood-ratio-test-bayes-factor)[**^**](#ref-students-t-test-2-sample-bayes-factor) Wilson, A. (2013). Embracing Bayes Factors for key item analysis in corpus linguistics. In M. Bieswanger & A. Koll-Stobbe (Eds.), *New Approaches to the Study of Linguistic Variability* (pp. 3–11). Peter Lang.
+[72] [**^**](#ref-log-likehood-ratio-test-bayes-factor)[**^**](#ref-students-t-test-2-sample-bayes-factor) Wilson, A. (2013). Embracing Bayes Factors for key item analysis in corpus linguistics. In M. Bieswanger & A. Koll-Stobbe (Eds.), *New Approaches to the Study of Linguistic Variability* (pp. 3–11). Peter Lang.
-[72] [**^**](#ref-zhangs-distributional-consistency) Zhang, H., Huang, C., & Yu, S. (2004). Distributional consistency: As a general method for defining a core lexicon. In M. T. Lino, M. F. Xavier, F. Ferreira, R. Costa, & R. Silva (Eds.), *Proceedings of Fourth International Conference on Language Resources and Evaluation* (pp. 1119–1122). European Language Resources Association.
+[73] [**^**](#ref-zhangs-distributional-consistency) Zhang, H., Huang, C., & Yu, S. (2004). Distributional consistency: As a general method for defining a core lexicon. In M. T. Lino, M. F. Xavier, F. Ferreira, R. Costa, & R. Silva (Eds.), *Proceedings of Fourth International Conference on Language Resources and Evaluation* (pp. 1119–1122). European Language Resources Association.
diff --git a/doc/measures/readability/colemans_readability_formula.svg b/doc/measures/readability/colemans_readability_formula.svg new file mode 100644 index 000000000..c843cf41f --- /dev/null +++ b/doc/measures/readability/colemans_readability_formula.svg @@ -0,0 +1,411 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/lensear_write.svg b/doc/measures/readability/lensear_write.svg index 7f4984fc3..94d9943f7 100644 --- a/doc/measures/readability/lensear_write.svg +++ b/doc/measures/readability/lensear_write.svg @@ -1,21 +1,18 @@ - + + - - - - @@ -41,32 +38,24 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/re_simplified.svg b/doc/measures/readability/re_simplified.svg index 48d45896f..fd060f7b7 100644 --- a/doc/measures/readability/re_simplified.svg +++ b/doc/measures/readability/re_simplified.svg @@ -1,6 +1,6 @@ - + @@ -15,17 +15,13 @@ - - - - @@ -55,66 +51,58 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/rgl.svg b/doc/measures/readability/rgl.svg index bc59748cb..4985d2a28 100644 --- a/doc/measures/readability/rgl.svg +++ b/doc/measures/readability/rgl.svg @@ -1,6 +1,6 @@ - + @@ -13,18 +13,13 @@ - + - - - - - @@ -55,17 +50,9 @@ - - - - - - - - - - - - + + + + \ No newline at end of file diff --git a/doc/measures/readability/smog_grade.svg b/doc/measures/readability/smog_grade.svg index 041e37857..5bd6b7977 100644 --- a/doc/measures/readability/smog_grade.svg +++ b/doc/measures/readability/smog_grade.svg @@ -1,6 +1,6 @@ - + @@ -15,13 +15,10 @@ + - - - - @@ -47,7 +44,7 @@ - + @@ -56,17 +53,14 @@ - - - - - - - - - - - - + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/wstf.svg b/doc/measures/readability/wstf.svg index 43a0b56f1..86e0c4ea3 100644 --- a/doc/measures/readability/wstf.svg +++ b/doc/measures/readability/wstf.svg @@ -1,6 +1,6 @@ - + @@ -31,13 +31,10 @@ - - - @@ -48,7 +45,7 @@ - + @@ -60,51 +57,48 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -129,38 +123,30 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/wl_tests_measures/test_measures_readability.py b/tests/wl_tests_measures/test_measures_readability.py index a1a0e34d8..68d644e65 100644 --- a/tests/wl_tests_measures/test_measures_readability.py +++ b/tests/wl_tests_measures/test_measures_readability.py @@ -21,8 +21,6 @@ from tests import wl_test_init from wordless.wl_measures import wl_measures_readability -main = wl_test_init.Wl_Test_Main() - class Wl_Test_Text(): def __init__(self, tokens_multilevel, lang = 'eng_us'): super().__init__() @@ -31,6 +29,9 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'): self.lang = lang self.tokens_multilevel = tokens_multilevel +main = wl_test_init.Wl_Test_Main() +settings = main.settings_custom['measures']['readability'] + TOKENS_MULTILEVEL_0 = [] TOKENS_MULTILEVEL_12 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['This', 'is', 'a', 'sen-tence0', '.']]]] TOKENS_MULTILEVEL_12_PROPN = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['Louisiana', 'readability', 'boxes', 'created', '.']]]] @@ -150,6 +151,33 @@ def test_coleman_liau_index(): assert grade_level_eng_0 == 'text_too_short' assert grade_level_eng_12 == grade_level_spa_12 == -27.4004 * (est_cloze_pct / 100) + 23.06395 +def test_colemans_readability_formula(): + cloze_pct_eng_0 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_0) + settings['colemans_readability_formula']['variant'] = '1' + cloze_pct_eng_12_1 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12) + settings['colemans_readability_formula']['variant'] = '2' + cloze_pct_eng_12_2 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12) + settings['colemans_readability_formula']['variant'] = '3' + cloze_pct_eng_12_3 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12) + settings['colemans_readability_formula']['variant'] = '4' + cloze_pct_eng_12_4 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12) + cloze_pct_other_12 = wl_measures_readability.colemans_readability_formula(main, test_text_other_12) + + print("Coleman's Readability Formula:") + print(f'\teng/0: {cloze_pct_eng_0}') + print(f'\teng/12-1: {cloze_pct_eng_12_1}') + print(f'\teng/12-2: {cloze_pct_eng_12_2}') + print(f'\teng/12-3: {cloze_pct_eng_12_3}') + print(f'\teng/12-4: {cloze_pct_eng_12_4}') + print(f'\tother/12: {cloze_pct_other_12}') + + assert cloze_pct_eng_0 == 'text_too_short' + assert cloze_pct_eng_12_1 == 1.29 * (9 / 12 * 100) - 38.45 + assert cloze_pct_eng_12_2 == 1.16 * (9 / 12 * 100) + 1.48 * (3 / 12 * 100) - 37.95 + assert cloze_pct_eng_12_3 == 1.07 * (9 / 12 * 100) + 1.18 * (3 / 12 * 100) + 0.76 * (0 / 12 * 100) - 34.02 + assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (0 / 12 * 100) - 0.36 * (0 / 12) - 26.01 + assert cloze_pct_other_12 == 'no_support' + def test_dale_chall_readability_score(): x_c50_eng_0 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_0) x_c50_eng_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_12) @@ -198,9 +226,9 @@ def test_flesch_reading_ease(): flesch_re_eng_0 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_0) flesch_re_eng_12 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_12) - main.settings_custom['measures']['readability']['re']['variant_nld'] = 'Douma' + settings['re']['variant_nld'] = 'Douma' flesch_re_nld_12_douma = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12) - main.settings_custom['measures']['readability']['re']['variant_nld'] = "Brouwer's Leesindex A" + settings['re']['variant_nld'] = "Brouwer's Leesindex A" flesch_re_nld_12_brouwer = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12) flesch_re_fra_12 = wl_measures_readability.flesch_reading_ease(main, test_text_fra_12) @@ -208,9 +236,9 @@ def test_flesch_reading_ease(): flesch_re_ita_12 = wl_measures_readability.flesch_reading_ease(main, test_text_ita_12) flesch_re_rus_12 = wl_measures_readability.flesch_reading_ease(main, test_text_rus_12) - main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Fernández Huerta' + settings['re']['variant_spa'] = 'Fernández Huerta' flesch_re_spa_12_fh = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12) - main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Szigriszt Pazos' + settings['re']['variant_spa'] = 'Szigriszt Pazos' flesch_re_spa_12_sp = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12) flesch_re_afr_12 = wl_measures_readability.flesch_reading_ease(main, test_text_afr_12) @@ -219,14 +247,14 @@ def test_flesch_reading_ease(): print('Flesch Reading Ease:') print(f'\teng/0: {flesch_re_eng_0}') print(f'\teng/12: {flesch_re_eng_12}') - print(f'\tnld-douma/12: {flesch_re_nld_12_douma}') - print(f'\tnld-brouwer/12: {flesch_re_nld_12_brouwer}') + print(f'\tnld/12-douma: {flesch_re_nld_12_douma}') + print(f'\tnld/12-brouwer: {flesch_re_nld_12_brouwer}') print(f'\tfra/12: {flesch_re_fra_12}') print(f'\tdeu/12: {flesch_re_deu_12}') print(f'\tita/12: {flesch_re_ita_12}') print(f'\trus/12: {flesch_re_rus_12}') - print(f'\tspa-fh/12: {flesch_re_spa_12_fh}') - print(f'\tspa-sp/12: {flesch_re_spa_12_sp}') + print(f'\tspa/12-fh: {flesch_re_spa_12_fh}') + print(f'\tspa/12-sp: {flesch_re_spa_12_sp}') print(f'\tafr/12: {flesch_re_afr_12}') print(f'\tother/12: {flesch_re_other_12}') @@ -453,18 +481,22 @@ def test_spache_grade_level(): def test_wiener_sachtextformel(): wstf_deu_0 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_0) - wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '1') - wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '2') - wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '3') - wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '4') + settings['wstf']['variant'] = '1' + wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12) + settings['wstf']['variant'] = '2' + wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12) + settings['wstf']['variant'] = '3' + wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12) + settings['wstf']['variant'] = '4' + wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12) wstf_eng_12 = wl_measures_readability.wiener_sachtextformel(main, test_text_eng_12) print('Wiener Sachtextformel:') print(f'\tdeu/0: {wstf_deu_0}') - print(f'\tdeu-1/12: {wstf_deu_12_1}') - print(f'\tdeu-2/12: {wstf_deu_12_2}') - print(f'\tdeu-3/12: {wstf_deu_12_3}') - print(f'\tdeu-4/12: {wstf_deu_12_4}') + print(f'\tdeu/12-1: {wstf_deu_12_1}') + print(f'\tdeu/12-2: {wstf_deu_12_2}') + print(f'\tdeu/12-3: {wstf_deu_12_3}') + print(f'\tdeu/12-4: {wstf_deu_12_4}') print(f'\teng/12: {wstf_eng_12}') ms = 0 / 12 @@ -485,6 +517,7 @@ def test_wiener_sachtextformel(): test_bormuths_cloze_mean() test_bormuths_gp() test_coleman_liau_index() + test_colemans_readability_formula() test_dale_chall_readability_score() test_devereux_readability_index() test_flesch_kincaid_grade_level() diff --git a/tests/wl_tests_work_area/test_profiler.py b/tests/wl_tests_work_area/test_profiler.py index 2a4cc0644..4cdbc1d29 100644 --- a/tests/wl_tests_work_area/test_profiler.py +++ b/tests/wl_tests_work_area/test_profiler.py @@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files): count_tokens_lens_syls.append(collections.Counter(len_tokens_syls)) count_tokens_lens_chars.append(collections.Counter(len_tokens_chars)) - assert len(readability_statistics) == 24 + assert len(readability_statistics) == 25 # Counts assert count_paras diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index d61166722..8c7888562 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -129,9 +129,9 @@ def get_count_words_dale(words, num_easy_words): # Automated Arabic Readability Index # Reference: Al-Tamimi, A., Jaradat M., Aljarrah, N., & Ghanim, S. (2013). AARI: Automatic Arabic readability index. The International Arab Journal of Information Technology, 11(4), pp. 370–378. def automated_ara_readability_index(main, text): - text = get_counts(main, text) - if text.lang == 'ara': + text = get_counts(main, text) + if text.count_words and text.count_sentences: aari = ( 3.28 * text.count_chars_alphanumeric @@ -164,9 +164,9 @@ def automated_readability_index(main, text): # Bormuth's Cloze Mean & Grade Placement # Reference: Bormuth, J. R. (1969). Development of readability analyses. U.S. Department of Health, Education, and Welfare. http://files.eric.ed.gov/fulltext/ED029166.pdf def bormuths_cloze_mean(main, text): - text = get_counts(main, text) - if text.lang.startswith('eng_'): + text = get_counts(main, text) + if text.count_sentences and text.count_words: ddl = get_count_words_dale(text.words_flat, 3000) m = ( @@ -219,12 +219,62 @@ def coleman_liau_index(main, text): return grade_level +# Coleman's Readability Formula +# Reference: Liau, T. L., Bassin, C. B., Martin, C. J., & Coleman, E. B. (1976). Modification of the Coleman readability formulas. Journal of Reading Behavior, 8(4), 381–386. https://journals.sagepub.com/doi/pdf/10.1080/10862967609547193 +def colemans_readability_formula(main, text): + if text.lang.startswith('eng_'): + text = get_counts(main, text) + + if text.count_words: + variant = main.settings_custom['measures']['readability']['colemans_readability_formula']['variant'] + count_words_1_syl = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) + + if variant in ['3', '4']: + pos_tags = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal') + count_prons = sum((1 for _, pos in pos_tags if pos == 'PRON')) + + if variant == '4': + count_preps = sum((1 for _, pos in pos_tags if pos == 'ADP')) + + if variant == '1': + cloze_pct = ( + 1.29 * (count_words_1_syl / text.count_words * 100) - + 38.45 + ) + elif variant == '2': + cloze_pct = ( + 1.16 * (count_words_1_syl / text.count_words * 100) + + 1.48 * (text.count_sentences / text.count_words * 100) - + 37.95 + ) + elif variant == '3': + cloze_pct = ( + 1.07 * (count_words_1_syl / text.count_words * 100) + + 1.18 * (text.count_sentences / text.count_words * 100) + + 0.76 * (count_prons / text.count_words * 100) - + 34.02 + ) + elif variant == '4': + cloze_pct = ( + 1.04 * (count_words_1_syl / text.count_words * 100) + + 1.06 * (text.count_sentences / text.count_words * 100) + + 0.56 * (count_prons / text.count_words * 100) - + 0.36 * (count_preps / text.count_words) - + 26.01 + ) + else: + cloze_pct = 'text_too_short' + else: + cloze_pct = 'no_support' + + return cloze_pct + # Dale-Chall Readability Score # References: # Dale, E., & Chall, J. S. (1948a). A formula for predicting readability. Educational Research Bulletin, 27(1), 11–20, 28. # Dale, E., & Chall, J. S. (1948b). A formula for predicting readability: Instructions. Educational Research Bulletin, 27(2), 37–54. def dale_chall_readability_score(main, text): - if text.lang.startswith('eng'): + if text.lang.startswith('eng_'): text = get_counts(main, text) if text.count_words and text.count_sentences: @@ -374,10 +424,10 @@ def flesch_reading_ease_simplified(main, text): text = get_counts(main, text) if text.count_words and text.count_sentences: - count_words_monosyllabic = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) + count_words_1_syl = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1) flesch_re_simplified = ( - 1.599 * (count_words_monosyllabic / text.count_words * 100) + 1.599 * (count_words_1_syl / text.count_words * 100) - 1.015 * (text.count_words / text.count_sentences) - 31.517 ) @@ -398,8 +448,8 @@ def forcast_grade_level(main, text): sample_start = random.randint(0, text.count_words - 150) sample = text.syls_words[sample_start : sample_start + 150] - count_words_monosyllabic = get_count_words_syls(sample, len_min = 1, len_max = 1) - rgl = 20.43 - 0.11 * count_words_monosyllabic + count_words_1_syl = get_count_words_syls(sample, len_min = 1, len_max = 1) + rgl = 20.43 - 0.11 * count_words_1_syl else: rgl = 'text_too_short' else: @@ -452,7 +502,7 @@ def formula_de_crawford(main, text): # Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. Scuola e Città, 39(3), pp. 110–124. # Indice Gulpease. (2021, July 9). In Wikipedia.https://it.wikipedia.org/w/index.php?title=Indice_Gulpease&oldid=121763335. def gulpease_index(main, text): - if text.lang.startswith('ita'): + if text.lang == 'ita': text = get_counts(main, text) if text.count_words: @@ -470,13 +520,13 @@ def gulpease_index(main, text): # Polish variant: # Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. Zeszyty Prasoznawcze, 4(42), 35–48. def gunning_fog_index(main, text): - if text.lang.startswith('eng') or text.lang == 'pol' and text.lang in main.settings_global['syl_tokenizers']: + if text.lang.startswith('eng_') or text.lang == 'pol' and text.lang in main.settings_global['syl_tokenizers']: text = get_counts(main, text) if text.count_sentences and text.count_words: count_hard_words = 0 - if text.lang.startswith('eng'): + if text.lang.startswith('eng_'): words_tagged = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal') for syls, (word, tag) in zip(text.syls_words, words_tagged): @@ -529,7 +579,7 @@ def legibility_mu(main, text): # Lensear Write # Reference: O’Hayre, J. (1966). Gobbledygook has gotta go. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf def lensear_write(main, text): - if text.lang.startswith('eng') and text.lang in main.settings_global['syl_tokenizers']: + if text.lang.startswith('eng_') and text.lang in main.settings_global['syl_tokenizers']: text = get_counts(main, text) if text.count_words > 0: @@ -589,7 +639,7 @@ def lix(main, text): # McAlpine EFLAW Readability Score # Reference: Nirmaldasan. (2009, April 30). McAlpine EFLAW readability score. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/ def mcalpine_eflaw(main, text): - if text.lang.startswith('eng'): + if text.lang.startswith('eng_'): text = get_counts(main, text) if text.count_sentences: @@ -703,14 +753,14 @@ def smog_grade(main, text): ) # Calculate the number of words with 3 or more syllables - count_words_polysyllabic = 0 + count_words_3_plus_syls = 0 for sentence in samples: syls_words = wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang) - count_words_polysyllabic += get_count_words_syls(syls_words, len_min = 3) + count_words_3_plus_syls += get_count_words_syls(syls_words, len_min = 3) - g = 3.1291 + 1.043 * (count_words_polysyllabic ** 0.5) + g = 3.1291 + 1.043 * (count_words_3_plus_syls ** 0.5) else: g = 'text_too_short' else: @@ -723,7 +773,7 @@ def smog_grade(main, text): # Dale, E. (1931). A comparison of two word lists. Educational Research Bulletin, 10(18), 484–489. # Spache, G. (1953). A new readability formula for primary-grade reading materials. Elementary School Journal, 53(7), 410–413. https://doi.org/10.1086/458513 def spache_grade_level(main, text): - if text.lang.startswith('eng'): + if text.lang.startswith('eng_'): text = get_counts(main, text) if text.count_words >= 100: @@ -771,14 +821,12 @@ def spache_grade_level(main, text): # References: # Bamberger, R., & Vanecek, E. (1984). Lesen – Verstehen – Lernen – Schreiben. Jugend und Volk. # Lesbarkeitsindex. (2022, July 21). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=224664667 -def wiener_sachtextformel(main, text, variant = None): - if text.lang.startswith('deu') and text.lang in main.settings_global['syl_tokenizers']: +def wiener_sachtextformel(main, text): + if text.lang.startswith('deu_') and text.lang in main.settings_global['syl_tokenizers']: text = get_counts(main, text) if text.count_words and text.count_sentences: - if not variant: - variant = main.settings_custom['measures']['readability']['wstf']['variant'] - + variant = main.settings_custom['measures']['readability']['wstf']['variant'] ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words sl = text.count_words / text.count_sentences iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py index 2b23f8fe5..23fc0543a 100644 --- a/wordless/wl_profiler.py +++ b/wordless/wl_profiler.py @@ -362,6 +362,7 @@ def __init__(self, parent): _tr('wl_profiler', "Bormuth's Cloze Mean"), _tr('wl_profiler', "Bormuth's Grade Placement"), _tr('wl_profiler', 'Coleman-Liau Index'), + _tr('wl_profiler', "Coleman's Readability Formula"), _tr('wl_profiler', 'Dale-Chall Readability Score'), _tr('wl_profiler', 'Devereaux Readability Index'), _tr('wl_profiler', 'Flesch-Kincaid Grade Level'), @@ -1178,6 +1179,7 @@ def run(self): wl_measures_readability.bormuths_cloze_mean(self.main, text), wl_measures_readability.bormuths_gp(self.main, text), wl_measures_readability.coleman_liau_index(self.main, text), + wl_measures_readability.colemans_readability_formula(self.main, text), wl_measures_readability.dale_chall_readability_score(self.main, text), wl_measures_readability.devereux_readability_index(self.main, text), wl_measures_readability.flesch_kincaid_grade_level(self.main, text), diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index aacbd32de..f4a89f0b8 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -1745,6 +1745,10 @@ def init_settings_default(main): 'cloze_criterion_score': 35 }, + 'colemans_readability_formula': { + 'variant': '2' + }, + 're': { 'variant_nld': 'Douma', 'variant_spa': 'Fernández Huerta' diff --git a/wordless/wl_settings/wl_settings_measures.py b/wordless/wl_settings/wl_settings_measures.py index 1749c7638..36d6fdb03 100644 --- a/wordless/wl_settings/wl_settings_measures.py +++ b/wordless/wl_settings/wl_settings_measures.py @@ -46,6 +46,19 @@ def __init__(self, main): self.group_box_bormuths_gp.layout().setColumnStretch(2, 1) + # Coleman's Readability Formula + self.group_box_colemans_readability_formula = QGroupBox(self.tr("Coleman's Readability Formula"), self) + + self.label_colemans_readability_formula_variant = QLabel(self.tr('Variant:'), self) + self.combo_box_colemans_readability_formula_variant = wl_boxes.Wl_Combo_Box(self) + + self.combo_box_colemans_readability_formula_variant.addItems(['1', '2', '3', '4']) + + self.group_box_colemans_readability_formula.setLayout(wl_layouts.Wl_Layout()) + self.group_box_colemans_readability_formula.layout().addWidget(self.label_colemans_readability_formula_variant, 0, 0) + self.group_box_colemans_readability_formula.layout().addWidget(self.combo_box_colemans_readability_formula_variant, 0, 1) + self.group_box_colemans_readability_formula.layout().setColumnStretch(2, 1) + # Flesch Reading Ease self.group_box_re = QGroupBox(self.tr('Flesch Reading Ease'), self) @@ -87,11 +100,12 @@ def __init__(self, main): self.setLayout(wl_layouts.Wl_Layout()) self.layout().addWidget(self.group_box_bormuths_gp, 0, 0) - self.layout().addWidget(self.group_box_re, 1, 0) - self.layout().addWidget(self.group_box_wstf, 2, 0) + self.layout().addWidget(self.group_box_colemans_readability_formula, 1, 0) + self.layout().addWidget(self.group_box_re, 2, 0) + self.layout().addWidget(self.group_box_wstf, 3, 0) self.layout().setContentsMargins(6, 4, 6, 4) - self.layout().setRowStretch(3, 1) + self.layout().setRowStretch(4, 1) def load_settings(self, defaults = False): if defaults: @@ -102,6 +116,9 @@ def load_settings(self, defaults = False): # Bormuth's Grade Placement self.spin_box_cloze_criterion_score.setValue(settings['bormuths_gp']['cloze_criterion_score']) + # Coleman's Readability Formula + self.combo_box_colemans_readability_formula_variant.setCurrentText(settings['colemans_readability_formula']['variant']) + # Flesch Reading Ease self.combo_box_re_variant_nld.setCurrentText(settings['re']['variant_nld']) self.combo_box_re_variant_spa.setCurrentText(settings['re']['variant_spa']) @@ -113,6 +130,9 @@ def apply_settings(self): # Bormuth's Grade Placement self.settings_custom['bormuths_gp']['cloze_criterion_score'] = self.spin_box_cloze_criterion_score.value() + # Coleman's Readability Formula + self.settings_custom['colemans_readability_formula']['variant'] = self.combo_box_colemans_readability_formula_variant.currentText() + # Flesch Reading Ease self.settings_custom['re']['variant_nld'] = self.combo_box_re_variant_nld.currentText() self.settings_custom['re']['variant_spa'] = self.combo_box_re_variant_spa.currentText()