From 99b596c66af2ac432a519bb3e7d1ef9d172a1408 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Mon, 24 Jul 2023 21:45:20 +0800 Subject: [PATCH] =?UTF-8?q?Settings:=20Add=20Settings=20-=20Measures=20-?= =?UTF-8?q?=20Readability=20-=20Flesch=20Reading=20Ease;=20Work=20Area:=20?= =?UTF-8?q?Remove=20Profiler=20-=20Fern=C3=A1ndez=20Huerta's=20Readability?= =?UTF-8?q?=20Score=20/=20Szigriszt's=20Perspicuity=20Index?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 + doc/doc_eng.md | 95 +-- .../fernandez_huertas_readability_score.svg | 91 --- doc/measures/readability/re.svg | 740 +++++++++--------- doc/measures/readability/wstf.svg | 470 +++++------ .../test_measures_readability.py | 249 +++--- tests/wl_tests_work_area/test_profiler.py | 2 +- .../wl_measures/wl_measures_readability.py | 105 +-- wordless/wl_profiler.py | 4 - wordless/wl_settings/wl_settings_default.py | 4 + wordless/wl_settings/wl_settings_measures.py | 42 +- 11 files changed, 852 insertions(+), 952 deletions(-) delete mode 100644 doc/measures/readability/fernandez_huertas_readability_score.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b1da0323..f9824162f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ ## [3.3.0](https://github.com/BLKSerene/Wordless/releases/tag/3.3.0) - ??/??/2023 ### 🎉 New Features +- Settings: Add Settings - Measures - Readability - Flesch Reading Ease - Utils: Add khmer-nltk's Khmer sentence tokenizer, word tokenizer, and part-of-speech tagger - Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard) - Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser @@ -35,6 +36,7 @@ ### ❌ Removals - Utils: Remove PyThaiNLP's perceptron part-of-speech tagger (LST20) +- Work Area: Remove Profiler - Fernández Huerta's Readability Score / Szigriszt's Perspicuity Index ### ⏫ Dependency Changes - Dependencies: Add khmer-nltk diff --git a/doc/doc_eng.md b/doc/doc_eng.md index ae6ce9881..1865a327d 100644 --- a/doc/doc_eng.md +++ b/doc/doc_eng.md @@ -930,20 +930,20 @@ Dale-Chall Readability Score: Devereux Readability Index: Grade \; Placement = 1.56 \times \frac{NumCharsAll}{NumWords} + 0.19 \times \frac{NumWords}{NumSentences} - 6.49 -Fernández Huerta's Readability Score: - Score = 206.84 - 60 \times \frac{NumSyls}{NumWords} - 102 \times \frac{NumSentences}{NumWords} - Flesch-Kincaid Grade Level: GL = 0.39 \times \frac{NumWords}{NumSentences} + 11.8 \times \frac{NumSyls}{NumWords} - 15.59 Flesch Reading Ease: \begin{align*} - RE &= 206.835 - 0.846 \times \left(\frac{NumSyls}{NumWords} \times 100\right) - 1.015 \times \frac{NumWords}{NumSentences} \\ - RE_{Dutch} &= 206.84 - 77 \times \frac{NumSyls}{NumWords} - 0.93 \times \frac{NumWords}{NumSentences} \\ - RE_{French} &= 207 - 73.6 \times \frac{NumSyls}{NumWords} - 1.015 \times \frac{NumWords}{NumSentences} \\ - RE_{German} &= 180 - 58.5 \times \frac{NumSyls}{NumWords} - \frac{NumWords}{NumSentences} \\ - RE_{Italian} &= 217 - 60 \times \frac{NumSyls}{NumWords} - 1.3 \times \frac{NumWords}{NumSentences} \\ - RE_{Russian} &= 206.835 - 60.1 \times \frac{NumSyls}{NumWords} - 1.3 \times \frac{NumWords}{NumSentences} + ASW &= \frac{NumSyls}{NumWords} \qquad ASL = \frac{NumWords}{NumSentences} \\ + RE &= 206.835 - 0.846 \times \left(ASW \times 100\right) - 1.015 \times ASL \\ + RE_{Dutch} &= 206.84 - 77 \times ASW - 0.93 \times ASL \\ + RE_{French} &= 207 - 73.6 \times ASW - 1.015 \times ASL \\ + RE_{German} &= 180 - 58.5 \times ASW - ASL \\ + RE_{Italian} &= 217 - 60 \times ASW - 1.3 \times ASL \\ + RE_{Russian} &= 206.835 - 60.1 \times ASW - 1.3 \times ASL \\ + RE_{Spanish-Fernández \; Huerta} &= 206.84 - 60 \times ASW - 1.02 \times ASL \\ + RE_{Spanish-Szigriszt \; Pazos} &= 207 - 62.3 \times ASW - ASL \end{align*} Flesch Reading Ease (Simplified): @@ -988,15 +988,10 @@ SMOG Grade: Spache Grade Level: {Grade \; Level = 0.141 \times \frac{100}{NumSentences} + 0.086 \times \left(\frac{NumDifficultWords}{100} \times 100\right) + 0.839} -Szigriszt's Perspicuity Index: - p = 207 - 62.3 \times \frac{NumSyls}{NumWords} - \frac{NumWords}{NumSentences} - Wiener Sachtextformel: \begin{align*} - MS &= \frac{NumWordsPolysyllabic}{NumWords} \\ - SL &= \frac{NumWords}{NumSentences} \\ - IW &= \frac{NumLongWords}{NumWords} \\ - ES &= \frac{NumWordsMonosyllabic}{NumWords} \\ + MS &= \frac{NumWordsPolysyllabic}{NumWords} \qquad SL = \frac{NumWords}{NumSentences} \\ + IW &= \frac{NumLongWords}{NumWords} \qquad \qquad \quad ES = \frac{NumWordsMonosyllabic}{NumWords} \\ WSTF_1 &= 0.1925 \times MS + 0.1672 \times SL + 0.1297 \times IW - 0.0327 \times ES - 0.875 \\ WSTF_2 &= 0.2007 \times MS + 0.1682 \times SL + 0.1373 \times IW - 2.779 \\ WSTF_3 &= 0.2963 \times MS + 0.1905 \times SL - 1.1144 \\ @@ -1011,15 +1006,14 @@ Measure of Readability|Formula Coleman-Liau Index
([Coleman & Liau, 1975](#ref-coleman-liau-1975))|![Formula](/doc/measures/readability/coleman_liau_index.svg) Dale-Chall Readability Score
([Dale & Chall, 1948a](#ref-dale-chall-1948a))|![Formula](/doc/measures/readability/xc50.svg)
where **NumDifficultWords** is the number of words outside the Dale list of 3000 easy words ([Dale & Chall, 1948b](#ref-dale-chall-1948b)).

* This test applies only to **English texts**. Devereux Readability Index
([Smith, 1961](#ref-smith-1961))|![Formula](/doc/measures/readability/devereux_readability_index.svg) -Fernández Huerta's Readability Score1
([Fernández Huerta, 1959](#ref-fernandez-huerta-1959); [Law, 2011](#ref-law-2011))|![Formula](/doc/measures/readability/fernandez_huertas_readability_score.svg)

* This test applies only to **Spanish texts**. Flesch-Kincaid Grade Level1
([Kincaid et al., 1975](#ref-kincaid-et-al-1975))|![Formula](/doc/measures/readability/flesch_kincaid_grade_level.svg) -Flesch Reading Ease1
(English: [Flesch, 1948](#ref-flesch-1948); Dutch: [Douma, 1960](#ref-douma-1960); French: [Kandel & Moles, 1958](#ref-kandel-moles-1958); German: [Amstad, 1978](#ref-amstad-1978); Italian: [Franchina & Vacca, 1986](#ref-franchina-vacca-1986); Russian: [Oborneva, 2006](#ref-oborneva-2006))|![Formula](/doc/measures/readability/re.svg) +Flesch Reading Ease1
([Flesch, 1948](#ref-flesch-1948)
Dutch: [Douma, 1960](#ref-douma-1960)
French: [Kandel & Moles, 1958](#ref-kandel-moles-1958)
German: [Amstad, 1978](#ref-amstad-1978)
Italian: [Franchina & Vacca, 1986](#ref-franchina-vacca-1986)
Russian: [Oborneva, 2006](#ref-oborneva-2006)
Spanish: [Fernández Huerta, 1959](#ref-fernandez-huerta-1959); [Szigriszt Pazos, 1993](#ref-szigrisze-pazos-1993))|![Formula](/doc/measures/readability/re.svg)

* This test has multiple variants for some languages, which you could select via **Menu → Preferences → Settings → Measures → Readability → Flesch Reading Ease**. Flesch Reading Ease (Simplified)1
([Farr et al., 1951](#ref-farr-et-al-1951))|![Formula](/doc/measures/readability/re_simplified.svg) -FORCAST Grade Level1
([Caylor et al., 1973](#ref-caylor-et-al-1973))|![Formula](/doc/measures/readability/rgl.svg)
* A sample of 150 words is taken randomly from the text, thus the text should be **at least 150 words long**. +FORCAST Grade Level1
([Caylor et al., 1973](#ref-caylor-et-al-1973))|![Formula](/doc/measures/readability/rgl.svg)

* A sample of 150 words is taken randomly from the text, thus the text should be **at least 150 words long**. Fórmula de comprensibilidad de Gutiérrez de Polini
([Gutiérrez de Polini, 1972](#ref-gutierrez-de-polini-1972))|![Formula](/doc/measures/readability/cp.svg)

* This test applies only to **Spanish texts**. Fórmula de Crawford1
([Crawford, 1985](#ref-crawford-1985))|![Formula](/doc/measures/readability/formula_de_crawford.svg)

* This test applies only to **Spanish texts**. Gulpease Index
([Lucisano & Emanuela Piemontese, 1988](#ref-lucisano-emanuela-piemontese-1988))|![Formula](/doc/measures/readability/gulpease_index.svg)

* This test applies only to **Italian texts**. -Gunning Fog Index1
(English: [Gunning, 1968](#ref-gunning-1968); Polish: [Pisarek, 1969](#ref-pisarek-1969))|![Formula](/doc/measures/readability/fog_index.svg)
where **NumHardWords** is the number of words with 3 or more syllables excluding all proper nouns and words with 3 syllables ending with *-ed* or *-es* for **English texts**, and the number of words with 4 or more syllables for **Polish texts**.

* This test applies only to **English texts** and **Polish texts**. +Gunning Fog Index1
(English: [Gunning, 1968](#ref-gunning-1968)
Polish: [Pisarek, 1969](#ref-pisarek-1969))|![Formula](/doc/measures/readability/fog_index.svg)
where **NumHardWords** is the number of words with 3 or more syllables excluding all proper nouns and words with 3 syllables ending with *-ed* or *-es* for **English texts**, and the number of words with 4 or more syllables for **Polish texts**.

* This test applies only to **English texts** and **Polish texts**. Legibilidad µ
([Muñoz Baquedano, 2006](#ref-munoz-baquedano-2006))|![Formula](/doc/measures/readability/mu.svg)
where **LenWordsAvg** is the average word length in letters, and **LenWordsVar** is the variance of word lengths in letters.

* This test applies only to **Spanish texts**.
* The text should be **at least 2 words long**. Lensear Write1
([O’Hayre, 1966](#ref-o-hayre-1966))|![Formula](/doc/measures/readability/lensear_write.svg)
where **NumWordsMonosyllabic** is the number of monosyllabic words excluding *the*, *is*, *are*, *was*, *were*, and **NumSentences** is the number of sentences to the nearest period.

* This test applies only to **English texts**.
* A sample of 100 words is taken randomly from the text.
If the text is **shorter than 100 words**, **NumWordsMonosyllabic** and **NumSentences** need to be multiplied by 100 and then divided by the number of text. Lix
([Björnsson, 1968](#ref-bjornsson-1968))|![Formula](/doc/measures/readability/lix.svg)
where **NumLongWords** is the number of words with 7 or more letters. @@ -1028,7 +1022,6 @@ Measure of Readability|Formula Rix
([Anderson, 1983](#ref-anderson-1983))|![Formula](/doc/measures/readability/rix.svg)
where **NumLongWords** is the number of words with 7 or more letters. SMOG Grade1
([McLaughlin, 1969](#ref-mclaughlin-1969))|![Formula](/doc/measures/readability/smog_grade.svg)
where **NumWordsPolysyllabic** is the number of words with 3 or more syllables.

* A sample consisting of the first 10 sentences of the text, the last 10 sentences of the text, and 10 sentences at the middle of the text is taken from the text, thus the text should be **at least 30 sentences long**. Spache Grade Level
([Spache, 1953](#ref-spache-1953))|![Formula](/doc/measures/readability/spache_grade_level.svg)
where **NumDifficultWords** is the number of words outside the Dale list of 769 easy words ([Dale, 1931](#ref-dale-1931)).

* Three samples each of 100 words are taken randomly from the text and the mean of the three scores is calculated, thus the text should be **at least 100 words long**. -Szigriszt's Perspicuity Index1
([Szigriszt Pazos, 1993](#ref-szigrisze-pazos-1993))|![Formula](/doc/measures/readability/szigriszts_perspicuity_index.svg)

* This test applies only to **Spanish texts**. Wiener Sachtextformel1
([Bamberger & Vanecek, 1984](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/wstf.svg)
where **NumWordsPolysyllabic** is the number of words with 3 or more syllables and **NumLongWords** is the numbers of words with 7 or more letters.

* This test applies only to **German texts**.
* This test has 4 variants, which you could select via **Menu → Preferences → Settings → Measures → Readability → Wiener Sachtextformel → Variant**. **Notes:** @@ -1358,7 +1351,6 @@ Cubic Association Ratio
([Daille, 1994](#ref-daille-1994), [1995](#ref-daille ## [5 References](#doc) - [1] [**^**](#ref-aari) Al-Tamimi, A., Jaradat M., Aljarrah, N., & Ghanim, S. (2013). AARI: Automatic Arabic readability index. *The International Arab Journal of Information Technology*, *11*(4), pp. 370–378.
@@ -1412,7 +1404,7 @@ Cubic Association Ratio
([Daille, 1994](#ref-daille-1994), [1995](#ref-daille [26] [**^**](#ref-re-simplified) Farr, J. N., Jenkins, J. J., & Paterson, D. G. (1951). Simplification of Flesch reading ease formula. *Journal of Applied Psychology*, *35*(5), 333–337. https://doi.org/10.1037/h0062427
-[27] [**^**](#ref-fernandez-huertas-readability-score) Fernández Huerta, J. (1959). Medidas sencillas de lecturabilidad. *Consigna*, *214*, 29–32.
+[27] [**^**](#ref-re) Fernández Huerta, J. (1959). Medidas sencillas de lecturabilidad. *Consigna*, *214*, 29–32.
[28] [**^**](#ref-re) Flesch, R. (1948). A new readability yardstick. *Journal of Applied Psychology*, *32*(3), 221–233. https://doi.org/10.1037/h0057532
@@ -1435,7 +1427,6 @@ Cubic Association Ratio
([Daille, 1994](#ref-daille-1994), [1995](#ref-daille [37] [**^**](#ref-juillands-d)[**^**](#ref-juillands-u) Juilland, A., & Chang-Rodriguez, E. (1964). *Frequency dictionary of Spanish words*. Mouton.
[38] [**^**](#ref-re) Kandel, L., & Moles A. (1958). Application de l’indice de flesch la langue francaise [applying flesch index to french language]. *The Journal of Educational Research*, *21*, 283–287.
- [39] [**^**](#ref-mann-whiteney-u-test) Kilgarriff, A. (2001). Comparing corpora. *International Journal of Corpus Linguistics*, *6*(1), 232–263. https://doi.org/10.1075/ijcl.6.1.05kil
@@ -1446,59 +1437,57 @@ Cubic Association Ratio
([Daille, 1994](#ref-daille-1994), [1995](#ref-daille [42] [**^**](#ref-flesch-kincaid-grade-level) Kincaid, J. P., Fishburne, R. P., Rogers, R. L., & Chissom, B. S. (1975). *Derivation of new readability formulas (automated readability index, fog count, and Flesch reading ease formula) for navy enlisted personnel*. Naval Air Station Memphis.
[43] [**^**](#ref-kromers-ur) Kromer, V. (2003). A usage measure based on psychophysical relations. *Journal of Quantitative Linguistics*, *10*(2), 177–186. https://doi.org/10.1076/jqul.10.2.177.16718
- -[44] [**^**](#ref-fernandez-huertas-readability-score) Law, Gwillim. (2011, May 27). *Error in the Fernandez Huerta readability formula*. LINGUIST List. https://linguistlist.org/issues/22/22-2332/
-[45] [**^**](#ref-mi-log-f) Lexical Computing. (2015, July 8). *Statistics used in Sketch Engine*. Sketch Engine. https://www.sketchengine.eu/documentation/statistics-used-in-sketch-engine/
+[44] [**^**](#ref-mi-log-f) Lexical Computing. (2015, July 8). *Statistics used in Sketch Engine*. Sketch Engine. https://www.sketchengine.eu/documentation/statistics-used-in-sketch-engine/
-[46] [**^**](#ref-griess-dp-norm) Lijffijt, J., & Gries, S. T. (2012). Correction to Stefan Th. Gries’ “dispersions and adjusted frequencies in corpora”. *International Journal of Corpus Linguistics*, *17*(1), 147–149. https://doi.org/10.1075/ijcl.17.1.08lij
+[45] [**^**](#ref-griess-dp-norm) Lijffijt, J., & Gries, S. T. (2012). Correction to Stefan Th. Gries’ “dispersions and adjusted frequencies in corpora”. *International Journal of Corpus Linguistics*, *17*(1), 147–149. https://doi.org/10.1075/ijcl.17.1.08lij
-[47] [**^**](#ref-gulpease-index) Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. *Scuola e Città*, *39*(3), pp. 110–124.
+[46] [**^**](#ref-gulpease-index) Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. *Scuola e Città*, *39*(3), pp. 110–124.
-[48] [**^**](#ref-lynes-d3) Lyne, A. A. (1985). Dispersion. In *The vocabulary of French business correspondence: Word frequencies, collocations, and problems of lexicometric method* (pp. 101–124). Slatkine/Champion.
+[47] [**^**](#ref-lynes-d3) Lyne, A. A. (1985). Dispersion. In *The vocabulary of French business correspondence: Word frequencies, collocations, and problems of lexicometric method* (pp. 101–124). Slatkine/Champion.
-[49] [**^**](#ref-smog-grade) McLaughlin, G. H. (1969). SMOG grading: A new readability formula. *Journal of Reading*, *12*(8), pp. 639–646.
+[48] [**^**](#ref-smog-grade) McLaughlin, G. H. (1969). SMOG grading: A new readability formula. *Journal of Reading*, *12*(8), pp. 639–646.
-[50] [**^**](#ref-legibilidad-mu) Muñoz Baquedano, M. (2006). Legibilidad y variabilidad de los textos. *Boletín de Investigación Educacional, Pontificia Universidad Católica de Chile*, *21*(2), 13–26.
+[49] [**^**](#ref-legibilidad-mu) Muñoz Baquedano, M. (2006). Legibilidad y variabilidad de los textos. *Boletín de Investigación Educacional, Pontificia Universidad Católica de Chile*, *21*(2), 13–26.
-[51] [**^**](#ref-eflaw) Nirmaldasan. (2009, April 30). *McAlpine EFLAW readability score*. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/
+[50] [**^**](#ref-eflaw) Nirmaldasan. (2009, April 30). *McAlpine EFLAW readability score*. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/
-[52] [**^**](#ref-pearsons-chi-squared-test) Oakes, M. P. (1998). *Statistics for Corpus Linguistics*. Edinburgh University Press.
+[51] [**^**](#ref-pearsons-chi-squared-test) Oakes, M. P. (1998). *Statistics for Corpus Linguistics*. Edinburgh University Press.
-[53] [**^**](#ref-re) Oborneva, I. V. (2006). *Автоматизированная оценка сложности учебных текстов на основе статистических параметров* [Doctoral dissertation, Institute for Strategy of Education Development of the Russian Academy of Education]. Freereferats.ru. https://static.freereferats.ru/_avtoreferats/01002881899.pdf?ver=3 +[52] [**^**](#ref-re) Oborneva, I. V. (2006). *Автоматизированная оценка сложности учебных текстов на основе статистических параметров* [Doctoral dissertation, Institute for Strategy of Education Development of the Russian Academy of Education]. Freereferats.ru. https://static.freereferats.ru/_avtoreferats/01002881899.pdf?ver=3 -[54] [**^**](#ref-lensear-write) O’Hayre, J. (1966). *Gobbledygook has gotta go*. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf
+[53] [**^**](#ref-lensear-write) O’Hayre, J. (1966). *Gobbledygook has gotta go*. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf
-[55] [**^**](#ref-students-t-test-2-sample) Paquot, M., & Bestgen, Y. (2009). Distinctive words in academic writing: A comparison of three statistical tests for keyword extraction. *Language and Computers*, *68*, 247–269.
+[54] [**^**](#ref-students-t-test-2-sample) Paquot, M., & Bestgen, Y. (2009). Distinctive words in academic writing: A comparison of three statistical tests for keyword extraction. *Language and Computers*, *68*, 247–269.
-[56] [**^**](#ref-fishers-exact-test) Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), *Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference* (pp. 188–200). The South–Central Regional SAS Users' Group.
+[55] [**^**](#ref-fishers-exact-test) Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), *Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference* (pp. 188–200). The South–Central Regional SAS Users' Group.
-[57] [**^**](#ref-min-sensitivity) Pedersen, T. (1998). Dependent bigram identification. In *Proceedings of the Fifteenth National Conference on Artificial Intelligence* (p. 1197). AAAI Press.
+[56] [**^**](#ref-min-sensitivity) Pedersen, T. (1998). Dependent bigram identification. In *Proceedings of the Fifteenth National Conference on Artificial Intelligence* (p. 1197). AAAI Press.
-[58] [**^**](#ref-fog-index) Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. *Zeszyty Prasoznawcze*, *4*(42), 35–48.
+[57] [**^**](#ref-fog-index) Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. *Zeszyty Prasoznawcze*, *4*(42), 35–48.
-[59] [**^**](#ref-odds-ratio) Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. *Corpus Linguistics and Linguistic Theory*, *15*(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030
+[58] [**^**](#ref-odds-ratio) Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. *Corpus Linguistics and Linguistic Theory*, *15*(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030
-[60] [**^**](#ref-poisson-collocation-measure) Quasthoff, U., & Wolff, C. (2002). The poisson collocation measure and its applications. *Proceedings of 2nd International Workshop on Computational Approaches to Collocations*. IEEE.
+[59] [**^**](#ref-poisson-collocation-measure) Quasthoff, U., & Wolff, C. (2002). The poisson collocation measure and its applications. *Proceedings of 2nd International Workshop on Computational Approaches to Collocations*. IEEE.
-[61] [**^**](#ref-rosengrens-s)[**^**](#ref-rosengrens-kf) Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. *Études de linguistique appliquée*, *1*, 103–127.
+[60] [**^**](#ref-rosengrens-s)[**^**](#ref-rosengrens-kf) Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. *Études de linguistique appliquée*, *1*, 103–127.
-[62] [**^**](#ref-log-dice) Rychlý, P. (2008). A lexicographyer-friendly association score. In P. Sojka & A. Horák (Eds.), *Proceedings of Second Workshop on Recent Advances in Slavonic Natural Languages Processing*. Masaryk University
+[61] [**^**](#ref-log-dice) Rychlý, P. (2008). A lexicographyer-friendly association score. In P. Sojka & A. Horák (Eds.), *Proceedings of Second Workshop on Recent Advances in Slavonic Natural Languages Processing*. Masaryk University
-[63] [**^**](#ref-ald) [**^**](#ref-fald) [**^**](#ref-arf) [**^**](#ref-farf) [**^**](#ref-awt) [**^**](#ref-fawt) Savický, P., & Hlaváčová, J. (2002). Measures of word commonness. *Journal of Quantitative Linguistics*, *9*(3), 215–231. https://doi.org/10.1076/jqul.9.3.215.14124 +[62] [**^**](#ref-ald) [**^**](#ref-fald) [**^**](#ref-arf) [**^**](#ref-farf) [**^**](#ref-awt) [**^**](#ref-fawt) Savický, P., & Hlaváčová, J. (2002). Measures of word commonness. *Journal of Quantitative Linguistics*, *9*(3), 215–231. https://doi.org/10.1076/jqul.9.3.215.14124 -[64] [**^**](#ref-dices-coeff) Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. *Computational Linguistics*, *22*(1), pp. 1–38.
+[63] [**^**](#ref-dices-coeff) Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. *Computational Linguistics*, *22*(1), pp. 1–38.
-[65] [**^**](#ref-devereux-readability-index) Smith, E. A. (1961). Devereaux readability index. *Journal of Educational Research*, *54*(8), 298–303. https://doi.org/10.1080/00220671.1961.10882728
+[64] [**^**](#ref-devereux-readability-index) Smith, E. A. (1961). Devereaux readability index. *Journal of Educational Research*, *54*(8), 298–303. https://doi.org/10.1080/00220671.1961.10882728
-[66] [**^**](#ref-ari) Smith, E. A., & Senter, R. J. (1967). *Automated readability index*. Aerospace Medical Research Laboratories. https://apps.dtic.mil/sti/pdfs/AD0667273.pdf
+[65] [**^**](#ref-ari) Smith, E. A., & Senter, R. J. (1967). *Automated readability index*. Aerospace Medical Research Laboratories. https://apps.dtic.mil/sti/pdfs/AD0667273.pdf
-[67] [**^**](#ref-spache-grade-level) Spache, G. (1953). A new readability formula for primary-grade reading materials. *Elementary School Journal*, *53*(7), 410–413. https://doi.org/10.1086/458513
+[66] [**^**](#ref-spache-grade-level) Spache, G. (1953). A new readability formula for primary-grade reading materials. *Elementary School Journal*, *53*(7), 410–413. https://doi.org/10.1086/458513
-[68] [**^**](#ref-szigriszts-perspicuity_index) Szigriszt Pazos, F. (1993). *Sistemas predictivos de legibilidad del mensaje escrito: Formula de perspicuidad* [Doctoral dissertation, Complutense University of Madrid]. Biblos-e Archivo. https://repositorio.uam.es/bitstream/handle/10486/2488/3907_barrio_cantalejo_ines_maria.pdf?sequence=1&isAllowed=y
+[67] [**^**](#ref-re) Szigriszt Pazos, F. (1993). *Sistemas predictivos de legibilidad del mensaje escrito: Formula de perspicuidad* [Doctoral dissertation, Complutense University of Madrid]. Biblos-e Archivo. https://repositorio.uam.es/bitstream/handle/10486/2488/3907_barrio_cantalejo_ines_maria.pdf?sequence=1&isAllowed=y
-[69] [**^**](#ref-lfmd)[**^**](#ref-md) Thanopoulos, A., Fakotakis, N., & Kokkinakis, G. (2002). Comparative evaluation of collocation extraction metrics. In M. G. González & C. P. S. Araujo (Eds.), *Proceedings of the Third International Conference on Language Resources and Evaluation* (pp. 620–625). European Language Resources Association.
+[68] [**^**](#ref-lfmd)[**^**](#ref-md) Thanopoulos, A., Fakotakis, N., & Kokkinakis, G. (2002). Comparative evaluation of collocation extraction metrics. In M. G. González & C. P. S. Araujo (Eds.), *Proceedings of the Third International Conference on Language Resources and Evaluation* (pp. 620–625). European Language Resources Association.
-[70] [**^**](#ref-log-likehood-ratio-test-bayes-factor)[**^**](#ref-students-t-test-2-sample-bayes-factor) Wilson, A. (2013). Embracing Bayes Factors for key item analysis in corpus linguistics. In M. Bieswanger & A. Koll-Stobbe (Eds.), *New Approaches to the Study of Linguistic Variability* (pp. 3–11). Peter Lang.
+[69] [**^**](#ref-log-likehood-ratio-test-bayes-factor)[**^**](#ref-students-t-test-2-sample-bayes-factor) Wilson, A. (2013). Embracing Bayes Factors for key item analysis in corpus linguistics. In M. Bieswanger & A. Koll-Stobbe (Eds.), *New Approaches to the Study of Linguistic Variability* (pp. 3–11). Peter Lang.
-[71] [**^**](#ref-zhangs-distributional-consistency) Zhang, H., Huang, C., & Yu, S. (2004). Distributional consistency: As a general method for defining a core lexicon. In M. T. Lino, M. F. Xavier, F. Ferreira, R. Costa, & R. Silva (Eds.), *Proceedings of Fourth International Conference on Language Resources and Evaluation* (pp. 1119–1122). European Language Resources Association.
+[70] [**^**](#ref-zhangs-distributional-consistency) Zhang, H., Huang, C., & Yu, S. (2004). Distributional consistency: As a general method for defining a core lexicon. In M. T. Lino, M. F. Xavier, F. Ferreira, R. Costa, & R. Silva (Eds.), *Proceedings of Fourth International Conference on Language Resources and Evaluation* (pp. 1119–1122). European Language Resources Association.
diff --git a/doc/measures/readability/fernandez_huertas_readability_score.svg b/doc/measures/readability/fernandez_huertas_readability_score.svg deleted file mode 100644 index 8c177dbf2..000000000 --- a/doc/measures/readability/fernandez_huertas_readability_score.svg +++ /dev/null @@ -1,91 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/doc/measures/readability/re.svg b/doc/measures/readability/re.svg index 037e3b7e8..75d68d665 100644 --- a/doc/measures/readability/re.svg +++ b/doc/measures/readability/re.svgo newline at end of file diff --git a/doc/measures/readability/wstf.svg b/doc/measures/readability/wstf.svg index 491c0e675..43a0b56f1 100644 --- a/doc/measures/readability/wstf.svg +++ b/doc/measures/readability/wstf.svgo newline at end of file diff --git a/tests/wl_tests_measures/test_measures_readability.py b/tests/wl_tests_measures/test_measures_readability.py index d192332a7..9ba4925b7 100644 --- a/tests/wl_tests_measures/test_measures_readability.py +++ b/tests/wl_tests_measures/test_measures_readability.py @@ -45,22 +45,36 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'): test_text_eng_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120) test_text_eng_150 = Wl_Test_Text(TOKENS_MULTILEVEL_150) +test_text_ara_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara') +test_text_ara_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara') + +test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de') +test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de') + +test_text_ita_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ita') +test_text_ita_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ita') + test_text_spa_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'spa') test_text_spa_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'spa') test_text_spa_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120, lang = 'spa') test_text_spa_150 = Wl_Test_Text(TOKENS_MULTILEVEL_150, lang = 'spa') +test_text_afr_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'afr') +test_text_nld_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'nld') +test_text_fra_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'fra') +test_text_pol_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'pol') +test_text_rus_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'rus') test_text_other_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'other') def test_automated_ara_readability_index(): - aari_ara_0 = wl_measures_readability.automated_ara_readability_index(main, Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara')) - aari_ara_12 = wl_measures_readability.automated_ara_readability_index(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara')) + aari_ara_0 = wl_measures_readability.automated_ara_readability_index(main, test_text_ara_0) + aari_ara_12 = wl_measures_readability.automated_ara_readability_index(main, test_text_ara_12) aari_eng_12 = wl_measures_readability.automated_ara_readability_index(main, test_text_eng_12) print('Automated Arabic Readability Index:') - print(f'\t{aari_ara_0} (ara/0)') - print(f'\t{aari_ara_12} (ara/12)') - print(f'\t{aari_eng_12} (eng/12)') + print(f'\tara/0: {aari_ara_0}') + print(f'\tara/12: {aari_ara_12}') + print(f'\teng/12: {aari_eng_12}') assert aari_ara_0 == 'text_too_short' assert aari_ara_12 == 3.28 * 46 + 1.43 * (46 / 12) + 1.24 * (12 / 3) @@ -72,9 +86,9 @@ def test_automated_readability_index(): ari_spa_12 = wl_measures_readability.automated_readability_index(main, test_text_spa_12) print('Automated Readability Index:') - print(f'\t{ari_eng_0} (0)') - print(f'\t{ari_eng_12} (eng/12)') - print(f'\t{ari_spa_12} (spa/12)') + print(f'\teng/0: {ari_eng_0}') + print(f'\teng/12: {ari_eng_12}') + print(f'\tspa/12: {ari_spa_12}') assert ari_eng_0 == 'text_too_short' assert ari_eng_12 == ari_spa_12 == 0.5 * (12 / 3) + 4.71 * (47 / 12) - 21.43 @@ -85,9 +99,9 @@ def test_coleman_liau_index(): grade_level_spa_12 = wl_measures_readability.coleman_liau_index(main, test_text_spa_12) print('Coleman-Liau Index:') - print(f'\t{grade_level_eng_0} (0)') - print(f'\t{grade_level_eng_12} (eng/12)') - print(f'\t{grade_level_spa_12} (spa/12)') + print(f'\teng/0: {grade_level_eng_0}') + print(f'\teng/12: {grade_level_eng_12}') + print(f'\tspa/12: {grade_level_spa_12}') est_cloze_pct = 141.8401 - 0.21459 * (45 / 12 * 100) + 1.079812 * (3 / 12 * 100) @@ -100,9 +114,9 @@ def test_dale_chall_readability_score(): x_c50_spa_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_spa_12) print('Dale-Chall Readibility Score:') - print(f'\t{x_c50_eng_0} (0)') - print(f'\t{x_c50_eng_12} (eng/12)') - print(f'\t{x_c50_spa_12} (spa/12)') + print(f'\teng/0: {x_c50_eng_0}') + print(f'\teng/12: {x_c50_eng_12}') + print(f'\tspa/12: {x_c50_spa_12}') assert x_c50_eng_0 == 'text_too_short' assert x_c50_eng_12 == 0.1579 * (1 / 12) + 0.0496 * (12 / 3) + 3.6365 @@ -114,27 +128,13 @@ def test_devereux_readability_index(): grade_placement_spa_12 = wl_measures_readability.devereux_readability_index(main, test_text_spa_12) print('Devereux Readability Index:') - print(f'\t{grade_placement_eng_0} (0)') - print(f'\t{grade_placement_eng_12} (eng/12)') - print(f'\t{grade_placement_spa_12} (spa/12)') + print(f'\teng/0: {grade_placement_eng_0}') + print(f'\teng/12: {grade_placement_eng_12}') + print(f'\tspa/12: {grade_placement_spa_12}') assert grade_placement_eng_0 == 'text_too_short' assert grade_placement_eng_12 == grade_placement_spa_12 == 1.56 * (47 / 12) + 0.19 * (12 / 3) - 6.49 -def test_fernandez_huertas_readability_score(): - score_spa_0 = wl_measures_readability.fernandez_huertas_readability_score(main, test_text_spa_0) - score_spa_12 = wl_measures_readability.fernandez_huertas_readability_score(main, test_text_spa_12) - score_eng_12 = wl_measures_readability.fernandez_huertas_readability_score(main, test_text_eng_12) - - print("Fernández Huerta's Readability Score:") - print(f'\t{score_spa_0} (spa/0)') - print(f'\t{score_spa_12} (spa/12)') - print(f'\t{score_eng_12} (eng/12)') - - assert score_spa_0 == 'text_too_short' - assert score_spa_12 == 206.84 - 60 * (18 / 12) - 102 * (3 / 12) - assert score_eng_12 == 'no_support' - def test_flesch_kincaid_grade_level(): gl_eng_0 = wl_measures_readability.flesch_kincaid_grade_level(main, test_text_eng_0) gl_eng_12 = wl_measures_readability.flesch_kincaid_grade_level(main, test_text_eng_12) @@ -142,10 +142,10 @@ def test_flesch_kincaid_grade_level(): gl_other_12 = wl_measures_readability.flesch_kincaid_grade_level(main, test_text_other_12) print('Flesch-Kincaid Grade Level:') - print(f'\t{gl_eng_0} (0)') - print(f'\t{gl_eng_12} (eng/12)') - print(f'\t{gl_spa_12} (spa/12)') - print(f'\t{gl_other_12} (other/12)') + print(f'\teng/0: {gl_eng_0}') + print(f'\teng/12: {gl_eng_12}') + print(f'\tspa/12: {gl_spa_12}') + print(f'\tother/12: {gl_other_12}') assert gl_eng_0 == 'text_too_short' assert gl_eng_12 == 0.39 * (12 / 3) + 11.8 * (15 / 12) - 15.59 @@ -155,18 +155,43 @@ def test_flesch_kincaid_grade_level(): def test_flesch_reading_ease(): flesch_re_eng_0 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_0) flesch_re_eng_12 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_12) - flesch_re_spa_12 = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12) + flesch_re_nld_12 = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12) + flesch_re_fra_12 = wl_measures_readability.flesch_reading_ease(main, test_text_fra_12) + flesch_re_deu_12 = wl_measures_readability.flesch_reading_ease(main, test_text_deu_12) + flesch_re_ita_12 = wl_measures_readability.flesch_reading_ease(main, test_text_ita_12) + flesch_re_rus_12 = wl_measures_readability.flesch_reading_ease(main, test_text_rus_12) + + main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Fernández Huerta' + flesch_re_spa_12_fh = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12) + main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Szigriszt Pazos' + flesch_re_spa_12_sp = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12) + + flesch_re_afr_12 = wl_measures_readability.flesch_reading_ease(main, test_text_afr_12) flesch_re_other_12 = wl_measures_readability.flesch_reading_ease(main, test_text_other_12) print('Flesch Reading Ease:') - print(f'\t{flesch_re_eng_0} (0)') - print(f'\t{flesch_re_eng_12} (eng/12)') - print(f'\t{flesch_re_spa_12} (spa/12)') - print(f'\t{flesch_re_other_12} (other/12)') + print(f'\teng/0: {flesch_re_eng_0}') + print(f'\teng/12: {flesch_re_eng_12}') + print(f'\tnld/12: {flesch_re_nld_12}') + print(f'\tfra/12: {flesch_re_fra_12}') + print(f'\tdeu/12: {flesch_re_deu_12}') + print(f'\tita/12: {flesch_re_ita_12}') + print(f'\trus/12: {flesch_re_rus_12}') + print(f'\tspa-FH/12: {flesch_re_spa_12_fh}') + print(f'\tspa-SP/12: {flesch_re_spa_12_sp}') + print(f'\tafr/12: {flesch_re_afr_12}') + print(f'\tother/12: {flesch_re_other_12}') assert flesch_re_eng_0 == 'text_too_short' assert flesch_re_eng_12 == 206.835 - 0.846 * (15 / 12 * 100) - 1.015 * (12 / 3) - assert flesch_re_spa_12 != 'no_support' + assert flesch_re_nld_12 == 206.84 - 77 * (18 / 12) - 0.93 * (12 / 3) + assert flesch_re_fra_12 == 207 - 73.6 * (16 / 12) - 1.015 * (12 / 3) + assert flesch_re_deu_12 == 180 - 58.5 * (15 / 12) - (12 / 3) + assert flesch_re_ita_12 == 217 - 60 * (19 / 12) - 1.3 * (12 / 3) + assert flesch_re_rus_12 == 206.835 - 60.1 * (13 / 12) - 1.3 * (12 / 3) + assert flesch_re_spa_12_fh == 206.84 - 60 * (18 / 12) - 1.02 * (12 / 3) + assert flesch_re_spa_12_sp == 206.84 - 62.3 * (18 / 12) - (12 / 3) + assert flesch_re_afr_12 == 206.835 - 0.846 * (18 / 12 * 100) - 1.015 * (12 / 3) assert flesch_re_other_12 == 'no_support' def test_flesch_reading_ease_simplified(): @@ -176,10 +201,10 @@ def test_flesch_reading_ease_simplified(): flesch_re_simplified_other_12 = wl_measures_readability.flesch_reading_ease_simplified(main, test_text_other_12) print('Flesch Reading Ease (Simplified):') - print(f'\t{flesch_re_simplified_eng_0} (0)') - print(f'\t{flesch_re_simplified_eng_12} (eng/12)') - print(f'\t{flesch_re_simplified_spa_12} (spa/12)') - print(f'\t{flesch_re_simplified_other_12} (other/12)') + print(f'\teng/0: {flesch_re_simplified_eng_0}') + print(f'\teng/12: {flesch_re_simplified_eng_12}') + print(f'\tspa/12: {flesch_re_simplified_spa_12}') + print(f'\tother/12: {flesch_re_simplified_other_12}') assert flesch_re_simplified_eng_0 == 'text_too_short' assert flesch_re_simplified_eng_12 == flesch_re_simplified_spa_12 == 1.599 * (9 / 12 * 100) - 1.015 * (12 / 3) - 31.517 @@ -192,10 +217,10 @@ def test_forcast_grade_level(): rgl_other_12 = wl_measures_readability.forcast_grade_level(main, test_text_other_12) print('FORCAST Grade Level:') - print(f'\t{rgl_eng_12} (eng/12)') - print(f'\t{rgl_eng_150} (eng/150)') - print(f'\t{rgl_spa_150} (spa/150)') - print(f'\t{rgl_other_12} (other/12)') + print(f'\teng/12: {rgl_eng_12}') + print(f'\teng/150: {rgl_eng_150}') + print(f'\tspa/150: {rgl_spa_150}') + print(f'\tother/12: {rgl_other_12}') assert rgl_eng_12 == 'text_too_short' assert rgl_eng_150 == rgl_spa_150 == 20.43 - 0.11 * (6 * 18 + 4) @@ -207,9 +232,9 @@ def test_formula_de_comprensibilidad_de_gutierrez_de_polini(): cp_eng_12 = wl_measures_readability.formula_de_comprensibilidad_de_gutierrez_de_polini(main, test_text_eng_12) print('Fórmula de comprensibilidad de Gutiérrez de Polini:') - print(f'\t{cp_spa_0} (spa/0)') - print(f'\t{cp_spa_12} (spa/12)') - print(f'\t{cp_eng_12} (eng/12)') + print(f'\tspa/0: {cp_spa_0}') + print(f'\tspa/12: {cp_spa_12}') + print(f'\teng/12: {cp_eng_12}') assert cp_spa_0 == 'text_too_short' assert cp_spa_12 == 95.2 - 9.7 * (45 / 12) - 0.35 * (12 / 3) @@ -221,23 +246,23 @@ def test_formula_de_crawford(): grade_level_eng_12 = wl_measures_readability.formula_de_crawford(main, test_text_eng_12) print('Fórmula de Crawford:') - print(f'\t{grade_level_spa_0} (spa/0)') - print(f'\t{grade_level_spa_12} (spa/12)') - print(f'\t{grade_level_eng_12} (eng/12)') + print(f'\tspa/0: {grade_level_spa_0}') + print(f'\tspa/12: {grade_level_spa_12}') + print(f'\teng/12: {grade_level_eng_12}') assert grade_level_spa_0 == 'text_too_short' assert grade_level_spa_12 == 3 / 12 * 100 * (-0.205) + 18 / 12 * 100 * 0.049 - 3.407 assert grade_level_eng_12 == 'no_support' def test_gulpease_index(): - gulpease_index_ita_0 = wl_measures_readability.gulpease_index(main, Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ita')) - gulpease_index_ita_12 = wl_measures_readability.gulpease_index(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ita')) + gulpease_index_ita_0 = wl_measures_readability.gulpease_index(main, test_text_ita_0) + gulpease_index_ita_12 = wl_measures_readability.gulpease_index(main, test_text_ita_12) gulpease_index_eng_12 = wl_measures_readability.gulpease_index(main, test_text_eng_12) print('Gulpease Index:') - print(f'\t{gulpease_index_ita_0} (ita/0)') - print(f'\t{gulpease_index_ita_12} (ita/12)') - print(f'\t{gulpease_index_eng_12} (eng/12)') + print(f'\tita/0: {gulpease_index_ita_0}') + print(f'\tita/12: {gulpease_index_ita_12}') + print(f'\teng/12: {gulpease_index_eng_12}') assert gulpease_index_ita_0 == 'text_too_short' assert gulpease_index_ita_12 == 89 + (300 * 3 - 10 * 45) / 12 @@ -246,14 +271,14 @@ def test_gulpease_index(): def test_gunning_fog_index(): fog_index_eng_0 = wl_measures_readability.gunning_fog_index(main, test_text_eng_0) fog_index_eng_12_propn = wl_measures_readability.gunning_fog_index(main, test_text_eng_12_propn) - fog_index_pol_12 = wl_measures_readability.gunning_fog_index(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'pol')) + fog_index_pol_12 = wl_measures_readability.gunning_fog_index(main, test_text_pol_12) fog_index_spa_12 = wl_measures_readability.gunning_fog_index(main, test_text_spa_12) print('Gunning Fog Index:') - print(f'\t{fog_index_eng_0} (0)') - print(f'\t{fog_index_eng_12_propn} (eng/12)') - print(f'\t{fog_index_pol_12} (pol/12)') - print(f'\t{fog_index_spa_12} (spa/12)') + print(f'\teng/0: {fog_index_eng_0}') + print(f'\teng/12: {fog_index_eng_12_propn}') + print(f'\tpol/12: {fog_index_pol_12}') + print(f'\tspa/12: {fog_index_spa_12}') assert fog_index_eng_0 == 'text_too_short' assert fog_index_eng_12_propn == 0.4 * (12 / 3 + 1 / 12 * 100) @@ -266,9 +291,9 @@ def test_legibility_mu(): mu_eng_12 = wl_measures_readability.legibility_mu(main, test_text_eng_12) print('Legibilidad µ:') - print(f'\t{mu_spa_0} (spa/0)') - print(f'\t{mu_spa_12} (spa/12)') - print(f'\t{mu_eng_12} (eng/12)') + print(f'\tspa/0: {mu_spa_0}') + print(f'\tspa/12: {mu_spa_12}') + print(f'\teng/12: {mu_eng_12}') assert mu_spa_0 == 'text_too_short' assert mu_spa_12 == (12 / 11) * (3.75 / 7.1875) * 100 @@ -281,10 +306,10 @@ def test_lensear_write(): score_other_12 = wl_measures_readability.lensear_write(main, test_text_other_12) print('Lensear Write:') - print(f'\t{score_eng_0} (eng/0)') - print(f'\t{score_eng_12} (eng/12)') - print(f'\t{score_eng_100} (eng/100)') - print(f'\t{score_other_12} (other/12)') + print(f'\teng/0: {score_eng_0}') + print(f'\teng/12: {score_eng_12}') + print(f'\teng/100: {score_eng_100}') + print(f'\tother/12: {score_other_12}') assert score_eng_0 == 'text_too_short' assert score_eng_12 == 6 * (100 / 12) + 3 * 3 * (100 / 12) @@ -297,9 +322,9 @@ def test_lix(): lix_spa_12 = wl_measures_readability.lix(main, test_text_spa_12) print('Lix:') - print(f'\t{lix_eng_0} (eng/0)') - print(f'\t{lix_eng_12} (eng/12)') - print(f'\t{lix_spa_12} (spa/12)') + print(f'\teng/0: {lix_eng_0}') + print(f'\teng/12: {lix_eng_12}') + print(f'\tspa/12: {lix_spa_12}') assert lix_eng_0 == 'text_too_short' assert lix_eng_12 == 12 / 3 + 100 * (3 / 12) @@ -311,23 +336,23 @@ def test_mcalpine_eflaw(): eflaw_spa_12 = wl_measures_readability.mcalpine_eflaw(main, test_text_spa_12) print('McAlpine EFLAW Readability Score:') - print(f'\t{eflaw_eng_0} (eng/0)') - print(f'\t{eflaw_eng_12} (eng/12)') - print(f'\t{eflaw_spa_12} (spa/12)') + print(f'\teng/0: {eflaw_eng_0}') + print(f'\teng/12: {eflaw_eng_12}') + print(f'\tspa/12: {eflaw_spa_12}') assert eflaw_eng_0 == 'text_too_short' assert eflaw_eng_12 == (12 + 6) / 3 assert eflaw_spa_12 == 'no_support' def test_osman(): - osman_ara_0 = wl_measures_readability.osman(main, Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara')) - osman_ara_12 = wl_measures_readability.osman(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara')) + osman_ara_0 = wl_measures_readability.osman(main, test_text_ara_0) + osman_ara_12 = wl_measures_readability.osman(main, test_text_ara_12) osman_eng_12 = wl_measures_readability.osman(main, test_text_eng_12) print('OSMAN:') - print(f'\t{osman_ara_0} (ara/0)') - print(f'\t{osman_ara_12} (ara/12)') - print(f'\t{osman_eng_12} (eng/12)') + print(f'\tara/0: {osman_ara_0}') + print(f'\tara/12: {osman_ara_12}') + print(f'\teng/12: {osman_eng_12}') assert osman_ara_0 == 'text_too_short' assert osman_ara_12 == 200.791 - 1.015 * (12 / 3) - 24.181 * ((3 + 23 + 3 + 0) / 12) @@ -339,9 +364,9 @@ def test_rix(): rix_spa_12 = wl_measures_readability.rix(main, test_text_spa_12) print('Rix:') - print(f'\t{rix_eng_0} (eng/0)') - print(f'\t{rix_eng_12} (eng/12)') - print(f'\t{rix_spa_12} (spa/12)') + print(f'\teng/0: {rix_eng_0}') + print(f'\teng/12: {rix_eng_12}') + print(f'\tspa/12: {rix_spa_12}') assert rix_eng_0 == 'text_too_short' assert rix_eng_12 == rix_spa_12 == 3 / 3 @@ -353,10 +378,10 @@ def test_smog_grade(): g_other_12 = wl_measures_readability.smog_grade(main, test_text_other_12) print('SMOG Grade:') - print(f'\t{g_eng_12} (eng/12)') - print(f'\t{g_eng_120} (eng/120)') - print(f'\t{g_spa_120} (spa/120)') - print(f'\t{g_other_12} (other/12)') + print(f'\teng/12: {g_eng_12}') + print(f'\teng/120: {g_eng_120}') + print(f'\tspa/120: {g_spa_120}') + print(f'\tother/12: {g_other_12}') assert g_eng_12 == 'text_too_short' assert g_eng_120 == 3.1291 + 1.043 * (15 ** 0.5) @@ -369,43 +394,29 @@ def test_spache_grade_level(): grade_level_spa_12 = wl_measures_readability.spache_grade_level(main, test_text_spa_12) print('Spache Grade Level:') - print(f'\t{grade_level_eng_12} (eng/12)') - print(f'\t{grade_level_eng_100} (eng/100)') - print(f'\t{grade_level_spa_12} (spa/12)') + print(f'\teng/12: {grade_level_eng_12}') + print(f'\teng/100: {grade_level_eng_100}') + print(f'\tspa/12: {grade_level_spa_12}') assert grade_level_eng_12 == 'text_too_short' assert grade_level_eng_100 == numpy.mean([0.141 * (100 / 25) + 0.086 * (25 / 100 * 100) + 0.839] * 3) assert grade_level_spa_12 == 'no_support' -def test_szigriszts_perspicuity_index(): - p_spa_0 = wl_measures_readability.szigriszts_perspicuity_index(main, test_text_spa_0) - p_spa_12 = wl_measures_readability.szigriszts_perspicuity_index(main, test_text_spa_12) - p_eng_12 = wl_measures_readability.szigriszts_perspicuity_index(main, test_text_eng_12) - - print("Szigriszt's Perspicuity Index:") - print(f'\t{p_spa_0} (spa/0)') - print(f'\t{p_spa_12} (spa/12)') - print(f'\t{p_eng_12} (eng/12)') - - assert p_spa_0 == 'text_too_short' - assert p_spa_12 == 207 - 62.3 * (18 / 12) - (12 / 3) - assert p_eng_12 == 'no_support' - def test_wiener_sachtextformel(): - wstf_deu_0 = wl_measures_readability.wiener_sachtextformel(main, Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de')) - wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de'), variant = '1') - wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de'), variant = '2') - wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de'), variant = '3') - wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de'), variant = '4') + wstf_deu_0 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_0) + wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '1') + wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '2') + wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '3') + wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '4') wstf_eng_12 = wl_measures_readability.wiener_sachtextformel(main, test_text_eng_12) print('Wiener Sachtextformel:') - print(f'\t{wstf_deu_0} (deu/0)') - print(f'\t{wstf_deu_12_1} (deu-1/12)') - print(f'\t{wstf_deu_12_2} (deu-2/12)') - print(f'\t{wstf_deu_12_3} (deu-3/12)') - print(f'\t{wstf_deu_12_4} (deu-4/12)') - print(f'\t{wstf_eng_12} (eng/12)') + print(f'\tdeu/0: {wstf_deu_0}') + print(f'\tdeu-1/12: {wstf_deu_12_1}') + print(f'\tdeu-2/12: {wstf_deu_12_2}') + print(f'\tdeu-3/12: {wstf_deu_12_3}') + print(f'\tdeu-4/12: {wstf_deu_12_4}') + print(f'\teng/12: {wstf_eng_12}') ms = 0 / 12 sl = 12 / 3 @@ -425,7 +436,6 @@ def test_wiener_sachtextformel(): test_coleman_liau_index() test_dale_chall_readability_score() test_devereux_readability_index() - test_fernandez_huertas_readability_score() test_flesch_kincaid_grade_level() test_flesch_reading_ease() test_flesch_reading_ease_simplified() @@ -442,5 +452,4 @@ def test_wiener_sachtextformel(): test_rix() test_smog_grade() test_spache_grade_level() - test_szigriszts_perspicuity_index() test_wiener_sachtextformel() diff --git a/tests/wl_tests_work_area/test_profiler.py b/tests/wl_tests_work_area/test_profiler.py index 2a4cc0644..86ffa7bf2 100644 --- a/tests/wl_tests_work_area/test_profiler.py +++ b/tests/wl_tests_work_area/test_profiler.py @@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files): count_tokens_lens_syls.append(collections.Counter(len_tokens_syls)) count_tokens_lens_chars.append(collections.Counter(len_tokens_chars)) - assert len(readability_statistics) == 24 + assert len(readability_statistics) == 22 # Counts assert count_paras diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index 6d9a14ad7..fe3c08b2d 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -216,27 +216,6 @@ def devereux_readability_index(main, text): return grade_placement -# Fernández Huerta's Readability Score -# References: -# Fernández Huerta, J. (1959). Medidas sencillas de lecturabilidad. Consigna, 214, 29–32. -# Law, Gwillim. (2011, May 27). Error in the Fernandez Huerta readability formula. LINGUIST List. https://linguistlist.org/issues/22/22-2332/ -def fernandez_huertas_readability_score(main, text): - if text.lang == 'spa' and text.lang in main.settings_global['syl_tokenizers']: - text = get_counts(main, text) - - if text.count_words and text.count_sentences: - p = ( - 206.84 - - 60 * (text.count_syls / text.count_words) - - 102 * (text.count_sentences / text.count_words) - ) - else: - p = 'text_too_short' - else: - p = 'no_support' - - return p - # Flesch-Kincaid Grade Level # Reference: Kincaid, J. P., Fishburne, R. P., Rogers, R. L., & Chissom, B. S. (1975). Derivation of new readability formulas (automated readability index, fog count, and Flesch reading ease formula) for navy enlisted personnel. Naval Air Station Memphis. https://apps.dtic.mil/sti/pdfs/ADA006655.pdf def flesch_kincaid_grade_level(main, text): @@ -272,22 +251,71 @@ def flesch_kincaid_grade_level(main, text): # Garais, E. (2011). Web applications readability. Journal of Information Systems and Operations Management, 5(1), 117–121. http://www.rebe.rau.ro/RePEc/rau/jisomg/SP11/JISOM-SP11-A13.pdf # Russian variant: # Oborneva, I. V. (2006). Автоматизированная оценка сложности учебных текстов на основе статистических параметров [Doctoral dissertation, Institute for Strategy of Education Development of the Russian Academy of Education]. Freereferats.ru. https://static.freereferats.ru/_avtoreferats/01002881899.pdf?ver=3 +# Spanish variant (Fernández Huerta): +# Fernández Huerta, J. (1959). Medidas sencillas de lecturabilidad. Consigna, 214, 29–32. +# Garais, E. (2011). Web applications readability. Journal of Information Systems and Operations Management, 5(1), 117–121. http://www.rebe.rau.ro/RePEc/rau/jisomg/SP11/JISOM-SP11-A13.pdf +# Spanish variant (Szigriszt Pazos): +# Szigriszt Pazos, F. (1993). Sistemas predictivos de legibilidad del mensaje escrito: Formula de perspicuidad [Doctoral dissertation, Complutense University of Madrid]. Biblos-e Archivo. https://repositorio.uam.es/bitstream/handle/10486/2488/3907_barrio_cantalejo_ines_maria.pdf?sequence=1&isAllowed=y def flesch_reading_ease(main, text): if text.lang in main.settings_global['syl_tokenizers']: text = get_counts(main, text) if text.count_words and text.count_sentences: - flesch_re = ( - 206.835 - - 0.846 * (text.count_syls / text.count_words * 100) - - 1.015 * (text.count_words / text.count_sentences) - ) + if text.lang == 'nld': + re = ( + 206.84 + - 77 * (text.count_syls / text.count_words) + - 0.93 * (text.count_words / text.count_sentences) + ) + elif text.lang == 'fra': + re = ( + 207 + - 73.6 * (text.count_syls / text.count_words) + - 1.015 * (text.count_words / text.count_sentences) + ) + elif text.lang.startswith('deu_'): + re = ( + 180 + - 58.5 * (text.count_syls / text.count_words) + - (text.count_words / text.count_sentences) + ) + elif text.lang == 'ita': + re = ( + 217 + - 60 * (text.count_syls / text.count_words) + - 1.3 * (text.count_words / text.count_sentences) + ) + elif text.lang == 'rus': + re = ( + 206.835 + - 60.1 * (text.count_syls / text.count_words) + - 1.3 * (text.count_words / text.count_sentences) + ) + elif text.lang == 'spa': + if main.settings_custom['measures']['readability']['re']['variant_spa'] == 'Fernández Huerta': + re = ( + 206.84 + - 60 * (text.count_syls / text.count_words) + - 1.02 * (text.count_words / text.count_sentences) + ) + elif main.settings_custom['measures']['readability']['re']['variant_spa'] == 'Szigriszt Pazos': + re = ( + 206.84 + - 62.3 * (text.count_syls / text.count_words) + - (text.count_words / text.count_sentences) + ) + else: + re = ( + 206.835 + - 0.846 * (text.count_syls / text.count_words * 100) + - 1.015 * (text.count_words / text.count_sentences) + ) else: - flesch_re = 'text_too_short' + re = 'text_too_short' else: - flesch_re = 'no_support' + re = 'no_support' - return flesch_re + return re # Flesch Reading Ease (Simplified) # Reference: Farr, J. N., Jenkins, J. J., & Paterson, D. G. (1951). Simplification of Flesch reading ease formula. Journal of Applied Psychology, 35(5), 333–337. https://doi.org/10.1037/h0062427 @@ -689,25 +717,6 @@ def spache_grade_level(main, text): return grade_level -# Szigriszt's Perspicuity Index -# Reference: Szigriszt Pazos, F. (1993). Sistemas predictivos de legibilidad del mensaje escrito: Formula de perspicuidad [Doctoral dissertation, Complutense University of Madrid]. Biblos-e Archivo. https://repositorio.uam.es/bitstream/handle/10486/2488/3907_barrio_cantalejo_ines_maria.pdf?sequence=1&isAllowed=y -def szigriszts_perspicuity_index(main, text): - if text.lang == 'spa' and text.lang in main.settings_global['syl_tokenizers']: - text = get_counts(main, text) - - if text.count_words and text.count_sentences: - p = ( - 207 - - 62.3 * (text.count_syls / text.count_words) - - (text.count_words / text.count_sentences) - ) - else: - p = 'text_too_short' - else: - p = 'no_support' - - return p - # Wiener Sachtextformel # References: # Bamberger, R., & Vanecek, E. (1984). Lesen – Verstehen – Lernen – Schreiben. Jugend und Volk. diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py index f77de1efd..cc0faa82e 100644 --- a/wordless/wl_profiler.py +++ b/wordless/wl_profiler.py @@ -362,7 +362,6 @@ def __init__(self, parent): _tr('wl_profiler', 'Coleman-Liau Index'), _tr('wl_profiler', 'Dale-Chall Readability Score'), _tr('wl_profiler', 'Devereaux Readability Index'), - _tr('wl_profiler', "Fernández Huerta's Readability Score"), _tr('wl_profiler', 'Flesch-Kincaid Grade Level'), _tr('wl_profiler', 'Flesch Reading Ease'), _tr('wl_profiler', 'Flesch Reading Ease (Simplified)'), @@ -379,7 +378,6 @@ def __init__(self, parent): _tr('wl_profiler', 'Rix'), _tr('wl_profiler', 'SMOG Grade'), _tr('wl_profiler', 'Spache Grade Level'), - _tr('wl_profiler', "Szigriszt's Perspicuity Index"), _tr('wl_profiler', 'Wiener Sachtextformel') ] @@ -1178,7 +1176,6 @@ def run(self): wl_measures_readability.coleman_liau_index(self.main, text), wl_measures_readability.dale_chall_readability_score(self.main, text), wl_measures_readability.devereux_readability_index(self.main, text), - wl_measures_readability.fernandez_huertas_readability_score(self.main, text), wl_measures_readability.flesch_kincaid_grade_level(self.main, text), wl_measures_readability.flesch_reading_ease(self.main, text), wl_measures_readability.flesch_reading_ease_simplified(self.main, text), @@ -1195,7 +1192,6 @@ def run(self): wl_measures_readability.rix(self.main, text), wl_measures_readability.smog_grade(self.main, text), wl_measures_readability.spache_grade_level(self.main, text), - wl_measures_readability.szigriszts_perspicuity_index(self.main, text), wl_measures_readability.wiener_sachtextformel(self.main, text) ] else: diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index c86e25590..f78fbfd2a 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -1741,6 +1741,10 @@ def init_settings_default(main): 'measures': { # Settings - Measures - Readability 'readability': { + 're': { + 'variant_spa': 'Fernández Huerta' + }, + 'wstf': { 'variant': '1' } diff --git a/wordless/wl_settings/wl_settings_measures.py b/wordless/wl_settings/wl_settings_measures.py index c3f3ea692..9131e529c 100644 --- a/wordless/wl_settings/wl_settings_measures.py +++ b/wordless/wl_settings/wl_settings_measures.py @@ -31,25 +31,43 @@ def __init__(self, main): self.settings_default = self.main.settings_default['measures']['readability'] self.settings_custom = self.main.settings_custom['measures']['readability'] + # Flesch Reading Ease + self.group_box_re = QGroupBox(self.tr('Flesch Reading Ease'), self) + + self.label_re_variant_spa = QLabel(self.tr('Spanish variant:'), self) + self.combo_box_re_variant_spa = wl_boxes.Wl_Combo_Box(self) + + self.combo_box_re_variant_spa.addItems([ + 'Fernández Huerta', + 'Szigriszt Pazos' + ]) + + self.group_box_re.setLayout(wl_layouts.Wl_Layout()) + self.group_box_re.layout().addWidget(self.label_re_variant_spa, 0, 0) + self.group_box_re.layout().addWidget(self.combo_box_re_variant_spa, 0, 1) + + self.group_box_re.layout().setColumnStretch(2, 1) + # Wiener Sachtextformel self.group_box_wstf = QGroupBox(self.tr('Wiener Sachtextformel'), self) - self.label_variant = QLabel(self.tr('Variant:'), self) - self.combo_box_variant = wl_boxes.Wl_Combo_Box(self) + self.label_wstf_variant = QLabel(self.tr('Variant:'), self) + self.combo_box_wstf_variant = wl_boxes.Wl_Combo_Box(self) - self.combo_box_variant.addItems(['1', '2', '3', '4']) + self.combo_box_wstf_variant.addItems(['1', '2', '3', '4']) self.group_box_wstf.setLayout(wl_layouts.Wl_Layout()) - self.group_box_wstf.layout().addWidget(self.label_variant, 0, 0) - self.group_box_wstf.layout().addWidget(self.combo_box_variant, 0, 1) + self.group_box_wstf.layout().addWidget(self.label_wstf_variant, 0, 0) + self.group_box_wstf.layout().addWidget(self.combo_box_wstf_variant, 0, 1) self.group_box_wstf.layout().setColumnStretch(2, 1) self.setLayout(wl_layouts.Wl_Layout()) - self.layout().addWidget(self.group_box_wstf, 0, 0) + self.layout().addWidget(self.group_box_re, 0, 0) + self.layout().addWidget(self.group_box_wstf, 1, 0) self.layout().setContentsMargins(6, 4, 6, 4) - self.layout().setRowStretch(1, 1) + self.layout().setRowStretch(2, 1) def load_settings(self, defaults = False): if defaults: @@ -57,12 +75,18 @@ def load_settings(self, defaults = False): else: settings = copy.deepcopy(self.settings_custom) + # Flesch Reading Ease + self.combo_box_re_variant_spa.setCurrentText(settings['re']['variant_spa']) + # Wiener Sachtextformel - self.combo_box_variant.setCurrentText(settings['wstf']['variant']) + self.combo_box_wstf_variant.setCurrentText(settings['wstf']['variant']) def apply_settings(self): + # Flesch Reading Ease + self.settings_custom['re']['variant_spa'] = self.combo_box_re_variant_spa.currentText() + # Wiener Sachtextformel - self.settings_custom['wstf']['variant'] = self.combo_box_variant.currentText() + self.settings_custom['wstf']['variant'] = self.combo_box_wstf_variant.currentText() return True