From 815cc98104dad14b17e1031ef64285d2bd84b1be Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Fri, 5 Jan 2024 23:30:13 +0800 Subject: [PATCH] =?UTF-8?q?Work=20Area:=20Add=20Profiler=20-=20Lexical=20D?= =?UTF-8?q?iversity=20-=20Brun=C3=A9t's=20Index=20/=20Honor=C3=A9's=20stat?= =?UTF-8?q?istic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + doc/doc.md | 13 ++++ .../lexical_diversity/brunets_index.svg | 58 +++++++++++++++ .../lexical_diversity/honores_stat.svg | 71 +++++++++++++++++++ tests/test_profiler.py | 2 +- .../test_measures_lexical_diversity.py | 12 ++++ .../wl_measures_lexical_diversity.py | 24 +++++++ wordless/wl_profiler.py | 6 +- 8 files changed, 184 insertions(+), 3 deletions(-) create mode 100644 doc/measures/lexical_diversity/brunets_index.svg create mode 100644 doc/measures/lexical_diversity/honores_stat.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e71625ac..196eb7068 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ ### 🎉 New Features - Utils: Add Stanza's Sindhi part-of-speech tagger - Utils: Add VADER's sentiment analyzers +- Work Area: Add Profiler - Lexical Diversity - BrunĂ©t's Index / HonorĂ©'s statistic ### 📌 Bugfixes - Utils: Fix downloading of Stanza models diff --git a/doc/doc.md b/doc/doc.md index 929f39ea3..a859892ae 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -1215,6 +1215,9 @@ The following variables would be used in formulas:
**NumTokens**: Number of tokens
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/lexical_diversity/honores_stat.svg b/doc/measures/lexical_diversity/honores_stat.svg new file mode 100644 index 000000000..9cfda980a --- /dev/null +++ b/doc/measures/lexical_diversity/honores_stat.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_profiler.py b/tests/test_profiler.py index b585a7d67..f96ffaf10 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -136,7 +136,7 @@ def update_gui(err_msg, texts_stats_files): assert all((len_syls == 1 for len_syls in len_types_syls)) # Lexical Diversity - assert len(stats_lexical_diversity) == 25 + assert len(stats_lexical_diversity) == 27 for i, lexical_diversity in enumerate(stats_lexical_diversity): assert ( diff --git a/tests/tests_measures/test_measures_lexical_diversity.py b/tests/tests_measures/test_measures_lexical_diversity.py index acc63be01..b2d63d1f3 100644 --- a/tests/tests_measures/test_measures_lexical_diversity.py +++ b/tests/tests_measures/test_measures_lexical_diversity.py @@ -33,6 +33,11 @@ # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 26). Mouton de Gruyter. TOKENS_225 = [1] * 11 + [2, 3] * 9 + [4] * 7 + [5, 6] * 6 + [7, 8] * 5 + list(range(9, 16)) * 4 + list(range(16, 22)) * 3 + list(range(22, 40)) * 2 + list(range(40, 125)) +def test_brunets_index(): + w = wl_measures_lexical_diversity.brunets_index(main, TOKENS_100) + + assert w == numpy.power(100, numpy.power(5, -0.165)) + def test_cttr(): cttr = wl_measures_lexical_diversity.cttr(main, TOKENS_100) @@ -55,6 +60,11 @@ def test_hdd(): assert hdd_100 == (1 - scipy.stats.hypergeom.pmf(k = 0, M = 100, n = 20, N = 42)) * (1 / 42) * 5 +def test_honores_stat(): + r = wl_measures_lexical_diversity.honores_stat(main, TOKENS_100) + + assert r == 100 * numpy.log(100 / (1 - 0 / 5)) + def test_logttr(): settings['logttr']['variant'] = 'Herdan' logttr_herdan = wl_measures_lexical_diversity.logttr(main, TOKENS_100) @@ -185,10 +195,12 @@ def test_yules_index_of_diversity(): assert index_of_diversity == (100 ** 2) / (5 * 20 ** 2 - 100) if __name__ == '__main__': + test_brunets_index() test_cttr() test_fishers_index_of_diversity() test_herdans_vm() test_hdd() + test_honores_stat() test_logttr() test_msttr() test_mtld() diff --git a/wordless/wl_measures/wl_measures_lexical_diversity.py b/wordless/wl_measures/wl_measures_lexical_diversity.py index aa8c34272..4602b73bf 100644 --- a/wordless/wl_measures/wl_measures_lexical_diversity.py +++ b/wordless/wl_measures/wl_measures_lexical_diversity.py @@ -29,6 +29,13 @@ _tr = QCoreApplication.translate +# BrunĂ©t's Index +# References: +# BrunĂ©t, E. (1978). Le vocabulaire de Jean Giraudoux: Structure et evolution. Slatkine. +# Bucks, R. S., Singh, S., Cuerden, J. M., & Wilcock, G. K. (2000). Analysis of spontaneous, conversational speech in dementia of Alzheimer type: Evaluation of an objective technique for analysing lexical performance. Aphasiology, 14(1), 71–91. https://doi.org/10.1080/026870300401603 +def brunets_index(main, tokens): + return numpy.power(len(tokens), numpy.power(len(set(tokens)), -0.165)) + # Corrected TTR # References: # Carroll, J. B. (1964). Language and thought. Prentice-Hall. @@ -99,6 +106,23 @@ def hdd(main, tokens): return sum(ttrs) +# HonorĂ©'s statistic +# References: +# HonorĂ©, A. (1979). Some simple measures of richness of vocabulary. Association of Literary and Linguistic Computing Bulletin, 7(2), 172–177. +# Bucks, R. S., Singh, S., Cuerden, J. M., & Wilcock, G. K. (2000). Analysis of spontaneous, conversational speech in dementia of Alzheimer type: Evaluation of an objective technique for analysing lexical performance. Aphasiology, 14(1), 71–91. https://doi.org/10.1080/026870300401603 +def honores_stat(main, tokens): + num_tokens = len(tokens) + types_freqs = collections.Counter(tokens) + num_types = len(types_freqs) + freqs_nums_types = collections.Counter(types_freqs.values()) + + if (denominator := 1 - freqs_nums_types[1] / num_types): + r = 100 * numpy.log(num_tokens / denominator) + else: + r = 0 + + return r + # LogTTR # Herdan: # Herdan, G. (1960). Type-token mathematics: A textbook of mathematical linguistics (p. 28). Mouton. diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py index 6c218eb8d..b26f6623f 100644 --- a/wordless/wl_profiler.py +++ b/wordless/wl_profiler.py @@ -1294,15 +1294,17 @@ def run(self): if self.profiler_tab in ['lexical_diversity', 'all']: if tokens: stats_lexical_diversity = [ + wl_measures_lexical_diversity.brunets_index(self.main, tokens), wl_measures_lexical_diversity.cttr(self.main, tokens), wl_measures_lexical_diversity.fishers_index_of_diversity(self.main, tokens), wl_measures_lexical_diversity.herdans_vm(self.main, tokens), wl_measures_lexical_diversity.hdd(self.main, tokens), + wl_measures_lexical_diversity.honores_stat(self.main, tokens), wl_measures_lexical_diversity.logttr(self.main, tokens), wl_measures_lexical_diversity.msttr(self.main, tokens), wl_measures_lexical_diversity.mtld(self.main, tokens), wl_measures_lexical_diversity.mattr(self.main, tokens), - * wl_measures_lexical_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(self.main, tokens), + *wl_measures_lexical_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(self.main, tokens), wl_measures_lexical_diversity.popescus_r1(self.main, tokens), wl_measures_lexical_diversity.popescus_r2(self.main, tokens), wl_measures_lexical_diversity.popescus_r3(self.main, tokens), @@ -1317,7 +1319,7 @@ def run(self): wl_measures_lexical_diversity.yules_index_of_diversity(self.main, tokens) ] else: - stats_lexical_diversity = [0] * 25 + stats_lexical_diversity = [0] * 27 else: stats_lexical_diversity = None