diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e71625ac..196eb7068 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@
### đ New Features
- Utils: Add Stanza's Sindhi part-of-speech tagger
- Utils: Add VADER's sentiment analyzers
+- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic
### đ Bugfixes
- Utils: Fix downloading of Stanza models
diff --git a/doc/doc.md b/doc/doc.md
index 929f39ea3..a859892ae 100644
--- a/doc/doc.md
+++ b/doc/doc.md
@@ -1215,6 +1215,9 @@ The following variables would be used in formulas:
**NumTokens**: Number of tokens
+
\ No newline at end of file
diff --git a/doc/measures/lexical_diversity/honores_stat.svg b/doc/measures/lexical_diversity/honores_stat.svg
new file mode 100644
index 000000000..9cfda980a
--- /dev/null
+++ b/doc/measures/lexical_diversity/honores_stat.svg
@@ -0,0 +1,71 @@
+
+
+
\ No newline at end of file
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index b585a7d67..f96ffaf10 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -136,7 +136,7 @@ def update_gui(err_msg, texts_stats_files):
assert all((len_syls == 1 for len_syls in len_types_syls))
# Lexical Diversity
- assert len(stats_lexical_diversity) == 25
+ assert len(stats_lexical_diversity) == 27
for i, lexical_diversity in enumerate(stats_lexical_diversity):
assert (
diff --git a/tests/tests_measures/test_measures_lexical_diversity.py b/tests/tests_measures/test_measures_lexical_diversity.py
index acc63be01..b2d63d1f3 100644
--- a/tests/tests_measures/test_measures_lexical_diversity.py
+++ b/tests/tests_measures/test_measures_lexical_diversity.py
@@ -33,6 +33,11 @@
# Reference: Popescu, I.-I. (2009). Word frequency studies (p. 26). Mouton de Gruyter.
TOKENS_225 = [1] * 11 + [2, 3] * 9 + [4] * 7 + [5, 6] * 6 + [7, 8] * 5 + list(range(9, 16)) * 4 + list(range(16, 22)) * 3 + list(range(22, 40)) * 2 + list(range(40, 125))
+def test_brunets_index():
+ w = wl_measures_lexical_diversity.brunets_index(main, TOKENS_100)
+
+ assert w == numpy.power(100, numpy.power(5, -0.165))
+
def test_cttr():
cttr = wl_measures_lexical_diversity.cttr(main, TOKENS_100)
@@ -55,6 +60,11 @@ def test_hdd():
assert hdd_100 == (1 - scipy.stats.hypergeom.pmf(k = 0, M = 100, n = 20, N = 42)) * (1 / 42) * 5
+def test_honores_stat():
+ r = wl_measures_lexical_diversity.honores_stat(main, TOKENS_100)
+
+ assert r == 100 * numpy.log(100 / (1 - 0 / 5))
+
def test_logttr():
settings['logttr']['variant'] = 'Herdan'
logttr_herdan = wl_measures_lexical_diversity.logttr(main, TOKENS_100)
@@ -185,10 +195,12 @@ def test_yules_index_of_diversity():
assert index_of_diversity == (100 ** 2) / (5 * 20 ** 2 - 100)
if __name__ == '__main__':
+ test_brunets_index()
test_cttr()
test_fishers_index_of_diversity()
test_herdans_vm()
test_hdd()
+ test_honores_stat()
test_logttr()
test_msttr()
test_mtld()
diff --git a/wordless/wl_measures/wl_measures_lexical_diversity.py b/wordless/wl_measures/wl_measures_lexical_diversity.py
index aa8c34272..4602b73bf 100644
--- a/wordless/wl_measures/wl_measures_lexical_diversity.py
+++ b/wordless/wl_measures/wl_measures_lexical_diversity.py
@@ -29,6 +29,13 @@
_tr = QCoreApplication.translate
+# Brunét's Index
+# References:
+# Brunét, E. (1978). Le vocabulaire de Jean Giraudoux: Structure et evolution. Slatkine.
+# Bucks, R. S., Singh, S., Cuerden, J. M., & Wilcock, G. K. (2000). Analysis of spontaneous, conversational speech in dementia of Alzheimer type: Evaluation of an objective technique for analysing lexical performance. Aphasiology, 14(1), 71â91. https://doi.org/10.1080/026870300401603
+def brunets_index(main, tokens):
+ return numpy.power(len(tokens), numpy.power(len(set(tokens)), -0.165))
+
# Corrected TTR
# References:
# Carroll, J. B. (1964). Language and thought. Prentice-Hall.
@@ -99,6 +106,23 @@ def hdd(main, tokens):
return sum(ttrs)
+# Honoré's statistic
+# References:
+# HonorĂ©, A. (1979). Some simple measures of richness of vocabulary. Association of Literary and Linguistic Computing Bulletin, 7(2), 172â177.
+# Bucks, R. S., Singh, S., Cuerden, J. M., & Wilcock, G. K. (2000). Analysis of spontaneous, conversational speech in dementia of Alzheimer type: Evaluation of an objective technique for analysing lexical performance. Aphasiology, 14(1), 71â91. https://doi.org/10.1080/026870300401603
+def honores_stat(main, tokens):
+ num_tokens = len(tokens)
+ types_freqs = collections.Counter(tokens)
+ num_types = len(types_freqs)
+ freqs_nums_types = collections.Counter(types_freqs.values())
+
+ if (denominator := 1 - freqs_nums_types[1] / num_types):
+ r = 100 * numpy.log(num_tokens / denominator)
+ else:
+ r = 0
+
+ return r
+
# LogTTR
# Herdan:
# Herdan, G. (1960). Type-token mathematics: A textbook of mathematical linguistics (p. 28). Mouton.
diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py
index 6c218eb8d..b26f6623f 100644
--- a/wordless/wl_profiler.py
+++ b/wordless/wl_profiler.py
@@ -1294,15 +1294,17 @@ def run(self):
if self.profiler_tab in ['lexical_diversity', 'all']:
if tokens:
stats_lexical_diversity = [
+ wl_measures_lexical_diversity.brunets_index(self.main, tokens),
wl_measures_lexical_diversity.cttr(self.main, tokens),
wl_measures_lexical_diversity.fishers_index_of_diversity(self.main, tokens),
wl_measures_lexical_diversity.herdans_vm(self.main, tokens),
wl_measures_lexical_diversity.hdd(self.main, tokens),
+ wl_measures_lexical_diversity.honores_stat(self.main, tokens),
wl_measures_lexical_diversity.logttr(self.main, tokens),
wl_measures_lexical_diversity.msttr(self.main, tokens),
wl_measures_lexical_diversity.mtld(self.main, tokens),
wl_measures_lexical_diversity.mattr(self.main, tokens),
- * wl_measures_lexical_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(self.main, tokens),
+ *wl_measures_lexical_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(self.main, tokens),
wl_measures_lexical_diversity.popescus_r1(self.main, tokens),
wl_measures_lexical_diversity.popescus_r2(self.main, tokens),
wl_measures_lexical_diversity.popescus_r3(self.main, tokens),
@@ -1317,7 +1319,7 @@ def run(self):
wl_measures_lexical_diversity.yules_index_of_diversity(self.main, tokens)
]
else:
- stats_lexical_diversity = [0] * 25
+ stats_lexical_diversity = [0] * 27
else:
stats_lexical_diversity = None