diff --git a/CHANGELOG.md b/CHANGELOG.md index e180c15c5..85fed6c14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ - Work Area: Fix Dependency Parser - analysis of files whose first token is a punctuation mark ### ❌ Removals +- Menu: Remove Settings - Measures - Statistical Significance - Welch's t-test +- Work Area: Remove Collocation Extractor / Colligation Extractor / Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test - Utils: Remove Dostoevsky's Russian sentiment analyzer ### ⏫ Dependency Changes diff --git a/doc/doc.md b/doc/doc.md index 63692c0d2..15029c3cc 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -39,8 +39,8 @@ - [4.4 Supported Measures](#doc-4-4) - [4.4.1 Measures of Readability](#doc-4-4-1) - [4.4.2 Measures of Lexical Diversity](#doc-4-4-2) - - [4.4.3 Measures of Dispersion & Adjusted Frequency](#doc-4-4-3) - - [4.4.4 Tests of Statistical Significance, Measures of Bayes Factor, & Measures of Effect Size](#doc-4-4-4) + - [4.4.3 Measures of Dispersion and Adjusted Frequency](#doc-4-4-3) + - [4.4.4 Tests of Statistical Significance, Measures of Bayes Factor, and Measures of Effect Size](#doc-4-4-4) - [5 References](#doc-5) @@ -1272,7 +1272,7 @@ Measure of Lexical Diversity|Formula > 1. Variants available and can be selected via **Menu - Preferences - Settings - Measures - Lexical Diversity** -#### [4.4.3 Measures of Dispersion & Adjusted Frequency](#doc) +#### [4.4.3 Measures of Dispersion and Adjusted Frequency](#doc) For parts-based measures, each file is divided into **n** (whose value you could modify via **Menu → Preferences → Settings → Measures → Dispersion / Adjusted Frequency → General Settings → Divide each file into subsections**) sub-sections and the frequency of the word in each part is counted and denoted by **F₁**, **F₂**, **F₃**, ..., **Fₙ** respectively. The total frequency of the word in each file is denoted by **F** and the mean value of the frequencies over all sub-sections is denoted by **F̅**. @@ -1357,7 +1357,7 @@ Measure of Dispersion (Distance-based)|Measure of Adjusted Frequency (Distance-b Average Waiting Time
([Savický & Hlaváčová, 2002](#ref-savicky-hlavacova-2002))|Average Waiting Time
([Savický & Hlaváčová, 2002](#ref-savicky-hlavacova-2002))|![Formula](/doc/measures/dispersion_adjusted_frequency/awt.svg) -#### [4.4.4 Tests of Statistical Significance, Measures of Bayes Factor, & Measures of Effect Size](#doc) +#### [4.4.4 Tests of Statistical Significance, Measures of Bayes Factor, and Measures of Effect Size](#doc) In order to calculate the statistical significance, Bayes factor, and effect size (except **Mann-Whitney U Test**, **Student's t-test (2-sample)**, and **Welch's t-test**) for two words in the same file (collocates) or for one specific word in two different files (keywords), two contingency tables must be constructed first, one for observed values, the other for expected values. @@ -1443,7 +1443,6 @@ Test of Statistical Significance|Measure of Bayes Factor|Formula Pearson's Chi-squared Test
([Hofland & Johanson, 1982](#ref-hofland-johanson-1982); [Oakes, 1998](#ref-oakes-1998))||![Formula](/doc/measures/statistical_significance/pearsons_chi_squared_test.svg) Student's t-test (1-sample)
([Church et al., 1991](#ref-church-et-al-1991))||![Formula](/doc/measures/statistical_significance/students_t_test_1_sample.svg) Student's t-test (2-sample)
([Paquot & Bestgen, 2009](#ref-paquot-bestgen-2009))|Student's t-test (2-sample)
([Wilson, 2013](#ref-wilson-2013))|![Formula](/doc/measures/statistical_significance/students_t_test_2_sample.svg) -Welch's t-test||* Same as Student's t-test (2-sample), but with different degrees of freedom (hence a different p-value). z-score
([Dennis, 1964](#ref-dennis-1964))||![Formula](/doc/measures/statistical_significance/z_score.svg) z-score (Berry-Rogghe)
([Berry-Rogghe, 1973](#ref-berry-rogghe-1973))||![Formula](/doc/measures/statistical_significance/z_score_berry_rogghe.svg)
where **S** is the average span size on both sides of the node word. diff --git a/tests/tests_measures/test_measure_utils.py b/tests/tests_measures/test_measure_utils.py index 4042c143b..265d04a0c 100644 --- a/tests/tests_measures/test_measure_utils.py +++ b/tests/tests_measures/test_measure_utils.py @@ -85,10 +85,6 @@ def test_to_freqs_sections_statistical_significance(): main, ITEMS_TO_SEARCH, ITEMS_X1, ITEMS_X2, test_statistical_significance = 'students_t_test_2_sample' ) == FREQS_SECTIONS_2_SAMPLE_RELATIVE - assert wl_measure_utils.to_freqs_sections_statistical_significance( - main, ITEMS_TO_SEARCH, ITEMS_X1, ITEMS_X2, - test_statistical_significance = 'welchs_t_test' - ) == FREQS_SECTIONS_2_SAMPLE_RELATIVE def test_to_freqs_sections_bayes_factor(): assert wl_measure_utils.to_freqs_sections_bayes_factor( diff --git a/tests/tests_measures/test_measures_statistical_significance.py b/tests/tests_measures/test_measures_statistical_significance.py index 12129f70c..52462dfc3 100644 --- a/tests/tests_measures/test_measures_statistical_significance.py +++ b/tests/tests_measures/test_measures_statistical_significance.py @@ -207,16 +207,6 @@ def test_students_t_test_2_sample(): numpy.testing.assert_array_equal(t_stats, numpy.array([0] * 2)) numpy.testing.assert_array_equal(p_vals, numpy.array([1] * 2)) -def test_welchs_t_test(): - t_stats, p_vals = wl_measures_statistical_significance.welchs_t_test( - main, - numpy.array([[0] * 5] * 2), - numpy.array([[0] * 5] * 2) - ) - - numpy.testing.assert_array_equal(t_stats, numpy.array([0] * 2)) - numpy.testing.assert_array_equal(p_vals, numpy.array([1] * 2)) - def test__z_score_p_val(): numpy.testing.assert_array_equal( wl_measures_statistical_significance._z_score_p_val(numpy.array([0] * 2), 'Two-tailed'), @@ -260,7 +250,6 @@ def test_z_score_berry_rogghe(): test__students_t_test_2_sample_alt() test_students_t_test_2_sample() - test_welchs_t_test() test__z_score_p_val() test_z_score() diff --git a/wordless/wl_measures/wl_measure_utils.py b/wordless/wl_measures/wl_measure_utils.py index 0eb465d90..370bbf9dd 100644 --- a/wordless/wl_measures/wl_measure_utils.py +++ b/wordless/wl_measures/wl_measure_utils.py @@ -107,9 +107,6 @@ def to_freqs_sections_statistical_significance(main, items_to_search, items_x1, elif test_statistical_significance == 'students_t_test_2_sample': num_sub_sections = main.settings_custom['measures']['statistical_significance']['students_t_test_2_sample']['num_sub_sections'] use_data = main.settings_custom['measures']['statistical_significance']['students_t_test_2_sample']['use_data'] - elif test_statistical_significance == 'welchs_t_test': - num_sub_sections = main.settings_custom['measures']['statistical_significance']['welchs_t_test']['num_sub_sections'] - use_data = main.settings_custom['measures']['statistical_significance']['welchs_t_test']['use_data'] return to_freqs_sections_2_sample(items_to_search, items_x1, items_x2, num_sub_sections, use_data) diff --git a/wordless/wl_measures/wl_measures_statistical_significance.py b/wordless/wl_measures/wl_measures_statistical_significance.py index 9ee65cc73..29836f010 100644 --- a/wordless/wl_measures/wl_measures_statistical_significance.py +++ b/wordless/wl_measures/wl_measures_statistical_significance.py @@ -211,26 +211,6 @@ def students_t_test_2_sample(main, freqs_x1s, freqs_x2s): return t_stats, p_vals -def welchs_t_test(main, freqs_x1s, freqs_x2s): - direction = main.settings_custom['measures']['statistical_significance']['welchs_t_test']['direction'] - alt = _students_t_test_2_sample_alt(direction) - - num_types = len(freqs_x1s) - t_stats = numpy.empty(shape = num_types, dtype = numpy.float64) - p_vals = numpy.empty(shape = num_types, dtype = numpy.float64) - - for i, (freqs_x1, freqs_x2) in enumerate(zip(freqs_x1s, freqs_x2s)): - if any(freqs_x1) or any(freqs_x2): - t_stat, p_val = scipy.stats.ttest_ind(freqs_x1, freqs_x2, equal_var = False, alternative = alt) - else: - t_stat = 0 - p_val = 1 - - t_stats[i] = t_stat - p_vals[i] = p_val - - return t_stats, p_vals - def _z_score_p_val(z_scores, direction): p_vals = numpy.empty_like(z_scores) diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index b587d66d0..ceb13e278 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -2292,12 +2292,6 @@ def init_settings_default(main): 'direction': _tr('wl_settings_default', 'Two-tailed') }, - 'welchs_t_test': { - 'num_sub_sections': 5, - 'use_data': _tr('wl_settings_default', 'Relative frequency'), - 'direction': _tr('wl_settings_default', 'Two-tailed') - }, - 'z_score': { 'direction': _tr('wl_settings_default', 'Two-tailed') }, diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index e6059ed42..b818da39d 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -3534,7 +3534,6 @@ _tr('wl_settings_global', "Pearson's chi-squared test"): 'pearsons_chi_squared_test', _tr('wl_settings_global', "Student's t-test (1-sample)"): 'students_t_test_1_sample', _tr('wl_settings_global', "Student's t-test (2-sample)"): 'students_t_test_2_sample', - _tr('wl_settings_global', "Welch's t-test"): 'welchs_t_test', _tr('wl_settings_global', 'z-score'): 'z_score', _tr('wl_settings_global', 'z-score (Berry-Rogghe)'): 'z_score_berry_rogghe' }, @@ -3744,14 +3743,6 @@ 'keyword_extractor': True }, - 'welchs_t_test': { - 'col_text': _tr('wl_settings_global', 't-statistic'), - 'func': wl_measures_statistical_significance.welchs_t_test, - 'to_sections': True, - 'collocation_extractor': False, - 'keyword_extractor': True - }, - 'z_score': { 'col_text': _tr('wl_settings_global', 'z-score'), 'func': wl_measures_statistical_significance.z_score, diff --git a/wordless/wl_settings/wl_settings_measures.py b/wordless/wl_settings/wl_settings_measures.py index e9650e47b..527fa9c87 100644 --- a/wordless/wl_settings/wl_settings_measures.py +++ b/wordless/wl_settings/wl_settings_measures.py @@ -760,39 +760,6 @@ def __init__(self, main): self.group_box_students_t_test_2_sample.layout().setColumnStretch(2, 1) - # Welch's t-test - self.group_box_welchs_t_test = QGroupBox(self.tr("Welch's t-test"), self) - - ( - self.label_welchs_t_test_divide_each_file_into, - self.spin_box_welchs_t_test_num_sub_sections, - self.label_welchs_t_test_sub_sections - ) = wl_widgets.wl_widgets_num_sub_sections(self) - ( - self.label_welchs_t_test_use_data, - self.combo_box_welchs_t_test_use_data - ) = wl_widgets.wl_widgets_use_data_freq(self) - ( - self.label_welchs_t_test_direction, - self.combo_box_welchs_t_test_direction - ) = wl_widgets.wl_widgets_direction(self) - - layout_welchs_t_test_num_sub_sections = wl_layouts.Wl_Layout() - layout_welchs_t_test_num_sub_sections.addWidget(self.label_welchs_t_test_divide_each_file_into, 0, 0) - layout_welchs_t_test_num_sub_sections.addWidget(self.spin_box_welchs_t_test_num_sub_sections, 0, 1) - layout_welchs_t_test_num_sub_sections.addWidget(self.label_welchs_t_test_sub_sections, 0, 2) - - layout_welchs_t_test_num_sub_sections.setColumnStretch(3, 1) - - self.group_box_welchs_t_test.setLayout(wl_layouts.Wl_Layout()) - self.group_box_welchs_t_test.layout().addLayout(layout_welchs_t_test_num_sub_sections, 0, 0, 1, 3) - self.group_box_welchs_t_test.layout().addWidget(self.label_welchs_t_test_use_data, 1, 0) - self.group_box_welchs_t_test.layout().addWidget(self.combo_box_welchs_t_test_use_data, 1, 1) - self.group_box_welchs_t_test.layout().addWidget(self.label_welchs_t_test_direction, 2, 0) - self.group_box_welchs_t_test.layout().addWidget(self.combo_box_welchs_t_test_direction, 2, 1) - - self.group_box_welchs_t_test.layout().setColumnStretch(2, 1) - # z-score self.group_box_z_score = QGroupBox(self.tr('z-score'), self) @@ -828,12 +795,11 @@ def __init__(self, main): self.layout().addWidget(self.group_box_pearsons_chi_squared_test, 3, 0) self.layout().addWidget(self.group_box_students_t_test_1_sample, 4, 0) self.layout().addWidget(self.group_box_students_t_test_2_sample, 5, 0) - self.layout().addWidget(self.group_box_welchs_t_test, 6, 0) - self.layout().addWidget(self.group_box_z_score, 7, 0) - self.layout().addWidget(self.group_box_z_score_berry_rogghe, 8, 0) + self.layout().addWidget(self.group_box_z_score, 6, 0) + self.layout().addWidget(self.group_box_z_score_berry_rogghe, 7, 0) self.layout().setContentsMargins(6, 4, 6, 4) - self.layout().setRowStretch(9, 1) + self.layout().setRowStretch(8, 1) def load_settings(self, defaults = False): if defaults: @@ -864,11 +830,6 @@ def load_settings(self, defaults = False): self.combo_box_students_t_test_2_sample_use_data.setCurrentText(settings['students_t_test_2_sample']['use_data']) self.combo_box_students_t_test_2_sample_direction.setCurrentText(settings['students_t_test_2_sample']['direction']) - # Welch's t-test - self.spin_box_welchs_t_test_num_sub_sections.setValue(settings['welchs_t_test']['num_sub_sections']) - self.combo_box_welchs_t_test_use_data.setCurrentText(settings['welchs_t_test']['use_data']) - self.combo_box_welchs_t_test_direction.setCurrentText(settings['welchs_t_test']['direction']) - # z-score self.combo_box_z_score_direction.setCurrentText(settings['z_score']['direction']) @@ -899,11 +860,6 @@ def apply_settings(self): self.settings_custom['students_t_test_2_sample']['use_data'] = self.combo_box_students_t_test_2_sample_use_data.currentText() self.settings_custom['students_t_test_2_sample']['direction'] = self.combo_box_students_t_test_2_sample_direction.currentText() - # Welch's t-test - self.settings_custom['welchs_t_test']['num_sub_sections'] = self.spin_box_welchs_t_test_num_sub_sections.value() - self.settings_custom['welchs_t_test']['use_data'] = self.combo_box_welchs_t_test_use_data.currentText() - self.settings_custom['welchs_t_test']['direction'] = self.combo_box_welchs_t_test_direction.currentText() - # z-score self.settings_custom['z_score']['direction'] = self.combo_box_z_score_direction.currentText()