From c487c9139bd0fa8d75977583ff39b8656f98d932 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Mon, 20 May 2024 01:15:57 +0800 Subject: [PATCH] Work Area: Add Collocation/Colligation Extractor - Filter results - Node/Collocation length --- CHANGELOG.md | 3 +- .../test_file_area_file_types.py | 8 +- tests/tests_utils/test_detection.py | 4 +- wordless/wl_colligation_extractor.py | 20 +- wordless/wl_collocation_extractor.py | 2 +- wordless/wl_concordancer.py | 53 ++++- wordless/wl_concordancer_parallel.py | 46 ++-- wordless/wl_dependency_parser.py | 58 +++-- wordless/wl_keyword_extractor.py | 4 +- wordless/wl_ngram_generator.py | 2 +- wordless/wl_nlp/wl_dependency_parsing.py | 14 +- wordless/wl_nlp/wl_texts.py | 99 +++++--- wordless/wl_nlp/wl_token_processing.py | 151 +++++++----- wordless/wl_profiler.py | 2 +- wordless/wl_results/wl_results_filter.py | 219 ++++++++++++------ wordless/wl_results/wl_results_search.py | 46 ++-- wordless/wl_settings/wl_settings_default.py | 22 +- wordless/wl_utils/wl_detection.py | 1 + wordless/wl_widgets/wl_tables.py | 124 ++++------ wordless/wl_widgets/wl_widgets.py | 56 ++--- 20 files changed, 570 insertions(+), 364 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c21f5f2e0..b707bd15c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ - Utils: Add PyThaiNLP's Han-solo - Utils: Add Stanza's Sindhi part-of-speech tagger - Utils: Add VADER's sentiment analyzers +- Work Area: Add Collocation/Colligation Extractor - Filter results / - Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic ### ✨ Improvements @@ -40,7 +41,7 @@ ### ❌ Removals - Menu: Remove Settings - Measures - Statistical Significance - Welch's t-test -- Work Area: Remove Collocation Extractor / Colligation Extractor / Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test +- Work Area: Remove Collocation/Colligation/Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test - Utils: Remove Dostoevsky's Russian sentiment analyzer ### ⏫ Dependency Changes diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py index 04e1d3daf..82dde64d3 100644 --- a/tests/tests_file_area/test_file_area_file_types.py +++ b/tests/tests_file_area/test_file_area_file_types.py @@ -153,7 +153,7 @@ def update_gui_file_types(err_msg, new_files): file_text = new_files[0]['text'] tokens = file_text.to_token_texts() - tags = file_text.get_token_properties('tag') + tags = file_text.get_token_properties('tag', flat = True) print(tokens) @@ -192,7 +192,7 @@ def update_gui_file_types(err_msg, new_files): file_text_tgt = new_files[1]['text'] tokens_src = file_text_src.to_token_texts() - tags_src = file_text_src.get_token_properties('tag') + tags_src = file_text_src.get_token_properties('tag', flat = True) # Source files print(file_text_src.lang) @@ -204,7 +204,7 @@ def update_gui_file_types(err_msg, new_files): # Target files tokens_tgt = file_text_tgt.to_token_texts() - tags_tgt = file_text_tgt.get_token_properties('tag') + tags_tgt = file_text_tgt.get_token_properties('tag', flat = True) print(file_text_tgt.lang) print(tokens_tgt) @@ -226,7 +226,7 @@ def update_gui_tags(err_msg, new_files): file_text = new_files[0]['text'] tokens = file_text.to_token_texts() - tags = file_text.get_token_properties('tag') + tags = file_text.get_token_properties('tag', flat = True) print(tokens) print(tags) diff --git a/tests/tests_utils/test_detection.py b/tests/tests_utils/test_detection.py index 08e22e436..c26fcbf17 100644 --- a/tests/tests_utils/test_detection.py +++ b/tests/tests_utils/test_detection.py @@ -290,11 +290,11 @@ def test_lingua(): langs_exceptions = {'bokmal', 'ganda', 'nynorsk', 'slovene'} langs_extra = set() - for lang in lingua.Language.all(): + for lang in lingua.Language.all(): # pylint: disable=no-member if lang.name.lower() not in langs | langs_exceptions: langs_extra.add(lang.name) - print(f"Extra languages: {', '.join(langs_extra)}\n") + print(f"\nExtra languages: {', '.join(langs_extra)}\n") assert langs_extra == {'BOSNIAN', 'MAORI', 'SHONA', 'SOMALI', 'SOTHO', 'TSONGA', 'XHOSA'} diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py index d556a4adf..6e95dd1c1 100644 --- a/wordless/wl_colligation_extractor.py +++ b/wordless/wl_colligation_extractor.py @@ -909,7 +909,7 @@ def run(self): colligations_freqs_file = {} colligations_freqs_file_all = {} - text = wl_token_processing.wl_process_tokens( + text = wl_token_processing.wl_process_tokens_colligation_extractor( self.main, file['text'], token_settings = settings['token_settings'] ) @@ -976,23 +976,21 @@ def run(self): tags_left = [] tags_right = [] - tags = wl_texts.to_tokens(wl_texts.get_token_properties(tokens, 'tag'), lang = file['lang']) - if window_left < 0 < window_right: # Limit Searching if settings_limit_searching == _tr('wl_colligation_extractor', 'None'): - tags_left = tags[max(0, i + window_left) : i] - tags_right = tags[i + ngram_size : i + ngram_size + window_right] + tags_left = text.tags[max(0, i + window_left) : i] + tags_right = text.tags[i + ngram_size : i + ngram_size + window_right] else: # Span positions (Left) for position in range(max(0, i + window_left), i): if i_unit_start <= position <= i_unit_end: - tags_left.append(tags[position]) + tags_left.append(text.tags[position]) # Span positions (Right) for position in range(i + ngram_size, i + ngram_size + window_right): if i_unit_start <= position <= i_unit_end: - tags_right.append(tags[position]) + tags_right.append(text.tags[position]) for j, collocate in enumerate(reversed(tags_left)): if wl_matching.check_context( @@ -1024,12 +1022,12 @@ def run(self): elif window_left < 0 and window_right < 0: # Limit Searching if settings_limit_searching == _tr('wl_colligation_extractor', 'None'): - tags_left = tags[max(0, i + window_left) : max(0, i + window_right + 1)] + tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)] else: # Span positions (Left) for position in range(max(0, i + window_left), max(0, i + window_right + 1)): if i_unit_start <= position <= i_unit_end: - tags_left.append(tags[position]) + tags_left.append(text.tags[position]) for j, collocate in enumerate(reversed(tags_left)): if wl_matching.check_context( @@ -1047,12 +1045,12 @@ def run(self): elif window_left > 0 and window_right > 0: # Limit Searching if settings_limit_searching == _tr('wl_colligation_extractor', 'None'): - tags_right = tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right] + tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right] else: # Span positions (Right) for position in range(i + ngram_size + window_left - 1, i + ngram_size + window_right): if i_unit_start <= position <= i_unit_end: - tags_right.append(tags[position]) + tags_right.append(text.tags[position]) for j, collocate in enumerate(tags_right): if wl_matching.check_context( diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py index 5eeb9ea27..643b46d42 100644 --- a/wordless/wl_collocation_extractor.py +++ b/wordless/wl_collocation_extractor.py @@ -906,7 +906,7 @@ def run(self): collocations_freqs_file = {} collocations_freqs_file_all = {} - text = wl_token_processing.wl_process_tokens( + text = wl_token_processing.wl_process_tokens_ngram_generator( self.main, file['text'], token_settings = settings['token_settings'] ) diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py index 446e8fc09..501b7c737 100644 --- a/wordless/wl_concordancer.py +++ b/wordless/wl_concordancer.py @@ -727,9 +727,10 @@ def run(self): no_sentence = bisect.bisect(offsets_sentences, i) no_para = bisect.bisect(offsets_paras, i) - # Search in Results (Node) - node_tokens_search = list(ngram) - node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(ngram)) + node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts( + ngram, + punc_mark = True + )) # Width Unit if settings['generation_settings']['width_unit'] == self.tr('Character'): @@ -811,16 +812,46 @@ def run(self): left_tokens_raw = tokens[offset_start:i] right_tokens_raw = tokens[i + len_search_term : offset_end] - # Search in results (Left & Right) - left_tokens_search = copy.deepcopy(left_tokens_raw) - right_tokens_search = copy.deepcopy(right_tokens_raw) + if settings['token_settings']['punc_marks']: + node_tokens_search = list(ngram) + + # Remove empty tokens for searching in results + left_tokens_search = [token for token in copy.deepcopy(left_tokens_raw) if token] + right_tokens_search = [token for token in copy.deepcopy(right_tokens_raw) if token] + # Convert trailing punctuation marks, if any, to separate tokens for searching + else: + node_tokens_search = [] + left_tokens_search = [] + right_tokens_search = [] + + for token in list(ngram): + node_tokens_search.append(token) + + if token.punc_mark: + node_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang)) - # Remove empty tokens for searching in results - left_tokens_search = [token for token in left_tokens_search if token] - right_tokens_search = [token for token in right_tokens_search if token] + for token in copy.deepcopy(left_tokens_raw): + if token: + left_tokens_search.append(token) - left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(left_tokens_raw)) - right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(right_tokens_raw)) + if token.punc_mark: + left_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang)) + + for token in copy.deepcopy(right_tokens_raw): + if token: + right_tokens_search.append(token) + + if token.punc_mark: + right_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang)) + + left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts( + left_tokens_raw, + punc_mark = True + )) + right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts( + right_tokens_raw, + punc_mark = True + )) # Left concordance_line.append([left_tokens_raw, left_tokens_search]) diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py index d4de43b4a..f60cfb47a 100644 --- a/wordless/wl_concordancer_parallel.py +++ b/wordless/wl_concordancer_parallel.py @@ -408,9 +408,10 @@ def run(self): parallel_unit_no = bisect.bisect(offsets_paras, j) if parallel_unit_no not in parallel_units: + # Save all nodes if multiple nodes are found in the same parallel unit parallel_units[parallel_unit_no] = [[] for _ in range(len_files)] - parallel_units[parallel_unit_no][i] = ngram + parallel_units[parallel_unit_no][i].append(ngram) # Search for additions & deletions else: for j, para in enumerate(text.tokens_multilevel): @@ -428,22 +429,37 @@ def run(self): len_parallel_units = len(offsets_paras) for parallel_unit_no, parallel_unit_nodes in parallel_units.items(): - node = parallel_unit_nodes[i] + nodes = parallel_unit_nodes[i] if parallel_unit_no <= len_parallel_units: - parallel_unit_tokens_raw = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) - parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(parallel_unit_tokens_raw)) - # Search in Results - parallel_unit_tokens_search = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) - - # Highlight node if found - if node: - len_node = len(node) - - for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit_tokens_search, len_node)): - if ngram == tuple(node): - parallel_unit_tokens_raw[j] = f'{parallel_unit_tokens_raw[j]}' - parallel_unit_tokens_raw[j + len_node - 1] += '' + parallel_unit = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) + + if settings['token_settings']['punc_marks']: + parallel_unit_tokens_search = copy.deepcopy(parallel_unit) + # Convert trailing punctuation marks, if any, to separate tokens for searching + else: + parallel_unit_tokens_search = [] + + for token in copy.deepcopy(parallel_unit): + parallel_unit_tokens_search.append(token) + + if token.punc_mark: + parallel_unit_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang)) + + parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts( + parallel_unit, + punc_mark = True + )) + + # Highlight nodes if found + if nodes: + for node in nodes: + len_node = len(node) + + for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit, len_node)): + if ngram == tuple(node): + parallel_unit_tokens_raw[j] = f'{parallel_unit_tokens_raw[j]}' + parallel_unit_tokens_raw[j + len_node - 1] += '' else: parallel_unit_tokens_raw = [] parallel_unit_tokens_search = [] diff --git a/wordless/wl_dependency_parser.py b/wordless/wl_dependency_parser.py index fde297ae4..5dba30661 100644 --- a/wordless/wl_dependency_parser.py +++ b/wordless/wl_dependency_parser.py @@ -29,7 +29,7 @@ from wordless.wl_checks import wl_checks_work_area from wordless.wl_dialogs import wl_dialogs_misc -from wordless.wl_nlp import wl_dependency_parsing, wl_matching, wl_token_processing +from wordless.wl_nlp import wl_dependency_parsing, wl_matching, wl_texts, wl_token_processing from wordless.wl_utils import wl_misc, wl_threading from wordless.wl_widgets import wl_labels, wl_layouts, wl_tables, wl_widgets @@ -383,7 +383,7 @@ def update_gui_table(self, err_msg, results): for i, ( head, dependent, dependency_relation, dependency_len, - sentence_tokens_raw, sentence_tokens_search, + sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search, no_sentence, len_sentences, file ) in enumerate(results): # Head @@ -394,7 +394,7 @@ def update_gui_table(self, err_msg, results): # Dependency Relation self.model().setItem(i, 2, wl_tables.Wl_Table_Item(dependency_relation)) - # Dependency Distance + # Dependency Length self.set_item_num(i, 3, dependency_len) self.set_item_num(i, 4, numpy.abs(dependency_len)) @@ -404,6 +404,7 @@ def update_gui_table(self, err_msg, results): wl_labels.Wl_Label_Html(' '.join(sentence_tokens_raw), self.main) ) self.indexWidget(self.model().index(i, 5)).tokens_raw = sentence_tokens_raw + self.indexWidget(self.model().index(i, 5)).tokens_fig = sentence_tokens_fig self.indexWidget(self.model().index(i, 5)).tokens_search = sentence_tokens_search # Sentence No. @@ -432,7 +433,7 @@ def generate_fig(self): fig_settings = self.main.settings_custom['dependency_parser']['fig_settings'] for row in self.get_selected_rows(): - sentence = tuple(self.model().item(row, 5).tokens_search) + sentence = tuple(self.indexWidget(self.model().index(row, 5)).tokens_fig) if sentence not in sentences_rendered: for file in self.settings['file_area']['files_open']: @@ -512,11 +513,11 @@ def run(self): if any((token in search_terms for token in sentence)): dependencies = [ - (token, token.head, token.dependency_relation, token.dependency_len) + (token, token.head, token.dependency_relation) for token in sentence ] - for i, (token, head, dependency_relation, dependency_len) in enumerate(dependencies): + for i, (token, head, dependency_relation) in enumerate(dependencies): j = i_token + i if ( @@ -535,25 +536,46 @@ def run(self): # Sentence sentence_tokens_raw = [] - - for sentence_token in sentence: - if sentence_token == head: + sentence_tokens_fig = [] + # Calculate dependency length based on modified tokens + i_head = -1 + i_dependent = -1 + + # Highlight heads and dependents + for i, sentence_token in enumerate(sentence): + if sentence_token is head: sentence_tokens_raw.append(f''' - {sentence_token.display_text()} + {sentence_token.display_text(punc_mark = True)} ''') - elif sentence_token == token: + + i_head = i + elif sentence_token is token: sentence_tokens_raw.append(f''' - {sentence_token.display_text()} + {sentence_token.display_text(punc_mark = True)} ''') + + i_dependent = i else: - sentence_tokens_raw.append(sentence_token.display_text()) + sentence_tokens_raw.append(sentence_token.display_text(punc_mark = True)) + + sentence_tokens_fig.append(copy.deepcopy(sentence_token)) + + if settings['token_settings']['punc_marks']: + # Remove empty tokens for searching in results + sentence_tokens_search = [token for token in copy.deepcopy(sentence) if token] + # Convert trailing punctuation marks, if any, to separate tokens for searching + else: + sentence_tokens_search = [] + + for sentence_token in copy.deepcopy(sentence): + sentence_tokens_search.append(sentence_token) - # Remove empty tokens for searching in results - sentence_tokens_search = [token for token in sentence if token] + if sentence_token.punc_mark: + sentence_tokens_search.append(wl_texts.Wl_Token(sentence_token.punc_mark, lang = sentence_token.lang)) # Head results[-1].append(head.display_text()) @@ -561,10 +583,10 @@ def run(self): results[-1].append(token.display_text()) # Dependency Relation results[-1].append(dependency_relation) - # Dependency Distance - results[-1].append(dependency_len) + # Dependency Length + results[-1].append(i_head - i_dependent) # Sentence - results[-1].extend([sentence_tokens_raw, sentence_tokens_search]) + results[-1].extend([sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search]) # Sentence No. results[-1].extend([no_sentence, len_sentences]) # File diff --git a/wordless/wl_keyword_extractor.py b/wordless/wl_keyword_extractor.py index fdd6f65c1..2d7321e4f 100644 --- a/wordless/wl_keyword_extractor.py +++ b/wordless/wl_keyword_extractor.py @@ -693,7 +693,7 @@ def run(self): tokens_ref = [] for file_ref in files_ref: - text = wl_token_processing.wl_process_tokens( + text = wl_token_processing.wl_process_tokens_ngram_generator( self.main, file_ref['text'], token_settings = settings['token_settings'] ) @@ -709,7 +709,7 @@ def run(self): # Frequency (Observed files) for file_observed in files_observed: - text = wl_token_processing.wl_process_tokens( + text = wl_token_processing.wl_process_tokens_ngram_generator( self.main, file_observed['text'], token_settings = settings['token_settings'] ) diff --git a/wordless/wl_ngram_generator.py b/wordless/wl_ngram_generator.py index 6e4564bd0..e68a3cc02 100644 --- a/wordless/wl_ngram_generator.py +++ b/wordless/wl_ngram_generator.py @@ -781,7 +781,7 @@ def run(self): for file in files: ngrams_is = [] - text = wl_token_processing.wl_process_tokens( + text = wl_token_processing.wl_process_tokens_ngram_generator( self.main, file['text'], token_settings = settings['token_settings'] ) diff --git a/wordless/wl_nlp/wl_dependency_parsing.py b/wordless/wl_nlp/wl_dependency_parsing.py index b85d915ce..6f99d1a23 100644 --- a/wordless/wl_nlp/wl_dependency_parsing.py +++ b/wordless/wl_nlp/wl_dependency_parsing.py @@ -404,10 +404,7 @@ def wl_dependency_parse_fig_tokens( ): htmls = [] - if inputs and isinstance(list(inputs)[0], wl_texts.Wl_Token): - inputs, token_properties = wl_texts.split_texts_properties(inputs) - else: - token_properties = [] + inputs, token_properties = wl_texts.split_texts_properties(inputs) options = { 'fine_grained': show_fine_grained_pos_tags, @@ -437,11 +434,12 @@ def wl_dependency_parse_fig_tokens( if show_in_separate_tab: for doc in nlp.pipe(docs): for sentence in doc.sents: - displacy_dict = spacy.displacy.parse_deps(sentence.as_doc(), options = options) + displacy_dict = spacy.displacy.parse_deps(sentence, options = options) if token_properties: for token, word in zip(sentence, displacy_dict['words']): - word['text'] += token_properties[i_tag_start + token.i] + properties = token_properties[i_tag_start + token.i] + word['text'] += (properties['punc_mark'] or '') + (properties['tag'] or '') htmls.append(spacy.displacy.render( displacy_dict, @@ -458,12 +456,12 @@ def wl_dependency_parse_fig_tokens( for doc in nlp.pipe(docs): for sentence in doc.sents: - displacy_dict = spacy.displacy.parse_deps(sentence.as_doc(), options = options) + displacy_dict = spacy.displacy.parse_deps(sentence, options = options) if token_properties: for token, word in zip(sentence, displacy_dict['words']): properties = token_properties[i_tag_start + token.i] - word['text'] += (properties['punc_mark'] or '') + (properties['tag']) + word['text'] += (properties['punc_mark'] or '') + (properties['tag'] or '') sentences.append(displacy_dict) diff --git a/wordless/wl_nlp/wl_texts.py b/wordless/wl_nlp/wl_texts.py index 05dc26082..d30b526bf 100644 --- a/wordless/wl_nlp/wl_texts.py +++ b/wordless/wl_nlp/wl_texts.py @@ -58,8 +58,11 @@ def __hash__(self): def __eq__(self, other): return self.display_text() == other.display_text() - def display_text(self): - return str(self) + (self.punc_mark or '') + (self.tag or '') + def display_text(self, punc_mark = False): + if punc_mark: + return str(self) + (self.punc_mark or '') + (self.tag or '') + else: + return str(self) + (self.tag or '') def update_properties(self, token): self.lang = token.lang @@ -134,8 +137,8 @@ def combine_texts_properties(texts, token_properties): def to_token_texts(tokens): return [str(token) for token in tokens] -def to_display_texts(tokens): - return [token.display_text() for token in tokens] +def to_display_texts(tokens, punc_mark = False): + return [token.display_text(punc_mark = punc_mark) for token in tokens] def set_token_text(token, text): _, token_properties = split_texts_properties([token]) @@ -148,11 +151,18 @@ def set_token_texts(tokens, texts): for i, token in enumerate(combine_texts_properties(texts, token_properties)): tokens[i] = token +def has_token_properties(tokens, name): + for token in tokens: + if getattr(token, name) is not None: + return True + + return False + def get_token_properties(tokens, name): return [getattr(token, name) for token in tokens] def set_token_properties(tokens, name, vals): - if isinstance(vals, str): + if isinstance(vals, str) or vals is None: vals = [vals] * len(tokens) for token, val in zip(tokens, vals): @@ -398,39 +408,45 @@ def get_tokens_flat(self): return list(wl_misc.flatten_list(self.tokens_multilevel)) def set_tokens(self, tokens): - i_start_token = 0 + i_token = 0 for para in self.tokens_multilevel: for sentence in para: for sentence_seg in sentence: for i, _ in enumerate(sentence_seg): - sentence_seg[i] = tokens[i_start_token + i] + sentence_seg[i] = tokens[i_token] - i_start_token += len(sentence_seg) + i_token += 1 - def to_token_texts(self): - return [ - [ + def to_token_texts(self, flat = False): + if flat: + return to_token_texts(self.get_tokens_flat()) + else: + return [ [ - [str(token) for token in sentence_seg] - for sentence_seg in sentence + [ + [str(token) for token in sentence_seg] + for sentence_seg in sentence + ] + for sentence in para ] - for sentence in para + for para in self.tokens_multilevel ] - for para in self.tokens_multilevel - ] - def to_display_texts(self): - return [ - [ + def to_display_texts(self, punc_mark = False, flat = False): + if flat: + return to_display_texts(self.get_tokens_flat()) + else: + return [ [ - [token.display_text() for token in sentence_seg] - for sentence_seg in sentence + [ + [token.display_text(punc_mark = punc_mark) for token in sentence_seg] + for sentence_seg in sentence + ] + for sentence in para ] - for sentence in para + for para in self.tokens_multilevel ] - for para in self.tokens_multilevel - ] def set_token_texts(self, texts): tokens = self.get_tokens_flat() @@ -440,11 +456,26 @@ def set_token_texts(self, texts): self.set_tokens(tokens) - def get_token_properties(self, name): - return [getattr(token, name) for token in self.get_tokens_flat()] + def has_token_properties(self, name): + return has_token_properties(self.get_tokens_flat(), name) + + def get_token_properties(self, name, flat = False): + if flat: + return get_token_properties(self.get_tokens_flat(), name) + else: + return [ + [ + [ + [getattr(token, name) for token in sentence_seg] + for sentence_seg in sentence + ] + for sentence in para + ] + for para in self.tokens_multilevel + ] def set_token_properties(self, name, vals): - if isinstance(vals, str): + if isinstance(vals, str) or vals is None: vals = [vals] * self.num_tokens i_val = 0 @@ -458,15 +489,15 @@ def set_token_properties(self, name, vals): i_val += 1 def update_token_properties(self, tokens): - i_start_token = 0 + i_token = 0 for para in self.tokens_multilevel: for sentence in para: for sentence_seg in sentence: - for i, token in enumerate(sentence_seg): - token.update_properties(tokens[i_start_token + i]) + for token in sentence_seg: + token.update_properties(tokens[i_token]) - i_start_token += len(sentence_seg) + i_token += 1 def get_offsets(self): offsets_paras = [] @@ -566,11 +597,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called ] # Remove empty tokens and whitespace around tokens - self.tokens_multilevel[0][0][0] = [ - token_clean - for token in self.tokens_multilevel[0][0][0] - if (token_clean := token.strip()) - ] + self.tokens_multilevel[0][0][0] = clean_texts(self.tokens_multilevel[0][0][0]) self.tokens_multilevel[0][0][0] = to_tokens(self.tokens_multilevel[0][0][0], self.lang) self.num_tokens = len(self.get_tokens_flat()) diff --git a/wordless/wl_nlp/wl_token_processing.py b/wordless/wl_nlp/wl_token_processing.py index 022a36a4c..95d3f5279 100644 --- a/wordless/wl_nlp/wl_token_processing.py +++ b/wordless/wl_nlp/wl_token_processing.py @@ -25,6 +25,66 @@ ) from wordless.wl_utils import wl_misc +# Assign part-of-speech tags +def text_pos_tag(main, text, settings): + if settings['assign_pos_tags'] and not text.tagged: + tokens = wl_pos_tagging.wl_pos_tag( + main, + inputs = text.get_tokens_flat(), + lang = text.lang + ) + + text.update_token_properties(tokens) + +# Syllable tokenization +def text_syl_tokenize(main, text): + tokens = wl_syl_tokenization.wl_syl_tokenize( + main, + inputs = text.get_tokens_flat(), + lang = text.lang, + ) + + text.update_token_properties(tokens) + +# Ignore tags +def text_ignore_tags(text, settings): + if settings['ignore_tags']: + text.set_token_properties('tag', None) + +# Use tags only +def text_use_tags_only(text, settings): + if settings['use_tags']: + # Calculate head references + if text.has_token_properties('head'): + head_refs = [] + + for i_para, para in enumerate(text.tokens_multilevel): + for i_sentence, sentence in enumerate(para): + for sentence_seg in sentence: + for token in sentence_seg: + head = token.head + + for i_sentence_seg, sentence_seg in enumerate(sentence): + for i_token, token in enumerate(sentence_seg): + if head is token: + head_refs.append((i_para, i_sentence, i_sentence_seg, i_token)) + + text.set_token_texts(text.get_token_properties('tag', flat = True)) + text.set_token_properties('tag', None) + + # Update head references + if text.has_token_properties('head'): + i_token = 0 + + for para in text.tokens_multilevel: + for sentence in para: + for sentence_seg in sentence: + for token in sentence_seg: + refs = head_refs[i_token] + token.head = text.tokens_multilevel[refs[0]][refs[1]][refs[2]][refs[3]] + + i_token += 1 + def wl_process_tokens(main, text, token_settings): settings = copy.deepcopy(token_settings) @@ -33,20 +93,10 @@ def wl_process_tokens(main, text, token_settings): settings['all_uppercase'] = False settings['title_case'] = False - if settings['ignore_tags']: - settings['use_tags'] = False - elif settings['use_tags']: + if settings['use_tags']: settings['apply_lemmatization'] = False - settings['ignore_tags'] = False - # Assign part-of-speech tags - if settings['assign_pos_tags'] and not text.tagged: - tokens = wl_pos_tagging.wl_pos_tag( - main, - inputs = text.get_tokens_flat(), - lang = text.lang - ) - text.update_token_properties(tokens) + text_pos_tag(main, text, token_settings) # Apply lemmatization if settings['apply_lemmatization']: @@ -153,16 +203,17 @@ def wl_process_tokens(main, text, token_settings): # Replace tokens with their lemmas if settings['apply_lemmatization']: - text_modified.set_token_texts(text_modified.get_token_properties('lemma')) + text_modified.set_token_texts(text_modified.get_token_properties('lemma', flat = True)) - # Ignore tags - if settings['ignore_tags']: - text_modified.set_token_properties('tag', '') + text_modified.update_num_tokens() - # Use tags only - if settings['use_tags']: - text_modified.set_token_texts(text_modified.get_token_properties('tag')) - text_modified.set_token_properties('tag', '') + return text_modified + +def wl_process_tokens_ngram_generator(main, text, token_settings): + text_modified = wl_process_tokens(main, text, token_settings) + + text_ignore_tags(text_modified, token_settings) + text_use_tags_only(text_modified, token_settings) text_modified.update_num_tokens() @@ -206,15 +257,9 @@ def wl_process_tokens_profiler(main, text, token_settings): # Punctuation marks must be preserved for some readability measures (e.g. Wheeler & Smith's Readability Formula) text.tokens_multilevel_with_puncs = copy.deepcopy(text.tokens_multilevel) - # Syllable tokenization - tokens = wl_syl_tokenization.wl_syl_tokenize( - main, - inputs = text.get_tokens_flat(), - lang = text.lang, - ) - text.update_token_properties(tokens) + text_syl_tokenize(main, text) - text_modified = wl_process_tokens(main, text, token_settings) + text_modified = wl_process_tokens_ngram_generator(main, text, token_settings) text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel) text_modified.update_num_tokens() @@ -223,14 +268,7 @@ def wl_process_tokens_profiler(main, text, token_settings): def wl_process_tokens_concordancer(main, text, token_settings, preserve_blank_lines = False): settings = copy.deepcopy(token_settings) - # Assign part-of-speech tags - if settings['assign_pos_tags'] and not text.tagged: - tokens = wl_pos_tagging.wl_pos_tag( - main, - inputs = text.get_tokens_flat(), - lang = text.lang - ) - text.update_token_properties(tokens) + text_pos_tag(main, text, token_settings) text_modified = copy.deepcopy(text) @@ -272,14 +310,8 @@ def wl_process_tokens_concordancer(main, text, token_settings, preserve_blank_li if not preserve_blank_lines: text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel, empty_tokens = False) - # Ignore tags - if settings['ignore_tags']: - text_modified.set_token_properties('tag', '') - - # Use tags only - if settings['use_tags']: - text_modified.set_token_texts(text_modified.get_token_properties('tag')) - text_modified.set_token_properties('tag', '') + text_ignore_tags(text_modified, token_settings) + text_use_tags_only(text_modified, token_settings) text_modified.update_num_tokens() @@ -302,17 +334,32 @@ def wl_process_tokens_dependency_parser(main, text, token_settings): return wl_process_tokens_concordancer(main, text, token_settings) def wl_process_tokens_wordlist_generator(main, text, token_settings, generation_settings): - # Syllable tokenization + # Syllabification if generation_settings['syllabification']: - tokens = wl_syl_tokenization.wl_syl_tokenize( - main, - inputs = text.get_tokens_flat(), - lang = text.lang, - ) - text.update_token_properties(tokens) + text_syl_tokenize(main, text) - text_modified = wl_process_tokens(main, text, token_settings) + text_modified = wl_process_tokens_ngram_generator(main, text, token_settings) text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel) text_modified.update_num_tokens() return text_modified + +def wl_process_tokens_colligation_extractor(main, text, token_settings): + # Do not modify custom settings, as adding new options would clear user's custom settings + settings = copy.deepcopy(token_settings) + # Always assign part-of-speech tags + settings['assign_pos_tags'] = True + + text_modified = wl_process_tokens(main, text, settings) + + text_modified.tags = wl_texts.to_tokens( + text_modified.get_token_properties('tag', flat = True), + lang = text.lang + ) + + text_ignore_tags(text_modified, token_settings) + text_use_tags_only(text_modified, token_settings) + + text_modified.update_num_tokens() + + return text_modified diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py index e4b58f2f8..26fbb29b8 100644 --- a/wordless/wl_profiler.py +++ b/wordless/wl_profiler.py @@ -1252,7 +1252,7 @@ def run(self): for sentence_seg in sentence ] - syls_tokens = text.get_token_properties('syls') + syls_tokens = text.get_token_properties('syls', flat = True) # Remove punctuation marks for i, syls in enumerate(syls_tokens): diff --git a/wordless/wl_results/wl_results_filter.py b/wordless/wl_results/wl_results_filter.py index 4e593c5b1..f52a279ea 100644 --- a/wordless/wl_results/wl_results_filter.py +++ b/wordless/wl_results/wl_results_filter.py @@ -19,7 +19,7 @@ import copy import math -from PyQt5.QtCore import QCoreApplication, Qt +from PyQt5.QtCore import QCoreApplication from PyQt5.QtWidgets import QLabel, QPushButton from wordless.wl_dialogs import wl_dialogs, wl_dialogs_misc @@ -160,7 +160,7 @@ def __init__(self, main, tab, table): ) = wl_widgets.wl_widgets_filter( self, filter_min = 1, - filter_max = 100 + filter_max = 1000 ) if self.tab == 'wordlist_generator': @@ -175,7 +175,7 @@ def __init__(self, main, tab, table): ) = wl_widgets.wl_widgets_filter( self, filter_min = 1, - filter_max = 100 + filter_max = 1000 ) self.label_freq = QLabel(self.tr('Frequency:'), self) @@ -567,11 +567,6 @@ def __init__(self, main, tab, table): self.Worker_Filter_Results = Wl_Worker_Results_Filter_Collocation_Extractor - if tab in ['collocation_extractor', 'colligation_extractor']: - self.type_node = 'collocate' - elif tab == 'keyword_extractor': - self.type_node = 'keyword' - settings = self.table.settings[self.tab] test_statistical_significance = settings['generation_settings']['test_statistical_significance'] @@ -586,10 +581,20 @@ def __init__(self, main, tab, table): self.has_bayes_factor = measure_bayes_factor != 'none' self.has_effect_size = measure_effect_size != 'none' - if self.type_node == 'collocate': - self.label_len_node = QLabel(self.tr('Collocate length:'), self) - elif self.type_node == 'keyword': - self.label_len_node = QLabel(self.tr('Keyword length:'), self) + match tab: + case 'collocation_extractor': + self.type_node = 'node' + self.type_collocation = 'collocation' + self.label_len_node = QLabel(self.tr('Node length:'), self) + self.label_len_collocation = QLabel(self.tr('Collocation length:'), self) + case 'colligation_extractor': + self.type_node = 'node' + self.type_collocation = 'colligation' + self.label_len_node = QLabel(self.tr('Node length:'), self) + self.label_len_collocation = QLabel(self.tr('Colligation length:'), self) + case 'keyword_extractor': + self.type_node = 'keyword' + self.label_len_node = QLabel(self.tr('Keyword length:'), self) ( self.label_len_node_min, @@ -601,9 +606,37 @@ def __init__(self, main, tab, table): ) = wl_widgets.wl_widgets_filter( self, filter_min = 1, - filter_max = 100 + filter_max = 1000 ) + if self.type_node == 'node': + self.label_len_collocate = QLabel(self.tr('Collocate length:'), self) + ( + self.label_len_collocate_min, + self.spin_box_len_collocate_min, + self.checkbox_len_collocate_min_no_limit, + self.label_len_collocate_max, + self.spin_box_len_collocate_max, + self.checkbox_len_collocate_max_no_limit + ) = wl_widgets.wl_widgets_filter( + self, + filter_min = 1, + filter_max = 1000 + ) + + ( + self.label_len_collocation_min, + self.spin_box_len_collocation_min, + self.checkbox_len_collocation_min_no_limit, + self.label_len_collocation_max, + self.spin_box_len_collocation_max, + self.checkbox_len_collocation_max_no_limit + ) = wl_widgets.wl_widgets_filter( + self, + filter_min = 2, + filter_max = 2000 + ) + self.label_freq = QLabel(self.tr('Frequency:'), self) ( self.label_freq_min, @@ -619,7 +652,7 @@ def __init__(self, main, tab, table): ) # Frequency position - if self.type_node == 'collocate': + if self.type_node == 'node': self.combo_box_freq_position = wl_boxes.Wl_Combo_Box(self) for i in range( @@ -696,7 +729,17 @@ def __init__(self, main, tab, table): self.spin_box_len_node_max.valueChanged.connect(self.filters_changed) self.checkbox_len_node_max_no_limit.stateChanged.connect(self.filters_changed) - if self.type_node == 'collocate': + if self.type_node == 'node': + self.spin_box_len_collocate_min.valueChanged.connect(self.filters_changed) + self.checkbox_len_collocate_min_no_limit.stateChanged.connect(self.filters_changed) + self.spin_box_len_collocate_max.valueChanged.connect(self.filters_changed) + self.checkbox_len_collocate_max_no_limit.stateChanged.connect(self.filters_changed) + + self.spin_box_len_collocation_min.valueChanged.connect(self.filters_changed) + self.checkbox_len_collocation_min_no_limit.stateChanged.connect(self.filters_changed) + self.spin_box_len_collocation_max.valueChanged.connect(self.filters_changed) + self.checkbox_len_collocation_max_no_limit.stateChanged.connect(self.filters_changed) + self.combo_box_freq_position.currentTextChanged.connect(self.filters_changed) self.spin_box_freq_min.valueChanged.connect(self.filters_changed) @@ -736,17 +779,29 @@ def __init__(self, main, tab, table): # Close the dialog when data in the table are re-generated self.table.button_generate_table.clicked.connect(self.close) - widgets_filter = [ - [ - self.label_len_node, - self.label_len_node_min, self.spin_box_len_node_min, self.checkbox_len_node_min_no_limit, - self.label_len_node_max, self.spin_box_len_node_max, self.checkbox_len_node_max_no_limit - ], [ - self.label_freq, - self.label_freq_min, self.spin_box_freq_min, self.checkbox_freq_min_no_limit, - self.label_freq_max, self.spin_box_freq_max, self.checkbox_freq_max_no_limit - ] - ] + widgets_filter = [[ + self.label_len_node, + self.label_len_node_min, self.spin_box_len_node_min, self.checkbox_len_node_min_no_limit, + self.label_len_node_max, self.spin_box_len_node_max, self.checkbox_len_node_max_no_limit + ]] + + if self.type_node == 'node': + widgets_filter.append([ + self.label_len_collocate, + self.label_len_collocate_min, self.spin_box_len_collocate_min, self.checkbox_len_collocate_min_no_limit, + self.label_len_collocate_max, self.spin_box_len_collocate_max, self.checkbox_len_collocate_max_no_limit + ]) + widgets_filter.append([ + self.label_len_collocation, + self.label_len_collocation_min, self.spin_box_len_collocation_min, self.checkbox_len_collocation_min_no_limit, + self.label_len_collocation_max, self.spin_box_len_collocation_max, self.checkbox_len_collocation_max_no_limit + ]) + + widgets_filter.append([ + self.label_freq, + self.label_freq_min, self.spin_box_freq_min, self.checkbox_freq_min_no_limit, + self.label_freq_max, self.spin_box_freq_max, self.checkbox_freq_max_no_limit + ]) if self.has_test_stat: widgets_filter.append([ @@ -784,14 +839,11 @@ def __init__(self, main, tab, table): add_widgets_filter(self, widgets_filter = widgets_filter, layout = self.layout_filters) - if self.type_node == 'collocate': + if self.type_node == 'node': self.layout_filters.removeWidget(self.label_freq) - layout_freq_position = wl_layouts.Wl_Layout() - layout_freq_position.addWidget(self.label_freq, 0, 0) - layout_freq_position.addWidget(self.combo_box_freq_position, 0, 1, Qt.AlignRight) - - self.layout_filters.addLayout(layout_freq_position, 3, 0, 1, 3) + self.layout_filters.addWidget(self.label_freq, 9, 0, 1, 2) + self.layout_filters.addWidget(self.combo_box_freq_position, 9, 2) self.load_settings() @@ -808,7 +860,17 @@ def load_settings(self, defaults = False): self.spin_box_len_node_max.setValue(settings[f'len_{self.type_node}_max']) self.checkbox_len_node_max_no_limit.setChecked(settings[f'len_{self.type_node}_max_no_limit']) - if self.type_node == 'collocate': + if self.type_node == 'node': + self.spin_box_len_collocate_min.setValue(settings['len_collocate_min']) + self.checkbox_len_collocate_min_no_limit.setChecked(settings['len_collocate_min_no_limit']) + self.spin_box_len_collocate_max.setValue(settings['len_collocate_max']) + self.checkbox_len_collocate_max_no_limit.setChecked(settings['len_collocate_max_no_limit']) + + self.spin_box_len_collocation_min.setValue(settings[f'len_{self.type_collocation}_min']) + self.checkbox_len_collocation_min_no_limit.setChecked(settings[f'len_{self.type_collocation}_min_no_limit']) + self.spin_box_len_collocation_max.setValue(settings[f'len_{self.type_collocation}_max']) + self.checkbox_len_collocation_max_no_limit.setChecked(settings[f'len_{self.type_collocation}_max_no_limit']) + self.combo_box_freq_position.setCurrentText(settings['freq_position']) self.spin_box_freq_min.setValue(settings['freq_min']) @@ -851,7 +913,17 @@ def filters_changed(self): self.settings[f'len_{self.type_node}_max'] = self.spin_box_len_node_max.value() self.settings[f'len_{self.type_node}_max_no_limit'] = self.checkbox_len_node_max_no_limit.isChecked() - if self.type_node == 'collocate': + if self.type_node == 'node': + self.settings['len_collocate_min'] = self.spin_box_len_collocate_min.value() + self.settings['len_collocate_min_no_limit'] = self.checkbox_len_collocate_min_no_limit.isChecked() + self.settings['len_collocate_max'] = self.spin_box_len_collocate_max.value() + self.settings['len_collocate_max_no_limit'] = self.checkbox_len_collocate_max_no_limit.isChecked() + + self.settings[f'len_{self.type_collocation}_min'] = self.spin_box_len_collocation_min.value() + self.settings[f'len_{self.type_collocation}_min_no_limit'] = self.checkbox_len_collocation_min_no_limit.isChecked() + self.settings[f'len_{self.type_collocation}_max'] = self.spin_box_len_collocation_max.value() + self.settings[f'len_{self.type_collocation}_max_no_limit'] = self.checkbox_len_collocation_max_no_limit.isChecked() + self.settings['freq_position'] = self.combo_box_freq_position.currentText() self.settings['freq_min'] = self.spin_box_freq_min.value() @@ -896,8 +968,9 @@ def run(self): col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text'] col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] - if self.dialog.type_node == 'collocate': - col_node = self.dialog.table.find_header_hor(self.tr('Collocate')) + if self.dialog.type_node == 'node': + col_node = self.dialog.table.find_header_hor(self.tr('Node')) + col_collocate = self.dialog.table.find_header_hor(self.tr('Collocate')) if self.dialog.settings['freq_position'] == self.tr('Total'): col_freq = self.dialog.table.find_header_hor( @@ -910,8 +983,8 @@ def run(self): else: col_node = self.dialog.table.find_header_hor(self.tr('Keyword')) col_freq = self.dialog.table.find_header_hor( - self.tr('[{}]\nFrequency').format(self.dialog.settings['file_to_filter']) - ) + self.tr('[{}]\nFrequency').format(self.dialog.settings['file_to_filter']) + ) if self.dialog.has_test_stat: col_test_stat = self.dialog.table.find_header_hor( @@ -946,6 +1019,29 @@ def run(self): else self.dialog.settings[f'len_{self.dialog.type_node}_max'] ) + if self.dialog.type_node == 'node': + len_collocate_min = ( + float('-inf') + if self.dialog.settings['len_collocate_min_no_limit'] + else self.dialog.settings['len_collocate_min'] + ) + len_collocate_max = ( + float('inf') + if self.dialog.settings['len_collocate_max_no_limit'] + else self.dialog.settings['len_collocate_max'] + ) + + len_collocation_min = ( + float('-inf') + if self.dialog.settings[f'len_{self.dialog.type_collocation}_min_no_limit'] + else self.dialog.settings[f'len_{self.dialog.type_collocation}_min'] + ) + len_collocation_max = ( + float('inf') + if self.dialog.settings[f'len_{self.dialog.type_collocation}_max_no_limit'] + else self.dialog.settings[f'len_{self.dialog.type_collocation}_max'] + ) + freq_min = ( float('-inf') if self.dialog.settings['freq_min_no_limit'] @@ -1015,61 +1111,54 @@ def run(self): self.dialog.table.row_filters = [] for i in range(self.dialog.table.model().rowCount()): + filters = [] + # Calculate length of token texts only when filtering tagged tokens and when filtering tags len_node = sum(( len(str(token)) for token in self.dialog.table.model().item(i, col_node).tokens_filter )) - filter_len_node = len_node_min <= len_node <= len_node_max - filter_freq = ( + filters.append(len_node_min <= len_node <= len_node_max) + + if self.dialog.type_node == 'node': + len_collocate = sum(( + len(str(token)) + for token in self.dialog.table.model().item(i, col_collocate).tokens_filter + )) + + filters.append(len_collocate_min <= len_collocate <= len_collocate_max) + filters.append(len_collocation_min <= len_node + len_collocate <= len_collocation_max) + + filters.append( freq_min <= self.dialog.table.model().item(i, col_freq).val <= freq_max ) if self.dialog.has_test_stat: - filter_test_stat = ( + filters.append( test_stat_min <= self.dialog.table.model().item(i, col_test_stat).val <= test_stat_max ) - else: - filter_test_stat = True if self.dialog.has_p_val: - filter_p_val = ( + filters.append( p_val_min <= self.dialog.table.model().item(i, col_p_value).val <= p_val_max ) - else: - filter_p_val = True if self.dialog.has_bayes_factor: - filter_bayes_factor = ( + filters.append( bayes_factor_min <= self.dialog.table.model().item(i, col_bayes_factor).val <= bayes_factor_max ) - else: - filter_bayes_factor = True if self.dialog.has_effect_size: - filter_effect_size = ( + filters.append( effect_size_min <= self.dialog.table.model().item(i, col_effect_size).val <= effect_size_max ) - else: - filter_effect_size = True - filter_num_files_found = ( + filters.append( num_files_found_min <= self.dialog.table.model().item(i, col_num_files_found).val <= num_files_found_max ) - if ( - filter_len_node - and filter_freq - and filter_test_stat - and filter_p_val - and filter_bayes_factor - and filter_effect_size - and filter_num_files_found - ): - self.dialog.table.row_filters.append(True) - else: - self.dialog.table.row_filters.append(False) + self.dialog.table.row_filters.append(all(filters)) self.progress_updated.emit(self.tr('Updating table...')) self.worker_done.emit() diff --git a/wordless/wl_results/wl_results_search.py b/wordless/wl_results/wl_results_search.py index 3c09c0ba1..4f8997f30 100644 --- a/wordless/wl_results/wl_results_search.py +++ b/wordless/wl_results/wl_results_search.py @@ -37,6 +37,7 @@ def __init__(self, main, tab, table): self.tab = tab self.tables = [table] self.settings = self.main.settings_custom[self.tab]['search_results'] + self.last_search_settings = [] self.items_found = [] self.main.wl_work_area.currentChanged.connect(self.reject) @@ -86,7 +87,7 @@ def __init__(self, main, tab, table): self.button_find_next.clicked.connect(lambda: self.find_next()) # pylint: disable=unnecessary-lambda self.button_find_prev.clicked.connect(lambda: self.find_prev()) # pylint: disable=unnecessary-lambda self.button_find_all.clicked.connect(lambda: self.find_all()) # pylint: disable=unnecessary-lambda - self.button_clr_hightlights.clicked.connect(self.clr_highlights) + self.button_clr_hightlights.clicked.connect(lambda: self.clr_highlights()) # pylint: disable=unnecessary-lambda self.button_close.clicked.connect(self.reject) @@ -123,8 +124,8 @@ def __init__(self, main, tab, table): self.layout().addWidget(wl_layouts.Wl_Separator(self), 9, 0, 1, 4) self.layout().addLayout(layout_buttons_bottom, 10, 0, 1, 4) - for table in self.tables: # pylint: disable=redefined-argument-from-local - table.model().itemChanged.connect(self.table_item_changed) + for table_to_search in self.tables: + table_to_search.model().itemChanged.connect(self.table_item_changed) self.load_settings() @@ -240,9 +241,7 @@ def find_prev(self): selected_rows = [] for table in self.tables: - table.hide() - table.blockSignals(True) - table.setUpdatesEnabled(False) + table.disable_updates() for table in self.tables: if table.get_selected_rows(): @@ -277,24 +276,24 @@ def find_prev(self): self.tables[-1].selectRow(self.items_found[-1][1]) for table in self.tables: - table.blockSignals(False) - table.setUpdatesEnabled(True) - table.show() + table.enable_updates() @wl_misc.log_timing def find_all(self): - self.clr_highlights() + # Search only when there are no search history or search settings have been changed + if not self.items_found or self.last_search_settings != copy.deepcopy(self.settings): + self.clr_highlights() - dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress(self.main, text = self.tr('Searching in results...')) + dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress(self.main, text = self.tr('Searching in results...')) - worker_results_search = Wl_Worker_Results_Search( - self.main, - dialog_progress = dialog_progress, - update_gui = self.update_gui, - dialog = self - ) + worker_results_search = Wl_Worker_Results_Search( + self.main, + dialog_progress = dialog_progress, + update_gui = self.update_gui, + dialog = self + ) - wl_threading.Wl_Thread(worker_results_search).start_worker() + wl_threading.Wl_Thread(worker_results_search).start_worker() def update_gui(self): if self.items_found: @@ -324,11 +323,15 @@ def update_gui(self): self.button_clr_hightlights.setEnabled(False) + # Save search settings + self.last_search_settings = copy.deepcopy(self.settings) + len_items_found = len(self.items_found) msg_item = self.tr('item') if len_items_found == 1 else self.tr('items') self.main.statusBar().showMessage(self.tr('Found {} {}.').format(len_items_found, msg_item)) + @wl_misc.log_timing def clr_highlights(self): if self.items_found: for table in self.tables: @@ -345,11 +348,16 @@ def clr_highlights(self): for table in self.tables: table.enable_updates() - self.items_found.clear() + self.clr_history() + self.main.statusBar().showMessage(self.tr('Highlights cleared.')) self.button_clr_hightlights.setEnabled(False) + def clr_history(self): + self.last_search_settings.clear() + self.items_found.clear() + def load(self): self.show() diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 811b20fac..f601a36ab 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -783,11 +783,21 @@ def init_settings_default(main): 'filter_results': { 'file_to_filter': _tr('wl_settings_default', 'Total'), + 'len_node_min': 1, + 'len_node_min_no_limit': True, + 'len_node_max': 20, + 'len_node_max_no_limit': True, + 'len_collocate_min': 1, 'len_collocate_min_no_limit': True, 'len_collocate_max': 20, 'len_collocate_max_no_limit': True, + 'len_collocation_min': 1, + 'len_collocation_min_no_limit': True, + 'len_collocation_max': 20, + 'len_collocation_max_no_limit': True, + 'freq_position': _tr('wl_settings_default', 'Total'), 'freq_min': 0, 'freq_min_no_limit': True, @@ -847,8 +857,6 @@ def init_settings_default(main): 'apply_lemmatization': False, 'filter_stop_words': False, - # Always assign POS tags - 'assign_pos_tags': True, 'ignore_tags': False, 'use_tags': False }, @@ -941,11 +949,21 @@ def init_settings_default(main): 'filter_results': { 'file_to_filter': _tr('wl_settings_default', 'Total'), + 'len_node_min': 1, + 'len_node_min_no_limit': True, + 'len_node_max': 20, + 'len_node_max_no_limit': True, + 'len_collocate_min': 1, 'len_collocate_min_no_limit': True, 'len_collocate_max': 20, 'len_collocate_max_no_limit': True, + 'len_colligation_min': 1, + 'len_colligation_min_no_limit': True, + 'len_colligation_max': 20, + 'len_colligation_max_no_limit': True, + 'freq_position': _tr('wl_settings_default', 'Total'), 'freq_min': 0, 'freq_min_no_limit': True, diff --git a/wordless/wl_utils/wl_detection.py b/wordless/wl_utils/wl_detection.py index a34ddc59e..dee923323 100644 --- a/wordless/wl_utils/wl_detection.py +++ b/wordless/wl_utils/wl_detection.py @@ -51,6 +51,7 @@ def detect_encoding(main, file_path): return encoding +# pylint: disable=no-member lingua_detector = lingua.LanguageDetectorBuilder.from_all_languages_without( lingua.Language.BOSNIAN, lingua.Language.MAORI, diff --git a/wordless/wl_widgets/wl_tables.py b/wordless/wl_widgets/wl_tables.py index 1377a5998..0c19452d5 100644 --- a/wordless/wl_widgets/wl_tables.py +++ b/wordless/wl_widgets/wl_tables.py @@ -40,6 +40,8 @@ _tr = QCoreApplication.translate +# pylint: disable=unnecessary-lambda + class Wl_Table(QTableView): def __init__( self, parent, @@ -513,16 +515,16 @@ def run(self): if '*.csv' in self.file_type: encoding = self.main.settings_custom['general']['exp']['tables']['default_encoding'] - # Concordancer - if self.table.tab == 'concordancer': - with open(self.file_path, 'w', encoding = encoding, newline = '') as f: - csv_writer = csv.writer(f) + with open(self.file_path, 'w', encoding = encoding, newline = '') as f: + csv_writer = csv.writer(f) + if self.table.header_orientation == 'hor': # Horizontal headers - csv_writer.writerow([ - self.table.model().horizontalHeaderItem(col).text().strip() + headers_hor = [ + self.table.model().horizontalHeaderItem(col).text() for col in cols - ]) + ] + csv_writer.writerow(self.clean_text_csv(headers_hor)) # Cells for i, row in enumerate(self.rows_to_exp): @@ -537,77 +539,28 @@ def run(self): row_to_exp.append(cell_text) - csv_writer.writerow(row_to_exp) + csv_writer.writerow(self.clean_text_csv(row_to_exp)) self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows)) - # Parallel Concordancer - elif self.table.tab == 'concordancer_parallel': - with open(self.file_path, 'w', encoding = encoding, newline = '') as f: - csv_writer = csv.writer(f) - - # Horizontal Headers - csv_writer.writerow([ - self.table.model().horizontalHeaderItem(col).text().strip() + # Profiler + else: + # Horizontal headers + headers_hor = [ + self.table.model().horizontalHeaderItem(col).text() for col in cols - ]) + ] + csv_writer.writerow([''] + self.clean_text_csv(headers_hor)) - # Cells + # Vertical headers and cells for i, row in enumerate(self.rows_to_exp): - row_to_exp = [] + row_to_exp = [self.table.model().verticalHeaderItem(row).text()] for col in cols: - if self.table.model().item(row, col): - cell_text = self.table.model().item(row, col).text() - else: - cell_text = self.table.indexWidget(self.table.model().index(row, col)).text() - cell_text = wl_nlp_utils.html_to_text(cell_text) - - row_to_exp.append(cell_text) + row_to_exp.append(self.table.model().item(row, col).text()) - csv_writer.writerow(row_to_exp) + csv_writer.writerow(self.clean_text_csv(row_to_exp)) self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows)) - else: - with open(self.file_path, 'w', encoding = encoding, newline = '') as f: - csv_writer = csv.writer(f) - - if self.table.header_orientation == 'hor': - # Horizontal headers - csv_writer.writerow([ - self.table.model().horizontalHeaderItem(col).text().strip() - for col in cols - ]) - - # Cells - for i, row in enumerate(self.rows_to_exp): - row_to_exp = [] - - for col in cols: - row_to_exp.append(self.table.model().item(row, col).text().strip()) - - csv_writer.writerow(row_to_exp) - - self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows)) - else: - # Horizontal headers - csv_writer.writerow( - [''] - + [ - self.table.model().horizontalHeaderItem(col).text().strip() - for col in cols - ] - ) - - # Vertical headers & cells - for i, row in enumerate(self.rows_to_exp): - row_to_exp = [self.table.model().verticalHeaderItem(row).text().strip()] - - for col in cols: - row_to_exp.append(self.table.model().item(row, col).text().strip()) - - csv_writer.writerow(row_to_exp) - - self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows)) # Excel workbooks elif '*.xlsx' in self.file_type: workbook = openpyxl.Workbook() @@ -735,7 +688,7 @@ def run(self): para_text = [] for col in range(3): - para_text.append(self.table.indexWidget(self.table.model().index(row, col)).text()) + para_text.append(self.table.indexWidget(self.table.model().index(row, col)).text().strip()) # Zapping if settings_concordancer['zapping']: @@ -764,14 +717,15 @@ def run(self): # Parallel Concordancer elif self.table.tab == 'concordancer_parallel': for i, row in enumerate(self.rows_to_exp): + if i > 0: + self.add_para(doc) + for col in range(2, self.table.model().columnCount()): - para_text = self.table.indexWidget(self.table.model().index(row, col)).text() + para_text = self.table.indexWidget(self.table.model().index(row, col)).text().strip() para = self.add_para(doc) self.style_para_rich_text(para, para_text, self.table.indexWidget(self.table.model().index(row, col))) - self.add_para(doc) - self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows)) # Add the last empty paragraph @@ -789,6 +743,15 @@ def run(self): self.worker_done.emit(err_msg, self.file_path) + # Clean text before writing to CSV files + def clean_text_csv(self, items): + for i, item in enumerate(items): + items[i] = item.replace('\n', ' ') + items[i] = re.sub(r'\s+', ' ', items[i]) + items[i] = items[i].strip() + + return items + # Remove invalid XML characters def remove_invalid_xml_chars(self, text): # openpyxl.cell.cell.ILLEGAL_CHARACTERS_RE is not complete @@ -1081,9 +1044,9 @@ def __init__(self, parent, headers, col_edit = None): self.button_del = QPushButton(_tr('wl_tables', 'Remove'), self) self.button_clr = QPushButton(_tr('wl_tables', 'Clear'), self) - self.button_add.clicked.connect(lambda: self.add_row()) # pylint: disable=unnecessary-lambda - self.button_ins.clicked.connect(lambda: self.ins_row()) # pylint: disable=unnecessary-lambda - self.button_del.clicked.connect(lambda: self.del_row()) # pylint: disable=unnecessary-lambda + self.button_add.clicked.connect(lambda: self.add_row()) + self.button_ins.clicked.connect(lambda: self.ins_row()) + self.button_del.clicked.connect(lambda: self.del_row()) self.button_clr.clicked.connect(lambda: self.clr_table(0)) def item_changed(self): @@ -1181,10 +1144,10 @@ def __init__( if not generate_fig: self.button_generate_fig.hide() - self.button_generate_table.clicked.connect(lambda: self.generate_table()) # pylint: disable=unnecessary-lambda - self.button_generate_fig.clicked.connect(lambda: self.generate_fig()) # pylint: disable=unnecessary-lambda - self.button_exp_selected_cells.clicked.connect(self.exp_selected_cells) - self.button_exp_all_cells.clicked.connect(lambda: self.exp_all_cells()) # pylint: disable=unnecessary-lambda + self.button_generate_table.clicked.connect(lambda: self.generate_table()) + self.button_generate_fig.clicked.connect(lambda: self.generate_fig()) + self.button_exp_selected_cells.clicked.connect(lambda: self.exp_selected_cells()) + self.button_exp_all_cells.clicked.connect(lambda: self.exp_all_cells()) self.button_clr_table.clicked.connect(lambda: self.clr_table(confirm = True)) self.main.wl_file_area.table_files.model().itemChanged.connect(self.file_changed) @@ -1871,6 +1834,7 @@ def __init__( self.button_results_search.setMinimumWidth(140) + self.button_generate_table.clicked.connect(self.dialog_results_search.clr_history) self.button_results_search.clicked.connect(self.dialog_results_search.load) self.results_changed() @@ -1924,6 +1888,7 @@ def __init__( self.button_results_search.setMinimumWidth(140) self.button_results_sort.setMinimumWidth(140) + self.button_generate_table.clicked.connect(self.dialog_results_search.clr_history) self.button_results_search.clicked.connect(self.dialog_results_search.load) self.button_results_sort.clicked.connect(self.dialog_results_sort.show) @@ -1976,6 +1941,7 @@ def __init__( self.button_results_filter.setMinimumWidth(140) self.button_results_search.setMinimumWidth(140) + self.button_generate_table.clicked.connect(self.dialog_results_search.clr_history) self.button_results_filter.clicked.connect(self.results_filter_clicked) self.button_results_search.clicked.connect(self.dialog_results_search.load) diff --git a/wordless/wl_widgets/wl_widgets.py b/wordless/wl_widgets/wl_widgets.py index d646b0372..a8974000d 100644 --- a/wordless/wl_widgets/wl_widgets.py +++ b/wordless/wl_widgets/wl_widgets.py @@ -338,10 +338,18 @@ def words_changed(): checkbox_all_uppercase.setEnabled(False) checkbox_title_case.setEnabled(False) + def assign_pos_tags_changed(): + if checkbox_assign_pos_tags.isChecked(): + checkbox_ignore_tags.setEnabled(False) + else: + checkbox_ignore_tags.setEnabled(not checkbox_use_tags.isChecked()) + def ignore_tags_changed(): if checkbox_ignore_tags.isChecked(): + checkbox_assign_pos_tags.setEnabled(False) checkbox_use_tags.setEnabled(False) else: + checkbox_assign_pos_tags.setEnabled(True) checkbox_use_tags.setEnabled(True) def use_tags_changed(): @@ -350,7 +358,7 @@ def use_tags_changed(): checkbox_ignore_tags.setEnabled(False) else: checkbox_apply_lemmatization.setEnabled(True) - checkbox_ignore_tags.setEnabled(True) + checkbox_ignore_tags.setEnabled(not checkbox_assign_pos_tags.isChecked()) checkbox_words = QCheckBox(_tr('wl_widgets', 'Words'), parent) checkbox_all_lowercase = QCheckBox(_tr('wl_widgets', 'All lowercase'), parent) @@ -368,6 +376,7 @@ def use_tags_changed(): checkbox_use_tags = QCheckBox(_tr('wl_widgets', 'Use tags only'), parent) checkbox_words.stateChanged.connect(words_changed) + checkbox_assign_pos_tags.stateChanged.connect(assign_pos_tags_changed) checkbox_ignore_tags.stateChanged.connect(ignore_tags_changed) checkbox_use_tags.stateChanged.connect(use_tags_changed) @@ -393,56 +402,33 @@ def use_tags_changed(): ) def wl_widgets_token_settings_concordancer(parent): - def ignore_tags_changed(): - if checkbox_ignore_tags.isChecked(): - checkbox_use_tags.setEnabled(False) - else: - checkbox_use_tags.setEnabled(True) - - def use_tags_changed(): - if checkbox_use_tags.isChecked(): + def assign_pos_tags_changed(): + if checkbox_assign_pos_tags.isChecked(): checkbox_ignore_tags.setEnabled(False) else: - checkbox_ignore_tags.setEnabled(True) + checkbox_ignore_tags.setEnabled(not checkbox_use_tags.isChecked()) - checkbox_punc_marks = QCheckBox(_tr('wl_widgets', 'Punctuation marks'), parent) - - checkbox_assign_pos_tags = QCheckBox(_tr('wl_widgets', 'Assign part-of-speech tags'), parent) - checkbox_ignore_tags = QCheckBox(_tr('wl_widgets', 'Ignore tags'), parent) - checkbox_use_tags = QCheckBox(_tr('wl_widgets', 'Use tags only'), parent) - - checkbox_ignore_tags.stateChanged.connect(ignore_tags_changed) - checkbox_use_tags.stateChanged.connect(use_tags_changed) - - ignore_tags_changed() - use_tags_changed() - - return ( - checkbox_punc_marks, - - checkbox_assign_pos_tags, - checkbox_ignore_tags, - checkbox_use_tags - ) - -def wl_widgets_token_settings_concordancer1(parent): def ignore_tags_changed(): if checkbox_ignore_tags.isChecked(): + checkbox_assign_pos_tags.setEnabled(False) checkbox_use_tags.setEnabled(False) else: + checkbox_assign_pos_tags.setEnabled(True) checkbox_use_tags.setEnabled(True) def use_tags_changed(): if checkbox_use_tags.isChecked(): checkbox_ignore_tags.setEnabled(False) else: - checkbox_ignore_tags.setEnabled(True) + checkbox_ignore_tags.setEnabled(not checkbox_assign_pos_tags.isChecked()) checkbox_punc_marks = QCheckBox(_tr('wl_widgets', 'Punctuation marks'), parent) + checkbox_assign_pos_tags = QCheckBox(_tr('wl_widgets', 'Assign part-of-speech tags'), parent) checkbox_ignore_tags = QCheckBox(_tr('wl_widgets', 'Ignore tags'), parent) checkbox_use_tags = QCheckBox(_tr('wl_widgets', 'Use tags only'), parent) + checkbox_assign_pos_tags.stateChanged.connect(assign_pos_tags_changed) checkbox_ignore_tags.stateChanged.connect(ignore_tags_changed) checkbox_use_tags.stateChanged.connect(use_tags_changed) @@ -452,6 +438,7 @@ def use_tags_changed(): return ( checkbox_punc_marks, + checkbox_assign_pos_tags, checkbox_ignore_tags, checkbox_use_tags ) @@ -496,10 +483,7 @@ def token_settings_changed(token_settings = None): match_tags_changed() def match_without_tags_changed(): - if checkbox_match_without_tags.isChecked(): - checkbox_match_tags.setEnabled(False) - else: - checkbox_match_tags.setEnabled(True) + checkbox_match_tags.setEnabled(not checkbox_match_without_tags.isChecked()) def match_tags_changed(): if checkbox_match_tags.isChecked():