diff --git a/CHANGELOG.md b/CHANGELOG.md
index c21f5f2e0..b707bd15c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@
- Utils: Add PyThaiNLP's Han-solo
- Utils: Add Stanza's Sindhi part-of-speech tagger
- Utils: Add VADER's sentiment analyzers
+- Work Area: Add Collocation/Colligation Extractor - Filter results /
- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic
### ✨ Improvements
@@ -40,7 +41,7 @@
### ❌ Removals
- Menu: Remove Settings - Measures - Statistical Significance - Welch's t-test
-- Work Area: Remove Collocation Extractor / Colligation Extractor / Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test
+- Work Area: Remove Collocation/Colligation/Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test
- Utils: Remove Dostoevsky's Russian sentiment analyzer
### ⏫ Dependency Changes
diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py
index cea0d338a..1c61f2eb3 100644
--- a/tests/test_dependency_parser.py
+++ b/tests/test_dependency_parser.py
@@ -62,7 +62,7 @@ def update_gui(err_msg, results):
for (
head, dependent, dependency_relation, dependency_len,
- sentence_display, sentence_search,
+ sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search,
no_sentence, len_sentences, file
) in results:
# Head
@@ -75,8 +75,9 @@ def update_gui(err_msg, results):
assert isinstance(dependency_len, int)
# Sentence
- assert all(sentence_display)
- assert all(sentence_search)
+ assert all(sentence_tokens_raw)
+ assert all(sentence_tokens_fig)
+ assert all(sentence_tokens_search)
# Sentence No.
assert no_sentence >= 1
diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py
index 04e1d3daf..82dde64d3 100644
--- a/tests/tests_file_area/test_file_area_file_types.py
+++ b/tests/tests_file_area/test_file_area_file_types.py
@@ -153,7 +153,7 @@ def update_gui_file_types(err_msg, new_files):
file_text = new_files[0]['text']
tokens = file_text.to_token_texts()
- tags = file_text.get_token_properties('tag')
+ tags = file_text.get_token_properties('tag', flat = True)
print(tokens)
@@ -192,7 +192,7 @@ def update_gui_file_types(err_msg, new_files):
file_text_tgt = new_files[1]['text']
tokens_src = file_text_src.to_token_texts()
- tags_src = file_text_src.get_token_properties('tag')
+ tags_src = file_text_src.get_token_properties('tag', flat = True)
# Source files
print(file_text_src.lang)
@@ -204,7 +204,7 @@ def update_gui_file_types(err_msg, new_files):
# Target files
tokens_tgt = file_text_tgt.to_token_texts()
- tags_tgt = file_text_tgt.get_token_properties('tag')
+ tags_tgt = file_text_tgt.get_token_properties('tag', flat = True)
print(file_text_tgt.lang)
print(tokens_tgt)
@@ -226,7 +226,7 @@ def update_gui_tags(err_msg, new_files):
file_text = new_files[0]['text']
tokens = file_text.to_token_texts()
- tags = file_text.get_token_properties('tag')
+ tags = file_text.get_token_properties('tag', flat = True)
print(tokens)
print(tags)
diff --git a/tests/tests_utils/test_detection.py b/tests/tests_utils/test_detection.py
index 08e22e436..c26fcbf17 100644
--- a/tests/tests_utils/test_detection.py
+++ b/tests/tests_utils/test_detection.py
@@ -290,11 +290,11 @@ def test_lingua():
langs_exceptions = {'bokmal', 'ganda', 'nynorsk', 'slovene'}
langs_extra = set()
- for lang in lingua.Language.all():
+ for lang in lingua.Language.all(): # pylint: disable=no-member
if lang.name.lower() not in langs | langs_exceptions:
langs_extra.add(lang.name)
- print(f"Extra languages: {', '.join(langs_extra)}\n")
+ print(f"\nExtra languages: {', '.join(langs_extra)}\n")
assert langs_extra == {'BOSNIAN', 'MAORI', 'SHONA', 'SOMALI', 'SOTHO', 'TSONGA', 'XHOSA'}
diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py
index d556a4adf..6e95dd1c1 100644
--- a/wordless/wl_colligation_extractor.py
+++ b/wordless/wl_colligation_extractor.py
@@ -909,7 +909,7 @@ def run(self):
colligations_freqs_file = {}
colligations_freqs_file_all = {}
- text = wl_token_processing.wl_process_tokens(
+ text = wl_token_processing.wl_process_tokens_colligation_extractor(
self.main, file['text'],
token_settings = settings['token_settings']
)
@@ -976,23 +976,21 @@ def run(self):
tags_left = []
tags_right = []
- tags = wl_texts.to_tokens(wl_texts.get_token_properties(tokens, 'tag'), lang = file['lang'])
-
if window_left < 0 < window_right:
# Limit Searching
if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
- tags_left = tags[max(0, i + window_left) : i]
- tags_right = tags[i + ngram_size : i + ngram_size + window_right]
+ tags_left = text.tags[max(0, i + window_left) : i]
+ tags_right = text.tags[i + ngram_size : i + ngram_size + window_right]
else:
# Span positions (Left)
for position in range(max(0, i + window_left), i):
if i_unit_start <= position <= i_unit_end:
- tags_left.append(tags[position])
+ tags_left.append(text.tags[position])
# Span positions (Right)
for position in range(i + ngram_size, i + ngram_size + window_right):
if i_unit_start <= position <= i_unit_end:
- tags_right.append(tags[position])
+ tags_right.append(text.tags[position])
for j, collocate in enumerate(reversed(tags_left)):
if wl_matching.check_context(
@@ -1024,12 +1022,12 @@ def run(self):
elif window_left < 0 and window_right < 0:
# Limit Searching
if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
- tags_left = tags[max(0, i + window_left) : max(0, i + window_right + 1)]
+ tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)]
else:
# Span positions (Left)
for position in range(max(0, i + window_left), max(0, i + window_right + 1)):
if i_unit_start <= position <= i_unit_end:
- tags_left.append(tags[position])
+ tags_left.append(text.tags[position])
for j, collocate in enumerate(reversed(tags_left)):
if wl_matching.check_context(
@@ -1047,12 +1045,12 @@ def run(self):
elif window_left > 0 and window_right > 0:
# Limit Searching
if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
- tags_right = tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
+ tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
else:
# Span positions (Right)
for position in range(i + ngram_size + window_left - 1, i + ngram_size + window_right):
if i_unit_start <= position <= i_unit_end:
- tags_right.append(tags[position])
+ tags_right.append(text.tags[position])
for j, collocate in enumerate(tags_right):
if wl_matching.check_context(
diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py
index 5eeb9ea27..643b46d42 100644
--- a/wordless/wl_collocation_extractor.py
+++ b/wordless/wl_collocation_extractor.py
@@ -906,7 +906,7 @@ def run(self):
collocations_freqs_file = {}
collocations_freqs_file_all = {}
- text = wl_token_processing.wl_process_tokens(
+ text = wl_token_processing.wl_process_tokens_ngram_generator(
self.main, file['text'],
token_settings = settings['token_settings']
)
diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py
index 446e8fc09..501b7c737 100644
--- a/wordless/wl_concordancer.py
+++ b/wordless/wl_concordancer.py
@@ -727,9 +727,10 @@ def run(self):
no_sentence = bisect.bisect(offsets_sentences, i)
no_para = bisect.bisect(offsets_paras, i)
- # Search in Results (Node)
- node_tokens_search = list(ngram)
- node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(ngram))
+ node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+ ngram,
+ punc_mark = True
+ ))
# Width Unit
if settings['generation_settings']['width_unit'] == self.tr('Character'):
@@ -811,16 +812,46 @@ def run(self):
left_tokens_raw = tokens[offset_start:i]
right_tokens_raw = tokens[i + len_search_term : offset_end]
- # Search in results (Left & Right)
- left_tokens_search = copy.deepcopy(left_tokens_raw)
- right_tokens_search = copy.deepcopy(right_tokens_raw)
+ if settings['token_settings']['punc_marks']:
+ node_tokens_search = list(ngram)
+
+ # Remove empty tokens for searching in results
+ left_tokens_search = [token for token in copy.deepcopy(left_tokens_raw) if token]
+ right_tokens_search = [token for token in copy.deepcopy(right_tokens_raw) if token]
+ # Convert trailing punctuation marks, if any, to separate tokens for searching
+ else:
+ node_tokens_search = []
+ left_tokens_search = []
+ right_tokens_search = []
+
+ for token in list(ngram):
+ node_tokens_search.append(token)
+
+ if token.punc_mark:
+ node_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
- # Remove empty tokens for searching in results
- left_tokens_search = [token for token in left_tokens_search if token]
- right_tokens_search = [token for token in right_tokens_search if token]
+ for token in copy.deepcopy(left_tokens_raw):
+ if token:
+ left_tokens_search.append(token)
- left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(left_tokens_raw))
- right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(right_tokens_raw))
+ if token.punc_mark:
+ left_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
+
+ for token in copy.deepcopy(right_tokens_raw):
+ if token:
+ right_tokens_search.append(token)
+
+ if token.punc_mark:
+ right_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
+
+ left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+ left_tokens_raw,
+ punc_mark = True
+ ))
+ right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+ right_tokens_raw,
+ punc_mark = True
+ ))
# Left
concordance_line.append([left_tokens_raw, left_tokens_search])
diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py
index d4de43b4a..f60cfb47a 100644
--- a/wordless/wl_concordancer_parallel.py
+++ b/wordless/wl_concordancer_parallel.py
@@ -408,9 +408,10 @@ def run(self):
parallel_unit_no = bisect.bisect(offsets_paras, j)
if parallel_unit_no not in parallel_units:
+ # Save all nodes if multiple nodes are found in the same parallel unit
parallel_units[parallel_unit_no] = [[] for _ in range(len_files)]
- parallel_units[parallel_unit_no][i] = ngram
+ parallel_units[parallel_unit_no][i].append(ngram)
# Search for additions & deletions
else:
for j, para in enumerate(text.tokens_multilevel):
@@ -428,22 +429,37 @@ def run(self):
len_parallel_units = len(offsets_paras)
for parallel_unit_no, parallel_unit_nodes in parallel_units.items():
- node = parallel_unit_nodes[i]
+ nodes = parallel_unit_nodes[i]
if parallel_unit_no <= len_parallel_units:
- parallel_unit_tokens_raw = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
- parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(parallel_unit_tokens_raw))
- # Search in Results
- parallel_unit_tokens_search = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
-
- # Highlight node if found
- if node:
- len_node = len(node)
-
- for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit_tokens_search, len_node)):
- if ngram == tuple(node):
- parallel_unit_tokens_raw[j] = f'{parallel_unit_tokens_raw[j]}'
- parallel_unit_tokens_raw[j + len_node - 1] += ''
+ parallel_unit = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
+
+ if settings['token_settings']['punc_marks']:
+ parallel_unit_tokens_search = copy.deepcopy(parallel_unit)
+ # Convert trailing punctuation marks, if any, to separate tokens for searching
+ else:
+ parallel_unit_tokens_search = []
+
+ for token in copy.deepcopy(parallel_unit):
+ parallel_unit_tokens_search.append(token)
+
+ if token.punc_mark:
+ parallel_unit_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))
+
+ parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
+ parallel_unit,
+ punc_mark = True
+ ))
+
+ # Highlight nodes if found
+ if nodes:
+ for node in nodes:
+ len_node = len(node)
+
+ for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit, len_node)):
+ if ngram == tuple(node):
+ parallel_unit_tokens_raw[j] = f'{parallel_unit_tokens_raw[j]}'
+ parallel_unit_tokens_raw[j + len_node - 1] += ''
else:
parallel_unit_tokens_raw = []
parallel_unit_tokens_search = []
diff --git a/wordless/wl_dependency_parser.py b/wordless/wl_dependency_parser.py
index fde297ae4..5dba30661 100644
--- a/wordless/wl_dependency_parser.py
+++ b/wordless/wl_dependency_parser.py
@@ -29,7 +29,7 @@
from wordless.wl_checks import wl_checks_work_area
from wordless.wl_dialogs import wl_dialogs_misc
-from wordless.wl_nlp import wl_dependency_parsing, wl_matching, wl_token_processing
+from wordless.wl_nlp import wl_dependency_parsing, wl_matching, wl_texts, wl_token_processing
from wordless.wl_utils import wl_misc, wl_threading
from wordless.wl_widgets import wl_labels, wl_layouts, wl_tables, wl_widgets
@@ -383,7 +383,7 @@ def update_gui_table(self, err_msg, results):
for i, (
head, dependent, dependency_relation, dependency_len,
- sentence_tokens_raw, sentence_tokens_search,
+ sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search,
no_sentence, len_sentences, file
) in enumerate(results):
# Head
@@ -394,7 +394,7 @@ def update_gui_table(self, err_msg, results):
# Dependency Relation
self.model().setItem(i, 2, wl_tables.Wl_Table_Item(dependency_relation))
- # Dependency Distance
+ # Dependency Length
self.set_item_num(i, 3, dependency_len)
self.set_item_num(i, 4, numpy.abs(dependency_len))
@@ -404,6 +404,7 @@ def update_gui_table(self, err_msg, results):
wl_labels.Wl_Label_Html(' '.join(sentence_tokens_raw), self.main)
)
self.indexWidget(self.model().index(i, 5)).tokens_raw = sentence_tokens_raw
+ self.indexWidget(self.model().index(i, 5)).tokens_fig = sentence_tokens_fig
self.indexWidget(self.model().index(i, 5)).tokens_search = sentence_tokens_search
# Sentence No.
@@ -432,7 +433,7 @@ def generate_fig(self):
fig_settings = self.main.settings_custom['dependency_parser']['fig_settings']
for row in self.get_selected_rows():
- sentence = tuple(self.model().item(row, 5).tokens_search)
+ sentence = tuple(self.indexWidget(self.model().index(row, 5)).tokens_fig)
if sentence not in sentences_rendered:
for file in self.settings['file_area']['files_open']:
@@ -512,11 +513,11 @@ def run(self):
if any((token in search_terms for token in sentence)):
dependencies = [
- (token, token.head, token.dependency_relation, token.dependency_len)
+ (token, token.head, token.dependency_relation)
for token in sentence
]
- for i, (token, head, dependency_relation, dependency_len) in enumerate(dependencies):
+ for i, (token, head, dependency_relation) in enumerate(dependencies):
j = i_token + i
if (
@@ -535,25 +536,46 @@ def run(self):
# Sentence
sentence_tokens_raw = []
-
- for sentence_token in sentence:
- if sentence_token == head:
+ sentence_tokens_fig = []
+ # Calculate dependency length based on modified tokens
+ i_head = -1
+ i_dependent = -1
+
+ # Highlight heads and dependents
+ for i, sentence_token in enumerate(sentence):
+ if sentence_token is head:
sentence_tokens_raw.append(f'''
- {sentence_token.display_text()}
+ {sentence_token.display_text(punc_mark = True)}
''')
- elif sentence_token == token:
+
+ i_head = i
+ elif sentence_token is token:
sentence_tokens_raw.append(f'''
- {sentence_token.display_text()}
+ {sentence_token.display_text(punc_mark = True)}
''')
+
+ i_dependent = i
else:
- sentence_tokens_raw.append(sentence_token.display_text())
+ sentence_tokens_raw.append(sentence_token.display_text(punc_mark = True))
+
+ sentence_tokens_fig.append(copy.deepcopy(sentence_token))
+
+ if settings['token_settings']['punc_marks']:
+ # Remove empty tokens for searching in results
+ sentence_tokens_search = [token for token in copy.deepcopy(sentence) if token]
+ # Convert trailing punctuation marks, if any, to separate tokens for searching
+ else:
+ sentence_tokens_search = []
+
+ for sentence_token in copy.deepcopy(sentence):
+ sentence_tokens_search.append(sentence_token)
- # Remove empty tokens for searching in results
- sentence_tokens_search = [token for token in sentence if token]
+ if sentence_token.punc_mark:
+ sentence_tokens_search.append(wl_texts.Wl_Token(sentence_token.punc_mark, lang = sentence_token.lang))
# Head
results[-1].append(head.display_text())
@@ -561,10 +583,10 @@ def run(self):
results[-1].append(token.display_text())
# Dependency Relation
results[-1].append(dependency_relation)
- # Dependency Distance
- results[-1].append(dependency_len)
+ # Dependency Length
+ results[-1].append(i_head - i_dependent)
# Sentence
- results[-1].extend([sentence_tokens_raw, sentence_tokens_search])
+ results[-1].extend([sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search])
# Sentence No.
results[-1].extend([no_sentence, len_sentences])
# File
diff --git a/wordless/wl_keyword_extractor.py b/wordless/wl_keyword_extractor.py
index fdd6f65c1..2d7321e4f 100644
--- a/wordless/wl_keyword_extractor.py
+++ b/wordless/wl_keyword_extractor.py
@@ -693,7 +693,7 @@ def run(self):
tokens_ref = []
for file_ref in files_ref:
- text = wl_token_processing.wl_process_tokens(
+ text = wl_token_processing.wl_process_tokens_ngram_generator(
self.main, file_ref['text'],
token_settings = settings['token_settings']
)
@@ -709,7 +709,7 @@ def run(self):
# Frequency (Observed files)
for file_observed in files_observed:
- text = wl_token_processing.wl_process_tokens(
+ text = wl_token_processing.wl_process_tokens_ngram_generator(
self.main, file_observed['text'],
token_settings = settings['token_settings']
)
diff --git a/wordless/wl_ngram_generator.py b/wordless/wl_ngram_generator.py
index 6e4564bd0..e68a3cc02 100644
--- a/wordless/wl_ngram_generator.py
+++ b/wordless/wl_ngram_generator.py
@@ -781,7 +781,7 @@ def run(self):
for file in files:
ngrams_is = []
- text = wl_token_processing.wl_process_tokens(
+ text = wl_token_processing.wl_process_tokens_ngram_generator(
self.main, file['text'],
token_settings = settings['token_settings']
)
diff --git a/wordless/wl_nlp/wl_dependency_parsing.py b/wordless/wl_nlp/wl_dependency_parsing.py
index b85d915ce..6f99d1a23 100644
--- a/wordless/wl_nlp/wl_dependency_parsing.py
+++ b/wordless/wl_nlp/wl_dependency_parsing.py
@@ -404,10 +404,7 @@ def wl_dependency_parse_fig_tokens(
):
htmls = []
- if inputs and isinstance(list(inputs)[0], wl_texts.Wl_Token):
- inputs, token_properties = wl_texts.split_texts_properties(inputs)
- else:
- token_properties = []
+ inputs, token_properties = wl_texts.split_texts_properties(inputs)
options = {
'fine_grained': show_fine_grained_pos_tags,
@@ -437,11 +434,12 @@ def wl_dependency_parse_fig_tokens(
if show_in_separate_tab:
for doc in nlp.pipe(docs):
for sentence in doc.sents:
- displacy_dict = spacy.displacy.parse_deps(sentence.as_doc(), options = options)
+ displacy_dict = spacy.displacy.parse_deps(sentence, options = options)
if token_properties:
for token, word in zip(sentence, displacy_dict['words']):
- word['text'] += token_properties[i_tag_start + token.i]
+ properties = token_properties[i_tag_start + token.i]
+ word['text'] += (properties['punc_mark'] or '') + (properties['tag'] or '')
htmls.append(spacy.displacy.render(
displacy_dict,
@@ -458,12 +456,12 @@ def wl_dependency_parse_fig_tokens(
for doc in nlp.pipe(docs):
for sentence in doc.sents:
- displacy_dict = spacy.displacy.parse_deps(sentence.as_doc(), options = options)
+ displacy_dict = spacy.displacy.parse_deps(sentence, options = options)
if token_properties:
for token, word in zip(sentence, displacy_dict['words']):
properties = token_properties[i_tag_start + token.i]
- word['text'] += (properties['punc_mark'] or '') + (properties['tag'])
+ word['text'] += (properties['punc_mark'] or '') + (properties['tag'] or '')
sentences.append(displacy_dict)
diff --git a/wordless/wl_nlp/wl_texts.py b/wordless/wl_nlp/wl_texts.py
index 05dc26082..d30b526bf 100644
--- a/wordless/wl_nlp/wl_texts.py
+++ b/wordless/wl_nlp/wl_texts.py
@@ -58,8 +58,11 @@ def __hash__(self):
def __eq__(self, other):
return self.display_text() == other.display_text()
- def display_text(self):
- return str(self) + (self.punc_mark or '') + (self.tag or '')
+ def display_text(self, punc_mark = False):
+ if punc_mark:
+ return str(self) + (self.punc_mark or '') + (self.tag or '')
+ else:
+ return str(self) + (self.tag or '')
def update_properties(self, token):
self.lang = token.lang
@@ -134,8 +137,8 @@ def combine_texts_properties(texts, token_properties):
def to_token_texts(tokens):
return [str(token) for token in tokens]
-def to_display_texts(tokens):
- return [token.display_text() for token in tokens]
+def to_display_texts(tokens, punc_mark = False):
+ return [token.display_text(punc_mark = punc_mark) for token in tokens]
def set_token_text(token, text):
_, token_properties = split_texts_properties([token])
@@ -148,11 +151,18 @@ def set_token_texts(tokens, texts):
for i, token in enumerate(combine_texts_properties(texts, token_properties)):
tokens[i] = token
+def has_token_properties(tokens, name):
+ for token in tokens:
+ if getattr(token, name) is not None:
+ return True
+
+ return False
+
def get_token_properties(tokens, name):
return [getattr(token, name) for token in tokens]
def set_token_properties(tokens, name, vals):
- if isinstance(vals, str):
+ if isinstance(vals, str) or vals is None:
vals = [vals] * len(tokens)
for token, val in zip(tokens, vals):
@@ -398,39 +408,45 @@ def get_tokens_flat(self):
return list(wl_misc.flatten_list(self.tokens_multilevel))
def set_tokens(self, tokens):
- i_start_token = 0
+ i_token = 0
for para in self.tokens_multilevel:
for sentence in para:
for sentence_seg in sentence:
for i, _ in enumerate(sentence_seg):
- sentence_seg[i] = tokens[i_start_token + i]
+ sentence_seg[i] = tokens[i_token]
- i_start_token += len(sentence_seg)
+ i_token += 1
- def to_token_texts(self):
- return [
- [
+ def to_token_texts(self, flat = False):
+ if flat:
+ return to_token_texts(self.get_tokens_flat())
+ else:
+ return [
[
- [str(token) for token in sentence_seg]
- for sentence_seg in sentence
+ [
+ [str(token) for token in sentence_seg]
+ for sentence_seg in sentence
+ ]
+ for sentence in para
]
- for sentence in para
+ for para in self.tokens_multilevel
]
- for para in self.tokens_multilevel
- ]
- def to_display_texts(self):
- return [
- [
+ def to_display_texts(self, punc_mark = False, flat = False):
+ if flat:
+ return to_display_texts(self.get_tokens_flat())
+ else:
+ return [
[
- [token.display_text() for token in sentence_seg]
- for sentence_seg in sentence
+ [
+ [token.display_text(punc_mark = punc_mark) for token in sentence_seg]
+ for sentence_seg in sentence
+ ]
+ for sentence in para
]
- for sentence in para
+ for para in self.tokens_multilevel
]
- for para in self.tokens_multilevel
- ]
def set_token_texts(self, texts):
tokens = self.get_tokens_flat()
@@ -440,11 +456,26 @@ def set_token_texts(self, texts):
self.set_tokens(tokens)
- def get_token_properties(self, name):
- return [getattr(token, name) for token in self.get_tokens_flat()]
+ def has_token_properties(self, name):
+ return has_token_properties(self.get_tokens_flat(), name)
+
+ def get_token_properties(self, name, flat = False):
+ if flat:
+ return get_token_properties(self.get_tokens_flat(), name)
+ else:
+ return [
+ [
+ [
+ [getattr(token, name) for token in sentence_seg]
+ for sentence_seg in sentence
+ ]
+ for sentence in para
+ ]
+ for para in self.tokens_multilevel
+ ]
def set_token_properties(self, name, vals):
- if isinstance(vals, str):
+ if isinstance(vals, str) or vals is None:
vals = [vals] * self.num_tokens
i_val = 0
@@ -458,15 +489,15 @@ def set_token_properties(self, name, vals):
i_val += 1
def update_token_properties(self, tokens):
- i_start_token = 0
+ i_token = 0
for para in self.tokens_multilevel:
for sentence in para:
for sentence_seg in sentence:
- for i, token in enumerate(sentence_seg):
- token.update_properties(tokens[i_start_token + i])
+ for token in sentence_seg:
+ token.update_properties(tokens[i_token])
- i_start_token += len(sentence_seg)
+ i_token += 1
def get_offsets(self):
offsets_paras = []
@@ -566,11 +597,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called
]
# Remove empty tokens and whitespace around tokens
- self.tokens_multilevel[0][0][0] = [
- token_clean
- for token in self.tokens_multilevel[0][0][0]
- if (token_clean := token.strip())
- ]
+ self.tokens_multilevel[0][0][0] = clean_texts(self.tokens_multilevel[0][0][0])
self.tokens_multilevel[0][0][0] = to_tokens(self.tokens_multilevel[0][0][0], self.lang)
self.num_tokens = len(self.get_tokens_flat())
diff --git a/wordless/wl_nlp/wl_token_processing.py b/wordless/wl_nlp/wl_token_processing.py
index 022a36a4c..95d3f5279 100644
--- a/wordless/wl_nlp/wl_token_processing.py
+++ b/wordless/wl_nlp/wl_token_processing.py
@@ -25,6 +25,66 @@
)
from wordless.wl_utils import wl_misc
+# Assign part-of-speech tags
+def text_pos_tag(main, text, settings):
+ if settings['assign_pos_tags'] and not text.tagged:
+ tokens = wl_pos_tagging.wl_pos_tag(
+ main,
+ inputs = text.get_tokens_flat(),
+ lang = text.lang
+ )
+
+ text.update_token_properties(tokens)
+
+# Syllable tokenization
+def text_syl_tokenize(main, text):
+ tokens = wl_syl_tokenization.wl_syl_tokenize(
+ main,
+ inputs = text.get_tokens_flat(),
+ lang = text.lang,
+ )
+
+ text.update_token_properties(tokens)
+
+# Ignore tags
+def text_ignore_tags(text, settings):
+ if settings['ignore_tags']:
+ text.set_token_properties('tag', None)
+
+# Use tags only
+def text_use_tags_only(text, settings):
+ if settings['use_tags']:
+ # Calculate head references
+ if text.has_token_properties('head'):
+ head_refs = []
+
+ for i_para, para in enumerate(text.tokens_multilevel):
+ for i_sentence, sentence in enumerate(para):
+ for sentence_seg in sentence:
+ for token in sentence_seg:
+ head = token.head
+
+ for i_sentence_seg, sentence_seg in enumerate(sentence):
+ for i_token, token in enumerate(sentence_seg):
+ if head is token:
+ head_refs.append((i_para, i_sentence, i_sentence_seg, i_token))
+
+ text.set_token_texts(text.get_token_properties('tag', flat = True))
+ text.set_token_properties('tag', None)
+
+ # Update head references
+ if text.has_token_properties('head'):
+ i_token = 0
+
+ for para in text.tokens_multilevel:
+ for sentence in para:
+ for sentence_seg in sentence:
+ for token in sentence_seg:
+ refs = head_refs[i_token]
+ token.head = text.tokens_multilevel[refs[0]][refs[1]][refs[2]][refs[3]]
+
+ i_token += 1
+
def wl_process_tokens(main, text, token_settings):
settings = copy.deepcopy(token_settings)
@@ -33,20 +93,10 @@ def wl_process_tokens(main, text, token_settings):
settings['all_uppercase'] = False
settings['title_case'] = False
- if settings['ignore_tags']:
- settings['use_tags'] = False
- elif settings['use_tags']:
+ if settings['use_tags']:
settings['apply_lemmatization'] = False
- settings['ignore_tags'] = False
- # Assign part-of-speech tags
- if settings['assign_pos_tags'] and not text.tagged:
- tokens = wl_pos_tagging.wl_pos_tag(
- main,
- inputs = text.get_tokens_flat(),
- lang = text.lang
- )
- text.update_token_properties(tokens)
+ text_pos_tag(main, text, token_settings)
# Apply lemmatization
if settings['apply_lemmatization']:
@@ -153,16 +203,17 @@ def wl_process_tokens(main, text, token_settings):
# Replace tokens with their lemmas
if settings['apply_lemmatization']:
- text_modified.set_token_texts(text_modified.get_token_properties('lemma'))
+ text_modified.set_token_texts(text_modified.get_token_properties('lemma', flat = True))
- # Ignore tags
- if settings['ignore_tags']:
- text_modified.set_token_properties('tag', '')
+ text_modified.update_num_tokens()
- # Use tags only
- if settings['use_tags']:
- text_modified.set_token_texts(text_modified.get_token_properties('tag'))
- text_modified.set_token_properties('tag', '')
+ return text_modified
+
+def wl_process_tokens_ngram_generator(main, text, token_settings):
+ text_modified = wl_process_tokens(main, text, token_settings)
+
+ text_ignore_tags(text_modified, token_settings)
+ text_use_tags_only(text_modified, token_settings)
text_modified.update_num_tokens()
@@ -206,15 +257,9 @@ def wl_process_tokens_profiler(main, text, token_settings):
# Punctuation marks must be preserved for some readability measures (e.g. Wheeler & Smith's Readability Formula)
text.tokens_multilevel_with_puncs = copy.deepcopy(text.tokens_multilevel)
- # Syllable tokenization
- tokens = wl_syl_tokenization.wl_syl_tokenize(
- main,
- inputs = text.get_tokens_flat(),
- lang = text.lang,
- )
- text.update_token_properties(tokens)
+ text_syl_tokenize(main, text)
- text_modified = wl_process_tokens(main, text, token_settings)
+ text_modified = wl_process_tokens_ngram_generator(main, text, token_settings)
text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel)
text_modified.update_num_tokens()
@@ -223,14 +268,7 @@ def wl_process_tokens_profiler(main, text, token_settings):
def wl_process_tokens_concordancer(main, text, token_settings, preserve_blank_lines = False):
settings = copy.deepcopy(token_settings)
- # Assign part-of-speech tags
- if settings['assign_pos_tags'] and not text.tagged:
- tokens = wl_pos_tagging.wl_pos_tag(
- main,
- inputs = text.get_tokens_flat(),
- lang = text.lang
- )
- text.update_token_properties(tokens)
+ text_pos_tag(main, text, token_settings)
text_modified = copy.deepcopy(text)
@@ -272,14 +310,8 @@ def wl_process_tokens_concordancer(main, text, token_settings, preserve_blank_li
if not preserve_blank_lines:
text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel, empty_tokens = False)
- # Ignore tags
- if settings['ignore_tags']:
- text_modified.set_token_properties('tag', '')
-
- # Use tags only
- if settings['use_tags']:
- text_modified.set_token_texts(text_modified.get_token_properties('tag'))
- text_modified.set_token_properties('tag', '')
+ text_ignore_tags(text_modified, token_settings)
+ text_use_tags_only(text_modified, token_settings)
text_modified.update_num_tokens()
@@ -302,17 +334,32 @@ def wl_process_tokens_dependency_parser(main, text, token_settings):
return wl_process_tokens_concordancer(main, text, token_settings)
def wl_process_tokens_wordlist_generator(main, text, token_settings, generation_settings):
- # Syllable tokenization
+ # Syllabification
if generation_settings['syllabification']:
- tokens = wl_syl_tokenization.wl_syl_tokenize(
- main,
- inputs = text.get_tokens_flat(),
- lang = text.lang,
- )
- text.update_token_properties(tokens)
+ text_syl_tokenize(main, text)
- text_modified = wl_process_tokens(main, text, token_settings)
+ text_modified = wl_process_tokens_ngram_generator(main, text, token_settings)
text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel)
text_modified.update_num_tokens()
return text_modified
+
+def wl_process_tokens_colligation_extractor(main, text, token_settings):
+ # Do not modify custom settings, as adding new options would clear user's custom settings
+ settings = copy.deepcopy(token_settings)
+ # Always assign part-of-speech tags
+ settings['assign_pos_tags'] = True
+
+ text_modified = wl_process_tokens(main, text, settings)
+
+ text_modified.tags = wl_texts.to_tokens(
+ text_modified.get_token_properties('tag', flat = True),
+ lang = text.lang
+ )
+
+ text_ignore_tags(text_modified, token_settings)
+ text_use_tags_only(text_modified, token_settings)
+
+ text_modified.update_num_tokens()
+
+ return text_modified
diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py
index e4b58f2f8..26fbb29b8 100644
--- a/wordless/wl_profiler.py
+++ b/wordless/wl_profiler.py
@@ -1252,7 +1252,7 @@ def run(self):
for sentence_seg in sentence
]
- syls_tokens = text.get_token_properties('syls')
+ syls_tokens = text.get_token_properties('syls', flat = True)
# Remove punctuation marks
for i, syls in enumerate(syls_tokens):
diff --git a/wordless/wl_results/wl_results_filter.py b/wordless/wl_results/wl_results_filter.py
index 4e593c5b1..f52a279ea 100644
--- a/wordless/wl_results/wl_results_filter.py
+++ b/wordless/wl_results/wl_results_filter.py
@@ -19,7 +19,7 @@
import copy
import math
-from PyQt5.QtCore import QCoreApplication, Qt
+from PyQt5.QtCore import QCoreApplication
from PyQt5.QtWidgets import QLabel, QPushButton
from wordless.wl_dialogs import wl_dialogs, wl_dialogs_misc
@@ -160,7 +160,7 @@ def __init__(self, main, tab, table):
) = wl_widgets.wl_widgets_filter(
self,
filter_min = 1,
- filter_max = 100
+ filter_max = 1000
)
if self.tab == 'wordlist_generator':
@@ -175,7 +175,7 @@ def __init__(self, main, tab, table):
) = wl_widgets.wl_widgets_filter(
self,
filter_min = 1,
- filter_max = 100
+ filter_max = 1000
)
self.label_freq = QLabel(self.tr('Frequency:'), self)
@@ -567,11 +567,6 @@ def __init__(self, main, tab, table):
self.Worker_Filter_Results = Wl_Worker_Results_Filter_Collocation_Extractor
- if tab in ['collocation_extractor', 'colligation_extractor']:
- self.type_node = 'collocate'
- elif tab == 'keyword_extractor':
- self.type_node = 'keyword'
-
settings = self.table.settings[self.tab]
test_statistical_significance = settings['generation_settings']['test_statistical_significance']
@@ -586,10 +581,20 @@ def __init__(self, main, tab, table):
self.has_bayes_factor = measure_bayes_factor != 'none'
self.has_effect_size = measure_effect_size != 'none'
- if self.type_node == 'collocate':
- self.label_len_node = QLabel(self.tr('Collocate length:'), self)
- elif self.type_node == 'keyword':
- self.label_len_node = QLabel(self.tr('Keyword length:'), self)
+ match tab:
+ case 'collocation_extractor':
+ self.type_node = 'node'
+ self.type_collocation = 'collocation'
+ self.label_len_node = QLabel(self.tr('Node length:'), self)
+ self.label_len_collocation = QLabel(self.tr('Collocation length:'), self)
+ case 'colligation_extractor':
+ self.type_node = 'node'
+ self.type_collocation = 'colligation'
+ self.label_len_node = QLabel(self.tr('Node length:'), self)
+ self.label_len_collocation = QLabel(self.tr('Colligation length:'), self)
+ case 'keyword_extractor':
+ self.type_node = 'keyword'
+ self.label_len_node = QLabel(self.tr('Keyword length:'), self)
(
self.label_len_node_min,
@@ -601,9 +606,37 @@ def __init__(self, main, tab, table):
) = wl_widgets.wl_widgets_filter(
self,
filter_min = 1,
- filter_max = 100
+ filter_max = 1000
)
+ if self.type_node == 'node':
+ self.label_len_collocate = QLabel(self.tr('Collocate length:'), self)
+ (
+ self.label_len_collocate_min,
+ self.spin_box_len_collocate_min,
+ self.checkbox_len_collocate_min_no_limit,
+ self.label_len_collocate_max,
+ self.spin_box_len_collocate_max,
+ self.checkbox_len_collocate_max_no_limit
+ ) = wl_widgets.wl_widgets_filter(
+ self,
+ filter_min = 1,
+ filter_max = 1000
+ )
+
+ (
+ self.label_len_collocation_min,
+ self.spin_box_len_collocation_min,
+ self.checkbox_len_collocation_min_no_limit,
+ self.label_len_collocation_max,
+ self.spin_box_len_collocation_max,
+ self.checkbox_len_collocation_max_no_limit
+ ) = wl_widgets.wl_widgets_filter(
+ self,
+ filter_min = 2,
+ filter_max = 2000
+ )
+
self.label_freq = QLabel(self.tr('Frequency:'), self)
(
self.label_freq_min,
@@ -619,7 +652,7 @@ def __init__(self, main, tab, table):
)
# Frequency position
- if self.type_node == 'collocate':
+ if self.type_node == 'node':
self.combo_box_freq_position = wl_boxes.Wl_Combo_Box(self)
for i in range(
@@ -696,7 +729,17 @@ def __init__(self, main, tab, table):
self.spin_box_len_node_max.valueChanged.connect(self.filters_changed)
self.checkbox_len_node_max_no_limit.stateChanged.connect(self.filters_changed)
- if self.type_node == 'collocate':
+ if self.type_node == 'node':
+ self.spin_box_len_collocate_min.valueChanged.connect(self.filters_changed)
+ self.checkbox_len_collocate_min_no_limit.stateChanged.connect(self.filters_changed)
+ self.spin_box_len_collocate_max.valueChanged.connect(self.filters_changed)
+ self.checkbox_len_collocate_max_no_limit.stateChanged.connect(self.filters_changed)
+
+ self.spin_box_len_collocation_min.valueChanged.connect(self.filters_changed)
+ self.checkbox_len_collocation_min_no_limit.stateChanged.connect(self.filters_changed)
+ self.spin_box_len_collocation_max.valueChanged.connect(self.filters_changed)
+ self.checkbox_len_collocation_max_no_limit.stateChanged.connect(self.filters_changed)
+
self.combo_box_freq_position.currentTextChanged.connect(self.filters_changed)
self.spin_box_freq_min.valueChanged.connect(self.filters_changed)
@@ -736,17 +779,29 @@ def __init__(self, main, tab, table):
# Close the dialog when data in the table are re-generated
self.table.button_generate_table.clicked.connect(self.close)
- widgets_filter = [
- [
- self.label_len_node,
- self.label_len_node_min, self.spin_box_len_node_min, self.checkbox_len_node_min_no_limit,
- self.label_len_node_max, self.spin_box_len_node_max, self.checkbox_len_node_max_no_limit
- ], [
- self.label_freq,
- self.label_freq_min, self.spin_box_freq_min, self.checkbox_freq_min_no_limit,
- self.label_freq_max, self.spin_box_freq_max, self.checkbox_freq_max_no_limit
- ]
- ]
+ widgets_filter = [[
+ self.label_len_node,
+ self.label_len_node_min, self.spin_box_len_node_min, self.checkbox_len_node_min_no_limit,
+ self.label_len_node_max, self.spin_box_len_node_max, self.checkbox_len_node_max_no_limit
+ ]]
+
+ if self.type_node == 'node':
+ widgets_filter.append([
+ self.label_len_collocate,
+ self.label_len_collocate_min, self.spin_box_len_collocate_min, self.checkbox_len_collocate_min_no_limit,
+ self.label_len_collocate_max, self.spin_box_len_collocate_max, self.checkbox_len_collocate_max_no_limit
+ ])
+ widgets_filter.append([
+ self.label_len_collocation,
+ self.label_len_collocation_min, self.spin_box_len_collocation_min, self.checkbox_len_collocation_min_no_limit,
+ self.label_len_collocation_max, self.spin_box_len_collocation_max, self.checkbox_len_collocation_max_no_limit
+ ])
+
+ widgets_filter.append([
+ self.label_freq,
+ self.label_freq_min, self.spin_box_freq_min, self.checkbox_freq_min_no_limit,
+ self.label_freq_max, self.spin_box_freq_max, self.checkbox_freq_max_no_limit
+ ])
if self.has_test_stat:
widgets_filter.append([
@@ -784,14 +839,11 @@ def __init__(self, main, tab, table):
add_widgets_filter(self, widgets_filter = widgets_filter, layout = self.layout_filters)
- if self.type_node == 'collocate':
+ if self.type_node == 'node':
self.layout_filters.removeWidget(self.label_freq)
- layout_freq_position = wl_layouts.Wl_Layout()
- layout_freq_position.addWidget(self.label_freq, 0, 0)
- layout_freq_position.addWidget(self.combo_box_freq_position, 0, 1, Qt.AlignRight)
-
- self.layout_filters.addLayout(layout_freq_position, 3, 0, 1, 3)
+ self.layout_filters.addWidget(self.label_freq, 9, 0, 1, 2)
+ self.layout_filters.addWidget(self.combo_box_freq_position, 9, 2)
self.load_settings()
@@ -808,7 +860,17 @@ def load_settings(self, defaults = False):
self.spin_box_len_node_max.setValue(settings[f'len_{self.type_node}_max'])
self.checkbox_len_node_max_no_limit.setChecked(settings[f'len_{self.type_node}_max_no_limit'])
- if self.type_node == 'collocate':
+ if self.type_node == 'node':
+ self.spin_box_len_collocate_min.setValue(settings['len_collocate_min'])
+ self.checkbox_len_collocate_min_no_limit.setChecked(settings['len_collocate_min_no_limit'])
+ self.spin_box_len_collocate_max.setValue(settings['len_collocate_max'])
+ self.checkbox_len_collocate_max_no_limit.setChecked(settings['len_collocate_max_no_limit'])
+
+ self.spin_box_len_collocation_min.setValue(settings[f'len_{self.type_collocation}_min'])
+ self.checkbox_len_collocation_min_no_limit.setChecked(settings[f'len_{self.type_collocation}_min_no_limit'])
+ self.spin_box_len_collocation_max.setValue(settings[f'len_{self.type_collocation}_max'])
+ self.checkbox_len_collocation_max_no_limit.setChecked(settings[f'len_{self.type_collocation}_max_no_limit'])
+
self.combo_box_freq_position.setCurrentText(settings['freq_position'])
self.spin_box_freq_min.setValue(settings['freq_min'])
@@ -851,7 +913,17 @@ def filters_changed(self):
self.settings[f'len_{self.type_node}_max'] = self.spin_box_len_node_max.value()
self.settings[f'len_{self.type_node}_max_no_limit'] = self.checkbox_len_node_max_no_limit.isChecked()
- if self.type_node == 'collocate':
+ if self.type_node == 'node':
+ self.settings['len_collocate_min'] = self.spin_box_len_collocate_min.value()
+ self.settings['len_collocate_min_no_limit'] = self.checkbox_len_collocate_min_no_limit.isChecked()
+ self.settings['len_collocate_max'] = self.spin_box_len_collocate_max.value()
+ self.settings['len_collocate_max_no_limit'] = self.checkbox_len_collocate_max_no_limit.isChecked()
+
+ self.settings[f'len_{self.type_collocation}_min'] = self.spin_box_len_collocation_min.value()
+ self.settings[f'len_{self.type_collocation}_min_no_limit'] = self.checkbox_len_collocation_min_no_limit.isChecked()
+ self.settings[f'len_{self.type_collocation}_max'] = self.spin_box_len_collocation_max.value()
+ self.settings[f'len_{self.type_collocation}_max_no_limit'] = self.checkbox_len_collocation_max_no_limit.isChecked()
+
self.settings['freq_position'] = self.combo_box_freq_position.currentText()
self.settings['freq_min'] = self.spin_box_freq_min.value()
@@ -896,8 +968,9 @@ def run(self):
col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text']
col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text']
- if self.dialog.type_node == 'collocate':
- col_node = self.dialog.table.find_header_hor(self.tr('Collocate'))
+ if self.dialog.type_node == 'node':
+ col_node = self.dialog.table.find_header_hor(self.tr('Node'))
+ col_collocate = self.dialog.table.find_header_hor(self.tr('Collocate'))
if self.dialog.settings['freq_position'] == self.tr('Total'):
col_freq = self.dialog.table.find_header_hor(
@@ -910,8 +983,8 @@ def run(self):
else:
col_node = self.dialog.table.find_header_hor(self.tr('Keyword'))
col_freq = self.dialog.table.find_header_hor(
- self.tr('[{}]\nFrequency').format(self.dialog.settings['file_to_filter'])
- )
+ self.tr('[{}]\nFrequency').format(self.dialog.settings['file_to_filter'])
+ )
if self.dialog.has_test_stat:
col_test_stat = self.dialog.table.find_header_hor(
@@ -946,6 +1019,29 @@ def run(self):
else self.dialog.settings[f'len_{self.dialog.type_node}_max']
)
+ if self.dialog.type_node == 'node':
+ len_collocate_min = (
+ float('-inf')
+ if self.dialog.settings['len_collocate_min_no_limit']
+ else self.dialog.settings['len_collocate_min']
+ )
+ len_collocate_max = (
+ float('inf')
+ if self.dialog.settings['len_collocate_max_no_limit']
+ else self.dialog.settings['len_collocate_max']
+ )
+
+ len_collocation_min = (
+ float('-inf')
+ if self.dialog.settings[f'len_{self.dialog.type_collocation}_min_no_limit']
+ else self.dialog.settings[f'len_{self.dialog.type_collocation}_min']
+ )
+ len_collocation_max = (
+ float('inf')
+ if self.dialog.settings[f'len_{self.dialog.type_collocation}_max_no_limit']
+ else self.dialog.settings[f'len_{self.dialog.type_collocation}_max']
+ )
+
freq_min = (
float('-inf')
if self.dialog.settings['freq_min_no_limit']
@@ -1015,61 +1111,54 @@ def run(self):
self.dialog.table.row_filters = []
for i in range(self.dialog.table.model().rowCount()):
+ filters = []
+
# Calculate length of token texts only when filtering tagged tokens and when filtering tags
len_node = sum((
len(str(token))
for token in self.dialog.table.model().item(i, col_node).tokens_filter
))
- filter_len_node = len_node_min <= len_node <= len_node_max
- filter_freq = (
+ filters.append(len_node_min <= len_node <= len_node_max)
+
+ if self.dialog.type_node == 'node':
+ len_collocate = sum((
+ len(str(token))
+ for token in self.dialog.table.model().item(i, col_collocate).tokens_filter
+ ))
+
+ filters.append(len_collocate_min <= len_collocate <= len_collocate_max)
+ filters.append(len_collocation_min <= len_node + len_collocate <= len_collocation_max)
+
+ filters.append(
freq_min <= self.dialog.table.model().item(i, col_freq).val <= freq_max
)
if self.dialog.has_test_stat:
- filter_test_stat = (
+ filters.append(
test_stat_min <= self.dialog.table.model().item(i, col_test_stat).val <= test_stat_max
)
- else:
- filter_test_stat = True
if self.dialog.has_p_val:
- filter_p_val = (
+ filters.append(
p_val_min <= self.dialog.table.model().item(i, col_p_value).val <= p_val_max
)
- else:
- filter_p_val = True
if self.dialog.has_bayes_factor:
- filter_bayes_factor = (
+ filters.append(
bayes_factor_min <= self.dialog.table.model().item(i, col_bayes_factor).val <= bayes_factor_max
)
- else:
- filter_bayes_factor = True
if self.dialog.has_effect_size:
- filter_effect_size = (
+ filters.append(
effect_size_min <= self.dialog.table.model().item(i, col_effect_size).val <= effect_size_max
)
- else:
- filter_effect_size = True
- filter_num_files_found = (
+ filters.append(
num_files_found_min <= self.dialog.table.model().item(i, col_num_files_found).val <= num_files_found_max
)
- if (
- filter_len_node
- and filter_freq
- and filter_test_stat
- and filter_p_val
- and filter_bayes_factor
- and filter_effect_size
- and filter_num_files_found
- ):
- self.dialog.table.row_filters.append(True)
- else:
- self.dialog.table.row_filters.append(False)
+ self.dialog.table.row_filters.append(all(filters))
self.progress_updated.emit(self.tr('Updating table...'))
self.worker_done.emit()
diff --git a/wordless/wl_results/wl_results_search.py b/wordless/wl_results/wl_results_search.py
index 3c09c0ba1..4f8997f30 100644
--- a/wordless/wl_results/wl_results_search.py
+++ b/wordless/wl_results/wl_results_search.py
@@ -37,6 +37,7 @@ def __init__(self, main, tab, table):
self.tab = tab
self.tables = [table]
self.settings = self.main.settings_custom[self.tab]['search_results']
+ self.last_search_settings = []
self.items_found = []
self.main.wl_work_area.currentChanged.connect(self.reject)
@@ -86,7 +87,7 @@ def __init__(self, main, tab, table):
self.button_find_next.clicked.connect(lambda: self.find_next()) # pylint: disable=unnecessary-lambda
self.button_find_prev.clicked.connect(lambda: self.find_prev()) # pylint: disable=unnecessary-lambda
self.button_find_all.clicked.connect(lambda: self.find_all()) # pylint: disable=unnecessary-lambda
- self.button_clr_hightlights.clicked.connect(self.clr_highlights)
+ self.button_clr_hightlights.clicked.connect(lambda: self.clr_highlights()) # pylint: disable=unnecessary-lambda
self.button_close.clicked.connect(self.reject)
@@ -123,8 +124,8 @@ def __init__(self, main, tab, table):
self.layout().addWidget(wl_layouts.Wl_Separator(self), 9, 0, 1, 4)
self.layout().addLayout(layout_buttons_bottom, 10, 0, 1, 4)
- for table in self.tables: # pylint: disable=redefined-argument-from-local
- table.model().itemChanged.connect(self.table_item_changed)
+ for table_to_search in self.tables:
+ table_to_search.model().itemChanged.connect(self.table_item_changed)
self.load_settings()
@@ -240,9 +241,7 @@ def find_prev(self):
selected_rows = []
for table in self.tables:
- table.hide()
- table.blockSignals(True)
- table.setUpdatesEnabled(False)
+ table.disable_updates()
for table in self.tables:
if table.get_selected_rows():
@@ -277,24 +276,24 @@ def find_prev(self):
self.tables[-1].selectRow(self.items_found[-1][1])
for table in self.tables:
- table.blockSignals(False)
- table.setUpdatesEnabled(True)
- table.show()
+ table.enable_updates()
@wl_misc.log_timing
def find_all(self):
- self.clr_highlights()
+ # Search only when there are no search history or search settings have been changed
+ if not self.items_found or self.last_search_settings != copy.deepcopy(self.settings):
+ self.clr_highlights()
- dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress(self.main, text = self.tr('Searching in results...'))
+ dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress(self.main, text = self.tr('Searching in results...'))
- worker_results_search = Wl_Worker_Results_Search(
- self.main,
- dialog_progress = dialog_progress,
- update_gui = self.update_gui,
- dialog = self
- )
+ worker_results_search = Wl_Worker_Results_Search(
+ self.main,
+ dialog_progress = dialog_progress,
+ update_gui = self.update_gui,
+ dialog = self
+ )
- wl_threading.Wl_Thread(worker_results_search).start_worker()
+ wl_threading.Wl_Thread(worker_results_search).start_worker()
def update_gui(self):
if self.items_found:
@@ -324,11 +323,15 @@ def update_gui(self):
self.button_clr_hightlights.setEnabled(False)
+ # Save search settings
+ self.last_search_settings = copy.deepcopy(self.settings)
+
len_items_found = len(self.items_found)
msg_item = self.tr('item') if len_items_found == 1 else self.tr('items')
self.main.statusBar().showMessage(self.tr('Found {} {}.').format(len_items_found, msg_item))
+ @wl_misc.log_timing
def clr_highlights(self):
if self.items_found:
for table in self.tables:
@@ -345,11 +348,16 @@ def clr_highlights(self):
for table in self.tables:
table.enable_updates()
- self.items_found.clear()
+ self.clr_history()
+
self.main.statusBar().showMessage(self.tr('Highlights cleared.'))
self.button_clr_hightlights.setEnabled(False)
+ def clr_history(self):
+ self.last_search_settings.clear()
+ self.items_found.clear()
+
def load(self):
self.show()
diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py
index 811b20fac..f601a36ab 100644
--- a/wordless/wl_settings/wl_settings_default.py
+++ b/wordless/wl_settings/wl_settings_default.py
@@ -783,11 +783,21 @@ def init_settings_default(main):
'filter_results': {
'file_to_filter': _tr('wl_settings_default', 'Total'),
+ 'len_node_min': 1,
+ 'len_node_min_no_limit': True,
+ 'len_node_max': 20,
+ 'len_node_max_no_limit': True,
+
'len_collocate_min': 1,
'len_collocate_min_no_limit': True,
'len_collocate_max': 20,
'len_collocate_max_no_limit': True,
+ 'len_collocation_min': 1,
+ 'len_collocation_min_no_limit': True,
+ 'len_collocation_max': 20,
+ 'len_collocation_max_no_limit': True,
+
'freq_position': _tr('wl_settings_default', 'Total'),
'freq_min': 0,
'freq_min_no_limit': True,
@@ -847,8 +857,6 @@ def init_settings_default(main):
'apply_lemmatization': False,
'filter_stop_words': False,
- # Always assign POS tags
- 'assign_pos_tags': True,
'ignore_tags': False,
'use_tags': False
},
@@ -941,11 +949,21 @@ def init_settings_default(main):
'filter_results': {
'file_to_filter': _tr('wl_settings_default', 'Total'),
+ 'len_node_min': 1,
+ 'len_node_min_no_limit': True,
+ 'len_node_max': 20,
+ 'len_node_max_no_limit': True,
+
'len_collocate_min': 1,
'len_collocate_min_no_limit': True,
'len_collocate_max': 20,
'len_collocate_max_no_limit': True,
+ 'len_colligation_min': 1,
+ 'len_colligation_min_no_limit': True,
+ 'len_colligation_max': 20,
+ 'len_colligation_max_no_limit': True,
+
'freq_position': _tr('wl_settings_default', 'Total'),
'freq_min': 0,
'freq_min_no_limit': True,
diff --git a/wordless/wl_utils/wl_detection.py b/wordless/wl_utils/wl_detection.py
index a34ddc59e..dee923323 100644
--- a/wordless/wl_utils/wl_detection.py
+++ b/wordless/wl_utils/wl_detection.py
@@ -51,6 +51,7 @@ def detect_encoding(main, file_path):
return encoding
+# pylint: disable=no-member
lingua_detector = lingua.LanguageDetectorBuilder.from_all_languages_without(
lingua.Language.BOSNIAN,
lingua.Language.MAORI,
diff --git a/wordless/wl_widgets/wl_tables.py b/wordless/wl_widgets/wl_tables.py
index 1377a5998..0c19452d5 100644
--- a/wordless/wl_widgets/wl_tables.py
+++ b/wordless/wl_widgets/wl_tables.py
@@ -40,6 +40,8 @@
_tr = QCoreApplication.translate
+# pylint: disable=unnecessary-lambda
+
class Wl_Table(QTableView):
def __init__(
self, parent,
@@ -513,16 +515,16 @@ def run(self):
if '*.csv' in self.file_type:
encoding = self.main.settings_custom['general']['exp']['tables']['default_encoding']
- # Concordancer
- if self.table.tab == 'concordancer':
- with open(self.file_path, 'w', encoding = encoding, newline = '') as f:
- csv_writer = csv.writer(f)
+ with open(self.file_path, 'w', encoding = encoding, newline = '') as f:
+ csv_writer = csv.writer(f)
+ if self.table.header_orientation == 'hor':
# Horizontal headers
- csv_writer.writerow([
- self.table.model().horizontalHeaderItem(col).text().strip()
+ headers_hor = [
+ self.table.model().horizontalHeaderItem(col).text()
for col in cols
- ])
+ ]
+ csv_writer.writerow(self.clean_text_csv(headers_hor))
# Cells
for i, row in enumerate(self.rows_to_exp):
@@ -537,77 +539,28 @@ def run(self):
row_to_exp.append(cell_text)
- csv_writer.writerow(row_to_exp)
+ csv_writer.writerow(self.clean_text_csv(row_to_exp))
self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows))
- # Parallel Concordancer
- elif self.table.tab == 'concordancer_parallel':
- with open(self.file_path, 'w', encoding = encoding, newline = '') as f:
- csv_writer = csv.writer(f)
-
- # Horizontal Headers
- csv_writer.writerow([
- self.table.model().horizontalHeaderItem(col).text().strip()
+ # Profiler
+ else:
+ # Horizontal headers
+ headers_hor = [
+ self.table.model().horizontalHeaderItem(col).text()
for col in cols
- ])
+ ]
+ csv_writer.writerow([''] + self.clean_text_csv(headers_hor))
- # Cells
+ # Vertical headers and cells
for i, row in enumerate(self.rows_to_exp):
- row_to_exp = []
+ row_to_exp = [self.table.model().verticalHeaderItem(row).text()]
for col in cols:
- if self.table.model().item(row, col):
- cell_text = self.table.model().item(row, col).text()
- else:
- cell_text = self.table.indexWidget(self.table.model().index(row, col)).text()
- cell_text = wl_nlp_utils.html_to_text(cell_text)
-
- row_to_exp.append(cell_text)
+ row_to_exp.append(self.table.model().item(row, col).text())
- csv_writer.writerow(row_to_exp)
+ csv_writer.writerow(self.clean_text_csv(row_to_exp))
self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows))
- else:
- with open(self.file_path, 'w', encoding = encoding, newline = '') as f:
- csv_writer = csv.writer(f)
-
- if self.table.header_orientation == 'hor':
- # Horizontal headers
- csv_writer.writerow([
- self.table.model().horizontalHeaderItem(col).text().strip()
- for col in cols
- ])
-
- # Cells
- for i, row in enumerate(self.rows_to_exp):
- row_to_exp = []
-
- for col in cols:
- row_to_exp.append(self.table.model().item(row, col).text().strip())
-
- csv_writer.writerow(row_to_exp)
-
- self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows))
- else:
- # Horizontal headers
- csv_writer.writerow(
- ['']
- + [
- self.table.model().horizontalHeaderItem(col).text().strip()
- for col in cols
- ]
- )
-
- # Vertical headers & cells
- for i, row in enumerate(self.rows_to_exp):
- row_to_exp = [self.table.model().verticalHeaderItem(row).text().strip()]
-
- for col in cols:
- row_to_exp.append(self.table.model().item(row, col).text().strip())
-
- csv_writer.writerow(row_to_exp)
-
- self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows))
# Excel workbooks
elif '*.xlsx' in self.file_type:
workbook = openpyxl.Workbook()
@@ -735,7 +688,7 @@ def run(self):
para_text = []
for col in range(3):
- para_text.append(self.table.indexWidget(self.table.model().index(row, col)).text())
+ para_text.append(self.table.indexWidget(self.table.model().index(row, col)).text().strip())
# Zapping
if settings_concordancer['zapping']:
@@ -764,14 +717,15 @@ def run(self):
# Parallel Concordancer
elif self.table.tab == 'concordancer_parallel':
for i, row in enumerate(self.rows_to_exp):
+ if i > 0:
+ self.add_para(doc)
+
for col in range(2, self.table.model().columnCount()):
- para_text = self.table.indexWidget(self.table.model().index(row, col)).text()
+ para_text = self.table.indexWidget(self.table.model().index(row, col)).text().strip()
para = self.add_para(doc)
self.style_para_rich_text(para, para_text, self.table.indexWidget(self.table.model().index(row, col)))
- self.add_para(doc)
-
self.progress_updated.emit(self.tr('Exporting table... ({} / {})').format(i + 1, len_rows))
# Add the last empty paragraph
@@ -789,6 +743,15 @@ def run(self):
self.worker_done.emit(err_msg, self.file_path)
+ # Clean text before writing to CSV files
+ def clean_text_csv(self, items):
+ for i, item in enumerate(items):
+ items[i] = item.replace('\n', ' ')
+ items[i] = re.sub(r'\s+', ' ', items[i])
+ items[i] = items[i].strip()
+
+ return items
+
# Remove invalid XML characters
def remove_invalid_xml_chars(self, text):
# openpyxl.cell.cell.ILLEGAL_CHARACTERS_RE is not complete
@@ -1081,9 +1044,9 @@ def __init__(self, parent, headers, col_edit = None):
self.button_del = QPushButton(_tr('wl_tables', 'Remove'), self)
self.button_clr = QPushButton(_tr('wl_tables', 'Clear'), self)
- self.button_add.clicked.connect(lambda: self.add_row()) # pylint: disable=unnecessary-lambda
- self.button_ins.clicked.connect(lambda: self.ins_row()) # pylint: disable=unnecessary-lambda
- self.button_del.clicked.connect(lambda: self.del_row()) # pylint: disable=unnecessary-lambda
+ self.button_add.clicked.connect(lambda: self.add_row())
+ self.button_ins.clicked.connect(lambda: self.ins_row())
+ self.button_del.clicked.connect(lambda: self.del_row())
self.button_clr.clicked.connect(lambda: self.clr_table(0))
def item_changed(self):
@@ -1181,10 +1144,10 @@ def __init__(
if not generate_fig:
self.button_generate_fig.hide()
- self.button_generate_table.clicked.connect(lambda: self.generate_table()) # pylint: disable=unnecessary-lambda
- self.button_generate_fig.clicked.connect(lambda: self.generate_fig()) # pylint: disable=unnecessary-lambda
- self.button_exp_selected_cells.clicked.connect(self.exp_selected_cells)
- self.button_exp_all_cells.clicked.connect(lambda: self.exp_all_cells()) # pylint: disable=unnecessary-lambda
+ self.button_generate_table.clicked.connect(lambda: self.generate_table())
+ self.button_generate_fig.clicked.connect(lambda: self.generate_fig())
+ self.button_exp_selected_cells.clicked.connect(lambda: self.exp_selected_cells())
+ self.button_exp_all_cells.clicked.connect(lambda: self.exp_all_cells())
self.button_clr_table.clicked.connect(lambda: self.clr_table(confirm = True))
self.main.wl_file_area.table_files.model().itemChanged.connect(self.file_changed)
@@ -1871,6 +1834,7 @@ def __init__(
self.button_results_search.setMinimumWidth(140)
+ self.button_generate_table.clicked.connect(self.dialog_results_search.clr_history)
self.button_results_search.clicked.connect(self.dialog_results_search.load)
self.results_changed()
@@ -1924,6 +1888,7 @@ def __init__(
self.button_results_search.setMinimumWidth(140)
self.button_results_sort.setMinimumWidth(140)
+ self.button_generate_table.clicked.connect(self.dialog_results_search.clr_history)
self.button_results_search.clicked.connect(self.dialog_results_search.load)
self.button_results_sort.clicked.connect(self.dialog_results_sort.show)
@@ -1976,6 +1941,7 @@ def __init__(
self.button_results_filter.setMinimumWidth(140)
self.button_results_search.setMinimumWidth(140)
+ self.button_generate_table.clicked.connect(self.dialog_results_search.clr_history)
self.button_results_filter.clicked.connect(self.results_filter_clicked)
self.button_results_search.clicked.connect(self.dialog_results_search.load)
diff --git a/wordless/wl_widgets/wl_widgets.py b/wordless/wl_widgets/wl_widgets.py
index d646b0372..a8974000d 100644
--- a/wordless/wl_widgets/wl_widgets.py
+++ b/wordless/wl_widgets/wl_widgets.py
@@ -338,10 +338,18 @@ def words_changed():
checkbox_all_uppercase.setEnabled(False)
checkbox_title_case.setEnabled(False)
+ def assign_pos_tags_changed():
+ if checkbox_assign_pos_tags.isChecked():
+ checkbox_ignore_tags.setEnabled(False)
+ else:
+ checkbox_ignore_tags.setEnabled(not checkbox_use_tags.isChecked())
+
def ignore_tags_changed():
if checkbox_ignore_tags.isChecked():
+ checkbox_assign_pos_tags.setEnabled(False)
checkbox_use_tags.setEnabled(False)
else:
+ checkbox_assign_pos_tags.setEnabled(True)
checkbox_use_tags.setEnabled(True)
def use_tags_changed():
@@ -350,7 +358,7 @@ def use_tags_changed():
checkbox_ignore_tags.setEnabled(False)
else:
checkbox_apply_lemmatization.setEnabled(True)
- checkbox_ignore_tags.setEnabled(True)
+ checkbox_ignore_tags.setEnabled(not checkbox_assign_pos_tags.isChecked())
checkbox_words = QCheckBox(_tr('wl_widgets', 'Words'), parent)
checkbox_all_lowercase = QCheckBox(_tr('wl_widgets', 'All lowercase'), parent)
@@ -368,6 +376,7 @@ def use_tags_changed():
checkbox_use_tags = QCheckBox(_tr('wl_widgets', 'Use tags only'), parent)
checkbox_words.stateChanged.connect(words_changed)
+ checkbox_assign_pos_tags.stateChanged.connect(assign_pos_tags_changed)
checkbox_ignore_tags.stateChanged.connect(ignore_tags_changed)
checkbox_use_tags.stateChanged.connect(use_tags_changed)
@@ -393,56 +402,33 @@ def use_tags_changed():
)
def wl_widgets_token_settings_concordancer(parent):
- def ignore_tags_changed():
- if checkbox_ignore_tags.isChecked():
- checkbox_use_tags.setEnabled(False)
- else:
- checkbox_use_tags.setEnabled(True)
-
- def use_tags_changed():
- if checkbox_use_tags.isChecked():
+ def assign_pos_tags_changed():
+ if checkbox_assign_pos_tags.isChecked():
checkbox_ignore_tags.setEnabled(False)
else:
- checkbox_ignore_tags.setEnabled(True)
+ checkbox_ignore_tags.setEnabled(not checkbox_use_tags.isChecked())
- checkbox_punc_marks = QCheckBox(_tr('wl_widgets', 'Punctuation marks'), parent)
-
- checkbox_assign_pos_tags = QCheckBox(_tr('wl_widgets', 'Assign part-of-speech tags'), parent)
- checkbox_ignore_tags = QCheckBox(_tr('wl_widgets', 'Ignore tags'), parent)
- checkbox_use_tags = QCheckBox(_tr('wl_widgets', 'Use tags only'), parent)
-
- checkbox_ignore_tags.stateChanged.connect(ignore_tags_changed)
- checkbox_use_tags.stateChanged.connect(use_tags_changed)
-
- ignore_tags_changed()
- use_tags_changed()
-
- return (
- checkbox_punc_marks,
-
- checkbox_assign_pos_tags,
- checkbox_ignore_tags,
- checkbox_use_tags
- )
-
-def wl_widgets_token_settings_concordancer1(parent):
def ignore_tags_changed():
if checkbox_ignore_tags.isChecked():
+ checkbox_assign_pos_tags.setEnabled(False)
checkbox_use_tags.setEnabled(False)
else:
+ checkbox_assign_pos_tags.setEnabled(True)
checkbox_use_tags.setEnabled(True)
def use_tags_changed():
if checkbox_use_tags.isChecked():
checkbox_ignore_tags.setEnabled(False)
else:
- checkbox_ignore_tags.setEnabled(True)
+ checkbox_ignore_tags.setEnabled(not checkbox_assign_pos_tags.isChecked())
checkbox_punc_marks = QCheckBox(_tr('wl_widgets', 'Punctuation marks'), parent)
+ checkbox_assign_pos_tags = QCheckBox(_tr('wl_widgets', 'Assign part-of-speech tags'), parent)
checkbox_ignore_tags = QCheckBox(_tr('wl_widgets', 'Ignore tags'), parent)
checkbox_use_tags = QCheckBox(_tr('wl_widgets', 'Use tags only'), parent)
+ checkbox_assign_pos_tags.stateChanged.connect(assign_pos_tags_changed)
checkbox_ignore_tags.stateChanged.connect(ignore_tags_changed)
checkbox_use_tags.stateChanged.connect(use_tags_changed)
@@ -452,6 +438,7 @@ def use_tags_changed():
return (
checkbox_punc_marks,
+ checkbox_assign_pos_tags,
checkbox_ignore_tags,
checkbox_use_tags
)
@@ -496,10 +483,7 @@ def token_settings_changed(token_settings = None):
match_tags_changed()
def match_without_tags_changed():
- if checkbox_match_without_tags.isChecked():
- checkbox_match_tags.setEnabled(False)
- else:
- checkbox_match_tags.setEnabled(True)
+ checkbox_match_tags.setEnabled(not checkbox_match_without_tags.isChecked())
def match_tags_changed():
if checkbox_match_tags.isChecked():