Skip to content

Commit

Permalink
Work Area: Add Collocation/Colligation Extractor - Filter results - N…
Browse files Browse the repository at this point in the history
…ode/Collocation length
  • Loading branch information
BLKSerene committed May 19, 2024
1 parent 0d95bf9 commit 6f8aa74
Show file tree
Hide file tree
Showing 21 changed files with 574 additions and 367 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
- Utils: Add PyThaiNLP's Han-solo
- Utils: Add Stanza's Sindhi part-of-speech tagger
- Utils: Add VADER's sentiment analyzers
- Work Area: Add Collocation/Colligation Extractor - Filter results /
- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic

### ✨ Improvements
Expand All @@ -40,7 +41,7 @@

### ❌ Removals
- Menu: Remove Settings - Measures - Statistical Significance - Welch's t-test
- Work Area: Remove Collocation Extractor / Colligation Extractor / Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test
- Work Area: Remove Collocation/Colligation/Keyword Extractor - Generation Settings - Test of Statistical Significance - Welch's t-test
- Utils: Remove Dostoevsky's Russian sentiment analyzer

### ⏫ Dependency Changes
Expand Down
7 changes: 4 additions & 3 deletions tests/test_dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def update_gui(err_msg, results):

for (
head, dependent, dependency_relation, dependency_len,
sentence_display, sentence_search,
sentence_tokens_raw, sentence_tokens_fig, sentence_tokens_search,
no_sentence, len_sentences, file
) in results:
# Head
Expand All @@ -75,8 +75,9 @@ def update_gui(err_msg, results):
assert isinstance(dependency_len, int)

# Sentence
assert all(sentence_display)
assert all(sentence_search)
assert all(sentence_tokens_raw)
assert all(sentence_tokens_fig)
assert all(sentence_tokens_search)

# Sentence No.
assert no_sentence >= 1
Expand Down
8 changes: 4 additions & 4 deletions tests/tests_file_area/test_file_area_file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def update_gui_file_types(err_msg, new_files):
file_text = new_files[0]['text']

tokens = file_text.to_token_texts()
tags = file_text.get_token_properties('tag')
tags = file_text.get_token_properties('tag', flat = True)

print(tokens)

Expand Down Expand Up @@ -192,7 +192,7 @@ def update_gui_file_types(err_msg, new_files):
file_text_tgt = new_files[1]['text']

tokens_src = file_text_src.to_token_texts()
tags_src = file_text_src.get_token_properties('tag')
tags_src = file_text_src.get_token_properties('tag', flat = True)

# Source files
print(file_text_src.lang)
Expand All @@ -204,7 +204,7 @@ def update_gui_file_types(err_msg, new_files):

# Target files
tokens_tgt = file_text_tgt.to_token_texts()
tags_tgt = file_text_tgt.get_token_properties('tag')
tags_tgt = file_text_tgt.get_token_properties('tag', flat = True)

print(file_text_tgt.lang)
print(tokens_tgt)
Expand All @@ -226,7 +226,7 @@ def update_gui_tags(err_msg, new_files):
file_text = new_files[0]['text']

tokens = file_text.to_token_texts()
tags = file_text.get_token_properties('tag')
tags = file_text.get_token_properties('tag', flat = True)

print(tokens)
print(tags)
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_utils/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,11 @@ def test_lingua():
langs_exceptions = {'bokmal', 'ganda', 'nynorsk', 'slovene'}
langs_extra = set()

for lang in lingua.Language.all():
for lang in lingua.Language.all(): # pylint: disable=no-member
if lang.name.lower() not in langs | langs_exceptions:
langs_extra.add(lang.name)

print(f"Extra languages: {', '.join(langs_extra)}\n")
print(f"\nExtra languages: {', '.join(langs_extra)}\n")

assert langs_extra == {'BOSNIAN', 'MAORI', 'SHONA', 'SOMALI', 'SOTHO', 'TSONGA', 'XHOSA'}

Expand Down
20 changes: 9 additions & 11 deletions wordless/wl_colligation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,7 +909,7 @@ def run(self):
colligations_freqs_file = {}
colligations_freqs_file_all = {}

text = wl_token_processing.wl_process_tokens(
text = wl_token_processing.wl_process_tokens_colligation_extractor(
self.main, file['text'],
token_settings = settings['token_settings']
)
Expand Down Expand Up @@ -976,23 +976,21 @@ def run(self):
tags_left = []
tags_right = []

tags = wl_texts.to_tokens(wl_texts.get_token_properties(tokens, 'tag'), lang = file['lang'])

if window_left < 0 < window_right:
# Limit Searching
if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
tags_left = tags[max(0, i + window_left) : i]
tags_right = tags[i + ngram_size : i + ngram_size + window_right]
tags_left = text.tags[max(0, i + window_left) : i]
tags_right = text.tags[i + ngram_size : i + ngram_size + window_right]
else:
# Span positions (Left)
for position in range(max(0, i + window_left), i):
if i_unit_start <= position <= i_unit_end:
tags_left.append(tags[position])
tags_left.append(text.tags[position])

# Span positions (Right)
for position in range(i + ngram_size, i + ngram_size + window_right):
if i_unit_start <= position <= i_unit_end:
tags_right.append(tags[position])
tags_right.append(text.tags[position])

for j, collocate in enumerate(reversed(tags_left)):
if wl_matching.check_context(
Expand Down Expand Up @@ -1024,12 +1022,12 @@ def run(self):
elif window_left < 0 and window_right < 0:
# Limit Searching
if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
tags_left = tags[max(0, i + window_left) : max(0, i + window_right + 1)]
tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)]
else:
# Span positions (Left)
for position in range(max(0, i + window_left), max(0, i + window_right + 1)):
if i_unit_start <= position <= i_unit_end:
tags_left.append(tags[position])
tags_left.append(text.tags[position])

for j, collocate in enumerate(reversed(tags_left)):
if wl_matching.check_context(
Expand All @@ -1047,12 +1045,12 @@ def run(self):
elif window_left > 0 and window_right > 0:
# Limit Searching
if settings_limit_searching == _tr('wl_colligation_extractor', 'None'):
tags_right = tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
else:
# Span positions (Right)
for position in range(i + ngram_size + window_left - 1, i + ngram_size + window_right):
if i_unit_start <= position <= i_unit_end:
tags_right.append(tags[position])
tags_right.append(text.tags[position])

for j, collocate in enumerate(tags_right):
if wl_matching.check_context(
Expand Down
2 changes: 1 addition & 1 deletion wordless/wl_collocation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ def run(self):
collocations_freqs_file = {}
collocations_freqs_file_all = {}

text = wl_token_processing.wl_process_tokens(
text = wl_token_processing.wl_process_tokens_ngram_generator(
self.main, file['text'],
token_settings = settings['token_settings']
)
Expand Down
53 changes: 42 additions & 11 deletions wordless/wl_concordancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,9 +727,10 @@ def run(self):
no_sentence = bisect.bisect(offsets_sentences, i)
no_para = bisect.bisect(offsets_paras, i)

# Search in Results (Node)
node_tokens_search = list(ngram)
node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(ngram))
node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
ngram,
punc_mark = True
))

# Width Unit
if settings['generation_settings']['width_unit'] == self.tr('Character'):
Expand Down Expand Up @@ -811,16 +812,46 @@ def run(self):
left_tokens_raw = tokens[offset_start:i]
right_tokens_raw = tokens[i + len_search_term : offset_end]

# Search in results (Left & Right)
left_tokens_search = copy.deepcopy(left_tokens_raw)
right_tokens_search = copy.deepcopy(right_tokens_raw)
if settings['token_settings']['punc_marks']:
node_tokens_search = list(ngram)

# Remove empty tokens for searching in results
left_tokens_search = [token for token in copy.deepcopy(left_tokens_raw) if token]
right_tokens_search = [token for token in copy.deepcopy(right_tokens_raw) if token]
# Convert trailing punctuation marks, if any, to separate tokens for searching
else:
node_tokens_search = []
left_tokens_search = []
right_tokens_search = []

for token in list(ngram):
node_tokens_search.append(token)

if token.punc_mark:
node_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))

# Remove empty tokens for searching in results
left_tokens_search = [token for token in left_tokens_search if token]
right_tokens_search = [token for token in right_tokens_search if token]
for token in copy.deepcopy(left_tokens_raw):
if token:
left_tokens_search.append(token)

left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(left_tokens_raw))
right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(right_tokens_raw))
if token.punc_mark:
left_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))

for token in copy.deepcopy(right_tokens_raw):
if token:
right_tokens_search.append(token)

if token.punc_mark:
right_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))

left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
left_tokens_raw,
punc_mark = True
))
right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
right_tokens_raw,
punc_mark = True
))

# Left
concordance_line.append([left_tokens_raw, left_tokens_search])
Expand Down
46 changes: 31 additions & 15 deletions wordless/wl_concordancer_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,9 +408,10 @@ def run(self):
parallel_unit_no = bisect.bisect(offsets_paras, j)

if parallel_unit_no not in parallel_units:
# Save all nodes if multiple nodes are found in the same parallel unit
parallel_units[parallel_unit_no] = [[] for _ in range(len_files)]

parallel_units[parallel_unit_no][i] = ngram
parallel_units[parallel_unit_no][i].append(ngram)
# Search for additions & deletions
else:
for j, para in enumerate(text.tokens_multilevel):
Expand All @@ -428,22 +429,37 @@ def run(self):
len_parallel_units = len(offsets_paras)

for parallel_unit_no, parallel_unit_nodes in parallel_units.items():
node = parallel_unit_nodes[i]
nodes = parallel_unit_nodes[i]

if parallel_unit_no <= len_parallel_units:
parallel_unit_tokens_raw = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))
parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(parallel_unit_tokens_raw))
# Search in Results
parallel_unit_tokens_search = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))

# Highlight node if found
if node:
len_node = len(node)

for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit_tokens_search, len_node)):
if ngram == tuple(node):
parallel_unit_tokens_raw[j] = f'<span style="color: {node_color}; font-weight: bold;">{parallel_unit_tokens_raw[j]}'
parallel_unit_tokens_raw[j + len_node - 1] += '</span>'
parallel_unit = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1]))

if settings['token_settings']['punc_marks']:
parallel_unit_tokens_search = copy.deepcopy(parallel_unit)
# Convert trailing punctuation marks, if any, to separate tokens for searching
else:
parallel_unit_tokens_search = []

for token in copy.deepcopy(parallel_unit):
parallel_unit_tokens_search.append(token)

if token.punc_mark:
parallel_unit_tokens_search.append(wl_texts.Wl_Token(token.punc_mark, lang = token.lang))

parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(
parallel_unit,
punc_mark = True
))

# Highlight nodes if found
if nodes:
for node in nodes:
len_node = len(node)

for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit, len_node)):
if ngram == tuple(node):
parallel_unit_tokens_raw[j] = f'<span style="color: {node_color}; font-weight: bold;">{parallel_unit_tokens_raw[j]}'
parallel_unit_tokens_raw[j + len_node - 1] += '</span>'
else:
parallel_unit_tokens_raw = []
parallel_unit_tokens_search = []
Expand Down
Loading

0 comments on commit 6f8aa74

Please sign in to comment.