Skip to content

Commit

Permalink
Work Area: Fix Dependency Parser - analysis of files whose first toke…
Browse files Browse the repository at this point in the history
…n is a punctuation mark
  • Loading branch information
BLKSerene committed Dec 10, 2023
1 parent 347b5b6 commit 011847d
Show file tree
Hide file tree
Showing 15 changed files with 63 additions and 48 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - ??/??/2023
### 📌 Bugfixes
- Utils: Fix downloading of Stanza models
- Work Area: Fix Dependency Parser - analysis of files whose first token is a punctuation mark

### ⏫ Dependency Changes
- Dependencies: Remove jieba
Expand Down Expand Up @@ -128,7 +129,7 @@

### 📌 Bugfixes
- Work Area: Remove all invalid XML characters when exporting tables to Excel workbooks
- Work Area: Fix Parallel Concordancer - Search Settings - Search for additions and deletions
- Work Area: Fix Parallel Concordancer - searching for additions and deletions

### ⏫ Dependency Changes
- Dependencies: Add python-mecab-ko
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"The first token is a punctuation mark.
10 changes: 5 additions & 5 deletions tests/test_colligation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ def test_colligation_extractor():
fillvalue = 'none'
)):
# Single file
if i % 3 == 0:
if i % 4 == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i % 3 == 1:
elif i % 4 == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i % 3 == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i % 4 + 1])

main.settings_custom['colligation_extractor']['generation_settings']['test_statistical_significance'] = test_statistical_significance
main.settings_custom['colligation_extractor']['generation_settings']['measure_bayes_factor'] = measure_bayes_factor
Expand Down
10 changes: 5 additions & 5 deletions tests/test_collocation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ def test_collocation_extractor():
fillvalue = 'none'
)):
# Single file
if i % 3 == 0:
if i % 4 == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i % 3 == 1:
elif i % 4 == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i % 3 == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i % 4 + 1])

main.settings_custom['collocation_extractor']['generation_settings']['test_statistical_significance'] = test_statistical_significance
main.settings_custom['collocation_extractor']['generation_settings']['measure_bayes_factor'] = measure_bayes_factor
Expand Down
14 changes: 10 additions & 4 deletions tests/test_concordancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ def test_concordancer():
main.settings_custom['concordancer']['search_settings']['multi_search_mode'] = True
main.settings_custom['concordancer']['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS

for i in range(3):
for i in range(4):
# Single file
if i == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
main_global = main
Expand Down Expand Up @@ -80,8 +80,14 @@ def update_gui_table(err_msg, concordance_lines):
assert node_text_search
# Left & Right
assert left_text or right_text
assert left_text == [] or all(left_text)
assert right_text == [] or all(right_text)
assert left_text_raw or right_text_raw
assert left_text_raw == [] or all(left_text_raw)
assert right_text_raw == [] or all(right_text_raw)
assert left_text_search or right_text_search
assert left_text_search == [] or all(left_text_search)
assert right_text_search == [] or all(right_text_search)

# Sentiment
assert sentiment == 'No language support' or -1 <= sentiment <= 1
Expand Down
2 changes: 1 addition & 1 deletion tests/test_concordancer_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_concordancer_parallel():
if i == 0:
wl_test_init.select_test_files(main, no_files = [0, 1, 2])
elif i == 1:
wl_test_init.select_test_files(main, no_files = [1, 2, 3])
wl_test_init.select_test_files(main, no_files = [1, 2, 3, 4])

print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")

Expand Down
8 changes: 4 additions & 4 deletions tests/test_dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ def test_dependency_parser():
main.settings_custom['dependency_parser']['search_settings']['multi_search_mode'] = True
main.settings_custom['dependency_parser']['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS

for i in range(3):
for i in range(4):
# Single file
if i == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
main_global = main
Expand Down
14 changes: 7 additions & 7 deletions tests/test_keyword_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,24 @@ def test_keyword_extractor():
fillvalue = 'none'
)):
# Single observed file & single reference file
if i % 5 == 0:
if i % 6 == 0:
wl_test_init.select_test_files(main, no_files = [0])
wl_test_init.select_test_files(main, no_files = [0], ref = True)
# Single observed file & multiple reference files
elif i % 5 == 1:
elif i % 6 == 1:
wl_test_init.select_test_files(main, no_files = [0])
wl_test_init.select_test_files(main, no_files = [1, 2], ref = True)
# Multiple observed files & single reference file
elif i % 5 == 2:
elif i % 6 == 2:
wl_test_init.select_test_files(main, no_files = [1, 2])
wl_test_init.select_test_files(main, no_files = [0], ref = True)
# Multiple observed files & multiple reference files
elif i % 5 == 3:
elif i % 6 == 3:
wl_test_init.select_test_files(main, no_files = [1, 2])
wl_test_init.select_test_files(main, no_files = [1, 2], ref = True)
# TTR = 1
elif i % 5 == 4:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i % 6 - 1])
wl_test_init.select_test_files(main, no_files = [0], ref = True)

main.settings_custom['keyword_extractor']['generation_settings']['test_statistical_significance'] = test_statistical_significance
Expand Down
10 changes: 5 additions & 5 deletions tests/test_ngram_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ def test_ngram_generator():
fillvalue = 'none'
)):
# Single file
if i % 3 == 0:
if i % 4 == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i % 3 == 1:
elif i % 4 == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i % 3 == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i % 4 + 1])

global main_global # pylint: disable=global-statement
main_global = main
Expand Down
8 changes: 4 additions & 4 deletions tests/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@
def test_profiler():
main = wl_test_init.Wl_Test_Main()

for i in range(3):
for i in range(4):
# Single file
if i == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
main_global = main
Expand Down
10 changes: 5 additions & 5 deletions tests/test_wordlist_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def test_wordlist_generator():
fillvalue = 'none'
)):
# Single file
if i % 3 == 0:
if i % 4 == 0:
wl_test_init.select_test_files(main, no_files = [0])
# Multiple files
elif i % 3 == 1:
elif i % 4 == 1:
wl_test_init.select_test_files(main, no_files = [1, 2])
# TTR = 1
elif i % 3 == 2:
wl_test_init.select_test_files(main, no_files = [3])
# Miscellaneous
else:
wl_test_init.select_test_files(main, no_files = [i % 4 + 1])

global main_global # pylint: disable=global-statement
main_global = main
Expand Down
10 changes: 7 additions & 3 deletions wordless/wl_concordancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def run(self):

len_context_right += len_token_next

# Search in Results (Left & Right)
# Search in results (Left & Right)
text_search_left = copy.deepcopy(context_left)
text_search_right = copy.deepcopy(context_right)

Expand Down Expand Up @@ -821,7 +821,7 @@ def run(self):
context_left = text.tokens_flat_punc_marks_merged[max(0, i - width_left_token) : i]
context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : i + len_search_term + width_right_token]

# Search in Results (Left & Right)
# Search in results (Left & Right)
if settings['token_settings']['punc_marks']:
text_search_left = copy.deepcopy(context_left)
text_search_right = copy.deepcopy(context_right)
Expand Down Expand Up @@ -858,14 +858,18 @@ def run(self):
context_left = text.tokens_flat_punc_marks_merged[offset_start:i]
context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : offset_end]

# Search in Results (Left & Right)
# Search in results (Left & Right)
if settings['token_settings']['punc_marks']:
text_search_left = copy.deepcopy(context_left)
text_search_right = copy.deepcopy(context_right)
else:
text_search_left = tokens[offset_start:i]
text_search_right = tokens[i + len_search_term : offset_end]

# Remove empty tokens for searching in results
text_search_left = [token for token in text_search_left if token]
text_search_right = [token for token in text_search_right if token]

context_left = wl_nlp_utils.escape_tokens(context_left)
context_right = wl_nlp_utils.escape_tokens(context_right)

Expand Down
3 changes: 2 additions & 1 deletion wordless/wl_dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,8 @@ def run(self):
offset_end = offsets_sentences[no_sentence]

sentence_display = text.tokens_flat_punc_marks_merged[offsets_sentences[no_sentence - 1]:offset_end]
sentence_search = sentence
# Remove empty tokens for searching in results
sentence_search = [token for token in sentence if token]

# Head
results[-1].append(head)
Expand Down
4 changes: 3 additions & 1 deletion wordless/wl_nlp/wl_dependency_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def wl_dependency_parse_text(main, inputs, lang, dependency_parser):
def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged):
dependencies = []

# Discard empty tokens since they are useless for dependency parsing and spacy.tokens.Doc does not accept empty strings
inputs = [token for token in inputs if token]

if tagged:
inputs, tags = wl_matching.split_tokens_tags(main, inputs)

Expand Down Expand Up @@ -142,7 +145,6 @@ def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged):
token.head - token.id if token.head > 0 else 0
))

# Put back tokens and tags
if tagged:
for i, dependency in enumerate(dependencies):
token, head, dependency_relation, dependency_dist = dependency
Expand Down
4 changes: 2 additions & 2 deletions wordless/wl_nlp/wl_lemmatization.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,16 @@ def wl_lemmatize_text(main, inputs, lang, lemmatizer):
return lemmas

def wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged):
empty_offsets = []
lemma_tokens = []
lemmas = []
empty_offsets = []

if tagged:
inputs, tags = wl_matching.split_tokens_tags(main, inputs)
else:
tags = [''] * len(inputs)

# Record positions of empty tokens and tags
# Record positions of empty tokens and tags since spacy.tokens.Doc does not accept empty strings
for i, token in reversed(list(enumerate(inputs))):
if not token.strip():
empty_offsets.append(i)
Expand Down

0 comments on commit 011847d

Please sign in to comment.