Skip to content

Commit

Permalink
Utils: Update sentence tokenizers, word tokenizers, syllable tokenize…
Browse files Browse the repository at this point in the history
…rs, part-of-speech taggers, lemmatizers, dependency parsers, and sentiment analyzers
  • Loading branch information
BLKSerene committed May 16, 2024
1 parent f5693ac commit 93f9cb0
Show file tree
Hide file tree
Showing 61 changed files with 2,809 additions and 2,877 deletions.
29 changes: 15 additions & 14 deletions pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,23 @@ extension-pkg-whitelist=
[MESSAGES CONTROL]

disable=
# C103, C114, C115, C116
# C0103, C0114, C0115, C0116
invalid-name,
missing-module-docstring,
missing-class-docstring,
missing-function-docstring,
# C301, C302
# C0301, C0302
line-too-long,
too-many-lines,
# C413, C415
# C0413, C0415
wrong-import-position,
import-outside-toplevel,

# R401
# R0401
cyclic-import,
# R801
# R0801
duplicate-code,
# R901, R902, R903, R904, R912, R913, R914, R915, R916
# R0901, R0902, R0903, R0904, R0912, R0913, R0914, R0915, R0916, R0917
too-many-ancestors,
too-many-instance-attributes,
too-few-public-methods,
Expand All @@ -50,21 +50,22 @@ disable=
too-many-locals,
too-many-statements,
too-many-branches,
# R1702, R1705, R1723, R1724
too-many-positional,
# R1702, R1705, R1720, R1723, R1724
too-many-nested-blocks,
no-else-return,
no-else-raise,
no-else-break,
no-else-continue,

# W201, W212
# W0201, W0212
attribute-defined-outside-init,
protected-access,
# W511
fixme,
# W621
# W0603, W0621
global-statement,
redefined-outer-name,

# E401
# E0401
import-error,
# E1121
too-many-function-args
# E0606
possibly-used-before-assignment,
87 changes: 87 additions & 0 deletions tests/files/file_area/misc/[eng_gb] Tagged.txt

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions tests/test_colligation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,18 @@ def update_gui(err_msg, colligations_freqs_files, colligations_stats_files):
assert node
# Collocate
assert collocate

# Frequency (span positions)
for freqs_file in freqs_files:
assert len(freqs_file) == 10

# Frequency (total)
assert sum((sum(freqs_file) for freqs_file in freqs_files)) >= 0

# p-value
for _, p_value, _, _ in stats_files:
assert p_value is None or 0 <= p_value <= 1

# Number of Files Found
assert len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)]) >= 1

Expand Down
4 changes: 4 additions & 0 deletions tests/test_collocation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,18 @@ def update_gui(err_msg, collocations_freqs_files, collocations_stats_files):
assert node
# Collocate
assert collocate

# Frequency (span positions)
for freqs_file in freqs_files:
assert len(freqs_file) == 10

# Frequency (total)
assert sum((sum(freqs_file) for freqs_file in freqs_files)) >= 0

# p-value
for _, p_value, _, _ in stats_files:
assert p_value is None or 0 <= p_value <= 1

# Number of Files Found
assert len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)]) >= 1

Expand Down
29 changes: 13 additions & 16 deletions tests/test_concordancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_concordancer():
case _:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
global main_global
main_global = main

print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
Expand All @@ -66,9 +66,9 @@ def update_gui_table(err_msg, concordance_lines):
file_names_selected = list(main_global.wl_file_area.get_selected_file_names())

for concordance_line in concordance_lines:
left_text, left_text_raw, left_text_search = concordance_line[0]
node_text, node_text_raw, node_text_search = concordance_line[1]
right_text, right_text_raw, right_text_search = concordance_line[2]
left_tokens_raw, left_tokens_search = concordance_line[0]
node_tokens_raw, node_tokens_search = concordance_line[1]
right_tokens_raw, right_tokens_search = concordance_line[2]

sentiment = concordance_line[3]
no_token, len_tokens = concordance_line[4]
Expand All @@ -78,19 +78,16 @@ def update_gui_table(err_msg, concordance_lines):
file_name = concordance_line[8]

# Node
assert node_text
assert node_text_raw
assert node_text_search
assert node_tokens_raw
assert node_tokens_search

# Left & Right
assert left_text or right_text
assert left_text == [] or all(left_text)
assert right_text == [] or all(right_text)
assert left_text_raw or right_text_raw
assert left_text_raw == [] or all(left_text_raw)
assert right_text_raw == [] or all(right_text_raw)
assert left_text_search or right_text_search
assert left_text_search == [] or all(left_text_search)
assert right_text_search == [] or all(right_text_search)
assert left_tokens_raw or right_tokens_raw
assert left_tokens_raw == [] or all(left_tokens_raw)
assert right_tokens_raw == [] or all(right_tokens_raw)
assert left_tokens_search or right_tokens_search
assert left_tokens_search == [] or all(left_tokens_search)
assert right_tokens_search == [] or all(right_tokens_search)

# Sentiment
assert sentiment == 'No language support' or -1 <= sentiment <= 1
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_dependency_parser():
case _:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
global main_global
main_global = main

print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
Expand Down
2 changes: 0 additions & 2 deletions tests/test_keyword_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

# pylint: disable=unsupported-assignment-operation

import random

from tests import wl_test_init
Expand Down
2 changes: 1 addition & 1 deletion tests/test_ngram_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_ngram_generator():
case _:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
global main_global
main_global = main

settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_profiler():
case _:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
global main_global
main_global = main

print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_wordlist_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_wordlist_generator():
case _:
wl_test_init.select_test_files(main, no_files = [i + 1])

global main_global # pylint: disable=global-statement
global main_global
main_global = main

settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion)
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_figs/test_figs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def test_get_data_ranks():
assert wl_figs.get_data_ranks(data_files_items, fig_settings) == [(str(i), i) for i in range(50)]

def test_generate_line_chart():
main = wl_test_init.Wl_Test_Main()
main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
wl_test_init.select_test_files(main, no_files = [0, 1])

global main_global # pylint: disable=global-statement
global main_global
main_global = main

wl_figs.generate_line_chart(
Expand Down
14 changes: 10 additions & 4 deletions tests/tests_figs/test_figs_freqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@

from tests import wl_test_init
from wordless.wl_figs import wl_figs_freqs
from wordless.wl_nlp import wl_texts

def test_wl_fig_freqs():
main = wl_test_init.Wl_Test_Main()
main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')

for tab in [
'wordlist_generator',
Expand All @@ -45,30 +46,35 @@ def test_wl_fig_freqs():

if graph_type == 'Network Graph':
for node in range(10):
node = wl_texts.Wl_Token(str(node))

for collocate in range(10):
collocate = wl_texts.Wl_Token(str(collocate))
freq_1, freq_2 = random.sample(range(10000), 2)

freq_files_items[(str(node), str(collocate))] = [
freq_files_items[(node, collocate)] = [
max(freq_1, freq_2) - min(freq_1, freq_2),
min(freq_1, freq_2),
max(freq_1, freq_2)
]
else:
if tab == 'keyword_extractor':
for item in range(100):
item = wl_texts.Wl_Token(str(item))
freq_1, freq_2 = random.sample(range(100), 2)

freq_files_items[str(item)] = [
freq_files_items[item] = [
random.randint(0, 100),
max(freq_1, freq_2) - min(freq_1, freq_2),
min(freq_1, freq_2),
max(freq_1, freq_2)
]
else:
for item in range(100):
item = wl_texts.Wl_Token(str(item))
freq_1, freq_2 = random.sample(range(100), 2)

freq_files_items[str(item)] = [
freq_files_items[item] = [
max(freq_1, freq_2) - min(freq_1, freq_2),
min(freq_1, freq_2),
max(freq_1, freq_2)
Expand Down
11 changes: 8 additions & 3 deletions tests/tests_figs/test_figs_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@

from tests import wl_test_init
from wordless.wl_figs import wl_figs_stats
from wordless.wl_nlp import wl_texts

def test_wl_fig_stats():
main = wl_test_init.Wl_Test_Main()
main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')

for tab in [
'wordlist_generator',
Expand Down Expand Up @@ -55,15 +56,19 @@ def test_wl_fig_stats():

if graph_type == 'Network Graph':
for node in range(10):
node = wl_texts.Wl_Token(str(node))

for collocate in range(10):
stat_files_items[(str(node), str(collocate))] = [
collocate = wl_texts.Wl_Token(str(collocate))
stat_files_items[(node, collocate)] = [
random.uniform(0, val_max),
random.uniform(0, val_max),
random.uniform(0, val_max)
]
else:
for item in range(100):
stat_files_items[str(item)] = [
item = wl_texts.Wl_Token(str(item))
stat_files_items[item] = [
random.uniform(0, val_max),
random.uniform(0, val_max),
random.uniform(0, val_max)
Expand Down
Loading

0 comments on commit 93f9cb0

Please sign in to comment.