Skip to content

Commit

Permalink
Utils: Update spaCy's sentence recognizers, word tokenizers, part-of-…
Browse files Browse the repository at this point in the history
…speech taggers, lemmatizers, and dependency parsers
  • Loading branch information
BLKSerene committed Aug 8, 2023
1 parent 047d5a4 commit 9c09ca9
Show file tree
Hide file tree
Showing 67 changed files with 2,179 additions and 1,202 deletions.
45 changes: 21 additions & 24 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,15 @@ jobs:
# Run tests and collect coverage
- name: Run Tests and collect coverage
run: |
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_dependency_parsing.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_lemmatization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_matching.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_nlp_utils.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_pos_tagging.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_sentence_tokenization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_stop_word_lists.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_syl_tokenization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_word_detokenization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_word_tokenization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_syl_tokenization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_pos_tagging.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_lemmatization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_file_area
pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_work_area
Expand Down Expand Up @@ -103,16 +102,15 @@ jobs:
# Run tests
- name: Run Tests
run: |
pytest tests/wl_tests_nlp/test_dependency_parsing.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/test_matching.py
pytest tests/wl_tests_nlp/test_nlp_utils.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
pytest tests/wl_tests_nlp/test_sentence_tokenization.py
pytest tests/wl_tests_nlp/test_stop_word_lists.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_word_detokenization.py
pytest tests/wl_tests_nlp/test_word_tokenization.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_file_area
# Ignore tests of Profiler due to unknown errors
Expand Down Expand Up @@ -153,16 +151,15 @@ jobs:
# Fix PyQt
export QT_QPA_PLATFORM=offscreen
pytest tests/wl_tests_nlp/test_dependency_parsing.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/test_matching.py
pytest tests/wl_tests_nlp/test_nlp_utils.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
pytest tests/wl_tests_nlp/test_sentence_tokenization.py
pytest tests/wl_tests_nlp/test_stop_word_lists.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_word_detokenization.py
pytest tests/wl_tests_nlp/test_word_tokenization.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_file_area
pytest tests/wl_tests_work_area
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Danielson-Bryan's Readability Formula / Degrees of Reading Power / Easy Listening Formula / Fucks's Stilcharakteristik / Strain Index / Tränkle & Bailer's Readability Formula / Tuldava's Text Difficulty / Wheeler & Smith's Readability Formula

### ✨ Improvements
- Utils: Update spaCy's sentence recognizers, word tokenizers, part-of-speech taggers, lemmatizers, and dependency parsers
- Utils: Update Wordless's sentence and sentence segment splitters
- Work Area: Update Profiler - Readability - Automated Readability Index / Dale-Chall Readability Formula / Flesch Reading Ease / Flesch Reading Ease (Farr-Jenkins-Paterson) / Gunning Fog Index / Spache Grade Level

Expand Down
45 changes: 21 additions & 24 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,15 @@ for:

# Run tests
test_script:
- pytest tests/wl_tests_nlp/test_dependency_parsing.py
- pytest tests/wl_tests_nlp/test_lemmatization.py
- pytest tests/wl_tests_nlp/test_matching.py
- pytest tests/wl_tests_nlp/test_nlp_utils.py
- pytest tests/wl_tests_nlp/test_pos_tagging.py
- pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py

- pytest tests/wl_tests_nlp/test_sentence_tokenization.py
- pytest tests/wl_tests_nlp/test_stop_word_lists.py
- pytest tests/wl_tests_nlp/test_syl_tokenization.py
- pytest tests/wl_tests_nlp/test_word_detokenization.py
- pytest tests/wl_tests_nlp/test_word_tokenization.py
- pytest tests/wl_tests_nlp/test_syl_tokenization.py
- pytest tests/wl_tests_nlp/test_pos_tagging.py
- pytest tests/wl_tests_nlp/test_lemmatization.py

- pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py

- pytest tests/wl_tests_file_area
- pytest tests/wl_tests_work_area
Expand All @@ -94,16 +93,15 @@ for:

# Run tests
test_script:
- pytest tests/wl_tests_nlp/test_dependency_parsing.py
- pytest tests/wl_tests_nlp/test_lemmatization.py
- pytest tests/wl_tests_nlp/test_matching.py
- pytest tests/wl_tests_nlp/test_nlp_utils.py
- pytest tests/wl_tests_nlp/test_pos_tagging.py
- pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py

- pytest tests/wl_tests_nlp/test_sentence_tokenization.py
- pytest tests/wl_tests_nlp/test_stop_word_lists.py
- pytest tests/wl_tests_nlp/test_syl_tokenization.py
- pytest tests/wl_tests_nlp/test_word_detokenization.py
- pytest tests/wl_tests_nlp/test_word_tokenization.py
- pytest tests/wl_tests_nlp/test_syl_tokenization.py
- pytest tests/wl_tests_nlp/test_pos_tagging.py
- pytest tests/wl_tests_nlp/test_lemmatization.py

- pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py

- pytest tests/wl_tests_file_area
# Ignore tests of Profiler due to unknown errors
Expand Down Expand Up @@ -132,16 +130,15 @@ for:
# Fix PyQt
- export QT_QPA_PLATFORM=offscreen

- pytest tests/wl_tests_nlp/test_dependency_parsing.py
- pytest tests/wl_tests_nlp/test_lemmatization.py
- pytest tests/wl_tests_nlp/test_matching.py
- pytest tests/wl_tests_nlp/test_nlp_utils.py
- pytest tests/wl_tests_nlp/test_pos_tagging.py
- pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py

- pytest tests/wl_tests_nlp/test_sentence_tokenization.py
- pytest tests/wl_tests_nlp/test_stop_word_lists.py
- pytest tests/wl_tests_nlp/test_syl_tokenization.py
- pytest tests/wl_tests_nlp/test_word_detokenization.py
- pytest tests/wl_tests_nlp/test_word_tokenization.py
- pytest tests/wl_tests_nlp/test_syl_tokenization.py
- pytest tests/wl_tests_nlp/test_pos_tagging.py
- pytest tests/wl_tests_nlp/test_lemmatization.py

- pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py

- pytest tests/wl_tests_file_area
- pytest tests/wl_tests_work_area
Expand Down
45 changes: 21 additions & 24 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,15 @@ jobs:
# Run tests
- script: |
pytest tests/wl_tests_nlp/test_dependency_parsing.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/test_matching.py
pytest tests/wl_tests_nlp/test_nlp_utils.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
pytest tests/wl_tests_nlp/test_sentence_tokenization.py
pytest tests/wl_tests_nlp/test_stop_word_lists.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_word_detokenization.py
pytest tests/wl_tests_nlp/test_word_tokenization.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_file_area
pytest tests/wl_tests_work_area
Expand Down Expand Up @@ -104,16 +103,15 @@ jobs:
# Run tests
- script: |
pytest tests/wl_tests_nlp/test_dependency_parsing.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/test_matching.py
pytest tests/wl_tests_nlp/test_nlp_utils.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
pytest tests/wl_tests_nlp/test_sentence_tokenization.py
pytest tests/wl_tests_nlp/test_stop_word_lists.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_word_detokenization.py
pytest tests/wl_tests_nlp/test_word_tokenization.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_file_area
# Ignore tests of Profiler due to unknown errors
Expand Down Expand Up @@ -158,16 +156,15 @@ jobs:
# Fix PyQt
export QT_QPA_PLATFORM=offscreen
pytest tests/wl_tests_nlp/test_dependency_parsing.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/test_matching.py
pytest tests/wl_tests_nlp/test_nlp_utils.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
pytest tests/wl_tests_nlp/test_sentence_tokenization.py
pytest tests/wl_tests_nlp/test_stop_word_lists.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_word_detokenization.py
pytest tests/wl_tests_nlp/test_word_tokenization.py
pytest tests/wl_tests_nlp/test_syl_tokenization.py
pytest tests/wl_tests_nlp/test_pos_tagging.py
pytest tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
pytest tests/wl_tests_file_area
pytest tests/wl_tests_work_area
Expand Down
16 changes: 16 additions & 0 deletions tests/wl_test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,19 @@ def select_random_files_ref(main, num_files):
def clean_import_caches():
for file in glob.glob('imports/*.*'):
os.remove(file)

def change_default_tokenizers(main):
for lang in [
'cat', 'zho_cn', 'zho_tw', 'hrv', 'dan',
'nld', 'eng_gb', 'eng_us', 'fin', 'fra',
'deu_at', 'deu_de', 'deu_ch', 'ell', 'ita',
'jpn', 'kor', 'lit', 'mkd', 'nob',
'pol', 'por_br', 'por_pt', 'ron', 'rus',
'slv', 'spa', 'swe', 'ukr', 'other'
]:
main.settings_custom['sentence_tokenization']['sentence_tokenizer_settings'][lang] = 'spacy_sentencizer'

if lang in ['zho_cn', 'zho_tw']:
main.settings_custom['word_tokenization']['word_tokenizer_settings'][lang] = 'pkuseg_zho'
else:
main.settings_custom['word_tokenization']['word_tokenizer_settings'][lang] = 'nltk_nltk'
Loading

0 comments on commit 9c09ca9

Please sign in to comment.