Utils: Update spaCy's sentence recognizers, word tokenizers, part-of-…

…speech taggers, lemmatizers, and dependency parsers
BLKSerene · Aug 8, 2023 · 9c09ca9 · 9c09ca9
1 parent 047d5a4
commit 9c09ca9
Show file tree

Hide file tree

Showing 67 changed files with 2,179 additions and 1,202 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -53,16 +53,15 @@ jobs:
       # Run tests and collect coverage
       - name: Run Tests and collect coverage
         run: |
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_dependency_parsing.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_lemmatization.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_matching.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_nlp_utils.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_pos_tagging.py
+          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
           pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_sentence_tokenization.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_stop_word_lists.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_syl_tokenization.py
-          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_word_detokenization.py
           pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_word_tokenization.py
+          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_syl_tokenization.py
+          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_pos_tagging.py
+          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/test_lemmatization.py
+
+          pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
           pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_file_area
           pytest --cov=./ --cov-report=xml --cov-append tests/wl_tests_work_area
@@ -103,16 +102,15 @@ jobs:
       # Run tests
       - name: Run Tests
         run: |
-          pytest tests/wl_tests_nlp/test_dependency_parsing.py
-          pytest tests/wl_tests_nlp/test_lemmatization.py
-          pytest tests/wl_tests_nlp/test_matching.py
-          pytest tests/wl_tests_nlp/test_nlp_utils.py
-          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
           pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-          pytest tests/wl_tests_nlp/test_stop_word_lists.py
-          pytest tests/wl_tests_nlp/test_syl_tokenization.py
-          pytest tests/wl_tests_nlp/test_word_detokenization.py
           pytest tests/wl_tests_nlp/test_word_tokenization.py
+          pytest tests/wl_tests_nlp/test_syl_tokenization.py
+          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/test_lemmatization.py
+
+          pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
           pytest tests/wl_tests_file_area
           # Ignore tests of Profiler due to unknown errors
@@ -153,16 +151,15 @@ jobs:
           # Fix PyQt
           export QT_QPA_PLATFORM=offscreen
 
-          pytest tests/wl_tests_nlp/test_dependency_parsing.py
-          pytest tests/wl_tests_nlp/test_lemmatization.py
-          pytest tests/wl_tests_nlp/test_matching.py
-          pytest tests/wl_tests_nlp/test_nlp_utils.py
-          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
           pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-          pytest tests/wl_tests_nlp/test_stop_word_lists.py
-          pytest tests/wl_tests_nlp/test_syl_tokenization.py
-          pytest tests/wl_tests_nlp/test_word_detokenization.py
           pytest tests/wl_tests_nlp/test_word_tokenization.py
+          pytest tests/wl_tests_nlp/test_syl_tokenization.py
+          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/test_lemmatization.py
+
+          pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
           pytest tests/wl_tests_file_area
           pytest tests/wl_tests_work_area

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,7 @@
 - Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula / Danielson-Bryan's Readability Formula / Degrees of Reading Power / Easy Listening Formula / Fucks's Stilcharakteristik / Strain Index / Tränkle & Bailer's Readability Formula / Tuldava's Text Difficulty / Wheeler & Smith's Readability Formula
 
 ### ✨ Improvements
+- Utils: Update spaCy's sentence recognizers, word tokenizers, part-of-speech taggers, lemmatizers, and dependency parsers
 - Utils: Update Wordless's sentence and sentence segment splitters
 - Work Area: Update Profiler - Readability - Automated Readability Index / Dale-Chall Readability Formula / Flesch Reading Ease / Flesch Reading Ease (Farr-Jenkins-Paterson) / Gunning Fog Index / Spache Grade Level
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -60,16 +60,15 @@ for:
 
     # Run tests
     test_script:
-      - pytest tests/wl_tests_nlp/test_dependency_parsing.py
-      - pytest tests/wl_tests_nlp/test_lemmatization.py
-      - pytest tests/wl_tests_nlp/test_matching.py
-      - pytest tests/wl_tests_nlp/test_nlp_utils.py
-      - pytest tests/wl_tests_nlp/test_pos_tagging.py
+      - pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
       - pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-      - pytest tests/wl_tests_nlp/test_stop_word_lists.py
-      - pytest tests/wl_tests_nlp/test_syl_tokenization.py
-      - pytest tests/wl_tests_nlp/test_word_detokenization.py
       - pytest tests/wl_tests_nlp/test_word_tokenization.py
+      - pytest tests/wl_tests_nlp/test_syl_tokenization.py
+      - pytest tests/wl_tests_nlp/test_pos_tagging.py
+      - pytest tests/wl_tests_nlp/test_lemmatization.py
+
+      - pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
       - pytest tests/wl_tests_file_area
       - pytest tests/wl_tests_work_area
@@ -94,16 +93,15 @@ for:
 
     # Run tests
     test_script:
-      - pytest tests/wl_tests_nlp/test_dependency_parsing.py
-      - pytest tests/wl_tests_nlp/test_lemmatization.py
-      - pytest tests/wl_tests_nlp/test_matching.py
-      - pytest tests/wl_tests_nlp/test_nlp_utils.py
-      - pytest tests/wl_tests_nlp/test_pos_tagging.py
+      - pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
       - pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-      - pytest tests/wl_tests_nlp/test_stop_word_lists.py
-      - pytest tests/wl_tests_nlp/test_syl_tokenization.py
-      - pytest tests/wl_tests_nlp/test_word_detokenization.py
       - pytest tests/wl_tests_nlp/test_word_tokenization.py
+      - pytest tests/wl_tests_nlp/test_syl_tokenization.py
+      - pytest tests/wl_tests_nlp/test_pos_tagging.py
+      - pytest tests/wl_tests_nlp/test_lemmatization.py
+
+      - pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
       - pytest tests/wl_tests_file_area
       # Ignore tests of Profiler due to unknown errors
@@ -132,16 +130,15 @@ for:
       # Fix PyQt
       - export QT_QPA_PLATFORM=offscreen
 
-      - pytest tests/wl_tests_nlp/test_dependency_parsing.py
-      - pytest tests/wl_tests_nlp/test_lemmatization.py
-      - pytest tests/wl_tests_nlp/test_matching.py
-      - pytest tests/wl_tests_nlp/test_nlp_utils.py
-      - pytest tests/wl_tests_nlp/test_pos_tagging.py
+      - pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
       - pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-      - pytest tests/wl_tests_nlp/test_stop_word_lists.py
-      - pytest tests/wl_tests_nlp/test_syl_tokenization.py
-      - pytest tests/wl_tests_nlp/test_word_detokenization.py
       - pytest tests/wl_tests_nlp/test_word_tokenization.py
+      - pytest tests/wl_tests_nlp/test_syl_tokenization.py
+      - pytest tests/wl_tests_nlp/test_pos_tagging.py
+      - pytest tests/wl_tests_nlp/test_lemmatization.py
+
+      - pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
       - pytest tests/wl_tests_file_area
       - pytest tests/wl_tests_work_area

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -54,16 +54,15 @@ jobs:
 
       # Run tests
       - script: |
-          pytest tests/wl_tests_nlp/test_dependency_parsing.py
-          pytest tests/wl_tests_nlp/test_lemmatization.py
-          pytest tests/wl_tests_nlp/test_matching.py
-          pytest tests/wl_tests_nlp/test_nlp_utils.py
-          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
           pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-          pytest tests/wl_tests_nlp/test_stop_word_lists.py
-          pytest tests/wl_tests_nlp/test_syl_tokenization.py
-          pytest tests/wl_tests_nlp/test_word_detokenization.py
           pytest tests/wl_tests_nlp/test_word_tokenization.py
+          pytest tests/wl_tests_nlp/test_syl_tokenization.py
+          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/test_lemmatization.py
+
+          pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
           pytest tests/wl_tests_file_area
           pytest tests/wl_tests_work_area
@@ -104,16 +103,15 @@ jobs:
 
       # Run tests
       - script: |
-          pytest tests/wl_tests_nlp/test_dependency_parsing.py
-          pytest tests/wl_tests_nlp/test_lemmatization.py
-          pytest tests/wl_tests_nlp/test_matching.py
-          pytest tests/wl_tests_nlp/test_nlp_utils.py
-          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
           pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-          pytest tests/wl_tests_nlp/test_stop_word_lists.py
-          pytest tests/wl_tests_nlp/test_syl_tokenization.py
-          pytest tests/wl_tests_nlp/test_word_detokenization.py
           pytest tests/wl_tests_nlp/test_word_tokenization.py
+          pytest tests/wl_tests_nlp/test_syl_tokenization.py
+          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/test_lemmatization.py
+
+          pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
           pytest tests/wl_tests_file_area
           # Ignore tests of Profiler due to unknown errors
@@ -158,16 +156,15 @@ jobs:
           # Fix PyQt
           export QT_QPA_PLATFORM=offscreen
 
-          pytest tests/wl_tests_nlp/test_dependency_parsing.py
-          pytest tests/wl_tests_nlp/test_lemmatization.py
-          pytest tests/wl_tests_nlp/test_matching.py
-          pytest tests/wl_tests_nlp/test_nlp_utils.py
-          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/wl_tests_spacy/test_spacy_eng.py
+
           pytest tests/wl_tests_nlp/test_sentence_tokenization.py
-          pytest tests/wl_tests_nlp/test_stop_word_lists.py
-          pytest tests/wl_tests_nlp/test_syl_tokenization.py
-          pytest tests/wl_tests_nlp/test_word_detokenization.py
           pytest tests/wl_tests_nlp/test_word_tokenization.py
+          pytest tests/wl_tests_nlp/test_syl_tokenization.py
+          pytest tests/wl_tests_nlp/test_pos_tagging.py
+          pytest tests/wl_tests_nlp/test_lemmatization.py
+
+          pytest tests/wl_tests_nlp/ --ignore=tests/wl_tests_nlp/wl_tests_spacy --ignore=tests/wl_tests_nlp/test_sentence_tokenization.py --ignore=tests/wl_tests_nlp/test_word_tokenization.py --ignore=tests/wl_tests_nlp/test_syl_tokenization.py --ignore=tests/wl_tests_nlp/test_pos_tagging.py --ignore=tests/wl_tests_nlp/test_lemmatization.py
 
           pytest tests/wl_tests_file_area
           pytest tests/wl_tests_work_area

diff --git a/tests/wl_test_init.py b/tests/wl_test_init.py
@@ -127,3 +127,19 @@ def select_random_files_ref(main, num_files):
 def clean_import_caches():
     for file in glob.glob('imports/*.*'):
         os.remove(file)
+
+def change_default_tokenizers(main):
+    for lang in [
+        'cat', 'zho_cn', 'zho_tw', 'hrv', 'dan',
+        'nld', 'eng_gb', 'eng_us', 'fin', 'fra',
+        'deu_at', 'deu_de', 'deu_ch', 'ell', 'ita',
+        'jpn', 'kor', 'lit', 'mkd', 'nob',
+        'pol', 'por_br', 'por_pt', 'ron', 'rus',
+        'slv', 'spa', 'swe', 'ukr', 'other'
+    ]:
+        main.settings_custom['sentence_tokenization']['sentence_tokenizer_settings'][lang] = 'spacy_sentencizer'
+
+        if lang in ['zho_cn', 'zho_tw']:
+            main.settings_custom['word_tokenization']['word_tokenizer_settings'][lang] = 'pkuseg_zho'
+        else:
+            main.settings_custom['word_tokenization']['word_tokenizer_settings'][lang] = 'nltk_nltk'