diff --git a/Jenkinsfile b/Jenkinsfile index 4b7271dd24..6461c0f069 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -18,6 +18,8 @@ node('cuda-module') { docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py36 py37 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39 + docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 + docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0 """ currentBuild.result = 'SUCCESS' diff --git a/README.md b/README.md index 63777998d0..61ff5e7369 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) -![Python 3.6, 3.7, 3.8, 3.9](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-green.svg) +![Python 3.6, 3.7, 3.8, 3.9, 3.10](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-green.svg) [![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov) @@ -34,9 +34,7 @@ Please leave us [your feedback](https://forms.gle/i64fowQmiVhMMC7f9) on how we c [Automatic Spelling Correction](http://docs.deeppavlov.ai/en/master/features/models/spelling_correction.html) | [Entity Linking](http://docs.deeppavlov.ai/en/master/features/models/entity_linking.html) -[Open Domain Questions Answering](http://docs.deeppavlov.ai/en/master/features/models/odqa.html) | [Frequently Asked Questions Answering](http://docs.deeppavlov.ai/en/master/features/models/faq.html) - -[Russian SuperGLUE](http://docs.deeppavlov.ai/en/master/features/models/superglue.html) +[Open Domain Questions Answering](http://docs.deeppavlov.ai/en/master/features/models/odqa.html) | [Russian SuperGLUE](http://docs.deeppavlov.ai/en/master/features/models/superglue.html) **Embeddings** @@ -58,7 +56,7 @@ Please leave us [your feedback](https://forms.gle/i64fowQmiVhMMC7f9) on how we c ## Installation -0. We support `Linux` platform, `Python 3.6`, `3.7`, `3.8` and `3.9` +0. We support `Linux` platform, `Python 3.6`, `3.7`, `3.8`, `3.9` and `3.10` * **`Python 3.5` is not supported!** 1. Create and activate a virtual environment: diff --git a/deeppavlov/_meta.py b/deeppavlov/_meta.py index 98a5ec4400..1d9fbe7a1b 100644 --- a/deeppavlov/_meta.py +++ b/deeppavlov/_meta.py @@ -1,4 +1,4 @@ -__version__ = '1.0.2' +__version__ = '1.1.0' __author__ = 'Neural Networks and Deep Learning lab, MIPT' __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] diff --git a/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json b/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json index 16b20476c0..bf7ec99716 100644 --- a/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json +++ b/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json @@ -137,7 +137,7 @@ }, "download": [ { - "url": "https://files.deeppavlov.ai/0.16/classifiers/glue_mnli.tar.gz", + "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_mnli.tar.gz", "subdir": "{MODELS_PATH}" } ] diff --git a/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json b/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json index 6001c5cce7..4559394435 100644 --- a/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json +++ b/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json @@ -137,7 +137,7 @@ }, "download": [ { - "url": "https://files.deeppavlov.ai/0.16/classifiers/glue_rte.tar.gz", + "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_rte.tar.gz", "subdir": "{MODELS_PATH}" } ] diff --git a/deeppavlov/configs/cv/cv_tfidf_autofaq.json b/deeppavlov/configs/cv/cv_tfidf_autofaq.json deleted file mode 100644 index eaf4f32491..0000000000 --- a/deeppavlov/configs/cv/cv_tfidf_autofaq.json +++ /dev/null @@ -1,90 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "q", - "in_y": "y", - "pipe": [ - { - "class_name": "ru_tokenizer", - "id": "my_tokenizer", - "in": "q", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "q_vect" - ], - "fit_on": [ - "q_lem" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform" - }, - { - "class_name": "cos_sim_classifier", - "in": [ - "q_vect" - ], - "fit_on": [ - "q_vect", - "y" - ], - "top_n": 1, - "save_path": "{MODELS_PATH}/faq/tfidf_cos_sim_classifier.pkl", - "load_path": "{MODELS_PATH}/faq/tfidf_cos_sim_classifier.pkl", - "out": [ - "answer", - "score" - ] - } - ], - "out": "answer" - }, - "train": { - "metrics": [ - {"name": "accuracy"} - ], - "evaluation_targets": [ - "valid" - ], - "class_name": "fit_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/school/tfidf_cos_sim_classifier.pkl", - "subdir": "{MODELS_PATH}/faq" - }, - { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "subdir": "{MODELS_PATH}/vectorizer" - } - ] - } -} \ No newline at end of file diff --git a/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json b/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json index 1ba5da819f..8b7849a645 100644 --- a/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json +++ b/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json @@ -33,8 +33,11 @@ "save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix.npz", "load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix.npz", "tokenizer": { - "class_name": "ru_tokenizer", + "class_name": "stream_spacy_tokenizer", + "spacy_model": "ru_core_news_sm", "lemmas": true, + "lowercase": true, + "filter_stopwords": true, "ngram_range": [ 1, 2 diff --git a/deeppavlov/configs/embedder/tfidf_vectorizer.json b/deeppavlov/configs/embedder/tfidf_vectorizer.json deleted file mode 100644 index 898f2454f3..0000000000 --- a/deeppavlov/configs/embedder/tfidf_vectorizer.json +++ /dev/null @@ -1,61 +0,0 @@ -{ - "dataset_reader": { - "class_name": "line_reader", - "data_path": "{DOWNLOADS_PATH}/wiki/wikitext_ru/ru.wiki.train.txt" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "q", - "pipe": [ - { - "class_name": "ru_tokenizer", - "id": "my_tokenizer", - "in": "q", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "q_vect" - ], - "fit_on": [ - "q_lem" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform" - } - ], - "out": "q_vect" - }, - "train": { - "evaluation_targets": [], - "class_name": "fit_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/wikitext_ru.zip", - "subdir": "{DOWNLOADS_PATH}/wiki" - } - ] - } -} \ No newline at end of file diff --git a/deeppavlov/configs/faq/fasttext_avg_autofaq.json b/deeppavlov/configs/faq/fasttext_avg_autofaq.json deleted file mode 100644 index 59fcb0ac47..0000000000 --- a/deeppavlov/configs/faq/fasttext_avg_autofaq.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "question", - "pipe": [ - { - "class_name": "ru_tokenizer", - "in": "question", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "class_name": "fasttext", - "in": "q_token_lemmas", - "load_path": "{DOWNLOADS_PATH}/embeddings/lenta_lower_100.bin", - "mean": true, - "out": "question_vector" - }, - { - "class_name": "cos_sim_classifier", - "in": "question_vector", - "fit_on": [ - "question_vector", - "y" - ], - "top_n": 1, - "save_path": "{MODELS_PATH}/faq/fasttext_cos_classifier.pkl", - "load_path": "{MODELS_PATH}/faq/fasttext_cos_classifier.pkl", - "out": [ - "answer", - "score" - ] - } - ], - "out": [ - "answer", - "score" - ] - }, - "train": { - "evaluation_targets": [], - "class_name": "fit_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/school/fasttext_cos_classifier.pkl", - "subdir": "{MODELS_PATH}/faq" - }, - { - "url": "http://files.deeppavlov.ai/embeddings/lenta_lower_100.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/deeppavlov/configs/faq/fasttext_logreg.json b/deeppavlov/configs/faq/fasttext_logreg.json new file mode 100644 index 0000000000..541329246d --- /dev/null +++ b/deeppavlov/configs/faq/fasttext_logreg.json @@ -0,0 +1,112 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "format": "json", + "orient": "split", + "x": "text", + "y": "category", + "data_path": "{DOWNLOADS_PATH}/massive/{LANGUAGE}", + "train": "train.json", + "valid": "dev.json", + "test": "test.json" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42, + "shuffle": true, + "shot": 5 + }, + "chainer": { + "in": ["text"], + "in_y": ["category"], + "pipe": [ + { + "class_name": "stream_spacy_tokenizer", + "in": ["text"], + "id": "my_tokenizer", + "lemmas": false, + "out": "token_lemmas", + "spacy_model": "{SPACY_MODEL}" + }, + { + "ref": "my_tokenizer", + "in": ["token_lemmas"], + "out": ["text_lem"] + }, + { + "class_name": "fasttext", + "in": ["token_lemmas"], + "load_path": "{DOWNLOADS_PATH}/embeddings/fasttext/{LANGUAGE}.bin", + "mean": true, + "out": ["text_vector"] + }, + { + "id": "answers_vocab", + "class_name": "simple_vocab", + "fit_on": "category", + "save_path": "{MODEL_PATH}/cat_answers.dict", + "load_path": "{MODEL_PATH}/cat_answers.dict", + "in": ["category"], + "out": ["y_ids"] + }, + { + "in": ["text_vector"], + "fit_on": ["text_vector", "y_ids"], + "out": ["y_pred_proba"], + "class_name": "sklearn_component", + "main": true, + "save_path": "{MODEL_PATH}/model.pkl", + "load_path": "{MODEL_PATH}/model.pkl", + "model_class": "sklearn.linear_model:LogisticRegression", + "infer_method": "predict_proba", + "C": 10, + "penalty": "l2" + }, + { + "in": ["y_pred_proba"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_category"], + "ref": "answers_vocab" + } + ], + "out": ["y_pred_category"] + }, + "train": { + "evaluation_targets": ["train", "valid", "test"], + "class_name": "fit_trainer", + "metrics": [ + { + "name": "accuracy", + "inputs": ["category", "y_pred_category"] + } + ] + }, + "metadata": { + "variables": { + "LANGUAGE": "en", + "ROOT_PATH": "~/.deeppavlov", + "SPACY_MODEL": "en_core_web_sm", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODEL_PATH": "{ROOT_PATH}/models/faq/{LANGUAGE}/fasttext_logreg" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/embeddings/fasttext/{LANGUAGE}.bin", + "subdir": "{DOWNLOADS_PATH}/embeddings/fasttext" + }, + { + "url": "http://files.deeppavlov.ai/datasets/massive-{LANGUAGE}.tar.gz", + "subdir": "{DOWNLOADS_PATH}/massive/{LANGUAGE}" + }, + { + "url": "https://files.deeppavlov.ai/faq/fasttext_logreg_{LANGUAGE}.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/configs/faq/fasttext_tfidf_autofaq.json b/deeppavlov/configs/faq/fasttext_tfidf_autofaq.json deleted file mode 100644 index 0eb3e112fc..0000000000 --- a/deeppavlov/configs/faq/fasttext_tfidf_autofaq.json +++ /dev/null @@ -1,108 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "question", - "in_y": "y", - "pipe": [ - { - "class_name": "ru_tokenizer", - "id": "my_tokenizer", - "in": "question", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "question_vector" - ], - "fit_on": [ - "q_lem", - "y" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform", - "analyzer": "word" - }, - { - "class_name": "fasttext", - "id": "my_embedder", - "in": "q_token_lemmas", - "load_path": "{DOWNLOADS_PATH}/embeddings/lenta_lower_100.bin", - "out": "tokens_fasttext_vectors" - }, - { - "class_name": "tfidf_weighted", - "in": "q_token_lemmas", - "vectorizer": "#tfidf_vec", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer", - "mean": true, - "out": "question_vector" - }, - { - "class_name": "cos_sim_classifier", - "in": "question_vector", - "fit_on": [ - "question_vector", - "y" - ], - "top_n": 1, - "save_path": "{MODELS_PATH}/faq/fasttext_cos_classifier.pkl", - "load_path": "{MODELS_PATH}/faq/fasttext_cos_classifier.pkl", - "out": [ - "answer", - "score" - ] - } - ], - "out": [ - "answer", - "score" - ] - }, - "train": { - "evaluation_targets": [], - "class_name": "fit_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/school/fasttext_cos_classifier.pkl", - "subdir": "{MODELS_PATH}/faq" - }, - { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "subdir": "{MODELS_PATH}/vectorizer" - }, - { - "url": "http://files.deeppavlov.ai/embeddings/lenta_lower_100.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/deeppavlov/configs/faq/tfidf_autofaq.json b/deeppavlov/configs/faq/tfidf_autofaq.json deleted file mode 100644 index ffe1996af0..0000000000 --- a/deeppavlov/configs/faq/tfidf_autofaq.json +++ /dev/null @@ -1,85 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "q", - "pipe": [ - { - "class_name": "ru_tokenizer", - "id": "my_tokenizer", - "in": "q", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "q_vect" - ], - "fit_on": [ - "q_lem" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform" - }, - { - "class_name": "cos_sim_classifier", - "in": "q_vect", - "fit_on": [ - "q_vect", - "y" - ], - "top_n": 1, - "save_path": "{MODELS_PATH}/faq/tfidf_cos_sim_classifier.pkl", - "load_path": "{MODELS_PATH}/faq/tfidf_cos_sim_classifier.pkl", - "out": [ - "answer", - "score" - ] - } - ], - "out": [ - "answer", - "score" - ] - }, - "train": { - "class_name": "fit_trainer", - "evaluation_targets": [] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/school/tfidf_cos_sim_classifier.pkl", - "subdir": "{MODELS_PATH}/faq" - }, - { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "subdir": "{MODELS_PATH}/vectorizer" - } - ] - } -} \ No newline at end of file diff --git a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json deleted file mode 100644 index a41ada103a..0000000000 --- a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json +++ /dev/null @@ -1,116 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" - }, - "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42 - }, - "chainer": { - "in": "q", - "in_y": "y", - "pipe": [ - { - "class_name": "ru_tokenizer", - "id": "my_tokenizer", - "in": "q", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "q_vect" - ], - "fit_on": [ - "q_lem" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v2.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v2.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform" - }, - { - "id": "answers_vocab", - "class_name": "simple_vocab", - "fit_on": [ - "y" - ], - "save_path": "{MODELS_PATH}/faq/ru_mipt_answers.dict", - "load_path": "{MODELS_PATH}/faq/ru_mipt_answers.dict", - "in": "y", - "out": "y_ids" - }, - { - "in": "q_vect", - "fit_on": [ - "q_vect", - "y_ids" - ], - "out": [ - "y_pred_proba" - ], - "class_name": "sklearn_component", - "main": true, - "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl", - "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl", - "model_class": "sklearn.linear_model:LogisticRegression", - "infer_method": "predict_proba", - "C": 1000, - "penalty": "l2" - }, - { - "in": "y_pred_proba", - "out": "y_pred_ids", - "class_name": "proba2labels", - "max_proba": true - }, - { - "in": "y_pred_ids", - "out": "y_pred_answers", - "ref": "answers_vocab" - } - ], - "out": [ - "y_pred_answers", - "y_pred_proba" - ] - }, - "train": { - "class_name": "fit_trainer", - "evaluation_targets": [] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v4.pkl", - "subdir": "{MODELS_PATH}/faq" - }, - { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki_v2.pkl", - "subdir": "{MODELS_PATH}/vectorizer" - }, - { - "url": "http://files.deeppavlov.ai/faq/mipt/ru_mipt_answers.dict", - "subdir": "{MODELS_PATH}/faq" - } - ] - } -} diff --git a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json deleted file mode 100644 index 8abccda06e..0000000000 --- a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json +++ /dev/null @@ -1,107 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school_en.csv" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "q", - "in_y": "y", - "pipe": [ - { - "class_name": "stream_spacy_tokenizer", - "in": "q", - "id": "my_tokenizer", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "q_vect" - ], - "fit_on": [ - "q_lem" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/tfidf.pkl", - "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/tfidf.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform" - }, - { - "id": "answers_vocab", - "class_name": "simple_vocab", - "fit_on": [ - "y" - ], - "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/en_mipt_answers.dict", - "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/en_mipt_answers.dict", - "in": "y", - "out": "y_ids" - }, - { - "in": "q_vect", - "fit_on": [ - "q_vect", - "y_ids" - ], - "out": [ - "y_pred_proba" - ], - "class_name": "sklearn_component", - "main": true, - "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/logreg.pkl", - "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/logreg.pkl", - "model_class": "sklearn.linear_model:LogisticRegression", - "infer_method": "predict_proba", - "C": 1000, - "penalty": "l2" - }, - { - "in": "y_pred_proba", - "out": "y_pred_ids", - "class_name": "proba2labels", - "max_proba": true - }, - { - "in": "y_pred_ids", - "out": "y_pred_answers", - "ref": "answers_vocab" - } - ], - "out": [ - "y_pred_answers", - "y_pred_proba" - ] - }, - "train": { - "evaluation_targets": [], - "class_name": "fit_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/mipt/en_mipt_faq_v5.tar.gz", - "subdir": "{MODELS_PATH}/faq/mipt" - } - ] - } -} diff --git a/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json b/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json deleted file mode 100644 index bf65d82229..0000000000 --- a/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json +++ /dev/null @@ -1,106 +0,0 @@ -{ - "dataset_reader": { - "class_name": "faq_reader", - "x_col_name": "Question", - "y_col_name": "Answer", - "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" - }, - "dataset_iterator": { - "class_name": "data_learning_iterator" - }, - "chainer": { - "in": "q", - "pipe": [ - { - "class_name": "ru_tokenizer", - "id": "my_tokenizer", - "in": "q", - "lemmas": true, - "out": "q_token_lemmas" - }, - { - "ref": "my_tokenizer", - "in": "q_token_lemmas", - "out": "q_lem" - }, - { - "in": [ - "q_lem" - ], - "out": [ - "q_vect" - ], - "fit_on": [ - "q_lem" - ], - "id": "tfidf_vec", - "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v3.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v3.pkl", - "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", - "infer_method": "transform" - }, - { - "in": [ - "q_vect" - ], - "out": [ - "y_pred" - ], - "fit_on": [ - "q_vect", - "y" - ], - "class_name": "sklearn_component", - "main": true, - "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl", - "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl", - "model_class": "sklearn.linear_model:LogisticRegression", - "infer_method": "predict", - "C": { - "search_choice": [ - 1, - 10, - 100, - 1000 - ] - }, - "penalty": { - "search_choice": [ - "l1", - "l2" - ] - } - } - ], - "out": [ - "y_pred" - ] - }, - "train": { - "metrics": [ - "accuracy" - ], - "evaluation_targets": [ - "valid" - ], - "class_name": "fit_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v4.pkl", - "subdir": "{MODELS_PATH}/faq" - }, - { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki_v3.pkl", - "subdir": "{MODELS_PATH}/vectorizer" - } - ] - } -} diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 42f0df484e..c45ddfdf81 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -50,7 +50,6 @@ "rel_ranking_reader": "deeppavlov.dataset_readers.rel_ranking_reader:ParaphraserReader", "response_base_loader": "deeppavlov.models.preprocessors.response_base_loader:ResponseBaseLoader", "ru_adj_to_noun": "deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun", - "ru_tokenizer": "deeppavlov.models.tokenizers.ru_tokenizer:RussianTokenizer", "rured_reader": "deeppavlov.dataset_readers.rured_reader:RuREDDatasetReader", "russian_words_vocab": "deeppavlov.vocabs.typos:RussianWordsVocab", "sanitizer": "deeppavlov.models.preprocessors.sanitizer:Sanitizer", diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json index d65eba771e..0abdd3b308 100644 --- a/deeppavlov/core/common/requirements_registry.json +++ b/deeppavlov/core/common/requirements_registry.json @@ -5,7 +5,9 @@ ], "entity_linker": [ "{DEEPPAVLOV_PATH}/requirements/hdt.txt", - "{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt" + "{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt", + "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", + "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "fasttext": [ "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" @@ -58,6 +60,7 @@ "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "ru_adj_to_noun": [ + "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt", "{DEEPPAVLOV_PATH}/requirements/udapi.txt" ], "russian_words_vocab": [ @@ -76,7 +79,8 @@ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "stream_spacy_tokenizer": [ - "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt" + "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", + "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "torch_bert_ranker": [ "{DEEPPAVLOV_PATH}/requirements/pytorch.txt", @@ -147,7 +151,9 @@ "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], "tree_to_sparql": [ - "{DEEPPAVLOV_PATH}/requirements/udapi.txt" + "{DEEPPAVLOV_PATH}/requirements/udapi.txt", + "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt", + "{DEEPPAVLOV_PATH}/requirements/ru_core_news_sm.txt" ], "typos_custom_reader": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" diff --git a/deeppavlov/core/data/simple_vocab.py b/deeppavlov/core/data/simple_vocab.py index d10162db65..66efc4bede 100644 --- a/deeppavlov/core/data/simple_vocab.py +++ b/deeppavlov/core/data/simple_vocab.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections import Counter, defaultdict, Iterable +from collections import Counter, defaultdict from itertools import chain from logging import getLogger -from typing import Optional, Tuple, List +from typing import Iterable, Optional, Tuple import numpy as np diff --git a/deeppavlov/dataset_iterators/basic_classification_iterator.py b/deeppavlov/dataset_iterators/basic_classification_iterator.py index 390a6ba442..dd91fb3f11 100644 --- a/deeppavlov/dataset_iterators/basic_classification_iterator.py +++ b/deeppavlov/dataset_iterators/basic_classification_iterator.py @@ -13,6 +13,7 @@ # limitations under the License. +from collections import defaultdict from logging import getLogger from typing import List @@ -40,6 +41,7 @@ class BasicClassificationDatasetIterator(DataLearningIterator): shuffle: whether to shuffle examples in batches split_seed: random seed for splitting dataset, if ``split_seed`` is None, division is based on `seed`. stratify: whether to use stratified split + shot: number of examples to sample for each class in training data. If None, all examples will remain in data. *args: arguments **kwargs: arguments @@ -52,6 +54,7 @@ def __init__(self, data: dict, field_to_split: str = None, split_fields: List[str] = None, split_proportions: List[float] = None, seed: int = None, shuffle: bool = True, split_seed: int = None, stratify: bool = None, + shot: int = None, *args, **kwargs): """ Initialize dataset using data from DatasetReader, @@ -80,6 +83,21 @@ def __init__(self, data: dict, stratify=stratify) else: raise IOError("Given field to split BUT not given names of split fields") + + if shot is not None: + train_data = self.data['train'] + self.random.shuffle(train_data) + self.random.seed(seed) + + data_dict = defaultdict(list) + for text, label in train_data: + if len(data_dict[label]) < shot: + data_dict[label].append(text) + + if min(len(x) for x in data_dict.values()) < shot: + log.warning(f"Some labels have less than {shot} examples") + + self.data['train'] = [(text, label) for label in data_dict for text in data_dict[label]] def _split_data(self, field_to_split: str = None, split_fields: List[str] = None, split_proportions: List[float] = None, split_seed: int = None, stratify: bool = None) -> bool: diff --git a/deeppavlov/models/entity_extraction/entity_linking.py b/deeppavlov/models/entity_extraction/entity_linking.py index 4d60fa1470..b91e1ea412 100644 --- a/deeppavlov/models/entity_extraction/entity_linking.py +++ b/deeppavlov/models/entity_extraction/entity_linking.py @@ -14,19 +14,19 @@ import re import sqlite3 +from collections import defaultdict from logging import getLogger from typing import List, Dict, Tuple, Union, Any -from collections import defaultdict -import pymorphy2 +import spacy from hdt import HDTDocument from nltk.corpus import stopwords from rapidfuzz import fuzz +from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable -from deeppavlov.core.commands.utils import expand_path log = getLogger(__name__) @@ -75,7 +75,6 @@ def __init__( **kwargs: """ super().__init__(save_path=None, load_path=load_path) - self.morph = pymorphy2.MorphAnalyzer() self.lemmatize = lemmatize self.entities_database_filename = entities_database_filename self.num_entities_for_bert_ranking = num_entities_for_bert_ranking @@ -86,8 +85,10 @@ def __init__( self.lang = f"@{lang}" if self.lang == "@en": self.stopwords = set(stopwords.words("english")) + self.nlp = spacy.load("en_core_web_sm") elif self.lang == "@ru": self.stopwords = set(stopwords.words("russian")) + self.nlp = spacy.load("ru_core_news_sm") self.use_descriptions = use_descriptions self.use_connections = use_connections self.max_paragraph_len = max_paragraph_len @@ -198,7 +199,7 @@ def link_entities( ): cand_ent_scores = [] if len(entity_substr) > 1: - entity_substr_split_lemm = [self.morph.parse(tok)[0].normal_form for tok in entity_substr_split] + entity_substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in entity_substr_split] cand_ent_init = self.find_exact_match(entity_substr, tag) if not cand_ent_init or entity_substr_split != entity_substr_split_lemm: cand_ent_init = self.find_fuzzy_match(entity_substr_split, tag) @@ -297,28 +298,23 @@ def find_exact_match(self, entity_substr, tag): entity_substr_split = entity_substr_split[1:] entities_and_ids = self.find_title(entity_substr) cand_ent_init = self.process_cand_ent(cand_ent_init, entities_and_ids, entity_substr_split, tag) - if self.lang == "@ru": - entity_substr_split_lemm = [self.morph.parse(tok)[0].normal_form for tok in entity_substr_split] - entity_substr_lemm = " ".join(entity_substr_split_lemm) - if entity_substr_lemm != entity_substr: - entities_and_ids = self.find_title(entity_substr_lemm) - if entities_and_ids: - cand_ent_init = self.process_cand_ent( - cand_ent_init, entities_and_ids, entity_substr_split_lemm, tag - ) + + entity_substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in entity_substr_split] + entity_substr_lemm = " ".join(entity_substr_split_lemm) + if entity_substr_lemm != entity_substr: + entities_and_ids = self.find_title(entity_substr_lemm) + if entities_and_ids: + cand_ent_init = self.process_cand_ent(cand_ent_init, entities_and_ids, entity_substr_split_lemm, tag) return cand_ent_init def find_fuzzy_match(self, entity_substr_split, tag): - if self.lang == "@ru": - entity_substr_split_lemm = [self.morph.parse(tok)[0].normal_form for tok in entity_substr_split] - else: - entity_substr_split_lemm = entity_substr_split + entity_substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in entity_substr_split] cand_ent_init = defaultdict(set) for word in entity_substr_split: part_entities_and_ids = self.find_title(word) cand_ent_init = self.process_cand_ent(cand_ent_init, part_entities_and_ids, entity_substr_split, tag) if self.lang == "@ru": - word_lemm = self.morph.parse(word)[0].normal_form + word_lemm = self.nlp(word)[0].lemma_ if word != word_lemm: part_entities_and_ids = self.find_title(word_lemm) cand_ent_init = self.process_cand_ent( @@ -329,11 +325,6 @@ def find_fuzzy_match(self, entity_substr_split, tag): ) return cand_ent_init - def morph_parse(self, word): - morph_parse_tok = self.morph.parse(word)[0] - normal_form = morph_parse_tok.normal_form - return normal_form - def calc_substr_score(self, cand_entity_title, entity_substr_split): label_tokens = cand_entity_title.split() cnt = 0.0 diff --git a/deeppavlov/models/kbqa/tree_to_sparql.py b/deeppavlov/models/kbqa/tree_to_sparql.py index b5ff26c44b..d406ce7368 100644 --- a/deeppavlov/models/kbqa/tree_to_sparql.py +++ b/deeppavlov/models/kbqa/tree_to_sparql.py @@ -19,7 +19,7 @@ from typing import Any, List, Tuple, Dict, Union import numpy as np -import pymorphy2 +import spacy from navec import Navec from scipy.sparse import csr_matrix from slovnet import Syntax @@ -66,11 +66,10 @@ def __init__(self, freq_dict_filename: str, candidate_nouns: int = 10, **kwargs) self.adj_set = set([word for word, freq in pos_freq_dict["a"]]) self.nouns = [noun[0] for noun in self.nouns_with_freq] self.matrix = self.make_sparse_matrix(self.nouns).transpose() - self.morph = pymorphy2.MorphAnalyzer() + self.nlp = spacy.load("ru_core_news_sm") def search(self, word: str): - word = self.morph.parse(word)[0] - word = word.normal_form + word = self.nlp(word)[0].lemma_ if word in self.adj_set: q_matrix = self.make_sparse_matrix([word]) scores = q_matrix * self.matrix @@ -190,6 +189,7 @@ def __init__(self, sparql_queries_filename: str, lang: str = "rus", adj_to_noun: self.begin_tokens = {"начинать", "начать"} self.end_tokens = {"завершить", "завершать", "закончить"} self.ranking_tokens = {"самый"} + self.nlp = spacy.load("ru_core_news_sm") elif self.lang == "eng": self.q_pronouns = {"what", "who", "how", "when", "where", "which"} self.how_many = "how many" @@ -199,12 +199,12 @@ def __init__(self, sparql_queries_filename: str, lang: str = "rus", adj_to_noun: self.begin_tokens = set() self.end_tokens = set() self.ranking_tokens = set() + self.nlp = spacy.load("en_core_web_sm") else: raise ValueError(f"unsupported language {lang}") self.sparql_queries_filename = expand_path(sparql_queries_filename) self.template_queries = read_json(self.sparql_queries_filename) self.adj_to_noun = adj_to_noun - self.morph = pymorphy2.MorphAnalyzer() def __call__(self, syntax_tree_batch: List[str], positions_batch: List[List[List[int]]]) -> Tuple[ @@ -274,7 +274,7 @@ def __call__(self, syntax_tree_batch: List[str], self.root_entity = True temporal_order = self.find_first_last(new_root) - new_root_nf = self.morph.parse(new_root.form)[0].normal_form + new_root_nf = self.nlp(new_root.form)[0].lemma_ if new_root_nf in self.begin_tokens or new_root_nf in self.end_tokens: temporal_order = new_root_nf ranking_tokens = self.find_ranking_tokens(new_root) @@ -288,7 +288,7 @@ def __call__(self, syntax_tree_batch: List[str], question = [] for node in tree.descendants: if node.ord in ranking_tokens or node.form.lower() in self.q_pronouns: - question.append(self.morph.parse(node.form)[0].normal_form) + question.append(self.nlp(node.form)[0].lemma_) else: question.append(node.form) question = ' '.join(question) @@ -496,9 +496,9 @@ def find_first_last(self, node: Node) -> str: for node in nodes: node_desc = defaultdict(set) for elem in node.children: - parsed_elem = self.morph.parse(elem.form.lower())[0].inflect({"masc", "sing", "nomn"}) + parsed_elem = self.nlp(elem.form.lower())[0].lemma_ if parsed_elem is not None: - node_desc[elem.deprel].add(parsed_elem.word) + node_desc[elem.deprel].add(parsed_elem) else: node_desc[elem.deprel].add(elem.form) if "amod" in node_desc.keys() and "nmod" in node_desc.keys() and \ @@ -511,7 +511,7 @@ def find_first_last(self, node: Node) -> str: def find_ranking_tokens(self, node: Node) -> list: ranking_tokens = [] for elem in node.descendants: - if self.morph.parse(elem.form)[0].normal_form in self.ranking_tokens: + if self.nlp(elem.form)[0].lemma_ in self.ranking_tokens: ranking_tokens.append(elem.ord) ranking_tokens.append(elem.parent.ord) return ranking_tokens diff --git a/deeppavlov/models/kbqa/type_define.py b/deeppavlov/models/kbqa/type_define.py index 7e9ab41be5..1ccdd9b388 100644 --- a/deeppavlov/models/kbqa/type_define.py +++ b/deeppavlov/models/kbqa/type_define.py @@ -15,7 +15,6 @@ import pickle from typing import List -import pymorphy2 import spacy from nltk.corpus import stopwords @@ -43,7 +42,6 @@ def __init__(self, lang: str, types_filename: str, types_sets_filename: str, self.types_filename = str(expand_path(types_filename)) self.types_sets_filename = str(expand_path(types_sets_filename)) self.num_types_to_return = num_types_to_return - self.morph = pymorphy2.MorphAnalyzer() if self.lang == "@en": self.stopwords = set(stopwords.words("english")) self.nlp = spacy.load("en_core_web_sm") @@ -102,7 +100,7 @@ def __call__(self, questions_batch: List[str], entity_substr_batch: List[List[st types_substr_tokens = types_substr.split() types_substr_tokens = [tok for tok in types_substr_tokens if tok not in self.stopwords] if self.lang == "@ru": - types_substr_tokens = [self.morph.parse(tok)[0].normal_form for tok in types_substr_tokens] + types_substr_tokens = [self.nlp(tok)[0].lemma_ for tok in types_substr_tokens] types_substr_tokens = set(types_substr_tokens) types_scores = [] for entity in self.types_dict: diff --git a/deeppavlov/models/tokenizers/ru_tokenizer.py b/deeppavlov/models/tokenizers/ru_tokenizer.py deleted file mode 100644 index e51478a079..0000000000 --- a/deeppavlov/models/tokenizers/ru_tokenizer.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2017 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from logging import getLogger -from typing import List, Generator, Any, Optional, Union, Tuple - -# from nltk.corpus import stopwords -# STOPWORDS = stopwords.words('russian') -import pymorphy2 -from nltk.tokenize.toktok import ToktokTokenizer - -from deeppavlov.core.common.registry import register -from deeppavlov.core.models.component import Component -from deeppavlov.models.tokenizers.utils import detokenize, ngramize - -logger = getLogger(__name__) - - -@register('ru_tokenizer') -class RussianTokenizer(Component): - """Tokenize or lemmatize a list of documents for Russian language. Default models are - :class:`ToktokTokenizer` tokenizer and :mod:`pymorphy2` lemmatizer. - Return a list of tokens or lemmas for a whole document. - If is called onto ``List[str]``, performs detokenizing procedure. - - Args: - stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing - and ngrams creation - ngram_range: size of ngrams to create; only unigrams are returned by default - lemmas: whether to perform lemmatizing or not - lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` - and :meth:`_lemmatize` methods - alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` - method - - Attributes: - stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing - and ngrams creation - tokenizer: an instance of :class:`ToktokTokenizer` tokenizer class - lemmatizer: an instance of :class:`pymorphy2.MorphAnalyzer` lemmatizer class - ngram_range: size of ngrams to create; only unigrams are returned by default - lemmas: whether to perform lemmatizing or not - lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` - and :meth:`_lemmatize` methods - alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` - method - tok2morph: token-to-lemma cache - - """ - - def __init__(self, stopwords: Optional[List[str]] = None, ngram_range: List[int] = None, - lemmas: bool = False, lowercase: Optional[bool] = None, - alphas_only: Optional[bool] = None, **kwargs): - - if ngram_range is None: - ngram_range = [1, 1] - self.stopwords = stopwords or [] - self.tokenizer = ToktokTokenizer() - self.lemmatizer = pymorphy2.MorphAnalyzer() - self.ngram_range = tuple(ngram_range) # cast JSON array to tuple - self.lemmas = lemmas - self.lowercase = lowercase - self.alphas_only = alphas_only - self.tok2morph = {} - - def __call__(self, batch: Union[List[str], List[List[str]]]) -> \ - Union[List[List[str]], List[str]]: - """Tokenize or detokenize strings, depends on the type structure of passed arguments. - - Args: - batch: a batch of documents to perform tokenizing/lemmatizing; - or a batch of lists of tokens/lemmas to perform detokenizing - - Returns: - a batch of lists of tokens/lemmas; or a batch of detokenized strings - - Raises: - TypeError: If the first element of ``batch`` is neither ``List``, nor ``str``. - - """ - if isinstance(batch[0], str): - if self.lemmas: - return list(self._lemmatize(batch)) - else: - return list(self._tokenize(batch)) - if isinstance(batch[0], list): - return [detokenize(doc) for doc in batch] - raise TypeError( - "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0]))) - - def _tokenize(self, data: List[str], ngram_range: Tuple[int, int] = (1, 1), lowercase: bool = True) \ - -> Generator[List[str], Any, None]: - """Tokenize a list of documents. - - Args: - data: a list of documents to tokenize - ngram_range: size of ngrams to create; only unigrams are returned by default - lowercase: whether to perform lowercasing or not; is performed by default by - :meth:`_tokenize` and :meth:`_lemmatize` methods - - Yields: - list of lists of ngramized tokens or list of detokenized strings - - Returns: - None - - """ - # DEBUG - # size = len(data) - _ngram_range = self.ngram_range or ngram_range - - if self.lowercase is None: - _lowercase = lowercase - else: - _lowercase = self.lowercase - - for i, doc in enumerate(data): - # DEBUG - # logger.info("Tokenize doc {} from {}".format(i, size)) - tokens = self.tokenizer.tokenize(doc) - if _lowercase: - tokens = [t.lower() for t in tokens] - filtered = self._filter(tokens) - processed_doc = ngramize(filtered, ngram_range=_ngram_range) - yield from processed_doc - - def _lemmatize(self, data: List[str], ngram_range: Tuple[int, int] = (1, 1)) -> \ - Generator[List[str], Any, None]: - """Lemmatize a list of documents. - - Args: - data: a list of documents to tokenize - ngram_range: size of ngrams to create; only unigrams are returned by default - - Yields: - list of lists of ngramized tokens or list of detokenized strings - - Returns: - None - - """ - # DEBUG - # size = len(data) - _ngram_range = self.ngram_range or ngram_range - - tokenized_data = list(self._tokenize(data)) - - for i, doc in enumerate(tokenized_data): - # DEBUG - # logger.info("Lemmatize doc {} from {}".format(i, size)) - lemmas = [] - for token in doc: - try: - lemma = self.tok2morph[token] - except KeyError: - lemma = self.lemmatizer.parse(token)[0].normal_form - self.tok2morph[token] = lemma - lemmas.append(lemma) - filtered = self._filter(lemmas) - processed_doc = ngramize(filtered, ngram_range=_ngram_range) - yield from processed_doc - - def _filter(self, items: List[str], alphas_only: bool = True) -> List[str]: - """Filter a list of tokens/lemmas. - - Args: - items: a list of tokens/lemmas to filter - alphas_only: whether to filter out non-alpha tokens - - Returns: - a list of filtered tokens/lemmas - - """ - if self.alphas_only is None: - _alphas_only = alphas_only - else: - _alphas_only = self.alphas_only - - if _alphas_only: - filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords - else: - filter_fn = lambda x: not x.isspace() and x not in self.stopwords - - return list(filter(filter_fn, items)) - - def set_stopwords(self, stopwords: List[str]) -> None: - """Redefine a list of stopwords. - - Args: - stopwords: a list of stopwords - - Returns: - None - - """ - self.stopwords = stopwords diff --git a/deeppavlov/models/tokenizers/spacy_tokenizer.py b/deeppavlov/models/tokenizers/spacy_tokenizer.py index f0d65c81a7..247c0ff938 100644 --- a/deeppavlov/models/tokenizers/spacy_tokenizer.py +++ b/deeppavlov/models/tokenizers/spacy_tokenizer.py @@ -48,8 +48,7 @@ class StreamSpacyTokenizer(Component): Args: disable: spacy pipeline elements to disable, serves a purpose of performing; if nothing - stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing - and ngrams creation + filter_stopwords: whether to ignore stopwords during tokenizing/lemmatizing and ngrams creation batch_size: a batch size for spaCy buffering ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not @@ -86,8 +85,7 @@ def __init__(self, disable: Optional[Iterable[str]] = None, filter_stopwords: bo if ngram_range is None: ngram_range = [1, 1] self.model = _try_load_spacy_model(spacy_model, disable=disable) - self.filter_stopwords = filter_stopwords - self.stopwords = spacy.lang.en.stop_words.STOP_WORDS if self.filter_stopwords else [] + self.stopwords = self.model.Defaults.stop_words if filter_stopwords else set() self.batch_size = batch_size self.ngram_range = tuple(ngram_range) # cast JSON array to tuple self.lemmas = lemmas diff --git a/deeppavlov/requirements/en_core_web_sm.txt b/deeppavlov/requirements/en_core_web_sm.txt index 6e4830cd98..a4c69643c7 100644 --- a/deeppavlov/requirements/en_core_web_sm.txt +++ b/deeppavlov/requirements/en_core_web_sm.txt @@ -1,2 +1,2 @@ -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl spacy diff --git a/deeppavlov/requirements/kenlm.txt b/deeppavlov/requirements/kenlm.txt index 9d57c24888..8bd21c6112 100644 --- a/deeppavlov/requirements/kenlm.txt +++ b/deeppavlov/requirements/kenlm.txt @@ -1 +1 @@ -pypi-kenlm==0.1.20210121 +pypi-kenlm==0.1.20220713 diff --git a/deeppavlov/requirements/pytorch.txt b/deeppavlov/requirements/pytorch.txt index 93197394a4..68e45f0c26 100644 --- a/deeppavlov/requirements/pytorch.txt +++ b/deeppavlov/requirements/pytorch.txt @@ -1 +1 @@ -torch>=1.6.0,<1.13.0 +torch>=1.6.0,<1.14.0 diff --git a/deeppavlov/requirements/ru_core_news_sm.txt b/deeppavlov/requirements/ru_core_news_sm.txt index d7e3dd11c9..f12bf5b6fb 100644 --- a/deeppavlov/requirements/ru_core_news_sm.txt +++ b/deeppavlov/requirements/ru_core_news_sm.txt @@ -1,2 +1,2 @@ -https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.3.0/ru_core_news_sm-3.3.0-py3-none-any.whl +https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl spacy diff --git a/deeppavlov/requirements/transformers.txt b/deeppavlov/requirements/transformers.txt index 65c2816393..29dddb74ee 100644 --- a/deeppavlov/requirements/transformers.txt +++ b/deeppavlov/requirements/transformers.txt @@ -1 +1 @@ -transformers>=4.13.0,<4.21.0 +transformers>=4.13.0,<4.25.0 diff --git a/docs/apiref/models/tokenizers.rst b/docs/apiref/models/tokenizers.rst index 99e735acda..3ce7353261 100644 --- a/docs/apiref/models/tokenizers.rst +++ b/docs/apiref/models/tokenizers.rst @@ -13,8 +13,4 @@ deeppavlov.models.tokenizers .. autoclass:: deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer - .. automethod:: __call__ - -.. autoclass:: deeppavlov.models.tokenizers.ru_tokenizer.RussianTokenizer - - .. automethod:: __call__ + .. automethod:: __call__ \ No newline at end of file diff --git a/docs/features/models/NER.ipynb b/docs/features/models/NER.ipynb index 42cffa710d..70ee221786 100644 --- a/docs/features/models/NER.ipynb +++ b/docs/features/models/NER.ipynb @@ -129,13 +129,16 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "`ner_ontonotes_bert_torch` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", - "The full list of NER models with their config names can be found in the [table](#6.-Models-list)." + "The full list of NER models with their config names can be found in the [table](#6.-Models-list).\n", + "\n", + "There are alternative ways to install the model's packages that do not require executing a separate command -- see the options in the next sections of this page. " ] }, { @@ -168,7 +171,17 @@ "source": [ "from deeppavlov import build_model\n", "\n", - "ner_model = build_model('ner_ontonotes_bert_torch', download=True)" + "ner_model = build_model('ner_ontonotes_bert_torch', download=True, install=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `download` argument defines whether it is necessary to download the files defined in the `download` section of the config: usually it provides the links to the train and test data, to the pretrained models, or to the embeddings.\n", + "\n", + "Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages." ] }, { @@ -568,10 +581,10 @@ "| **CARDINAL** | Numerals that do not fall under another type |" ] } - ], - "metadata": { - "accelerator": "GPU" - }, - "nbformat": 4, - "nbformat_minor": 4 -} +], +"metadata": { + "accelerator": "GPU" +}, +"nbformat": 4, +"nbformat_minor": 4 +} \ No newline at end of file diff --git a/docs/features/models/classification.ipynb b/docs/features/models/classification.ipynb index 4353b4a8b7..e0e0f1ae3b 100644 --- a/docs/features/models/classification.ipynb +++ b/docs/features/models/classification.ipynb @@ -25,7 +25,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Table of contents \n", + "# Table of contents\n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "2. [Get started with the model](#2.-Get-started-with-the-model)\n", @@ -35,16 +35,23 @@ "\n", " 3.2. [Predict using CLI](#3.2-Predict-using-CLI)\n", "4. [Evaluation](#4.-Evaluation)\n", - " \n", + "\n", " 4.1. [from Python](#4.1-Evaluate-from-Python)\n", - " \n", + "\n", " 4.2. [from CLI](#4.2-Evaluate-from-CLI)\n", "5. [Train the model on your data](#5.-Train-the-model-on-your-data)\n", - " \n", + "\n", " 5.1. [from Python](#5.1-Train-your-model-from-Python)\n", - " \n", + "\n", " 5.2. [from CLI](#5.2-Train-your-model-from-CLI)\n", - "6. [Models list](#6.-Models-list)" + "6. [Models list](#6.-Models-list)\n", + "7. [Simple few-shot classifiers](#7.-Simple-few-shot-classifiers)\n", + "\n", + " 7.1. [Few-shot setting](#7.1-Few-shot-setting)\n", + "\n", + " 7.2. [Multiple languages support](#7.2-Multiple-languages-support)\n", + "\n", + " 7.3. [Dataset and Scores](#7.3-Dataset-and-Scores)" ] }, { @@ -97,7 +104,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`insults_kaggle_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", + "`insults_kaggle_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html)\n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "The full list of NER models with their config names can be found in the [table](#6.-Models-list)." @@ -387,6 +394,60 @@ "| rusentiment_convers_bert | Ru | Sentiment | [RuSentiment](https://text-machine.cs.uml.edu/projects/rusentiment/) | 1.5 GB | F1-weighted | 0.7724 |\n", "| topics_distilbert_base_uncased | En | Topics | [DeepPavlov Topics](https://deeppavlov.ai/datasets/topics) | 6.2 GB | F1-macro | 0.9961 |\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 7. Simple few-shot classifiers\n", + "\n", + "Additionally, in the [faq](https://github.com/deeppavlov/DeepPavlov/tree/master/deeppavlov/configs/faq) section you can find a config for a fast and simple pre-BERT model, which consists of a fasttext vectorizer and a simple logistic regression classifier.\n", + "\n", + "## 7.1 Few-shot setting\n", + "\n", + "In the current setting the config can be used for few-shot classification - a task, in which only a few training examples are available for each class (usually from 5 to 10). Note that the config takes the full version of the dataset as the input and samples N examples for each class of the train data in the iterator.\n", + "\n", + "The sampling is done within the `basic_classification_iterator` component of the pipeline and the `shot` parameter defines the number of examples to be sampled. By default the `shot` parameter is set to `None` (no sampling applied).\n", + "\n", + "## 7.2 Multiple languages support\n", + "\n", + "By default `fasttext_logreg` supports classification in English, but can be modified for classification in Russian.\n", + "\n", + "In order to change `fasttext_logreg` language to Russian, change `LANGUAGE` variable in the `metadata.variables` section from `en` to `ru` and change the Spacy model by changing `SPACY_MODEL` variable from `en_core_web_sm` to `ru_core_news_sm`.\n", + "\n", + "You can do that by directly editing the config file through an editor or change it through Python (example below). N.B. `read_json` and `find_config` combination is intentionally used instead of `parse_config` to read config in the example, because `parse_config` will replace all `LANGUAGE` and `SPACY_MODEL` usages in the config with the default values from `metadata.variables`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov import build_model\n", + "from deeppavlov.core.common.file import read_json, find_config\n", + "\n", + "model_config = read_json(find_config('fasttext_logreg'))\n", + "model_config['metadata']['variables']['LANGUAGE'] = 'ru'\n", + "model_config['metadata']['variables']['SPACY_MODEL'] = 'ru_core_news_sm'\n", + "model = build_model(model_config, install=True, download=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7.3 Dataset and Scores\n", + "\n", + "To demonstrate the performance of the model in two languages, we use the English and Russian subsets of [the MASSIVE dataset](https://github.com/alexa/massive).\n", + "\n", + "MASSIVE is a parallel dataset of utterrances in 52 languages with annotations for the Natural Language Understanding tasks of intent prediction and slot annotation. We only employ the intent classification data. You can see the results of the given configs in 5-shot classification setting in the table below.\n", + "\n", + "| Config name | Language | Train accuracy | Validation accuracy | Test accuracy |\n", + "| :--- | --- | --- | --- | ---: |\n", + "| fasttext_logreg | en | 0.9632 | 0.5239 | 0.5155 |\n", + "| fasttext_logreg | ru | 0.9231 | 0.4565 | 0.4304 |" + ] } ], "metadata": { @@ -394,4 +455,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/features/models/faq.rst b/docs/features/models/faq.rst deleted file mode 100644 index d42e5535c5..0000000000 --- a/docs/features/models/faq.rst +++ /dev/null @@ -1,158 +0,0 @@ -================================ -Frequently Asked Questions (FAQ) -================================ - -This is implementation of FAQ model which helps to classify incoming questions. - -:: - - :: What is your open hours? - >> 8am - 8pm - - -Quick Start -=========== - -Building --------- - -.. code:: python - - from deeppavlov import build_model, configs - - faq = build_model(configs.faq.tfidf_logreg_en_faq, download=True) - - -Inference ---------- - -.. code:: python - - result = faq(['What is your open hours?']) - -If some required packages are missing, install all the requirements by running in command line: - -.. code:: bash - - python -m deeppavlov install fasttext_avg_autofaq - python -m deeppavlov install fasttext_tfidf_autofaq - python -m deeppavlov install tfidf_autofaq - python -m deeppavlov install tfidf_logreg_autofaq - python -m deeppavlov install tfidf_logreg_en_faq - -Config -====== - -As usual, config consists of: - -- **dataset_reader** -- **dataset_iterator** -- **chainer** - -You can use you own dataset_reader, dataset_iterator for speficic data. -Let's consider chainer in more details. - -Config Structure ----------------- - -- **chainer** - pipeline manager - - - **in** - pipeline input data: question - - **out** - pipeline output data: answer + score[0,1] - -- **preprocessing** - it can be tokenization, lemmatization, stemming and etc. In example tfidf_logreg_autofaq.json there are tokenization and lemmatization. - -- **vectorizer** - vectorizer of incoming sentences. It can be word embeddings vectorizer, bag of words vectorizer, tf-idf vectorizer and etc. Th output is vectorized sentences (numeric vectors). - -- **classifier** - This is faq model that classify incoming question. Model receive vectorized train sentences and vectorized question for inference. Output is classified answer from train dataset. - - -Vectorizers ------------ - -Vectorizers produce numeric vectors of input sentences - -- **sentence2vector_v2w_tfidf** - Sentence vectorizer: weighted sum of word embeddings from sentence - - - **in** - input data: question - - **fit_on** - train data: [token lemmas of question, word embeddings] - - **save_path** - path where to save model - - **load_path** - path where to load model - - **out** - output data: vectorized sentence - - -Classifiers for FAQ -------------------- - -This is models that classify incoming question and find corresponding answer - -- **cos_sim_classifier** - Classifier based on cosine similarity - - - **in** - input data: question - - **fit_on** - train data: [vectorized sentences, answers] - - **save_path** - path where to save model - - **load_path** - path where to load model - - **out** - output data: [answer, score] - - -- **logreg_classifier** - Logistic Regression classifier, that output most probable answer with score - - - **in** - input data: question - - **fit_on** - train data: [vectorized sentences, answers] - - **c** - regularization parameter for logistic regression model - - **penalty** - regularization type: 'l1' or 'l2' - - **save_path** - path where to save model - - **load_path** - path where to load model - - **out** - output data: [answer, score] - - - -Running FAQ -=========== - - -Training --------- - -To train your own model by running command `train`, for example: - -.. code:: bash - - python -m deeppavlov train tfidf_autofaq - - -Interacting ------------ - -After model has trained, you can use it for inference: model will return answers from FAQ data that used for train. - -.. code:: bash - - python -m deeppavlov interact tfidf_autofaq -d - - -Inference example: - -:: - - :: What is your open hours? - >> 8am - 8pm - - -Available Data and Pretrained Models -==================================== - -As an example you can try pretrained models on FAQ dataset in English: MIPT FAQ for entrants - https://mipt.ru/english/edu/faqs/ - -- `tfidf_logreg_classifier_en_mipt_faq.pkl `__ - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) -- `tfidf_vectorizer_en_mipt_faq.pkl `__ - pre-trained model for TF-IDF vectorizer based on MIPT FAQ - -Example config - :config:`tfidf_logreg_en_faq.json ` - - -Also you can use pretrained model on Russan FAQ dataset from school-site: https://gobu.ftl.name/page/1279/ - -- `tfidf_cos_sim_classifier.pkl `__ - pre-trained cosine similarity classifier for classifying input question (vectorized by tfidf) -- `tfidf_logreg_classifier_v2.pkl `__ - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) -- `fasttext_cos_classifier.pkl `__ - pre-trained cosine similarity classifier for classifying input question (vectorized by word embeddings) -- `tfidf_vectorizer_ruwiki_v2.pkl `__ - pre-trained model for TF-IDF vectorizer based on Russian Wikipedia diff --git a/docs/features/overview.rst b/docs/features/overview.rst index 57efca893c..331eb4ac4a 100644 --- a/docs/features/overview.rst +++ b/docs/features/overview.rst @@ -206,12 +206,6 @@ In the case when answer is not necessary present in given context we have :confi model. This model outputs empty string in case if there is no answer in context. -Frequently Asked Questions (FAQ) model :doc:`[docs] ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Set of pipelines for FAQ task: classifying incoming question into set of known questions and return prepared answer. -You can build different pipelines based on: tf-idf, weighted fasttext, cosine similarity, logistic regression. - ODQA :doc:`[docs] ` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/index.rst b/docs/index.rst index 4acc86479c..391bbc58ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -41,7 +41,6 @@ Welcome to DeepPavlov's documentation! Relation Extraction SuperGLUE Submission Open-Domain Question Answering - Frequently Asked Questions Answering .. toctree:: diff --git a/docs/intro/configuration.rst b/docs/intro/configuration.rst index 88cca82df8..b028338c7a 100644 --- a/docs/intro/configuration.rst +++ b/docs/intro/configuration.rst @@ -362,10 +362,6 @@ string). ``nltk.tokenize.moses.MosesDetokenizer``, ``nltk.tokenize.moses.MosesTokenizer``. - - :class:`~deeppavlov.models.tokenizers.ru_tokenizer.RussianTokenizer` - (registered as ``ru_tokenizer``) tokenizes or lemmatizes Russian texts - using ``nltk.tokenize.toktok.ToktokTokenizer``. - - :class:`~deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer` (registered as ``stream_spacy_tokenizer``) tokenizes or lemmatizes texts with spacy ``en_core_web_sm`` models by default. diff --git a/docs/intro/installation.rst b/docs/intro/installation.rst index 335d7cd6c8..e79698e626 100644 --- a/docs/intro/installation.rst +++ b/docs/intro/installation.rst @@ -2,7 +2,7 @@ Installation ============ -We support ``Linux`` platform, ``Python 3.6``, ``3.7``, ``3.8`` and ``3.9``. +We support ``Linux`` platform, ``Python 3.6``, ``3.7``, ``3.8``, ``3.9`` and ``3.10``. .. note:: diff --git a/docs/intro/quick_start.rst b/docs/intro/quick_start.rst index 54e2318350..402381ff55 100644 --- a/docs/intro/quick_start.rst +++ b/docs/intro/quick_start.rst @@ -2,7 +2,7 @@ QuickStart ------------ First, follow instructions on :doc:`Installation page ` -to install ``deeppavlov`` package for Python 3.6/3.7/3.8/3.9. +to install ``deeppavlov`` package for Python 3.6/3.7/3.8/3.9/3.10. DeepPavlov contains a bunch of great pre-trained NLP models. Each model is determined by its config file. List of models is available on diff --git a/requirements.txt b/requirements.txt index 92488368eb..6707575de1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,15 @@ aio-pika>=3.2.2,<6.9.0 fastapi>=0.47.0,<0.78.0 -filelock>=3.0.0,<3.8.0 -nltk>=3.2.5,<3.8.0 +filelock>=3.0.0,<3.10.0 +nltk>=3.2.5,<3.10.0 numpy<1.24 overrides==4.1.2 -pandas>=1.0.0,<1.5.0 +pandas>=1.0.0,<1.6.0 prometheus-client>=0.13.0,<0.15.0 pydantic -pymorphy2==0.9.1 pybind11==2.2.4 requests>=2.19.0,<3.0.0 scikit-learn>=0.24,<1.1.0 -scipy<1.9.0 +scipy<1.10.0 tqdm>=4.42.0,<4.65.0 uvicorn>=0.13.0,<0.19.0 diff --git a/setup.py b/setup.py index 282181e33c..3a6f642e79 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,8 @@ def readme(): 'pexpect' ], 'docs': [ - 'sphinx==3.5.4', + 'sphinx==3.5.4;python_version<"3.10"', + 'sphinx==4.5.0;python_version>="3.10"', 'sphinx_rtd_theme==0.5.2', 'nbsphinx==0.8.4', 'ipykernel==5.5.4', diff --git a/tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json b/tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json index 18348d9389..3b8e5bd774 100644 --- a/tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json +++ b/tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json @@ -33,8 +33,11 @@ "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz", "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz", "tokenizer": { - "class_name": "ru_tokenizer", + "class_name": "stream_spacy_tokenizer", + "spacy_model": "ru_core_news_sm", "lemmas": true, + "lowercase": true, + "filter_stopwords": true, "ngram_range": [ 1, 2 diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index bd4f73ef41..654984c6b8 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -2,7 +2,6 @@ import json import logging import os -import pickle import shutil import signal import socket @@ -21,7 +20,7 @@ import deeppavlov from deeppavlov import build_model -from deeppavlov.core.commands.utils import parse_config +from deeppavlov.core.commands.utils import parse_config, parse_value_with_config from deeppavlov.core.common.aliases import ALIASES from deeppavlov.core.data.utils import get_all_elems_from_json from deeppavlov.download import deep_download @@ -88,11 +87,7 @@ ] }, "faq": { - ("faq/tfidf_logreg_en_faq.json", "faq_tfidf_logreg_en", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("faq/tfidf_autofaq.json", "faq_tfidf_cos", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("faq/tfidf_logreg_autofaq.json", "faq_tfidf_logreg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("faq/fasttext_avg_autofaq.json", "faq_fasttext_avg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("faq/fasttext_tfidf_autofaq.json", "faq_fasttext_tfidf", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK] + ("faq/fasttext_logreg.json", "fasttext_logreg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], # TODO: add ru test }, "spelling_correction": { ("spelling_correction/brillmoore_wikitypos_en.json", "error_model", ALL_MODES): @@ -256,7 +251,7 @@ ("kbqa/kbqa_cq_ru.json", "kbqa", ('IP',)): [ ("Кто такой Оксимирон?", ("российский рэп-исполнитель",)), - ("Чем питаются коалы?", ("Лист",)), + ("Кто написал «Евгений Онегин»?", ("Александр Сергеевич Пушкин",)), ("абв", ("Not Found",)) ] }, @@ -266,8 +261,7 @@ "doc_retrieval": { ("doc_retrieval/en_ranker_tfidf_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("doc_retrieval/ru_ranker_tfidf_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("doc_retrieval/en_ranker_pop_wiki_test.json", "doc_retrieval", ('TI',)): [ - ONE_ARGUMENT_INFER_CHECK] + ("doc_retrieval/en_ranker_pop_wiki_test.json", "doc_retrieval", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, "squad": { ("squad/squad_ru_bert.json", "squad_ru_bert", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK], @@ -567,7 +561,7 @@ def test_consecutive_training_and_inferring(self, model, conf_file, model_dir, m def test_crossvalidation(): model_dir = 'faq' - conf_file = 'cv/cv_tfidf_autofaq.json' + conf_file = 'faq/fasttext_logreg.json' download_config(conf_file) @@ -589,39 +583,17 @@ def test_crossvalidation(): shutil.rmtree(str(download_path), ignore_errors=True) -def test_param_search(): - model_dir = 'faq' - conf_file = 'paramsearch/tfidf_logreg_autofaq_psearch.json' - - download_config(conf_file) - - c = test_configs_path / conf_file - model_path = download_path / model_dir - - install_config(c) - deep_download(c) - - shutil.rmtree(str(model_path), ignore_errors=True) - - logfile = io.BytesIO(b'') - p = pexpect.popen_spawn.PopenSpawn(sys.executable + f" -m deeppavlov.paramsearch {c} --folds 2", - timeout=None, logfile=logfile) - p.readlines() - if p.wait() != 0: - raise RuntimeError('Training process of {} returned non-zero exit code: \n{}' - .format(model_dir, logfile.getvalue().decode())) - - shutil.rmtree(str(download_path), ignore_errors=True) - - def test_hashes_existence(): all_configs = list(src_dir.glob('**/*.json')) + list(test_src_dir.glob('**/*.json')) url_root = 'http://files.deeppavlov.ai/' downloads_urls = set() for config in all_configs: config = json.loads(config.read_text(encoding='utf-8')) - downloads_urls |= {d if isinstance(d, str) else d['url'] for d in - config.get('metadata', {}).get('download', [])} + # TODO: replace with get downloads from config + # TODO: download only headers + # TODO: make requests in async mode + config_urls = {d if isinstance(d, str) else d['url'] for d in config.get('metadata', {}).get('download', [])} + downloads_urls |= {parse_value_with_config(url, config) for url in config_urls} downloads_urls = [url + '.md5' for url in downloads_urls if url.startswith(url_root)] messages = [] diff --git a/utils/Docker/docker-compose.yml b/utils/Docker/docker-compose.yml index a8ba5ba925..b4680dc009 100644 --- a/utils/Docker/docker-compose.yml +++ b/utils/Docker/docker-compose.yml @@ -19,7 +19,7 @@ services: dockerfile: utils/Docker/Dockerfile args: - EPOCH=$EPOCH - - PYTHON_VERSION=3.7.11 + - PYTHON_VERSION=3.7.16 - BASE_IMAGE=nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 user: '${UID}:${GID}' environment: @@ -32,7 +32,7 @@ services: dockerfile: utils/Docker/Dockerfile args: - EPOCH=$EPOCH - - PYTHON_VERSION=3.8.12 + - PYTHON_VERSION=3.8.16 - BASE_IMAGE=nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 user: '${UID}:${GID}' environment: @@ -45,10 +45,23 @@ services: dockerfile: utils/Docker/Dockerfile args: - EPOCH=$EPOCH - - PYTHON_VERSION=3.9.10 + - PYTHON_VERSION=3.9.16 - BASE_IMAGE=nvidia/cuda:11.5.1-cudnn8-runtime-ubuntu20.04 user: '${UID}:${GID}' environment: - CUDA_VISIBLE_DEVICES=$TEST_GPU_1 - PYTEST_ARGS=$PYTEST_ARGS - DP_PYTEST_NO_CACHE=True + py310: + build: + context: ../../ + dockerfile: utils/Docker/Dockerfile + args: + - EPOCH=$EPOCH + - PYTHON_VERSION=3.10.9 + - BASE_IMAGE=nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 + user: '${UID}:${GID}' + environment: + - CUDA_VISIBLE_DEVICES=$TEST_GPU_0 + - PYTEST_ARGS=$PYTEST_ARGS + - DP_PYTEST_NO_CACHE=True