diff --git a/russian_g2p/Accentor.py b/russian_g2p/Accentor.py index 6d998c3..15ca6d6 100644 --- a/russian_g2p/Accentor.py +++ b/russian_g2p/Accentor.py @@ -11,9 +11,11 @@ import dawg import logging +from russian_g2p.ner_accentuation.NerAccentor import NerAccentor + class Accentor: - def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki=True): + def __init__(self, mode='one', debug='no', exception_for_unknown=False): if debug == 'no': logging.basicConfig() else: @@ -26,7 +28,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki 'я'} self.__russian_vowels = {'а', 'о', 'у', 'э', 'ы', 'и', 'я', 'ё', 'ю', 'е'} self.exception_for_unknown = exception_for_unknown - self.use_wiki = use_wiki + #self.use_wiki = use_wiki self.__homonyms = None self.__simple_words_dawg = None self.__function_words = None @@ -35,6 +37,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki self.__bad_words = [] self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U) self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U) + self.__ner_acc = NerAccentor() assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!' assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!' homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json') @@ -93,174 +96,6 @@ def __del__(self): del self.__bad_words del self.__function_words - def get_correct_omograph_wiki(self, root_text, cur_word, morphotag='X'): - ''' - Разбор омографии. - Использование морфологической информации о - слове для их разграничения. - ''' - langs = root_text.split('
') - #print('hello?') - root = None - for lang in langs: - #print(lang) - head_pos = lang.find('

Russian') - if head_pos != -1: - root = lxml.html.document_fromstring(lang[head_pos:]) - if root == None: - #print(':^(') - return [] - good_headers = [] - shallow_vars = set() - results = set() - for header in root.findall('.//*[@class="mw-headline"]'): - #print(cur_word, morphotag) - if header.text_content() in ['Noun', 'Verb', 'Adjective', 'Adverb', 'Conjunction', 'Determiner', 'Interjection', - 'Morpheme', 'Numeral', 'Particle', 'Predicative', 'Preposition', 'Pronoun']: - good_headers.append(header.text_content()) - acc_word = header.getparent().getnext() - while acc_word.tag != 'p': - acc_word = acc_word.getnext() - #print(acc_word) - result = [] - hyphen = 0 - for part in acc_word.find_class('Cyrl headword'): - result += [part.text_content()] - if part.text_content().find('-') != -1: - hyphen = 1 - #print(result) - if (hyphen == 1) or (len(result) == 1): - result = ''.join(result) - else: - continue - if result.replace('ё', 'е́').find('') != -1: - shallow_vars.add(result) - if header.text_content()[0] == morphotag[0]: - #print('The tags are equal') - if header.text_content()[0] == 'N': - gramm_info = acc_word.getnext() - if gramm_info.text_content().find('of') != -1: - for variant in gramm_info.find_class('form-of-definition'): - info = variant.findall('a') - #print(variant.text_content()) - try: - if info[0].text_content()[0] == 'p': - case = 'l' - else: - case = info[0].text_content()[0] - #print(case) - number = info[1].text_content()[0] - #print(number + case, morphotag) - if case == morphotag[morphotag.find('Case=') + 5].lower(): - results.add(result) - except IndexError: - continue - else: - if morphotag[morphotag.find('Case=') + 5].lower() == 'n': - results.add(result) - elif header.text_content()[0] == 'V': - gramm_info = acc_word.getnext() - if morphotag.find('Mood=Inf') != -1: - results.add(result) - #print('Wut',morphotag, results) - for variant in gramm_info.find_class('form-of-definition'): - #print(variant.text_content()) - t = 0 - if (variant.text_content().find('indicative') != -1) and (morphotag.find('Mood=Ind') != -1): - if ((variant.text_content().find('future') != -1) or (variant.text_content().find('present') != -1)) and (morphotag.find('Tense=Notpast') != -1): - #print('I should be here') - results.add(result) - #print(1, results) - elif (variant.text_content().find('past') != -1) and (morphotag.find('Tense=Past') != -1): - results.add(result) - #print(2, results) - elif (variant.text_content().find('imperative') != -1) and (morphotag.find('Mood=Imp') != -1): - results.add(result) - else: - results.add(result) - elif (header.text_content()[0] == 'D') and (morphotag.find('PRON') != -1): - acc_word = header.getparent().getnext() - result = '' - for part in acc_word.find_class('Cyrl headword'): - result += part.text_content() - results.add(result) - elif (header.text_content().lower().find(morphotag.split()[0].lower()) != -1): - acc_word = header.getparent().getnext() - result = '' - for part in acc_word.find_class('Cyrl headword'): - result += part.text_content() - results.add(result) - #print(shallow_vars) - if len(list(shallow_vars)) == 1: - if list(shallow_vars)[0].replace('ё', 'е+').replace('', '') == cur_word: - return [list(shallow_vars)[0].replace('ё', 'ё+').replace('', '+').replace('', '+')] - #print(results) - if len(list(results)) != 1: - return [] - best_results = [variant.replace('', '+') for variant in results] - return list(best_results) - - def get_simple_form_wiki(self, root_text, form): - ''' - Непосредственное нахождение релевантной формы - и ударение без морфологической информации. - ''' - root = lxml.html.document_fromstring(root_text) - rel_forms = set() - for header in root.findall('.//*[@class="Cyrl headword"][@lang="ru"]'): - header_text = header.text_content().replace('ё', 'е́') - header_text_best = header.text_content().replace('ё', 'ё+').replace('', '+') - if header_text.replace('', '') == form: - if header_text.find('') != -1: - rel_forms.add(header_text_best) - for mention in root.findall('.//i[@class="Cyrl mention"][@lang="ru"]'): - mention_text = mention.text_content().replace('ё', 'е́') - mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+') - if mention_text.replace('', '') == form: - if mention_text.replace('ё', 'е́').find('') != -1: - rel_forms.add(mention_text_best) - for mention in root.findall('.//b[@class="Cyrl"][@lang="ru"]'): - mention_text = mention.text_content().replace('ё', 'е́') - mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+') - if mention_text.replace('', '') == form: - if mention_text.replace('ё', 'е́').find('') != -1: - rel_forms.add(mention_text_best) - elif mention_text.find('(') != -1: - if mention_text.replace('', '').find(form) != -1: - if mention_text.find('') != -1: - rel_forms.add(mention_text_best[mention_text.replace('', '').find(form):]) - elif re.sub(r'[\(\)́]', '', mention_text) == form: - rel_forms.add(re.sub(r'[\(\)]', '', mention_text_best)) - for target in root.xpath('.//span[@class="Cyrl"][@lang="ru"]'): - one_form = target.text_content() - if one_form.replace('ё', 'е́').replace('', '') == form: - if one_form.replace('ё', 'е́').find('') != -1: - rel_forms.add(one_form.replace('ё', 'ё́').replace('', '+')) - results = list(rel_forms) - if len(results) == 2: - if results[0].replace('ё', 'е') == results[1].replace('ё', 'е'): - rel_forms = set() - for var in results: - if var.find('ё') != -1: - rel_forms.add(var) - return list(rel_forms) - - def load_wiki_page(self, cur_form): - if not self.use_wiki: - if self.exception_for_unknown: - raise ValueError(f'Word `{cur_form}` is unknown!') - return - query = urllib.parse.urlencode({ 'title' : cur_form }) - try: - http_exception_type = urllib.error.HTTPError - except: - http_exception_type = urllib.request.HTTPError - try: - with urllib.request.urlopen(f'https://en.wiktionary.org/w/index.php?{query}&#printable=yes') as f: - root_text = f.read().decode('utf-8') - return root_text - except http_exception_type: - return def do_accents(self, source_phrase_and_morphotags: list) -> list: self.logger.debug('Checking the source phrase...') @@ -484,31 +319,29 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list: accented_wordforms += [self.__homonyms[cur_word][morpho_variants[best_ind]]] accented_wordforms_many.append([self.__homonyms[cur_word][morpho_variants[best_ind]]]) else: - root_text = self.load_wiki_page(cur_word) - if root_text != None: - #print('am I even here?') - cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0])) - if len(cur_accented_wordforms) == 1: - accented_wordforms += [cur_accented_wordforms[0]] - accented_wordforms_many.append([cur_accented_wordforms[0]]) - self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]} - elif len(cur_accented_wordforms) > 1: - accented_wordforms += [cur_word] - accented_wordforms_many.append([cur_accented_wordforms]) - warn = 'many' - else: - accented_wordforms += [cur_word] - accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]])) - warn = 'many' + #print('am I even here?') + cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0]) # sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0])) + if len(cur_accented_wordforms) == 1: + accented_wordforms += [cur_accented_wordforms[0]] + accented_wordforms_many.append([cur_accented_wordforms[0]]) + self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]} + elif len(cur_accented_wordforms) > 1: + accented_wordforms += [cur_word] + accented_wordforms_many.append([cur_accented_wordforms]) + warn = 'many' else: accented_wordforms += [cur_word] - accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]])) + accented_wordforms_many.append(cur_accented_wordforms) # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]])) warn = 'many' + else: - self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...') - root_text = self.load_wiki_page(cur_word) - if root_text != None: - cur_accented_wordforms = sorted(self.get_simple_form_wiki(root_text, cur_word)) + #self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...') + if morphotags_list is None: + err_msg = f'Word `{cur_word}` has no morphotags. Try again by specifying it' + raise ValueError(err_msg) + else: + cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0]) # sorted(self.get_simple_form_wiki(root_text, cur_word)) + #print(cur_accented_wordforms) if len(cur_accented_wordforms) == 1: accented_wordforms += [cur_accented_wordforms[0]] accented_wordforms_many.append([cur_accented_wordforms[0]]) @@ -518,20 +351,16 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list: accented_wordforms_many.append([cur_word]) warn = 'no' else: - cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0])) - if len(cur_accented_wordforms) == 1: - accented_wordforms += [cur_accented_wordforms[0]] - accented_wordforms_many.append([cur_accented_wordforms[0]]) - self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]} - else: - accented_wordforms += [cur_word] - accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]])) - warn = 'many' + #cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0])#sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0])) + #if len(cur_accented_wordforms) == 1: + # accented_wordforms += [cur_accented_wordforms[0]] + # accented_wordforms_many.append([cur_accented_wordforms[0]]) + # self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]} + #else: + accented_wordforms += [cur_word] + accented_wordforms_many.append(cur_accented_wordforms) # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]])) + warn = 'many' - else: - accented_wordforms += [cur_word] - accented_wordforms_many.append([cur_word]) - warn = 'no' if i == 0: if (accented_wordforms[0].find('+') != -1) or (len(separate_tokens) == 2): break diff --git a/russian_g2p/ner_accentuation/NerAccentor.py b/russian_g2p/ner_accentuation/NerAccentor.py new file mode 100644 index 0000000..a73e37d --- /dev/null +++ b/russian_g2p/ner_accentuation/NerAccentor.py @@ -0,0 +1,30 @@ +import numpy as np +from russian_g2p.ner_accentuation.ner_utils import load_place_stress_model, create_index_of_letters, create_morph_vector, create_index_of_con_vow + + +class NerAccentor: + def __init__(self): + self._place_stress_model = load_place_stress_model() + + def define_stress(self, word, morph_inf): + all_word_vectors = [create_index_of_letters(word), create_morph_vector(morph_inf), create_index_of_con_vow(word)] + stress_vector_two_list = np.asarray(self._place_stress_model.predict(all_word_vectors, verbose=1) >= 0.34, + dtype=np.int32) # список в списке + stress_vector = stress_vector_two_list.reshape(stress_vector_two_list.shape[0], stress_vector_two_list.shape[1], )[ + 0] + stress_index = [] + for i in range(len(stress_vector)): + if stress_vector[i] == 1: + stress_index.append(i) + + word_with_stress = [letter for letter in word] + words_return = [] + if len(stress_index) > 0: + for index in stress_index: + word_with_stress.insert(index + 1, '+') + words_return.append(''.join(word_with_stress)) + word_with_stress.pop(index + 1) + else: + words_return.append(''.join(word_with_stress)) + + return words_return diff --git a/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5 b/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5 new file mode 100644 index 0000000..5080874 Binary files /dev/null and b/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5 differ diff --git a/russian_g2p/ner_accentuation/ner_utils.py b/russian_g2p/ner_accentuation/ner_utils.py new file mode 100644 index 0000000..4c30d0d --- /dev/null +++ b/russian_g2p/ner_accentuation/ner_utils.py @@ -0,0 +1,183 @@ +import numpy as np +import tensorflow as tf +from keras.models import Sequential +from keras.layers import Embedding, Input, Dense, GRU, TimeDistributed +from tensorflow_addons.losses import SigmoidFocalCrossEntropy + + +alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' +char_to_int = dict((c, i) for i, c in enumerate(alphabet, start=1)) # определяем буквы алфавита в цифры + +pos_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'CONJ','DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'SCONJ', 'VERB', 'X'] +pos = 'ADJ ADP ADV AUX CCONJ CONJ DET INTJ NOUN NUM PART PRON PROPN SCONJ VERB X' # список частей речи +pos_to_int = dict((pos.split(' ')[i], i) for i in range(len(pos.split(' ')))) # 16 + +features_list = [ +'Abbr=Yes', +'Gender=Masc', 'Gender=Fem', 'Gender=Neut', +'Animacy=Anim', 'Animacy=Inan', +'Number=Sing', 'Number=Plur', 'Number=Ptan', 'Number=Coll', +'Case=Nom', 'Case=Gen', 'Case=Par', 'Case=Dat', 'Case=Acc', 'Case=Loc', 'Case=Ins', 'Case=Voc', +'Degree=Pos', 'Degree=Cmp', 'Degree=Sup', +'VerbForm=Conv', 'VerbForm=Fin', 'VerbForm=Inf', 'VerbForm=Part', 'VerbForm=PartRes', 'VerbForm=Trans', +'Mood=Ind', 'Mood=Imp', 'Mood=Cnd', +'Tense=Past', 'Tense=Pres', 'Tense=Fut', +'Aspect=Imp', 'Aspect=Perf', +'Voice=Act', 'Voice=Pass', 'Voice=Mid', +'Person=1', 'Person=2', 'Person=3', +'Variant=Full', 'Variant=Brev', +] +features = " ".join(features_list) # список морф признаков +features_to_int = dict((features.split(' ')[i], i) for i in range(len(features.split(' ')))) # 41 + +vowels = 'аеёиоуыэюя' +consonants = 'бвгджзйклмнпрстфхцчшщъь' + + +class MaskCalculator(tf.keras.layers.Layer): + def __init__(self, output_dim, **kwargs): + self.output_dim = output_dim + super(MaskCalculator, self).__init__(**kwargs) + + def build(self, input_shape): + super(MaskCalculator, self).build(input_shape) + + def call(self, inputs, **kwargs): + return tf.keras.backend.permute_dimensions( + x=tf.keras.backend.repeat( + x=tf.keras.backend.cast( + x=tf.keras.backend.greater( + x=inputs, + y=0 + ), + dtype='float32' + ), + n=self.output_dim + ), + pattern=(0, 2, 1) + ) + + def compute_output_shape(self, input_shape): + assert len(input_shape) == 1 + shape = list(input_shape) + shape.append(self.output_dim) + return tuple(shape) + + +def create_place_stress_model(): + input_words = Input(shape=(None,), name='InputWords', dtype='int32') # (I) Архитекрутра модели + embedding = Embedding(input_dim=len(alphabet) + 1, output_dim=256, mask_zero=True, + name='EmbeddingMaskForWords') # шаг 1: маскирование матриц Цепочек Индексов Символов + output_mask_words = embedding(input_words) + + input_morph_inf = Input(shape=(59,), name='InputMorphInf', + dtype='float32') # шаг 2: вход матрицы векторов МорфИнф и обработка его Dense-слоем + dense_morph_inf = Dense(units=256, name='DenseMorphInf')( + input_morph_inf) # подаем размерность без учёта мини-батчей + + gru = GRU(units=256, return_sequences=True, name='RecurrentLayerGRU')(output_mask_words, + initial_state=dense_morph_inf) # шаг 3: реккурентный слой + + input_cons_vow = Input(shape=(None,), name='InputConsonantVowel', dtype='int32') # шаг 4: маскирование согласных + output_mask_cons_vow = MaskCalculator(output_dim=256, trainable=False, name='OutMaskCalculator')( + input_cons_vow) # вручной слой маски + masked_sequence_output = tf.keras.layers.Multiply(name='OutMaskMultiplicator')([output_mask_cons_vow, gru]) + masked_sequence_output = tf.keras.layers.Masking(name='OutMasking')(masked_sequence_output) + + cls_layer = TimeDistributed( # шаг 5: слой TimeDistributed + Dense(units=1, activation='sigmoid'), + name='ClassificationLayer')(masked_sequence_output) + + place_stress_model = tf.keras.Model( + inputs=[input_words, input_morph_inf, input_cons_vow], + outputs=cls_layer, + name='Placement_of_stress_model') + + place_stress_model.compile(loss=SigmoidFocalCrossEntropy(), optimizer='adam') # (II) Скомпилирование модели + # place_stress_model.summary() + + return place_stress_model + + +def load_place_stress_model(): + place_stress_model = create_place_stress_model() + place_stress_model.load_weights(filepath='russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5') # russian_g2p/ + + return place_stress_model + + +def create_index_of_letters(word): + integer_encoded = [] + integer_encoded.append([char_to_int[char] for char in word if char != '-']) # Integer Encoding: проводим для заданного слова целочисленное кодирование + + return np.array(integer_encoded, dtype='int32') + + +def create_morph_vector(morph_inf): + if ' ' not in morph_inf: # случай, если указана только половина информации: только часть речи / только морф инф + if morph_inf in pos_list: + integer_encoded_pos = pos_to_int[morph_inf.split(' ')[0]] # созадаем вектор для части речи + pos_vector = [0 for _ in range(len(pos.split(' ')))] # 15 + pos_vector[integer_encoded_pos] = 1 + + features_vector = [0 for _ in range(len(features.split(' ')))] + else: + pos_vector = [0 for _ in range(len(pos.split(' ')))] + + integer_encoded_features = [] + for char in morph_inf.split('|'): + if '(2)' in char or '(3)' in char: + char = char[:-3] + if char in features_list: + integer_encoded_features.append(features_to_int[char]) + else: + print(char, '1) эта морф информация отсутствует в ', morph_inf) + features_vector = [0 for _ in range(len(features.split(' ')))] + for value in integer_encoded_features: + features_vector[value] = 1 + + else: + + if morph_inf.split(' ')[0] in pos_list: + integer_encoded_pos = pos_to_int[morph_inf.split(' ')[0]] # созадаем вектор для части речи + pos_vector = [0 for _ in range(len(pos.split(' ')))] # 15 + pos_vector[integer_encoded_pos] = 1 + else: + print('Нет такой части речи в списке: ', morph_inf.split(' ')[0]) + pos_vector = [0 for _ in range(len(pos.split(' ')))] + + if morph_inf.split(' ')[1] == '_' or morph_inf.split(' ')[ + 1] == '_(2)': # случай, в которых не указана морф инф, а только часть речи + features_vector = [0 for _ in range(len(features.split(' ')))] + else: + integer_encoded_features = [] + for char in morph_inf.split(' ')[1].split('|'): + if '(2)' in char or '(3)' in char: + char = char[:-3] + if char in features_list: + integer_encoded_features.append(features_to_int[char]) + else: + print(char, 'эта морф информация отсутствует в ', morph_inf) + features_vector = [0 for _ in range(len(features.split(' ')))] + for value in integer_encoded_features: + features_vector[value] = 1 + + morph_inf_vector = pos_vector + features_vector + + morph_inf_vector_list = [] + morph_inf_vector_list.append(morph_inf_vector) + return np.array(morph_inf_vector_list, dtype='float32') + + +def create_index_of_con_vow(word): + one_hot_encoded = [] + for char in word: + if char != '-': + if char in vowels: + one_hot_encoded.append(1) + elif char in consonants: + one_hot_encoded.append(0) + + one_hot_encoded_list = [] + one_hot_encoded_list.append(one_hot_encoded) + return np.array(one_hot_encoded_list, dtype='int32') diff --git a/russian_g2p/tests/test_Accentor.py b/russian_g2p/tests/test_Accentor.py index bac157e..4146c4d 100644 --- a/russian_g2p/tests/test_Accentor.py +++ b/russian_g2p/tests/test_Accentor.py @@ -62,10 +62,13 @@ def test_do_accents_positive06(self): real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) - def test_do_accents_positive07(self): - source_phrase = [['хракозябр'], ['впулил'], ['куздру']] + def test_do_accents_positive07(self): # нейросеть + source_phrase = [['хракозябр', 'NOUN Animacy=Inan|Case=Gen|Gender=Fem|Number=Plur'], + ['впулил', 'VERB Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin'], + ['куздру', 'NOUN Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing'] + ] target_variants = [ - ['хракозябр', 'впулил', 'куздру'] + ['хракозябр', 'впули+л', 'куздру+'] ] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) @@ -76,10 +79,10 @@ def test_do_accents_positive08(self): with self.assertRaises(ValueError): _ = accentor.do_accents(source_phrase) - def test_do_accents_positive09(self): - source_phrase = [['серебристо-белый'], ['цвет']] + def test_do_accents_positive09(self): # нейросеть для серебристо-белый, словарь для цвет + source_phrase = [['серебристо-белый', 'ADJ Case=Nom|Gender=Masc|Number=Sing'], ['цвет', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing']] target_variants = [ - ['серебри+сто-бе+лый', 'цве+т'] + ['серебри+сто-белый', 'цве+т'] # 'серебри+сто-бе+лый' ] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) @@ -101,6 +104,14 @@ def test_do_accents_positive11(self): real_variants = accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) + def test_do_accents_positive12(self): # нейросеть + source_phrase = [['а-зе', 'PROPN Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing']] + target_variants = [ + ['а-зе+'] + ] + real_variants = self.__accentor.do_accents(source_phrase) + self.assertEqual(target_variants, real_variants) + def test_do_accents_negative01(self): source_phrase_n_morphotags = [['подарок', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'], ['для', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'], @@ -121,10 +132,10 @@ def test_do_accents_negative03(self): with self.assertRaisesRegex(AssertionError, target_err_msg): self.__accentor.do_accents(source_phrase) - def test_do_accents_negative04(self): + def test_do_accents_negative04(self): # нейросеть source_phrase = [['а-зе']] - target_err_msg = re.escape('Word `а-зе` is unknown!') - accentor = Accentor(exception_for_unknown=True, use_wiki=False) + target_err_msg = re.escape('Word `а-зе` has no morphotags. Try again by specifying it') + accentor = Accentor(exception_for_unknown=True)#, use_wiki=False) with self.assertRaisesRegex(ValueError, target_err_msg): accentor.do_accents(source_phrase)