diff --git a/russian_g2p/Accentor.py b/russian_g2p/Accentor.py
index 6d998c3..15ca6d6 100644
--- a/russian_g2p/Accentor.py
+++ b/russian_g2p/Accentor.py
@@ -11,9 +11,11 @@
import dawg
import logging
+from russian_g2p.ner_accentuation.NerAccentor import NerAccentor
+
class Accentor:
- def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki=True):
+ def __init__(self, mode='one', debug='no', exception_for_unknown=False):
if debug == 'no':
logging.basicConfig()
else:
@@ -26,7 +28,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki
'я'}
self.__russian_vowels = {'а', 'о', 'у', 'э', 'ы', 'и', 'я', 'ё', 'ю', 'е'}
self.exception_for_unknown = exception_for_unknown
- self.use_wiki = use_wiki
+ #self.use_wiki = use_wiki
self.__homonyms = None
self.__simple_words_dawg = None
self.__function_words = None
@@ -35,6 +37,7 @@ def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki
self.__bad_words = []
self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U)
self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U)
+ self.__ner_acc = NerAccentor()
assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!'
assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!'
homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json')
@@ -93,174 +96,6 @@ def __del__(self):
del self.__bad_words
del self.__function_words
- def get_correct_omograph_wiki(self, root_text, cur_word, morphotag='X'):
- '''
- Разбор омографии.
- Использование морфологической информации о
- слове для их разграничения.
- '''
- langs = root_text.split('
')
- #print('hello?')
- root = None
- for lang in langs:
- #print(lang)
- head_pos = lang.find('Russian')
- if head_pos != -1:
- root = lxml.html.document_fromstring(lang[head_pos:])
- if root == None:
- #print(':^(')
- return []
- good_headers = []
- shallow_vars = set()
- results = set()
- for header in root.findall('.//*[@class="mw-headline"]'):
- #print(cur_word, morphotag)
- if header.text_content() in ['Noun', 'Verb', 'Adjective', 'Adverb', 'Conjunction', 'Determiner', 'Interjection',
- 'Morpheme', 'Numeral', 'Particle', 'Predicative', 'Preposition', 'Pronoun']:
- good_headers.append(header.text_content())
- acc_word = header.getparent().getnext()
- while acc_word.tag != 'p':
- acc_word = acc_word.getnext()
- #print(acc_word)
- result = []
- hyphen = 0
- for part in acc_word.find_class('Cyrl headword'):
- result += [part.text_content()]
- if part.text_content().find('-') != -1:
- hyphen = 1
- #print(result)
- if (hyphen == 1) or (len(result) == 1):
- result = ''.join(result)
- else:
- continue
- if result.replace('ё', 'е́').find('') != -1:
- shallow_vars.add(result)
- if header.text_content()[0] == morphotag[0]:
- #print('The tags are equal')
- if header.text_content()[0] == 'N':
- gramm_info = acc_word.getnext()
- if gramm_info.text_content().find('of') != -1:
- for variant in gramm_info.find_class('form-of-definition'):
- info = variant.findall('a')
- #print(variant.text_content())
- try:
- if info[0].text_content()[0] == 'p':
- case = 'l'
- else:
- case = info[0].text_content()[0]
- #print(case)
- number = info[1].text_content()[0]
- #print(number + case, morphotag)
- if case == morphotag[morphotag.find('Case=') + 5].lower():
- results.add(result)
- except IndexError:
- continue
- else:
- if morphotag[morphotag.find('Case=') + 5].lower() == 'n':
- results.add(result)
- elif header.text_content()[0] == 'V':
- gramm_info = acc_word.getnext()
- if morphotag.find('Mood=Inf') != -1:
- results.add(result)
- #print('Wut',morphotag, results)
- for variant in gramm_info.find_class('form-of-definition'):
- #print(variant.text_content())
- t = 0
- if (variant.text_content().find('indicative') != -1) and (morphotag.find('Mood=Ind') != -1):
- if ((variant.text_content().find('future') != -1) or (variant.text_content().find('present') != -1)) and (morphotag.find('Tense=Notpast') != -1):
- #print('I should be here')
- results.add(result)
- #print(1, results)
- elif (variant.text_content().find('past') != -1) and (morphotag.find('Tense=Past') != -1):
- results.add(result)
- #print(2, results)
- elif (variant.text_content().find('imperative') != -1) and (morphotag.find('Mood=Imp') != -1):
- results.add(result)
- else:
- results.add(result)
- elif (header.text_content()[0] == 'D') and (morphotag.find('PRON') != -1):
- acc_word = header.getparent().getnext()
- result = ''
- for part in acc_word.find_class('Cyrl headword'):
- result += part.text_content()
- results.add(result)
- elif (header.text_content().lower().find(morphotag.split()[0].lower()) != -1):
- acc_word = header.getparent().getnext()
- result = ''
- for part in acc_word.find_class('Cyrl headword'):
- result += part.text_content()
- results.add(result)
- #print(shallow_vars)
- if len(list(shallow_vars)) == 1:
- if list(shallow_vars)[0].replace('ё', 'е+').replace('', '') == cur_word:
- return [list(shallow_vars)[0].replace('ё', 'ё+').replace('', '+').replace('', '+')]
- #print(results)
- if len(list(results)) != 1:
- return []
- best_results = [variant.replace('', '+') for variant in results]
- return list(best_results)
-
- def get_simple_form_wiki(self, root_text, form):
- '''
- Непосредственное нахождение релевантной формы
- и ударение без морфологической информации.
- '''
- root = lxml.html.document_fromstring(root_text)
- rel_forms = set()
- for header in root.findall('.//*[@class="Cyrl headword"][@lang="ru"]'):
- header_text = header.text_content().replace('ё', 'е́')
- header_text_best = header.text_content().replace('ё', 'ё+').replace('', '+')
- if header_text.replace('', '') == form:
- if header_text.find('') != -1:
- rel_forms.add(header_text_best)
- for mention in root.findall('.//i[@class="Cyrl mention"][@lang="ru"]'):
- mention_text = mention.text_content().replace('ё', 'е́')
- mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+')
- if mention_text.replace('', '') == form:
- if mention_text.replace('ё', 'е́').find('') != -1:
- rel_forms.add(mention_text_best)
- for mention in root.findall('.//b[@class="Cyrl"][@lang="ru"]'):
- mention_text = mention.text_content().replace('ё', 'е́')
- mention_text_best = mention.text_content().replace('ё', 'ё+').replace('', '+')
- if mention_text.replace('', '') == form:
- if mention_text.replace('ё', 'е́').find('') != -1:
- rel_forms.add(mention_text_best)
- elif mention_text.find('(') != -1:
- if mention_text.replace('', '').find(form) != -1:
- if mention_text.find('') != -1:
- rel_forms.add(mention_text_best[mention_text.replace('', '').find(form):])
- elif re.sub(r'[\(\)́]', '', mention_text) == form:
- rel_forms.add(re.sub(r'[\(\)]', '', mention_text_best))
- for target in root.xpath('.//span[@class="Cyrl"][@lang="ru"]'):
- one_form = target.text_content()
- if one_form.replace('ё', 'е́').replace('', '') == form:
- if one_form.replace('ё', 'е́').find('') != -1:
- rel_forms.add(one_form.replace('ё', 'ё́').replace('', '+'))
- results = list(rel_forms)
- if len(results) == 2:
- if results[0].replace('ё', 'е') == results[1].replace('ё', 'е'):
- rel_forms = set()
- for var in results:
- if var.find('ё') != -1:
- rel_forms.add(var)
- return list(rel_forms)
-
- def load_wiki_page(self, cur_form):
- if not self.use_wiki:
- if self.exception_for_unknown:
- raise ValueError(f'Word `{cur_form}` is unknown!')
- return
- query = urllib.parse.urlencode({ 'title' : cur_form })
- try:
- http_exception_type = urllib.error.HTTPError
- except:
- http_exception_type = urllib.request.HTTPError
- try:
- with urllib.request.urlopen(f'https://en.wiktionary.org/w/index.php?{query}printable=yes') as f:
- root_text = f.read().decode('utf-8')
- return root_text
- except http_exception_type:
- return
def do_accents(self, source_phrase_and_morphotags: list) -> list:
self.logger.debug('Checking the source phrase...')
@@ -484,31 +319,29 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list:
accented_wordforms += [self.__homonyms[cur_word][morpho_variants[best_ind]]]
accented_wordforms_many.append([self.__homonyms[cur_word][morpho_variants[best_ind]]])
else:
- root_text = self.load_wiki_page(cur_word)
- if root_text != None:
- #print('am I even here?')
- cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
- if len(cur_accented_wordforms) == 1:
- accented_wordforms += [cur_accented_wordforms[0]]
- accented_wordforms_many.append([cur_accented_wordforms[0]])
- self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
- elif len(cur_accented_wordforms) > 1:
- accented_wordforms += [cur_word]
- accented_wordforms_many.append([cur_accented_wordforms])
- warn = 'many'
- else:
- accented_wordforms += [cur_word]
- accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
- warn = 'many'
+ #print('am I even here?')
+ cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0]) # sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
+ if len(cur_accented_wordforms) == 1:
+ accented_wordforms += [cur_accented_wordforms[0]]
+ accented_wordforms_many.append([cur_accented_wordforms[0]])
+ self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
+ elif len(cur_accented_wordforms) > 1:
+ accented_wordforms += [cur_word]
+ accented_wordforms_many.append([cur_accented_wordforms])
+ warn = 'many'
else:
accented_wordforms += [cur_word]
- accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
+ accented_wordforms_many.append(cur_accented_wordforms) # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
warn = 'many'
+
else:
- self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...')
- root_text = self.load_wiki_page(cur_word)
- if root_text != None:
- cur_accented_wordforms = sorted(self.get_simple_form_wiki(root_text, cur_word))
+ #self.logger.debug(f'The word `{cur_word}` was not found in any of the dictionaries\nTrying to parse wictionary page...')
+ if morphotags_list is None:
+ err_msg = f'Word `{cur_word}` has no morphotags. Try again by specifying it'
+ raise ValueError(err_msg)
+ else:
+ cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0]) # sorted(self.get_simple_form_wiki(root_text, cur_word))
+ #print(cur_accented_wordforms)
if len(cur_accented_wordforms) == 1:
accented_wordforms += [cur_accented_wordforms[0]]
accented_wordforms_many.append([cur_accented_wordforms[0]])
@@ -518,20 +351,16 @@ def __do_accents(self, words_list: list, morphotags_list: list=None) -> list:
accented_wordforms_many.append([cur_word])
warn = 'no'
else:
- cur_accented_wordforms = sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
- if len(cur_accented_wordforms) == 1:
- accented_wordforms += [cur_accented_wordforms[0]]
- accented_wordforms_many.append([cur_accented_wordforms[0]])
- self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
- else:
- accented_wordforms += [cur_word]
- accented_wordforms_many.append(sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
- warn = 'many'
+ #cur_accented_wordforms = self.__ner_acc.define_stress(cur_word, morphotags_list[0])#sorted(self.get_correct_omograph_wiki(root_text, cur_word, morphotags_list[0]))
+ #if len(cur_accented_wordforms) == 1:
+ # accented_wordforms += [cur_accented_wordforms[0]]
+ # accented_wordforms_many.append([cur_accented_wordforms[0]])
+ # self.__new_homonyms[cur_word] = {morphotags_list[0] : cur_accented_wordforms[0]}
+ #else:
+ accented_wordforms += [cur_word]
+ accented_wordforms_many.append(cur_accented_wordforms) # sorted([self.__homonyms[cur_word][it] for it in self.__homonyms[cur_word]]))
+ warn = 'many'
- else:
- accented_wordforms += [cur_word]
- accented_wordforms_many.append([cur_word])
- warn = 'no'
if i == 0:
if (accented_wordforms[0].find('+') != -1) or (len(separate_tokens) == 2):
break
diff --git a/russian_g2p/ner_accentuation/NerAccentor.py b/russian_g2p/ner_accentuation/NerAccentor.py
new file mode 100644
index 0000000..a73e37d
--- /dev/null
+++ b/russian_g2p/ner_accentuation/NerAccentor.py
@@ -0,0 +1,30 @@
+import numpy as np
+from russian_g2p.ner_accentuation.ner_utils import load_place_stress_model, create_index_of_letters, create_morph_vector, create_index_of_con_vow
+
+
+class NerAccentor:
+ def __init__(self):
+ self._place_stress_model = load_place_stress_model()
+
+ def define_stress(self, word, morph_inf):
+ all_word_vectors = [create_index_of_letters(word), create_morph_vector(morph_inf), create_index_of_con_vow(word)]
+ stress_vector_two_list = np.asarray(self._place_stress_model.predict(all_word_vectors, verbose=1) >= 0.34,
+ dtype=np.int32) # список в списке
+ stress_vector = stress_vector_two_list.reshape(stress_vector_two_list.shape[0], stress_vector_two_list.shape[1], )[
+ 0]
+ stress_index = []
+ for i in range(len(stress_vector)):
+ if stress_vector[i] == 1:
+ stress_index.append(i)
+
+ word_with_stress = [letter for letter in word]
+ words_return = []
+ if len(stress_index) > 0:
+ for index in stress_index:
+ word_with_stress.insert(index + 1, '+')
+ words_return.append(''.join(word_with_stress))
+ word_with_stress.pop(index + 1)
+ else:
+ words_return.append(''.join(word_with_stress))
+
+ return words_return
diff --git a/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5 b/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5
new file mode 100644
index 0000000..5080874
Binary files /dev/null and b/russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5 differ
diff --git a/russian_g2p/ner_accentuation/ner_utils.py b/russian_g2p/ner_accentuation/ner_utils.py
new file mode 100644
index 0000000..4c30d0d
--- /dev/null
+++ b/russian_g2p/ner_accentuation/ner_utils.py
@@ -0,0 +1,183 @@
+import numpy as np
+import tensorflow as tf
+from keras.models import Sequential
+from keras.layers import Embedding, Input, Dense, GRU, TimeDistributed
+from tensorflow_addons.losses import SigmoidFocalCrossEntropy
+
+
+alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
+char_to_int = dict((c, i) for i, c in enumerate(alphabet, start=1)) # определяем буквы алфавита в цифры
+
+pos_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'CONJ','DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'SCONJ', 'VERB', 'X']
+pos = 'ADJ ADP ADV AUX CCONJ CONJ DET INTJ NOUN NUM PART PRON PROPN SCONJ VERB X' # список частей речи
+pos_to_int = dict((pos.split(' ')[i], i) for i in range(len(pos.split(' ')))) # 16
+
+features_list = [
+'Abbr=Yes',
+'Gender=Masc', 'Gender=Fem', 'Gender=Neut',
+'Animacy=Anim', 'Animacy=Inan',
+'Number=Sing', 'Number=Plur', 'Number=Ptan', 'Number=Coll',
+'Case=Nom', 'Case=Gen', 'Case=Par', 'Case=Dat', 'Case=Acc', 'Case=Loc', 'Case=Ins', 'Case=Voc',
+'Degree=Pos', 'Degree=Cmp', 'Degree=Sup',
+'VerbForm=Conv', 'VerbForm=Fin', 'VerbForm=Inf', 'VerbForm=Part', 'VerbForm=PartRes', 'VerbForm=Trans',
+'Mood=Ind', 'Mood=Imp', 'Mood=Cnd',
+'Tense=Past', 'Tense=Pres', 'Tense=Fut',
+'Aspect=Imp', 'Aspect=Perf',
+'Voice=Act', 'Voice=Pass', 'Voice=Mid',
+'Person=1', 'Person=2', 'Person=3',
+'Variant=Full', 'Variant=Brev',
+]
+features = " ".join(features_list) # список морф признаков
+features_to_int = dict((features.split(' ')[i], i) for i in range(len(features.split(' ')))) # 41
+
+vowels = 'аеёиоуыэюя'
+consonants = 'бвгджзйклмнпрстфхцчшщъь'
+
+
+class MaskCalculator(tf.keras.layers.Layer):
+ def __init__(self, output_dim, **kwargs):
+ self.output_dim = output_dim
+ super(MaskCalculator, self).__init__(**kwargs)
+
+ def build(self, input_shape):
+ super(MaskCalculator, self).build(input_shape)
+
+ def call(self, inputs, **kwargs):
+ return tf.keras.backend.permute_dimensions(
+ x=tf.keras.backend.repeat(
+ x=tf.keras.backend.cast(
+ x=tf.keras.backend.greater(
+ x=inputs,
+ y=0
+ ),
+ dtype='float32'
+ ),
+ n=self.output_dim
+ ),
+ pattern=(0, 2, 1)
+ )
+
+ def compute_output_shape(self, input_shape):
+ assert len(input_shape) == 1
+ shape = list(input_shape)
+ shape.append(self.output_dim)
+ return tuple(shape)
+
+
+def create_place_stress_model():
+ input_words = Input(shape=(None,), name='InputWords', dtype='int32') # (I) Архитекрутра модели
+ embedding = Embedding(input_dim=len(alphabet) + 1, output_dim=256, mask_zero=True,
+ name='EmbeddingMaskForWords') # шаг 1: маскирование матриц Цепочек Индексов Символов
+ output_mask_words = embedding(input_words)
+
+ input_morph_inf = Input(shape=(59,), name='InputMorphInf',
+ dtype='float32') # шаг 2: вход матрицы векторов МорфИнф и обработка его Dense-слоем
+ dense_morph_inf = Dense(units=256, name='DenseMorphInf')(
+ input_morph_inf) # подаем размерность без учёта мини-батчей
+
+ gru = GRU(units=256, return_sequences=True, name='RecurrentLayerGRU')(output_mask_words,
+ initial_state=dense_morph_inf) # шаг 3: реккурентный слой
+
+ input_cons_vow = Input(shape=(None,), name='InputConsonantVowel', dtype='int32') # шаг 4: маскирование согласных
+ output_mask_cons_vow = MaskCalculator(output_dim=256, trainable=False, name='OutMaskCalculator')(
+ input_cons_vow) # вручной слой маски
+ masked_sequence_output = tf.keras.layers.Multiply(name='OutMaskMultiplicator')([output_mask_cons_vow, gru])
+ masked_sequence_output = tf.keras.layers.Masking(name='OutMasking')(masked_sequence_output)
+
+ cls_layer = TimeDistributed( # шаг 5: слой TimeDistributed
+ Dense(units=1, activation='sigmoid'),
+ name='ClassificationLayer')(masked_sequence_output)
+
+ place_stress_model = tf.keras.Model(
+ inputs=[input_words, input_morph_inf, input_cons_vow],
+ outputs=cls_layer,
+ name='Placement_of_stress_model')
+
+ place_stress_model.compile(loss=SigmoidFocalCrossEntropy(), optimizer='adam') # (II) Скомпилирование модели
+ # place_stress_model.summary()
+
+ return place_stress_model
+
+
+def load_place_stress_model():
+ place_stress_model = create_place_stress_model()
+ place_stress_model.load_weights(filepath='russian_g2p/ner_accentuation/Placement_of_stress_best_model.h5') # russian_g2p/
+
+ return place_stress_model
+
+
+def create_index_of_letters(word):
+ integer_encoded = []
+ integer_encoded.append([char_to_int[char] for char in word if char != '-']) # Integer Encoding: проводим для заданного слова целочисленное кодирование
+
+ return np.array(integer_encoded, dtype='int32')
+
+
+def create_morph_vector(morph_inf):
+ if ' ' not in morph_inf: # случай, если указана только половина информации: только часть речи / только морф инф
+ if morph_inf in pos_list:
+ integer_encoded_pos = pos_to_int[morph_inf.split(' ')[0]] # созадаем вектор для части речи
+ pos_vector = [0 for _ in range(len(pos.split(' ')))] # 15
+ pos_vector[integer_encoded_pos] = 1
+
+ features_vector = [0 for _ in range(len(features.split(' ')))]
+ else:
+ pos_vector = [0 for _ in range(len(pos.split(' ')))]
+
+ integer_encoded_features = []
+ for char in morph_inf.split('|'):
+ if '(2)' in char or '(3)' in char:
+ char = char[:-3]
+ if char in features_list:
+ integer_encoded_features.append(features_to_int[char])
+ else:
+ print(char, '1) эта морф информация отсутствует в ', morph_inf)
+ features_vector = [0 for _ in range(len(features.split(' ')))]
+ for value in integer_encoded_features:
+ features_vector[value] = 1
+
+ else:
+
+ if morph_inf.split(' ')[0] in pos_list:
+ integer_encoded_pos = pos_to_int[morph_inf.split(' ')[0]] # созадаем вектор для части речи
+ pos_vector = [0 for _ in range(len(pos.split(' ')))] # 15
+ pos_vector[integer_encoded_pos] = 1
+ else:
+ print('Нет такой части речи в списке: ', morph_inf.split(' ')[0])
+ pos_vector = [0 for _ in range(len(pos.split(' ')))]
+
+ if morph_inf.split(' ')[1] == '_' or morph_inf.split(' ')[
+ 1] == '_(2)': # случай, в которых не указана морф инф, а только часть речи
+ features_vector = [0 for _ in range(len(features.split(' ')))]
+ else:
+ integer_encoded_features = []
+ for char in morph_inf.split(' ')[1].split('|'):
+ if '(2)' in char or '(3)' in char:
+ char = char[:-3]
+ if char in features_list:
+ integer_encoded_features.append(features_to_int[char])
+ else:
+ print(char, 'эта морф информация отсутствует в ', morph_inf)
+ features_vector = [0 for _ in range(len(features.split(' ')))]
+ for value in integer_encoded_features:
+ features_vector[value] = 1
+
+ morph_inf_vector = pos_vector + features_vector
+
+ morph_inf_vector_list = []
+ morph_inf_vector_list.append(morph_inf_vector)
+ return np.array(morph_inf_vector_list, dtype='float32')
+
+
+def create_index_of_con_vow(word):
+ one_hot_encoded = []
+ for char in word:
+ if char != '-':
+ if char in vowels:
+ one_hot_encoded.append(1)
+ elif char in consonants:
+ one_hot_encoded.append(0)
+
+ one_hot_encoded_list = []
+ one_hot_encoded_list.append(one_hot_encoded)
+ return np.array(one_hot_encoded_list, dtype='int32')
diff --git a/russian_g2p/tests/test_Accentor.py b/russian_g2p/tests/test_Accentor.py
index bac157e..4146c4d 100644
--- a/russian_g2p/tests/test_Accentor.py
+++ b/russian_g2p/tests/test_Accentor.py
@@ -62,10 +62,13 @@ def test_do_accents_positive06(self):
real_variants = self.__accentor.do_accents(source_phrase)
self.assertEqual(target_variants, real_variants)
- def test_do_accents_positive07(self):
- source_phrase = [['хракозябр'], ['впулил'], ['куздру']]
+ def test_do_accents_positive07(self): # нейросеть
+ source_phrase = [['хракозябр', 'NOUN Animacy=Inan|Case=Gen|Gender=Fem|Number=Plur'],
+ ['впулил', 'VERB Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin'],
+ ['куздру', 'NOUN Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing']
+ ]
target_variants = [
- ['хракозябр', 'впулил', 'куздру']
+ ['хракозябр', 'впули+л', 'куздру+']
]
real_variants = self.__accentor.do_accents(source_phrase)
self.assertEqual(target_variants, real_variants)
@@ -76,10 +79,10 @@ def test_do_accents_positive08(self):
with self.assertRaises(ValueError):
_ = accentor.do_accents(source_phrase)
- def test_do_accents_positive09(self):
- source_phrase = [['серебристо-белый'], ['цвет']]
+ def test_do_accents_positive09(self): # нейросеть для серебристо-белый, словарь для цвет
+ source_phrase = [['серебристо-белый', 'ADJ Case=Nom|Gender=Masc|Number=Sing'], ['цвет', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing']]
target_variants = [
- ['серебри+сто-бе+лый', 'цве+т']
+ ['серебри+сто-белый', 'цве+т'] # 'серебри+сто-бе+лый'
]
real_variants = self.__accentor.do_accents(source_phrase)
self.assertEqual(target_variants, real_variants)
@@ -101,6 +104,14 @@ def test_do_accents_positive11(self):
real_variants = accentor.do_accents(source_phrase)
self.assertEqual(target_variants, real_variants)
+ def test_do_accents_positive12(self): # нейросеть
+ source_phrase = [['а-зе', 'PROPN Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing']]
+ target_variants = [
+ ['а-зе+']
+ ]
+ real_variants = self.__accentor.do_accents(source_phrase)
+ self.assertEqual(target_variants, real_variants)
+
def test_do_accents_negative01(self):
source_phrase_n_morphotags = [['подарок', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'],
['для', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'],
@@ -121,10 +132,10 @@ def test_do_accents_negative03(self):
with self.assertRaisesRegex(AssertionError, target_err_msg):
self.__accentor.do_accents(source_phrase)
- def test_do_accents_negative04(self):
+ def test_do_accents_negative04(self): # нейросеть
source_phrase = [['а-зе']]
- target_err_msg = re.escape('Word `а-зе` is unknown!')
- accentor = Accentor(exception_for_unknown=True, use_wiki=False)
+ target_err_msg = re.escape('Word `а-зе` has no morphotags. Try again by specifying it')
+ accentor = Accentor(exception_for_unknown=True)#, use_wiki=False)
with self.assertRaisesRegex(ValueError, target_err_msg):
accentor.do_accents(source_phrase)