From d5f0e07a0825e0f3fdf0ffca85c0821449cc2376 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Mon, 19 Aug 2024 07:27:24 +0200 Subject: [PATCH 1/7] Cardinals up to a hundred trillions, timeFST and transliteration Signed-off-by: kurt0cougar --- .../text_normalization/rw/__init__.py | 18 + .../text_normalization/rw/data/__init__.py | 14 + .../rw/data/whitelist/__init__.py | 14 + .../data/whitelist/kinya_transliterations.tsv | 175 ++++++++++ .../text_normalization/rw/taggers/__init__.py | 14 + .../text_normalization/rw/taggers/cardinal.py | 319 ++++++++++++++++++ .../text_normalization/rw/taggers/time.py | 106 ++++++ .../rw/taggers/tokenize_and_classify.py | 71 ++++ .../rw/taggers/whitelist.py | 30 ++ .../text_normalization/rw/utils.py | 30 ++ .../rw/verbalizers/__init__.py | 14 + .../text_normalization/rw/verbalizers/time.py | 32 ++ .../rw/verbalizers/verbalize.py | 32 ++ .../rw/verbalizers/verbalize_final.py | 56 +++ tests/nemo_text_processing/rw/__init__.py | 13 + .../test_cases_cardinal.txt | 57 ++++ .../test_cases_time.txt | 14 + .../test_cases_whitelist.txt | 3 + .../test_cases_word.txt | 29 ++ .../nemo_text_processing/rw/test_cardinal.py | 40 +++ .../rw/test_sparrowhawk_normalization.sh | 60 ++++ tests/nemo_text_processing/rw/test_time.py | 36 ++ .../nemo_text_processing/rw/test_whitelist.py | 36 ++ tests/nemo_text_processing/rw/test_word.py | 36 ++ .../pynini_export.py | 9 +- 25 files changed, 1257 insertions(+), 1 deletion(-) create mode 100644 nemo_text_processing/text_normalization/rw/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv create mode 100644 nemo_text_processing/text_normalization/rw/taggers/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/time.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/whitelist.py create mode 100644 nemo_text_processing/text_normalization/rw/utils.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/time.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py create mode 100644 tests/nemo_text_processing/rw/__init__.py create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/rw/test_cardinal.py create mode 100644 tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh create mode 100644 tests/nemo_text_processing/rw/test_time.py create mode 100644 tests/nemo_text_processing/rw/test_whitelist.py create mode 100644 tests/nemo_text_processing/rw/test_word.py diff --git a/nemo_text_processing/text_normalization/rw/__init__.py b/nemo_text_processing/text_normalization/rw/__init__.py new file mode 100644 index 000000000..b136ce06b --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize_final import VerbalizeFinalFst + diff --git a/nemo_text_processing/text_normalization/rw/data/__init__.py b/nemo_text_processing/text_normalization/rw/data/__init__.py new file mode 100644 index 000000000..9fb50331b --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py new file mode 100644 index 000000000..9fb50331b --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv b/nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv new file mode 100644 index 000000000..e550214cd --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv @@ -0,0 +1,175 @@ +Chris kiris +Hipkins Hipikineze +Dexamethasone dekisametazone +corticosteroid koritikositeroyide +immunosuppressant iminosupuresa +CDC sidisi +RBC arabisi +RISA risa +minisante minisante +sars sarisi +pfizer pifiza +BionTech biyoniteki +dollar dorari +ADN ade eni +BBC bibisi +Victoria vikitoria +Espagne esipanye +Nouvelle-Zélande nuveli zerandi +lopinavir lopinaviri +rotinavir rotinaviri +HIV heci ayivi +seychelles seyisheli +maroc maroki +sputnik siputinike +la crosse la korosi +paul pawulo +www wawawa +gov govu +rw rwa +http hecititipi +Berlin iberile +Remdesivir remudesivire +coranavirus koronavirusi +covid kovide +quarantine karantine +oms o e mesi +basketball basiketibalu +football futibolu +cholera kolera +radio radiyo +television televiziyo +service serivise +prof purofu +royal ruyolo +college koreji +health ubuzima +SARS-CoV-kabiri sarisi-kov-kabiri +recovery rekoveri +Dr dogiteri +protein puroteyine +spike sipiyike +victoria vigitoriya +technique tekinike +cell selile +electro erekitoro +sanitizer sanitayiza +Orthocoronavirinae oritocoronavirinaye +coronavirinae coronavirinaye +nidovirales nidoviralesi +Covs covuse +antibody antibodi +Hydroxychloroquine hidurokulorokine +company kompani +oxygen ogisijeni +Carolina karolina +jonathan jonatani +hyper hiperi +micro mikoro +microscope mikorosikope +microchip mikorocipu +glycoproteine gilicopuroteyine +sport siporo +lockdown lokidawuno +email imeli +japan japani +science siyansi +pubmed pubimedi +koica koyika +jica jika +DNA diyeniyi +RNA araneyi +wuhan wuhani +huanan hwanani +thermoregulation terimoregulashiyoni +alveolar aliviyola +hypoxemia hipokisemiya +PCR pisiyara +rapid-test rapidi-tesite +sepsis sepusisi +septique seputike +pulmonary pirimonari +extra egisitura +Real riyo +Time tayimu +Polymerase porimerase +poly pori +Chain ceyini +Reaction reyakishoni +hypoxic hipokisike +ICU ayisiyu +ambulance amburansi +antibiotic antibiyotike +vaccine vagisine +MEDAIR medayire +guardian garidiyani +covax covagise +paris parisi +transplant turansipulanti +laboratoire laboratuwari +Tedros tewodurosi +Ghebreyesus gebureyesusi +polybasic poribazike +china chinwa +RT-PCR arati-pisiyara +UNICEF yunicefu +HCR hashiseyeri +UNESCO yunesico +UN oni +World woridi +bank banki +FMI efu emi +new-york nuyoriki +times tayimuze +MERS merise +electron erekituronu +RDB aradibi +Platelet-fibrin puratele-fibirini +arterial ariteriyo +coagulopathie kowagulopati +RBD arabidi +RDF aradiyefu +module modile +Oxford ogisiforudu +AstraZeneca asutarazeneka +Astra-Zeneca asutarazeneka +astra asutara +zeneca zeneka +chlorine kulorakine +acide aside +peroxyacetic perukisiyatike +chlorhexidine kulorekidine +chloroform kuloroforume +disinfectant dezenkifekita +carbon kariboni +Hopkins hopikinze +communist komunisite +Tanzania tanzaniya +Africa afurika +VOA vi o aye +Jean yohana +Marie mariya +Vianney viyane +chimiotherapie kimyoterapi +sinopharm sinofarume +bus busi +ventilator ventirata +ventilators ventirataze +mRNA emu araneyi +Favipiravir favipiravire +command komandi +center santire +app apu +phone fone +telephone terefone +clinical kiliniko +clinique kilinike +lymphocytes lemfosite +twitter tuwita +youtube yutubi +facebook fasibuki +google gugoli +com komu +Antibodies antibodize +COVID-CUMI kovide-cumi +COVID-CUMI-N'ICYENDA kovide-cumi-n'icyenda diff --git a/nemo_text_processing/text_normalization/rw/taggers/__init__.py b/nemo_text_processing/text_normalization/rw/taggers/__init__.py new file mode 100644 index 000000000..90380542f --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py new file mode 100644 index 000000000..68abc5fbd --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py @@ -0,0 +1,319 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil +import string +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst,NEMO_CHAR,insert_space +from nemo_text_processing.text_normalization.rw.utils import get_abs_path + +def apply_fst(text, fst): + try: + print(pynini.shortestpath(text @ fst).string()) + print(len(pynini.shortestpath(text @ fst).string())) + + except pynini.FstOpError: + print(f"Error: no valid output with given'input: '{text}'") + +class CardinalFst(GraphFst): + def __init__(self): + super().__init__(name="cardinal", kind="classify") + alphabet = string.ascii_letters + rewrite_na_fst = pynini.cdrewrite(pynini.cross(" "," na "),pynini.union(*"aeiouAEIOU "),pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz"),NEMO_CHAR.closure()) + rewrite_n_fst = pynini.cdrewrite(pynini.cross(" "," n'"),pynini.union(*"aeiouAEIOU "),pynini.union(*"aeiouAEIOU"),NEMO_CHAR.closure()) + remove_underscore_fst = pynini.cdrewrite(pynini.cross("_"," "),pynini.union(*alphabet),pynini.union(*alphabet),NEMO_CHAR.closure()) + remove_extra_space_fst = pynini.cdrewrite(pynini.cross(" "," "),pynini.union(*alphabet),pynini.union(*alphabet),NEMO_CHAR.closure()) + remove_trailing_space_fst = pynini.cdrewrite(pynini.cross(pynini.accep(' ').closure(),''),pynini.union(*alphabet).closure(),'[EOS]',NEMO_CHAR.closure()) + + rewrite_add_separator_fst = pynini.compose(rewrite_na_fst,rewrite_n_fst) + ten_thousand = pynini.string_map([("ibihumbi_icumi","10")]) + ten = pynini.string_map([("icumi","10")]) + digits = pynini.string_map([ + ("rimwe","1"), + ("kabiri","2"), + ("gatatu","3"), + ("kane","4"), + ("gatanu","5"), + ("gatandatu","6"), + ("karindwi","7"), + ("umunani","8"), + ("icyenda","9"), + ]) + digits_for_thousands = pynini.string_map([ + ("","0"), + ("kimwe","1"), + ("bibiri","2"), + ("bitatu","3"), + ("bine","4"), + ("bitanu","5"), + ("bitandatu","6"), + ("birindwi","7"), + ("umunani","8"), + ("icyenda","9") + ]) + digits_millions_trillions= pynini.string_map([ + ("","0"), + ("imwe","1"), + ("ebyiri","2"), + ("eshatu","3"), + ("enye","4"), + ("eshanu","5"), + ("esheshatu","6"), + ("zirindwi","7"), + ("umunani","8"), + ("icyenda","9") + ]) + tens = pynini.string_map([ + (" ","0"), + ("makumyabiri","2"), + ("mirongo_itatu","3"), + ("mirongo_ine","4"), + ("mirongo_itanu","5"), + ("mirongo_itandatu","6"), + ("mirongo_irindwi","7"), + ("mirongo_inani","8"), + ("mirongo_icyenda","9") + ]) + tens_for_ends = pynini.string_map([("icumi","1")])|tens + tens_for_beginnings= pynini.string_map([("cumi","1")])|tens + hundreds = pynini.string_map([ + ("ijana","1"), + ("magana_abiri","2"), + ("magana_atatu","3"), + ("magana_ane","4"), + ("magana_atanu","5"), + ("magana_atandatu","6"), + ("magana_arindwi","7"), + ("magana_inani","8"), + ("magana_cyenda","9") + ]) + thousands = pynini.string_map([ + ("igihumbi","1"), + ("ibihumbi_bibiri","2"), + ("ibihumbi_bitatu","3"), + ("ibihumbi_bine","4"), + ("ibihumbi_bitanu","5"), + ("ibihumbi_bitandatu","6"), + ("ibihumbi_birindwi","7"), + ("ibihumbi_umunani","8"), + ("ibihumbi_icyenda","9") + ]) + tens_of_thousands = pynini.string_map([ + ("ibihumbi_cumi","1"), + ("ibihumbi_makumyabiri","2"), + ("ibihumbi_mirongo_itatu","3"), + ("ibihumbi_mirongo_ine","4"), + ("ibihumbi_mirongo_itanu","5"), + ("ibihumbi_mirongo_itandatatu","6"), + ("ibihumbi_mirongo_irindwi","7"), + ("ibihumbi_mirongo_inani","8"), + ("ibihumbi_mirongo_icyenda","9") + ]) + hundreds_of_thousands = pynini.string_map([ + ("ibihumbi_ijana","1"), + ("ibihumbi_magana_abiri","2"), + ("ibihumbi_magana_atatu","3"), + ("ibihumbi_magana_ane","4"), + ("ibihumbi_magana_atanu","5"), + ("ibihumbi_magana_atandatu","6"), + ("ibihumbi_magana_arindwi","7"), + ("ibihumbi_magana_inani","8"), + ("ibihumbi_magana_cyenda","9") + ]) + millions = pynini.string_map([ + ("miliyoni","1"), + ("miliyoni_ebyiri","2"), + ("miliyoni_eshatu","3"), + ("miliyoni_enye","4"), + ("miliyoni_eshanu","5"), + ("miliyoni_esheshatu","6"), + ("miliyoni_zirindwi","7"), + ("miliyoni_umunani","8"), + ("miliyoni_icyenda","9") + ]) + tens_of_millions = pynini.string_map([ + ("miliyoni_cumi","1"), + ("miliyoni_makumyabiri","2"), + ("miliyoni_mirongo_itatu","3"), + ("miliyoni_mirongo_ine","4"), + ("miliyoni_mirongo_itanu","5"), + ("miliyoni_mirongo_itandatatu","6"), + ("miliyoni_mirongo_irindwi","7"), + ("miliyoni_mirongo_inani","8"), + ("miliyoni_mirongo_icyenda","9") + ]) + hundreds_of_millions = pynini.string_map([ + ("miliyoni_ijana","1"), + ("miliyoni_magana_abiri","2"), + ("miliyoni_magana_atatu","3"), + ("miliyoni_magana_ane","4"), + ("miliyoni_magana_atanu","5"), + ("miliyoni_magana_atandatu","6"), + ("miliyoni_magana_arindwi","7"), + ("miliyoni_magana_inani","8"), + ("miliyoni_magana_cyenda","9") + ]) + trillions = pynini.string_map([ + ("tiriyoni","1"), + ("tiriyoni_ebyiri","2"), + ("tiriyoni_eshatu","3"), + ("tiriyoni_enye","4"), + ("tiriyoni_eshanu","5"), + ("tiriyoni_esheshatu","6"), + ("tiriyoni_zirindwi","7"), + ("tiriyoni_umunani","8"), + ("tiriyoni_icyenda","9") + ]) + tens_of_trillions = pynini.string_map([ + ("tiriyoni_icumi","1"), + ("tiriyoni_makumyabiri","2"), + ("tiriyoni_mirongo_itatu","3"), + ("tiriyoni_mirongo_ine","4"), + ("tiriyoni_mirongo_itanu","5"), + ("tiriyoni_mirongo_itandatatu","6"), + ("tiriyoni_mirongo_irindwi","7"), + ("tiriyoni_mirongo_inani","8"), + ("tiriyoni_mirongo_icyenda","9") + ]) + hundreds_of_trillions = pynini.string_map([ + ("tiriyoni_ijana","1"), + ("tiriyoni_magana_abiri","2"), + ("tiriyoni_magana_atatu","3"), + ("tiriyoni_magana_ane","4"), + ("tiriyoni_magana_atanu","5"), + ("tiriyoni_magana_atandatu","6"), + ("tiriyoni_magana_arindwi","7"), + ("tiriyoni_magana_inani","8"), + ("tiriyoni_magana_cyenda","9") + ]) + THREE_ZEROS = "000" + FOUR_ZEROS = "0000" + FIVE_ZEROS = "00000" + SIX_ZEROS = "000000" + SIX_ZEROS = "000000" + SEVEN_ZEROS = "0000000" + EIGHT_ZEROS = "00000000" + NINE_ZEROS = "000000000" + + zero = pynini.string_map([("zeru","0")]) + rewrite_remove_comma_fst = pynini.cdrewrite(pynini.cross(",",""),pynini.union(*"0123456789"),pynini.union(*"0123456789"),NEMO_CHAR.closure()) + single_digits_graph = pynini.invert(digits | zero) + single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) + remove_comma = rewrite_remove_comma_fst@single_digits_graph + + graph_tens_ends = tens_for_ends +pynutil.delete(" ")+ digits | tens_for_ends+pynutil.insert("0") + graph_tens_starts = tens_for_beginnings +pynutil.delete(" ")+ digits | tens_for_beginnings+pynutil.insert("0") + + graph_tens_for_thousands = tens_for_beginnings +pynutil.delete(" ")+ digits_for_thousands | tens_for_beginnings+pynutil.insert("0") + + graph_tens_for_millions_trillions = tens_for_beginnings +pynutil.delete(" ")+ digits_millions_trillions \ + | tens_for_beginnings+pynutil.insert("0") + graph_hundreds = hundreds+pynutil.delete(" ")+graph_tens_ends | hundreds+pynutil.insert("00") \ + | hundreds+pynutil.delete(" ")+pynutil.insert("0")+digits + graph_thousands = thousands+pynutil.delete(" ")+graph_hundreds | thousands+pynutil.insert(THREE_ZEROS) \ + | thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_ends \ + | thousands+pynutil.delete(" ")+pynutil.insert("00")+digits + + graph_ten_thousand_and_hundreds = ten_thousand +pynutil.insert(THREE_ZEROS) | ten_thousand +pynutil.delete(" ") + graph_hundreds \ + | ten_thousand+pynutil.delete(" ") +pynutil.insert("0")+graph_tens_ends \ + | ten_thousand+pynutil.delete(" ") +pynutil.insert("00")+digits + prefix_tens_of_thousands = tens_of_thousands+pynutil.delete(" ") + digits_for_thousands + graph_tens_of_thousands = pynutil.add_weight(graph_ten_thousand_and_hundreds, weight=-0.1) \ + | prefix_tens_of_thousands+ pynutil.delete(" ")+ graph_hundreds \ + | prefix_tens_of_thousands + pynutil.insert(THREE_ZEROS) \ + | prefix_tens_of_thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_hundreds \ + | prefix_tens_of_thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_ends \ + | prefix_tens_of_thousands+pynutil.delete(" ")+pynutil.insert("00")+digits + + prefix_hundreds_of_thousands = hundreds_of_thousands+pynutil.delete(" ") + graph_tens_for_thousands + graph_hundreds_of_thousands = hundreds_of_thousands+pynutil.insert(FIVE_ZEROS) \ + | prefix_hundreds_of_thousands+pynutil.insert(THREE_ZEROS) \ + | prefix_hundreds_of_thousands+pynutil.delete(" ")+graph_hundreds \ + | pynutil.add_weight(prefix_hundreds_of_thousands+pynutil.delete(" ")+pynutil.insert("00")+digits,weight=-0.1) \ + | prefix_hundreds_of_thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_for_thousands + + graph_millions = millions +pynutil.delete(" ") + graph_hundreds_of_thousands | millions+pynutil.insert(SIX_ZEROS) \ + | millions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_thousands \ + | millions+pynutil.delete(" ")+pynutil.insert("00")+graph_thousands \ + | millions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds \ + | millions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_ends \ + | millions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+digits + + prefix_tens_of_millions = tens_of_millions+pynutil.delete(" ") + digits_millions_trillions + graph_tens_of_millions = prefix_tens_of_millions +pynutil.delete(" ")+graph_hundreds_of_thousands \ + | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS) \ + | prefix_tens_of_millions+pynutil.delete(" ") +pynutil.insert("0")+graph_tens_of_thousands \ + | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds \ + | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_ends \ + | tens_of_millions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_tens_ends \ + | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+digits + + prefix_hundreds_of_millions = hundreds_of_millions+pynutil.delete(" ") +graph_tens_for_millions_trillions + graph_hundreds_of_millions = prefix_hundreds_of_millions+pynutil.delete(" ")+graph_hundreds_of_thousands \ + | prefix_hundreds_of_millions+pynutil.insert(SIX_ZEROS) \ + | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_thousands \ + | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert("00")+graph_thousands \ + | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds \ + | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_ends + + graph_trillions = trillions+pynutil.delete(" ")+graph_hundreds_of_millions | trillions+pynutil.insert(NINE_ZEROS) \ + | trillions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_millions \ + | trillions+pynutil.delete(" ")+pynutil.insert("00")+graph_millions \ + | trillions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds_of_thousands \ + | trillions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_of_thousands \ + | trillions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_thousands\ + | trillions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS)+graph_hundreds \ + | trillions+pynutil.delete(" ")+pynutil.insert(SEVEN_ZEROS)+graph_tens_ends \ + | trillions+pynutil.delete(" ")+pynutil.insert(EIGHT_ZEROS)+digits + + prefix_tens_of_trillions = tens_of_trillions+pynutil.delete(" ") + digits_millions_trillions + graph_tens_of_trillions = prefix_tens_of_trillions+pynutil.delete(" ")+graph_hundreds_of_millions \ + | prefix_tens_of_trillions+pynutil.insert(NINE_ZEROS) \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_millions \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert("00")+graph_millions \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds_of_thousands \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_of_thousands \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_thousands \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS)+graph_hundreds \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(SEVEN_ZEROS)+graph_tens_ends \ + | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(EIGHT_ZEROS)+digits + + prefix_hundreds_of_trillions = hundreds_of_trillions+pynutil.delete(" ") +graph_tens_for_millions_trillions + graph_hundreds_of_trillions = prefix_hundreds_of_trillions+pynutil.delete(" ")+ graph_hundreds_of_millions \ + | prefix_hundreds_of_trillions+pynutil.insert(NINE_ZEROS) \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_millions \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert("00")+graph_millions \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds_of_thousands \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_of_thousands \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_thousands \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS)+graph_hundreds \ + | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(SEVEN_ZEROS)+graph_tens_ends + + graph_all = graph_hundreds_of_trillions | graph_tens_of_trillions | graph_trillions | graph_hundreds_of_millions | graph_tens_of_millions \ + | graph_millions | graph_hundreds_of_thousands | graph_tens_of_thousands \ + | graph_thousands | graph_hundreds | pynutil.add_weight(ten, weight=-0.1) \ + | graph_tens_starts | digits | pynini.cross("zeru","0") + + inverted_graph_all = pynini.compose(pynini.invert(graph_all),rewrite_add_separator_fst) + inverted_graph_all = pynini.compose(inverted_graph_all,remove_extra_space_fst) + inverted_graph_all = pynini.compose(inverted_graph_all,remove_trailing_space_fst) + inverted_graph_all = pynini.compose(inverted_graph_all,remove_underscore_fst) | pynutil.add_weight(remove_comma, 0.0001) + + inverted_graph_all = inverted_graph_all.optimize() + final_graph = pynutil.insert("integer: \"") + inverted_graph_all + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph + + diff --git a/nemo_text_processing/text_normalization/rw/taggers/time.py b/nemo_text_processing/text_normalization/rw/taggers/time.py new file mode 100644 index 000000000..6b2a0d531 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/time.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +import pynini +from pynini.lib import pynutil + + +class TimeFst(GraphFst): + def __init__(self): + super().__init__(name="time", kind="classify") + + hours = pynini.string_map([ + ('1', 'saa saba'), + ('2', 'saa munani'), + ('3', 'saa cyenda'), + ('4', 'saa cumi'), + ('5', "saa cumi n'imwe"), + ('6', "saa cumi n'ebyiri"), + ('7', 'saa moya'), + ('8', 'saa mbiri'), + ('9', 'saa tatu'), + ('10', 'saa ine'), + ('11', 'saa tanu'), + ('12', 'saa sita'), + ]) + + minutes = pynini.string_map([ + ('00', ' '), + ('01', " n'umunota umwe") , + ('02', " n'iminota ibiri") , + ('03', " n'iminota itatu") , + ('04', " n'iminota ine") , + ('05', " n'iminota itanu") , + ('06', " n'iminota itandatu") , + ('07', " n'iminota irindwi") , + ('08', " n'iminota umunani") , + ('09', " n'iminota icyenda") , + ('10', " n'iminota icumi") , + ('11', " n'iminota cumi n'umwe") , + ('12', " n'iminota cumi n'ibiri") , + ('13', " n'iminota cumi n'itatu") , + ('14', " n'iminota cumi n'ine") , + ('15', " n'iminota cumi n'itanu") , + ('16', " n'iminota cumi n'itandatu") , + ('17', " n'iminota cumi n'irindwi") , + ('18', " n'iminota cumi n'umunani") , + ('19', " n'iminota cumi n'icyenda") , + ('20', " n'iminota makumyabiri") , + ('21', " n'iminota makumyabiri na rimwe") , + ('22', " n'iminota makumyabiri n'ibiri") , + ('23', " n'iminota makumyabiri n'itatu") , + ('24', " n'iminota makumyabiri n'ine") , + ('25', " n'iminota makumyabiri n'itanu") , + ('26', " n'iminota makumyabiri n'itandatu") , + ('27', " n'iminota makumyabiri n'irindwi") , + ('28', " n'iminota makumyabiri n'umunani") , + ('29', " n'iminota makumyabiri n'icyenda") , + ('30', " n'iminota mirongo itatu") , + ('31', " n'iminota mirongo itatu n'umwe") , + ('32', " n'iminota mirongo itatu n'ibiri") , + ('33', " n'iminota mirongo itatu n'itatu") , + ('34', " n'iminota mirongo itatu n'ine") , + ('35', " n'iminota mirongo itatu n'itanu") , + ('36', " n'iminota mirongo itatu n'itandatu") , + ('37', " n'iminota mirongo itatu n'irindwi") , + ('38', " n'iminota mirongo itatu n'umunani") , + ('39', " n'iminota mirongo itatu n'icyenda") , + ('40', " n'iminota mirongo ine") , + ('41', " n'iminota mirongo ine n'umwe") , + ('42', " n'iminota mirongo ine n'ibiri") , + ('43', " n'iminota mirongo ine n'itatu") , + ('44', " n'iminota mirongo ine n'ine") , + ('45', " n'iminota mirongo ine n'itanu") , + ('46', " n'iminota mirongo ine n'itandatu") , + ('47', " n'iminota mirongo ine n'irindwi") , + ('48', " n'iminota mirongo ine n'umunani") , + ('49', " n'iminota mirongo ine n'icyenda") , + ('50', " n'iminota mirongo itanu") , + ('51', " n'iminota mirongo itanu n'umwe") , + ('52', " n'iminota mirongo itanu n'ibiri") , + ('53', " n'iminota mirongo itanu n'itatu") , + ('54', " n'iminota mirongo itanu n'ine") , + ('55', " n'iminota mirongo itanu n'itanu") , + ('56', " n'iminota mirongo itanu n'itandatu") , + ('57', " n'iminota mirongo itanu n'irindwi") , + ('58', " n'iminota mirongo itanu n'umunani") , + ('59', " n'iminota mirongo itanu n'icyenda") , + ]) + + final_graph = pynutil.insert("hours:\"")+hours+pynutil.insert("\"")+pynutil.delete(":")+pynutil.insert(" minutes:\"")+minutes+pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..3a034af13 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.taggers.time import TimeFst +from nemo_text_processing.text_normalization.rw.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.rw.taggers.cardinal import CardinalFst + + +from nemo_text_processing.text_normalization.en.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst +import pynini +from pynini.lib import pynutil +import os +from nemo_text_processing.text_normalization.en.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + +class ClassifyFst(GraphFst): + def __init__(self,input_case: str,cache_dir: str = None, overwrite_cache: bool = False,deterministic: bool = True,whitelist: str = None, +): + super().__init__(name='tokenize_and_classify',kind='classify',deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "rw_tn_tokenize_and_classify.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + print("FAR file: ",far_file) + self.fst = pynini.Far(far_file, mode="r")["TOKENIZE_AND_CLASSIFY"] + else: + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + time_graph = TimeFst().fst + punctuation = PunctuationFst() + punct_graph = punctuation.fst + + word_graph = WordFst(punctuation=punctuation).fst + + whitelist_graph = WhiteListFst().fst + classify = ( + pynutil.add_weight(time_graph, 1.05) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(word_graph, 1.50) + | pynutil.add_weight(whitelist_graph,1.01) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token+ pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"TOKENIZE_AND_CLASSIFY":self.fst}) diff --git a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py new file mode 100644 index 000000000..0355d9741 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +import pynini +from pynini.lib import pynutil +from nemo_text_processing.text_normalization.rw.utils import get_abs_path + + +transliterations = pynini.string_file(get_abs_path("data/whitelist/kinya_transliterations.tsv")) + +class WhiteListFst(GraphFst): + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = transliterations + graph = pynutil.insert("name: \"") + whitelist + pynutil.insert("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/rw/utils.py b/nemo_text_processing/text_normalization/rw/utils.py new file mode 100644 index 000000000..148d2de51 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/utils.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + + diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py new file mode 100644 index 000000000..26cff59aa --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/time.py b/nemo_text_processing/text_normalization/rw/verbalizers/time.py new file mode 100644 index 000000000..90d1c17e4 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/time.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pynini +from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import ( + GraphFst, + delete_space, + NEMO_CHAR +) + +class VerbalizeTimeFst(GraphFst): + def __init__(self): + super().__init__(name="time",kind="verbalize") + hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")+delete_space \ + +pynutil.delete("minutes:")+delete_space+pynutil.delete("\"") + pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")) + + graph = hour + delete_tokens = self.delete_tokens(graph) + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py new file mode 100644 index 000000000..94bf7a038 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.verbalizers.time import VerbalizeTimeFst +from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst + +class VerbalizeFst(GraphFst): + def __init__(self,deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize",deterministic=deterministic) + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + time = VerbalizeTimeFst().fst + + graph = ( + cardinal_graph + | time + ) + self.fst = graph + + diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py new file mode 100644 index 000000000..e191fbf32 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pynini +from pynini.lib import pynutil +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +import os + +class VerbalizeFinalFst(GraphFst): + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False,deterministic: bool = True): + super().__init__(name="verbalize_final", kind="verbalize",deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"rw_tn_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + else: + verbalize = VerbalizeFst().fst + word = WordFst().fst + + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + + self.fst = graph + + if far_file: + generator_main(far_file, {"ALL":self.fst,'REDUP': pynini.accep("REDUP")}) diff --git a/tests/nemo_text_processing/rw/__init__.py b/tests/nemo_text_processing/rw/__init__.py new file mode 100644 index 000000000..9e3250071 --- /dev/null +++ b/tests/nemo_text_processing/rw/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..baca7cbe4 --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,57 @@ +1~rimwe +2~kabiri +3~gatatu +4~kane +5~gatanu +6~gatandatu +7~karindwi +8~umunani +9~icyenda +10~icumi +002~zeru zeru kabiri +11~cumi na rimwe +12~cumi na kabiri +13~cumi na gatatu +2,3,4,5~kabiri gatatu kane gatanu +14~cumi na kane +15~cumi na gatanu +16~cumi na gatandatu +17~cumi na karindwi +18~cumi n'umunani +19~cumi n'icyenda +20~makumyabiri +89~mirongo inani n'icyenda +123~ijana na makumyabiri na gatatu +730~magana arindwi na mirongo itatu +100~ijana +2004~ibihumbi bibiri na kane +9041~ibihumbi icyenda na mirongo ine na rimwe +5324~ibihumbi bitanu na magana atatu na makumyabiri na kane +8567~ibihumbi umunani na magana atanu na mirongo itandatu na karindwi +10000~ibihumbi icumi +14000~ibihumbi cumi na bine +24404~ibihumbi makumyabiri na bine na magana ane na kane +9000~ibihumbi icyenda +9700~ibihumbi icyenda na magana arindwi +250~magana abiri na mirongo itanu +367~magana atatu na mirongo itandatu na karindwi +90104~ibihumbi mirongo icyenda n'ijana na kane +111001~ibihumbi ijana na cumi na kimwe na rimwe +10999~ibihumbi icumi na magana cyenda na mirongo icyenda n'icyenda +100000~ibihumbi ijana +200000~ibihumbi magana abiri +101000~ibihumbi ijana na kimwe +130000~ibihumbi ijana na mirongo itatu +531000~ibihumbi magana atanu na mirongo itatu na kimwe +2200345~miliyoni ebyiri n'ibihumbi magana abiri na magana atatu na mirongo ine na gatanu +7000000~miliyoni zirindwi +9101100~miliyoni icyenda n'ibihumbi ijana na kimwe n'ijana +19034004~miliyoni cumi n'icyenda n'ibihumbi mirongo itatu na bine na kane +29000000~miliyoni makumyabiri n'icyenda +40000000~miliyoni mirongo ine +400000000~miliyoni magana ane +100000001~miliyoni ijana na rimwe +340000000~miliyoni magana atatu na mirongo ine +783100000~miliyoni magana arindwi na mirongo inani n'eshatu n'ibihumbi ijana +340010010~miliyoni magana atatu na mirongo ine n'ibihumbi icumi n'icumi +9374514510~tiriyoni icyenda na miliyoni magana atatu na mirongo irindwi n'enye n'ibihumbi magana atanu na cumi na bine na magana atanu n'icumi diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..4a4ec27bc --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt @@ -0,0 +1,14 @@ +1:00~saa saba +2:01~saa munani n'umunota umwe +3:30~saa cyenda n'iminota mirongo itatu +4:21~saa cumi n'iminota makumyabiri na rimwe +5:12~saa cumi n'imwe n'iminota cumi n'ibiri +6:49~saa cumi n'ebyiri n'iminota mirongo ine n'icyenda +7:05~saa moya n'iminota itanu +8:23~saa mbiri n'iminota makumyabiri n'itatu +9:47~saa tatu n'iminota mirongo ine n'irindwi +10:56~saa ine n'iminota mirongo itanu n'itandatu +11:00~saa tanu +12:09~saa sita n'iminota icyenda +1:59~saa saba n'iminota mirongo itanu n'icyenda +12:31~saa sita n'iminota mirongo itatu n'umwe \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..b9b597932 --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,3 @@ +www~wawawa +maroc~maroki +television~televiziyo \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..02f67151c --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt @@ -0,0 +1,29 @@ +~ + ~ + no~no +x ~x +X!~X! +—~— +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aalem~aalem +a'ali~a'ali +aaliyan's~aaliyan's +mother-in-law~mother-in-law + 1~rimwe +1~rimwe +!1~! rimwe +mar~mar +mar~mar +umwangavu~umwangavu diff --git a/tests/nemo_text_processing/rw/test_cardinal.py b/tests/nemo_text_processing/rw/test_cardinal.py new file mode 100644 index 000000000..4c04cfaa6 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_cardinal.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + + + normalizer_rw = Normalizer( + input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + print(pred) + + \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..c67b247e6 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh @@ -0,0 +1,60 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/rw"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + + +testTNCardinal() { + input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + + +testTNTime() { + input=$TEST_DIR/data_text_normalization/test_cases_time.txt + runtest $input +} + + +testTNWhitelist() { + input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testTNWord() { + input=$TEST_DIR/data_text_normalization/test_cases_word.txt + runtest $input +} + + + + + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/rw/test_time.py b/tests/nemo_text_processing/rw/test_time.py new file mode 100644 index 000000000..99b6f62b5 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_time.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + + + normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False) + assert pred == expected + + \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/test_whitelist.py b/tests/nemo_text_processing/rw/test_whitelist.py new file mode 100644 index 000000000..35a43197c --- /dev/null +++ b/tests/nemo_text_processing/rw/test_whitelist.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + + + normalizer_rw = Normalizer(input_case='cased',lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False) + assert pred == expected + diff --git a/tests/nemo_text_processing/rw/test_word.py b/tests/nemo_text_processing/rw/test_word.py new file mode 100644 index 000000000..65f990ca6 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_word.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + + + normalizer_rw = Normalizer(input_case='cased',lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False) + assert pred == expected + diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index d6ceb84f2..596723091 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -86,7 +86,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr', 'ja'], + choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr', 'ja','rw'], type=str, default='en', ) @@ -270,6 +270,13 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'rw': + from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import ( + VerbalizeFst as TNVerbalizeFst, + ) output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From 44d726c5e0164693f51585a96e96faa25b62be4c Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Mon, 2 Sep 2024 06:23:23 +0200 Subject: [PATCH 2/7] Cardinals up to a hundred trillions, timeFST and transliteration Signed-off-by: kurt0cougar --- tests/nemo_text_processing/rw/__init__.py | 4 +++- .../rw/data_text_normalization/test_cases_word.txt | 4 ++-- tests/nemo_text_processing/rw/test_cardinal.py | 1 + tests/nemo_text_processing/rw/test_time.py | 1 + tests/nemo_text_processing/rw/test_whitelist.py | 1 + tests/nemo_text_processing/rw/test_word.py | 1 + 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/nemo_text_processing/rw/__init__.py b/tests/nemo_text_processing/rw/__init__.py index 9e3250071..4f53d71f2 100644 --- a/tests/nemo_text_processing/rw/__init__.py +++ b/tests/nemo_text_processing/rw/__init__.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt index 02f67151c..a4c1f2c6a 100644 --- a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt @@ -23,7 +23,7 @@ aaliyan's~aaliyan's mother-in-law~mother-in-law 1~rimwe 1~rimwe -!1~! rimwe +!1~!rimwe mar~mar mar~mar -umwangavu~umwangavu +umwangavu~umwangavu \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/test_cardinal.py b/tests/nemo_text_processing/rw/test_cardinal.py index 4c04cfaa6..d1d290cb4 100644 --- a/tests/nemo_text_processing/rw/test_cardinal.py +++ b/tests/nemo_text_processing/rw/test_cardinal.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/rw/test_time.py b/tests/nemo_text_processing/rw/test_time.py index 99b6f62b5..ff49a3dc8 100644 --- a/tests/nemo_text_processing/rw/test_time.py +++ b/tests/nemo_text_processing/rw/test_time.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/rw/test_whitelist.py b/tests/nemo_text_processing/rw/test_whitelist.py index 35a43197c..b5850ab6a 100644 --- a/tests/nemo_text_processing/rw/test_whitelist.py +++ b/tests/nemo_text_processing/rw/test_whitelist.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/rw/test_word.py b/tests/nemo_text_processing/rw/test_word.py index 65f990ca6..06fff29b1 100644 --- a/tests/nemo_text_processing/rw/test_word.py +++ b/tests/nemo_text_processing/rw/test_word.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 81d0e8362ef04242973ccc4ee19017eaf384de98 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Mon, 2 Sep 2024 06:28:51 +0200 Subject: [PATCH 3/7] Cardinals up to a hundred trillions, timeFST and transliteration (moving constants to data files). Signed-off-by: kurt0cougar --- .../text_normalization/normalize.py | 3 + .../text_normalization/rw/__init__.py | 1 + .../text_normalization/rw/data/__init__.py | 1 + .../rw/data/cardinal/__init__.py | 15 + .../rw/data/cardinal/digits.tsv | 9 + .../rw/data/cardinal/digits_for_thousands.tsv | 10 + .../cardinal/digits_millions_trillions.tsv | 10 + .../rw/data/cardinal/hundreds.tsv | 9 + .../rw/data/cardinal/hundreds_of_millions.tsv | 9 + .../data/cardinal/hundreds_of_thousands.tsv | 9 + .../data/cardinal/hundreds_of_trillions.tsv | 9 + .../rw/data/cardinal/millions.tsv | 9 + .../rw/data/cardinal/tens.tsv | 9 + .../rw/data/cardinal/tens_of_millions.tsv | 9 + .../rw/data/cardinal/tens_of_thousands.tsv | 9 + .../rw/data/cardinal/tens_of_trillions.tsv | 9 + .../rw/data/cardinal/thousands.tsv | 10 + .../rw/data/cardinal/trillions.tsv | 9 + .../rw/data/time/__init__.py | 15 + .../text_normalization/rw/data/time/hours.tsv | 12 + .../rw/data/time/minutes.tsv | 60 ++++ .../rw/data/whitelist/__init__.py | 1 + .../text_normalization/rw/graph_utils.py | 311 ++++++++++++++++++ .../text_normalization/rw/taggers/__init__.py | 1 + .../text_normalization/rw/taggers/cardinal.py | 196 ++--------- .../text_normalization/rw/taggers/time.py | 82 +---- .../rw/taggers/tokenize_and_classify.py | 5 +- .../rw/taggers/whitelist.py | 3 +- .../rw/verbalizers/__init__.py | 1 + .../text_normalization/rw/verbalizers/time.py | 7 +- .../rw/verbalizers/verbalize.py | 3 +- .../rw/verbalizers/verbalize_final.py | 30 +- .../pynini_export.py | 7 +- 33 files changed, 611 insertions(+), 272 deletions(-) create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/time/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/time/hours.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/time/minutes.tsv create mode 100644 nemo_text_processing/text_normalization/rw/graph_utils.py diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 14093dadf..c6d19f82f 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -165,6 +165,9 @@ def __init__( elif lang == 'hy': from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.hy.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'rw': + from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.rw.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") diff --git a/nemo_text_processing/text_normalization/rw/__init__.py b/nemo_text_processing/text_normalization/rw/__init__.py index b136ce06b..c921ca1b8 100644 --- a/nemo_text_processing/text_normalization/rw/__init__.py +++ b/nemo_text_processing/text_normalization/rw/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/data/__init__.py b/nemo_text_processing/text_normalization/rw/data/__init__.py index 9fb50331b..9c4313114 100644 --- a/nemo_text_processing/text_normalization/rw/data/__init__.py +++ b/nemo_text_processing/text_normalization/rw/data/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py b/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv new file mode 100644 index 000000000..bf85b743b --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv @@ -0,0 +1,9 @@ +rimwe 1 +kabiri 2 +gatatu 3 +kane 4 +gatanu 5 +gatandatu 6 +karindwi 7 +umunani 8 +icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv new file mode 100644 index 000000000..ee31aadee --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv @@ -0,0 +1,10 @@ + 0 +kimwe 1 +bibiri 2 +bitatu 3 +bine 4 +bitanu 5 +bitandatu 6 +birindwi 7 +umunani 8 +icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv new file mode 100644 index 000000000..126ad90a3 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv @@ -0,0 +1,10 @@ + 0 +imwe 1 +ebyiri 2 +eshatu 3 +enye 4 +eshanu 5 +esheshatu 6 +zirindwi 7 +umunani 8 +icyenda 9 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv new file mode 100644 index 000000000..a46623cc1 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv @@ -0,0 +1,9 @@ +ijana 1 +magana_abiri 2 +magana_atatu 3 +magana_ane 4 +magana_atanu 5 +magana_atandatu 6 +magana_arindwi 7 +magana_inani 8 +magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv new file mode 100644 index 000000000..6e38c3ceb --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv @@ -0,0 +1,9 @@ +miliyoni_ijana 1 +miliyoni_magana_abiri 2 +miliyoni_magana_atatu 3 +miliyoni_magana_ane 4 +miliyoni_magana_atanu 5 +miliyoni_magana_atandatu 6 +miliyoni_magana_arindwi 7 +miliyoni_magana_inani 8 +miliyoni_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv new file mode 100644 index 000000000..a73477c14 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv @@ -0,0 +1,9 @@ +ibihumbi_ijana 1 +ibihumbi_magana_abiri 2 +ibihumbi_magana_atatu 3 +ibihumbi_magana_ane 4 +ibihumbi_magana_atanu 5 +ibihumbi_magana_atandatu 6 +ibihumbi_magana_arindwi 7 +ibihumbi_magana_inani 8 +ibihumbi_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv new file mode 100644 index 000000000..00fc01aa4 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni_ijana 1 +tiriyoni_magana_abiri 2 +tiriyoni_magana_atatu 3 +tiriyoni_magana_ane 4 +tiriyoni_magana_atanu 5 +tiriyoni_magana_atandatu 6 +tiriyoni_magana_arindwi 7 +tiriyoni_magana_inani 8 +tiriyoni_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv new file mode 100644 index 000000000..fded5ed55 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv @@ -0,0 +1,9 @@ +miliyoni 1 +miliyoni_ebyiri 2 +miliyoni_eshatu 3 +miliyoni_enye 4 +miliyoni_eshanu 5 +miliyoni_esheshatu 6 +miliyoni_zirindwi 7 +miliyoni_umunani 8 +miliyoni_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv new file mode 100644 index 000000000..6e63c3875 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv @@ -0,0 +1,9 @@ + 0 +makumyabiri 2 +mirongo_itatu 3 +mirongo_ine 4 +mirongo_itanu 5 +mirongo_itandatu 6 +mirongo_irindwi 7 +mirongo_inani 8 +mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv new file mode 100644 index 000000000..36f077d00 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv @@ -0,0 +1,9 @@ +miliyoni_cumi 1 +miliyoni_makumyabiri 2 +miliyoni_mirongo_itatu 3 +miliyoni_mirongo_ine 4 +miliyoni_mirongo_itanu 5 +miliyoni_mirongo_itandatatu 6 +miliyoni_mirongo_irindwi 7 +miliyoni_mirongo_inani 8 +miliyoni_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv new file mode 100644 index 000000000..f230751bf --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv @@ -0,0 +1,9 @@ +ibihumbi_cumi 1 +ibihumbi_makumyabiri 2 +ibihumbi_mirongo_itatu 3 +ibihumbi_mirongo_ine 4 +ibihumbi_mirongo_itanu 5 +ibihumbi_mirongo_itandatatu 6 +ibihumbi_mirongo_irindwi 7 +ibihumbi_mirongo_inani 8 +ibihumbi_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv new file mode 100644 index 000000000..3cf483594 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni_icumi 1 +tiriyoni_makumyabiri 2 +tiriyoni_mirongo_itatu 3 +tiriyoni_mirongo_ine 4 +tiriyoni_mirongo_itanu 5 +tiriyoni_mirongo_itandatatu 6 +tiriyoni_mirongo_irindwi 7 +tiriyoni_mirongo_inani 8 +tiriyoni_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv new file mode 100644 index 000000000..39d262443 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv @@ -0,0 +1,10 @@ +igihumbi 1 +ibihumbi_bibiri 2 +ibihumbi_bitatu 3 +ibihumbi_bine 4 +ibihumbi_bitanu 5 +ibihumbi_bitandatu 6 +ibihumbi_birindwi 7 +ibihumbi_umunani 8 +ibihumbi_icyenda 9 + diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv new file mode 100644 index 000000000..8098158df --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni 1 +tiriyoni_ebyiri 2 +tiriyoni_eshatu 3 +tiriyoni_enye 4 +tiriyoni_eshanu 5 +tiriyoni_esheshatu 6 +tiriyoni_zirindwi 7 +tiriyoni_umunani 8 +tiriyoni_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/time/__init__.py b/nemo_text_processing/text_normalization/rw/data/time/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/time/hours.tsv b/nemo_text_processing/text_normalization/rw/data/time/hours.tsv new file mode 100644 index 000000000..fae6f0898 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/hours.tsv @@ -0,0 +1,12 @@ +1 saa saba +2 saa munani +3 saa cyenda +4 saa cumi +5 saa cumi n'imwe +6 saa cumi n'ebyiri +7 saa moya +8 saa mbiri +9 saa tatu +10 saa ine +11 saa tanu +12 saa sita \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv b/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv new file mode 100644 index 000000000..c30327106 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv @@ -0,0 +1,60 @@ +00 +01 n'umunota umwe +02 n'iminota ibiri +03 n'iminota itatu +04 n'iminota ine +05 n'iminota itanu +06 n'iminota itandatu +07 n'iminota irindwi +08 n'iminota umunani +09 n'iminota icyenda +10 n'iminota icumi +11 n'iminota cumi n'umwe +12 n'iminota cumi n'ibiri +13 n'iminota cumi n'itatu +14 n'iminota cumi n'ine +15 n'iminota cumi n'itanu +16 n'iminota cumi n'itandatu +17 n'iminota cumi n'irindwi +18 n'iminota cumi n'umunani +19 n'iminota cumi n'icyenda +20 n'iminota makumyabiri +21 n'iminota makumyabiri na rimwe +22 n'iminota makumyabiri n'ibiri +23 n'iminota makumyabiri n'itatu +24 n'iminota makumyabiri n'ine +25 n'iminota makumyabiri n'itanu +26 n'iminota makumyabiri n'itandatu +27 n'iminota makumyabiri n'irindwi +28 n'iminota makumyabiri n'umunani +29 n'iminota makumyabiri n'icyenda +30 n'iminota mirongo itatu +31 n'iminota mirongo itatu n'umwe +32 n'iminota mirongo itatu n'ibiri +33 n'iminota mirongo itatu n'itatu +34 n'iminota mirongo itatu n'ine +35 n'iminota mirongo itatu n'itanu +36 n'iminota mirongo itatu n'itandatu +37 n'iminota mirongo itatu n'irindwi +38 n'iminota mirongo itatu n'umunani +39 n'iminota mirongo itatu n'icyenda +40 n'iminota mirongo ine +41 n'iminota mirongo ine n'umwe +42 n'iminota mirongo ine n'ibiri +43 n'iminota mirongo ine n'itatu +44 n'iminota mirongo ine n'ine +45 n'iminota mirongo ine n'itanu +46 n'iminota mirongo ine n'itandatu +47 n'iminota mirongo ine n'irindwi +48 n'iminota mirongo ine n'umunani +49 n'iminota mirongo ine n'icyenda +50 n'iminota mirongo itanu +51 n'iminota mirongo itanu n'umwe +52 n'iminota mirongo itanu n'ibiri +53 n'iminota mirongo itanu n'itatu +54 n'iminota mirongo itanu n'ine +55 n'iminota mirongo itanu n'itanu +56 n'iminota mirongo itanu n'itandatu +57 n'iminota mirongo itanu n'irindwi +58 n'iminota mirongo itanu n'umunani +59 n'iminota mirongo itanu n'icyenda \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py index 9fb50331b..9c4313114 100644 --- a/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py +++ b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py new file mode 100644 index 000000000..3744580d5 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -0,0 +1,311 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_VOWELS = pynini.union(*"aeiouAEIOU").optimize() +NEMO_CONSONANTS = pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz").optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_space_or_punct = NEMO_PUNCT | delete_space +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, +) -> "pynini.FstLike": + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logger.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/rw/taggers/__init__.py b/nemo_text_processing/text_normalization/rw/taggers/__init__.py index 90380542f..96d45783e 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/rw/taggers/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py index 68abc5fbd..c80097a8e 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,189 +16,40 @@ import pynini from pynini.lib import pynutil -import string -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst,NEMO_CHAR,insert_space +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst,NEMO_CHAR,insert_space,NEMO_DIGIT,NEMO_ALPHA,NEMO_CONSONANTS,NEMO_VOWELS,delete_extra_space,delete_space from nemo_text_processing.text_normalization.rw.utils import get_abs_path -def apply_fst(text, fst): - try: - print(pynini.shortestpath(text @ fst).string()) - print(len(pynini.shortestpath(text @ fst).string())) - - except pynini.FstOpError: - print(f"Error: no valid output with given'input: '{text}'") class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - alphabet = string.ascii_letters - rewrite_na_fst = pynini.cdrewrite(pynini.cross(" "," na "),pynini.union(*"aeiouAEIOU "),pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz"),NEMO_CHAR.closure()) - rewrite_n_fst = pynini.cdrewrite(pynini.cross(" "," n'"),pynini.union(*"aeiouAEIOU "),pynini.union(*"aeiouAEIOU"),NEMO_CHAR.closure()) - remove_underscore_fst = pynini.cdrewrite(pynini.cross("_"," "),pynini.union(*alphabet),pynini.union(*alphabet),NEMO_CHAR.closure()) - remove_extra_space_fst = pynini.cdrewrite(pynini.cross(" "," "),pynini.union(*alphabet),pynini.union(*alphabet),NEMO_CHAR.closure()) - remove_trailing_space_fst = pynini.cdrewrite(pynini.cross(pynini.accep(' ').closure(),''),pynini.union(*alphabet).closure(),'[EOS]',NEMO_CHAR.closure()) + vowels_or_space = NEMO_VOWELS | " " + rewrite_na_fst = pynini.cdrewrite(pynini.cross(" "," na "),vowels_or_space,NEMO_CONSONANTS,NEMO_CHAR.closure()) + rewrite_n_fst = pynini.cdrewrite(pynini.cross(" "," n'"),vowels_or_space,NEMO_VOWELS,NEMO_CHAR.closure()) + remove_underscore_fst = pynini.cdrewrite(pynini.cross("_"," "),pynini.union(NEMO_ALPHA),pynini.union(NEMO_ALPHA),NEMO_CHAR.closure()) + remove_extra_space_fst = pynini.cdrewrite(delete_extra_space,pynini.union(NEMO_ALPHA),pynini.union(NEMO_ALPHA),NEMO_CHAR.closure()) + remove_trailing_space_fst = pynini.cdrewrite(delete_space,pynini.union(NEMO_ALPHA).closure(),'[EOS]',NEMO_CHAR.closure()) rewrite_add_separator_fst = pynini.compose(rewrite_na_fst,rewrite_n_fst) ten_thousand = pynini.string_map([("ibihumbi_icumi","10")]) ten = pynini.string_map([("icumi","10")]) - digits = pynini.string_map([ - ("rimwe","1"), - ("kabiri","2"), - ("gatatu","3"), - ("kane","4"), - ("gatanu","5"), - ("gatandatu","6"), - ("karindwi","7"), - ("umunani","8"), - ("icyenda","9"), - ]) - digits_for_thousands = pynini.string_map([ - ("","0"), - ("kimwe","1"), - ("bibiri","2"), - ("bitatu","3"), - ("bine","4"), - ("bitanu","5"), - ("bitandatu","6"), - ("birindwi","7"), - ("umunani","8"), - ("icyenda","9") - ]) - digits_millions_trillions= pynini.string_map([ - ("","0"), - ("imwe","1"), - ("ebyiri","2"), - ("eshatu","3"), - ("enye","4"), - ("eshanu","5"), - ("esheshatu","6"), - ("zirindwi","7"), - ("umunani","8"), - ("icyenda","9") - ]) - tens = pynini.string_map([ - (" ","0"), - ("makumyabiri","2"), - ("mirongo_itatu","3"), - ("mirongo_ine","4"), - ("mirongo_itanu","5"), - ("mirongo_itandatu","6"), - ("mirongo_irindwi","7"), - ("mirongo_inani","8"), - ("mirongo_icyenda","9") - ]) + digits = pynini.string_file(get_abs_path("data/cardinal/digits.tsv")) + digits_for_thousands = pynini.string_file(get_abs_path("data/cardinal/digits_for_thousands.tsv")) + digits_millions_trillions= pynini.string_file(get_abs_path("data/cardinal/digits_millions_trillions.tsv")) + tens = pynini.string_file(get_abs_path("data/cardinal/tens.tsv")) tens_for_ends = pynini.string_map([("icumi","1")])|tens tens_for_beginnings= pynini.string_map([("cumi","1")])|tens - hundreds = pynini.string_map([ - ("ijana","1"), - ("magana_abiri","2"), - ("magana_atatu","3"), - ("magana_ane","4"), - ("magana_atanu","5"), - ("magana_atandatu","6"), - ("magana_arindwi","7"), - ("magana_inani","8"), - ("magana_cyenda","9") - ]) - thousands = pynini.string_map([ - ("igihumbi","1"), - ("ibihumbi_bibiri","2"), - ("ibihumbi_bitatu","3"), - ("ibihumbi_bine","4"), - ("ibihumbi_bitanu","5"), - ("ibihumbi_bitandatu","6"), - ("ibihumbi_birindwi","7"), - ("ibihumbi_umunani","8"), - ("ibihumbi_icyenda","9") - ]) - tens_of_thousands = pynini.string_map([ - ("ibihumbi_cumi","1"), - ("ibihumbi_makumyabiri","2"), - ("ibihumbi_mirongo_itatu","3"), - ("ibihumbi_mirongo_ine","4"), - ("ibihumbi_mirongo_itanu","5"), - ("ibihumbi_mirongo_itandatatu","6"), - ("ibihumbi_mirongo_irindwi","7"), - ("ibihumbi_mirongo_inani","8"), - ("ibihumbi_mirongo_icyenda","9") - ]) - hundreds_of_thousands = pynini.string_map([ - ("ibihumbi_ijana","1"), - ("ibihumbi_magana_abiri","2"), - ("ibihumbi_magana_atatu","3"), - ("ibihumbi_magana_ane","4"), - ("ibihumbi_magana_atanu","5"), - ("ibihumbi_magana_atandatu","6"), - ("ibihumbi_magana_arindwi","7"), - ("ibihumbi_magana_inani","8"), - ("ibihumbi_magana_cyenda","9") - ]) - millions = pynini.string_map([ - ("miliyoni","1"), - ("miliyoni_ebyiri","2"), - ("miliyoni_eshatu","3"), - ("miliyoni_enye","4"), - ("miliyoni_eshanu","5"), - ("miliyoni_esheshatu","6"), - ("miliyoni_zirindwi","7"), - ("miliyoni_umunani","8"), - ("miliyoni_icyenda","9") - ]) - tens_of_millions = pynini.string_map([ - ("miliyoni_cumi","1"), - ("miliyoni_makumyabiri","2"), - ("miliyoni_mirongo_itatu","3"), - ("miliyoni_mirongo_ine","4"), - ("miliyoni_mirongo_itanu","5"), - ("miliyoni_mirongo_itandatatu","6"), - ("miliyoni_mirongo_irindwi","7"), - ("miliyoni_mirongo_inani","8"), - ("miliyoni_mirongo_icyenda","9") - ]) - hundreds_of_millions = pynini.string_map([ - ("miliyoni_ijana","1"), - ("miliyoni_magana_abiri","2"), - ("miliyoni_magana_atatu","3"), - ("miliyoni_magana_ane","4"), - ("miliyoni_magana_atanu","5"), - ("miliyoni_magana_atandatu","6"), - ("miliyoni_magana_arindwi","7"), - ("miliyoni_magana_inani","8"), - ("miliyoni_magana_cyenda","9") - ]) - trillions = pynini.string_map([ - ("tiriyoni","1"), - ("tiriyoni_ebyiri","2"), - ("tiriyoni_eshatu","3"), - ("tiriyoni_enye","4"), - ("tiriyoni_eshanu","5"), - ("tiriyoni_esheshatu","6"), - ("tiriyoni_zirindwi","7"), - ("tiriyoni_umunani","8"), - ("tiriyoni_icyenda","9") - ]) - tens_of_trillions = pynini.string_map([ - ("tiriyoni_icumi","1"), - ("tiriyoni_makumyabiri","2"), - ("tiriyoni_mirongo_itatu","3"), - ("tiriyoni_mirongo_ine","4"), - ("tiriyoni_mirongo_itanu","5"), - ("tiriyoni_mirongo_itandatatu","6"), - ("tiriyoni_mirongo_irindwi","7"), - ("tiriyoni_mirongo_inani","8"), - ("tiriyoni_mirongo_icyenda","9") - ]) - hundreds_of_trillions = pynini.string_map([ - ("tiriyoni_ijana","1"), - ("tiriyoni_magana_abiri","2"), - ("tiriyoni_magana_atatu","3"), - ("tiriyoni_magana_ane","4"), - ("tiriyoni_magana_atanu","5"), - ("tiriyoni_magana_atandatu","6"), - ("tiriyoni_magana_arindwi","7"), - ("tiriyoni_magana_inani","8"), - ("tiriyoni_magana_cyenda","9") - ]) + hundreds = pynini.string_file(get_abs_path("data/cardinal/hundreds.tsv")) + thousands = pynini.string_file(get_abs_path("data/cardinal/thousands.tsv")) + tens_of_thousands = pynini.string_file(get_abs_path("data/cardinal/tens_of_thousands.tsv")) + hundreds_of_thousands = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_thousands.tsv")) + millions = pynini.string_file(get_abs_path("data/cardinal/millions.tsv")) + tens_of_millions = pynini.string_file(get_abs_path("data/cardinal/tens_of_millions.tsv")) + hundreds_of_millions = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_millions.tsv")) + trillions = pynini.string_file(get_abs_path("data/cardinal/trillions.tsv")) + tens_of_trillions = pynini.string_file(get_abs_path("data/cardinal/tens_of_trillions.tsv")) + hundreds_of_trillions = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_trillions.tsv")) + THREE_ZEROS = "000" FOUR_ZEROS = "0000" FIVE_ZEROS = "00000" @@ -208,7 +60,7 @@ def __init__(self): NINE_ZEROS = "000000000" zero = pynini.string_map([("zeru","0")]) - rewrite_remove_comma_fst = pynini.cdrewrite(pynini.cross(",",""),pynini.union(*"0123456789"),pynini.union(*"0123456789"),NEMO_CHAR.closure()) + rewrite_remove_comma_fst = pynini.cdrewrite(pynini.cross(",",""),pynini.union(NEMO_DIGIT),pynini.union(NEMO_DIGIT),NEMO_CHAR.closure()) single_digits_graph = pynini.invert(digits | zero) single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) remove_comma = rewrite_remove_comma_fst@single_digits_graph diff --git a/nemo_text_processing/text_normalization/rw/taggers/time.py b/nemo_text_processing/text_normalization/rw/taggers/time.py index 6b2a0d531..a07ae059e 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/time.py +++ b/nemo_text_processing/text_normalization/rw/taggers/time.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,92 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst import pynini from pynini.lib import pynutil +from nemo_text_processing.text_normalization.rw.utils import get_abs_path class TimeFst(GraphFst): def __init__(self): super().__init__(name="time", kind="classify") - hours = pynini.string_map([ - ('1', 'saa saba'), - ('2', 'saa munani'), - ('3', 'saa cyenda'), - ('4', 'saa cumi'), - ('5', "saa cumi n'imwe"), - ('6', "saa cumi n'ebyiri"), - ('7', 'saa moya'), - ('8', 'saa mbiri'), - ('9', 'saa tatu'), - ('10', 'saa ine'), - ('11', 'saa tanu'), - ('12', 'saa sita'), - ]) + hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) - minutes = pynini.string_map([ - ('00', ' '), - ('01', " n'umunota umwe") , - ('02', " n'iminota ibiri") , - ('03', " n'iminota itatu") , - ('04', " n'iminota ine") , - ('05', " n'iminota itanu") , - ('06', " n'iminota itandatu") , - ('07', " n'iminota irindwi") , - ('08', " n'iminota umunani") , - ('09', " n'iminota icyenda") , - ('10', " n'iminota icumi") , - ('11', " n'iminota cumi n'umwe") , - ('12', " n'iminota cumi n'ibiri") , - ('13', " n'iminota cumi n'itatu") , - ('14', " n'iminota cumi n'ine") , - ('15', " n'iminota cumi n'itanu") , - ('16', " n'iminota cumi n'itandatu") , - ('17', " n'iminota cumi n'irindwi") , - ('18', " n'iminota cumi n'umunani") , - ('19', " n'iminota cumi n'icyenda") , - ('20', " n'iminota makumyabiri") , - ('21', " n'iminota makumyabiri na rimwe") , - ('22', " n'iminota makumyabiri n'ibiri") , - ('23', " n'iminota makumyabiri n'itatu") , - ('24', " n'iminota makumyabiri n'ine") , - ('25', " n'iminota makumyabiri n'itanu") , - ('26', " n'iminota makumyabiri n'itandatu") , - ('27', " n'iminota makumyabiri n'irindwi") , - ('28', " n'iminota makumyabiri n'umunani") , - ('29', " n'iminota makumyabiri n'icyenda") , - ('30', " n'iminota mirongo itatu") , - ('31', " n'iminota mirongo itatu n'umwe") , - ('32', " n'iminota mirongo itatu n'ibiri") , - ('33', " n'iminota mirongo itatu n'itatu") , - ('34', " n'iminota mirongo itatu n'ine") , - ('35', " n'iminota mirongo itatu n'itanu") , - ('36', " n'iminota mirongo itatu n'itandatu") , - ('37', " n'iminota mirongo itatu n'irindwi") , - ('38', " n'iminota mirongo itatu n'umunani") , - ('39', " n'iminota mirongo itatu n'icyenda") , - ('40', " n'iminota mirongo ine") , - ('41', " n'iminota mirongo ine n'umwe") , - ('42', " n'iminota mirongo ine n'ibiri") , - ('43', " n'iminota mirongo ine n'itatu") , - ('44', " n'iminota mirongo ine n'ine") , - ('45', " n'iminota mirongo ine n'itanu") , - ('46', " n'iminota mirongo ine n'itandatu") , - ('47', " n'iminota mirongo ine n'irindwi") , - ('48', " n'iminota mirongo ine n'umunani") , - ('49', " n'iminota mirongo ine n'icyenda") , - ('50', " n'iminota mirongo itanu") , - ('51', " n'iminota mirongo itanu n'umwe") , - ('52', " n'iminota mirongo itanu n'ibiri") , - ('53', " n'iminota mirongo itanu n'itatu") , - ('54', " n'iminota mirongo itanu n'ine") , - ('55', " n'iminota mirongo itanu n'itanu") , - ('56', " n'iminota mirongo itanu n'itandatu") , - ('57', " n'iminota mirongo itanu n'irindwi") , - ('58', " n'iminota mirongo itanu n'umunani") , - ('59', " n'iminota mirongo itanu n'icyenda") , - ]) + minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) final_graph = pynutil.insert("hours:\"")+hours+pynutil.insert("\"")+pynutil.delete(":")+pynutil.insert(" minutes:\"")+minutes+pynutil.insert("\"") final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py index 3a034af13..e17841e10 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.taggers.time import TimeFst from nemo_text_processing.text_normalization.rw.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.rw.taggers.cardinal import CardinalFst @@ -23,7 +24,7 @@ import pynini from pynini.lib import pynutil import os -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_extra_space, delete_space, diff --git a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py index 0355d9741..288a1edda 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst import pynini from pynini.lib import pynutil from nemo_text_processing.text_normalization.rw.utils import get_abs_path diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py index 26cff59aa..2931cfd9b 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/time.py b/nemo_text_processing/text_normalization/rw/verbalizers/time.py index 90d1c17e4..99bcd7808 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/time.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +15,7 @@ # limitations under the License. import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_space, NEMO_CHAR @@ -23,8 +24,8 @@ class VerbalizeTimeFst(GraphFst): def __init__(self): super().__init__(name="time",kind="verbalize") - hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")+delete_space \ - +pynutil.delete("minutes:")+delete_space+pynutil.delete("\"") + pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")) + hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR)+pynutil.delete("\"")+delete_space \ + +pynutil.delete("minutes:")+delete_space+pynutil.delete("\"") + pynini.closure(NEMO_CHAR)+pynutil.delete("\"")) graph = hour delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py index 94bf7a038..9d3e69cd9 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.verbalizers.time import VerbalizeTimeFst from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py index e191fbf32..953bffdfe 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,11 +17,14 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_extra_space, + delete_space_or_punct, delete_space, + NEMO_PUNCT, generator_main, + delete_space ) import os @@ -34,20 +38,20 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False,determin if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["verbalize"] else: - verbalize = VerbalizeFst().fst - word = WordFst().fst - + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst types = verbalize | word graph = ( - pynutil.delete("tokens") - + delete_space - + pynutil.delete("{") - + delete_space - + types - + delete_space - + pynutil.delete("}") - ) - graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space + self.fst = graph diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 596723091..0cbd53349 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -1,4 +1,5 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +21,7 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import generator_main +from nemo_text_processing.text_normalization.rw.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files # tokenize_and_classify.far and verbalize.far for production purposes @@ -270,7 +271,7 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst - elif args.language == 'rw': + elif args.language == 'rw': from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) From 32b9ddf7093b5b1bc9764bf2b6c1dff604c0ab16 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Thu, 12 Sep 2024 04:52:38 +0200 Subject: [PATCH 4/7] Update test_cases_word.txt Signed-off-by: kurt0cougar --- .../rw/data_text_normalization/test_cases_word.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt index a4c1f2c6a..1c97057aa 100644 --- a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt @@ -21,9 +21,6 @@ aalem~aalem a'ali~a'ali aaliyan's~aaliyan's mother-in-law~mother-in-law - 1~rimwe 1~rimwe -!1~!rimwe mar~mar -mar~mar -umwangavu~umwangavu \ No newline at end of file +umwangavu~umwangavu From 2b8c220b2f0dd988e5636ec388fd875e7d0b78e4 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Thu, 12 Sep 2024 04:54:41 +0200 Subject: [PATCH 5/7] Update graph_utils.py Signed-off-by: kurt0cougar --- .../text_normalization/rw/graph_utils.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py index 3744580d5..46ab24f7c 100644 --- a/nemo_text_processing/text_normalization/rw/graph_utils.py +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -204,43 +204,6 @@ def convert_space(fst) -> "pynini.FstLike": return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) -def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): - labels = load_labels(input_file) - - if input_case == INPUT_CASED: - additional_labels = [] - for written, spoken, *weight in labels: - written_capitalized = written[0].upper() + written[1:] - additional_labels.extend( - [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized - [ - written_capitalized, - spoken.upper().replace(" AND ", " and "), - ], # # add pairs with the all letters capitalized - ] - ) - - spoken_no_space = spoken.replace(" ", "") - # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" - if len(spoken) == (2 * len(spoken_no_space) - 1): - logger.debug(f"This is weight {weight}") - if len(weight) == 0: - additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] - ) - else: - additional_labels.extend( - [ - [written, spoken_no_space, weight[0]], - [written_capitalized, spoken_no_space.upper(), weight[0]], - ] - ) - labels += additional_labels - - whitelist = pynini.string_map(labels).invert().optimize() - return whitelist - class GraphFst: """ From 3d46312abaedc479f7af8e04a525a299f390f8a9 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Fri, 13 Sep 2024 00:59:42 +0200 Subject: [PATCH 6/7] Cardinals up to a hundred trillions, timeFST and transliteration - reformatteda Signed-off-by: kurt0cougar --- .../fst_alignment/alignment.py | 2 +- nemo_text_processing/hybrid/mlm_scorer.py | 2 +- nemo_text_processing/hybrid/model_utils.py | 2 +- nemo_text_processing/hybrid/utils.py | 23 +- .../hybrid/wfst_lm_rescoring.py | 2 +- .../ar/taggers/cardinal.py | 8 +- .../ar/taggers/decimal.py | 4 +- .../ar/taggers/fraction.py | 2 +- .../inverse_text_normalization/ar/utils.py | 4 +- .../ar/verbalizers/verbalize_final.py | 2 +- .../de/taggers/cardinal.py | 4 +- .../de/taggers/electronic.py | 2 +- .../de/taggers/fraction.py | 2 +- .../de/taggers/telephone.py | 4 +- .../de/taggers/time.py | 2 +- .../de/verbalizers/time.py | 2 +- .../de/verbalizers/verbalize_final.py | 2 +- .../en/clean_eval_data.py | 2 +- .../en/taggers/cardinal.py | 7 +- .../en/taggers/date.py | 8 +- .../en/taggers/decimal.py | 6 +- .../en/taggers/electronic.py | 8 +- .../en/taggers/measure.py | 4 +- .../en/taggers/money.py | 2 +- .../en/taggers/telephone.py | 16 +- .../en/taggers/time.py | 22 +- .../inverse_text_normalization/en/utils.py | 4 +- .../en/verbalizers/fraction.py | 2 +- .../en/verbalizers/telephone.py | 2 +- .../en/verbalizers/verbalize_final.py | 2 +- .../es/taggers/cardinal.py | 15 +- .../es/taggers/date.py | 4 +- .../es/taggers/decimal.py | 4 +- .../es/taggers/electronic.py | 10 +- .../es/taggers/fraction.py | 10 +- .../es/taggers/measure.py | 2 +- .../es/taggers/ordinal.py | 10 +- .../es/taggers/telephone.py | 9 +- .../es/taggers/time.py | 12 +- .../inverse_text_normalization/es/utils.py | 2 +- .../es/verbalizers/telephone.py | 2 +- .../es/verbalizers/verbalize_final.py | 2 +- .../inverse_text_normalization/es_en/utils.py | 2 +- .../es_en/verbalizers/verbalize_final.py | 2 +- .../fr/taggers/cardinal.py | 19 +- .../fr/taggers/date.py | 4 +- .../fr/taggers/decimal.py | 6 +- .../fr/taggers/fraction.py | 2 +- .../fr/taggers/ordinal.py | 2 +- .../fr/taggers/telephone.py | 2 +- .../inverse_text_normalization/fr/utils.py | 2 +- .../fr/verbalizers/decimal.py | 6 +- .../fr/verbalizers/ordinal.py | 4 +- .../fr/verbalizers/telephone.py | 2 +- .../fr/verbalizers/time.py | 2 +- .../fr/verbalizers/verbalize_final.py | 2 +- .../inverse_text_normalization/hy/utils.py | 4 +- .../hy/verbalizers/ordinal.py | 7 +- .../ja/taggers/cardinal.py | 7 +- .../ja/taggers/date.py | 2 +- .../ja/taggers/decimal.py | 4 +- .../ja/taggers/fraction.py | 6 +- .../ja/taggers/fraction_old.py | 6 +- .../ja/taggers/ordinal.py | 2 +- .../ja/taggers/preprocessor.py | 18 +- .../ja/taggers/time.py | 8 +- .../inverse_text_normalization/ja/utils.py | 2 +- .../ja/verbalizers/cardinal.py | 2 +- .../ja/verbalizers/date.py | 2 +- .../ja/verbalizers/fraction.py | 2 +- .../ja/verbalizers/fraction_old.py | 2 +- .../ja/verbalizers/post_processing.py | 6 +- .../ja/verbalizers/postprocessor.py | 14 +- .../ja/verbalizers/time.py | 16 +- .../ja/verbalizers/verbalize_final.py | 11 +- .../ja/verbalizers/whitelist.py | 2 +- .../ja/verbalizers/word.py | 2 +- .../mr/taggers/cardinal.py | 6 +- .../mr/taggers/date.py | 6 +- .../mr/taggers/decimal.py | 14 +- .../mr/taggers/time.py | 16 +- .../inverse_text_normalization/mr/utils.py | 4 +- .../mr/verbalizers/time.py | 10 +- .../pt/taggers/cardinal.py | 15 +- .../ru/taggers/cardinal.py | 2 +- .../ru/verbalizers/verbalize_final.py | 2 +- .../sv/taggers/cardinal.py | 2 +- .../sv/taggers/date.py | 3 +- .../sv/taggers/decimal.py | 10 +- .../sv/taggers/electronic.py | 2 +- .../sv/taggers/fraction.py | 2 +- .../sv/taggers/telephone.py | 4 +- .../sv/taggers/time.py | 2 +- .../inverse_text_normalization/sv/utils.py | 2 +- .../sv/verbalizers/verbalize_final.py | 2 +- .../vi/taggers/cardinal.py | 7 +- .../vi/taggers/date.py | 5 +- .../vi/taggers/decimal.py | 6 +- .../vi/verbalizers/time.py | 4 +- .../zh/graph_utils.py | 10 +- .../zh/taggers/cardinal.py | 9 +- .../zh/taggers/date.py | 4 +- .../zh/taggers/fraction.py | 2 +- .../zh/taggers/money.py | 4 +- .../zh/taggers/tokenize_and_classify.py | 8 +- .../inverse_text_normalization/zh/utils.py | 4 +- .../zh/verbalizers/cardinal.py | 2 +- .../zh/verbalizers/decimal.py | 2 +- .../zh/verbalizers/verbalize.py | 2 +- .../zh/verbalizers/verbalize_final.py | 2 +- .../text_normalization/ar/taggers/cardinal.py | 2 +- .../text_normalization/ar/taggers/decimal.py | 4 +- .../text_normalization/ar/taggers/fraction.py | 2 +- .../text_normalization/ar/taggers/measure.py | 21 +- .../text_normalization/ar/taggers/money.py | 7 +- .../text_normalization/ar/utils.py | 4 +- .../ar/verbalizers/measure.py | 2 +- .../ar/verbalizers/verbalize_final.py | 2 +- .../text_normalization/data_loader_utils.py | 12 +- .../text_normalization/de/taggers/cardinal.py | 6 +- .../text_normalization/de/taggers/date.py | 4 +- .../text_normalization/de/taggers/decimal.py | 4 +- .../text_normalization/de/taggers/measure.py | 4 +- .../text_normalization/de/taggers/ordinal.py | 2 +- .../de/taggers/telephone.py | 6 +- .../text_normalization/de/taggers/time.py | 4 +- .../de/taggers/tokenize_and_classify.py | 14 +- .../text_normalization/de/utils.py | 2 +- .../de/verbalizers/decimal.py | 4 +- .../de/verbalizers/measure.py | 2 +- .../de/verbalizers/ordinal.py | 5 +- .../de/verbalizers/telephone.py | 2 +- .../de/verbalizers/verbalize_final.py | 2 +- .../text_normalization/en/clean_eval_data.py | 2 +- .../text_normalization/en/graph_utils.py | 40 ++- .../text_normalization/en/taggers/cardinal.py | 6 +- .../text_normalization/en/taggers/date.py | 14 +- .../text_normalization/en/taggers/decimal.py | 4 +- .../en/taggers/electronic.py | 18 +- .../text_normalization/en/taggers/measure.py | 26 +- .../text_normalization/en/taggers/money.py | 6 +- .../text_normalization/en/taggers/ordinal.py | 2 +- .../text_normalization/en/taggers/range.py | 15 +- .../text_normalization/en/taggers/serial.py | 4 +- .../en/taggers/telephone.py | 6 +- .../text_normalization/en/taggers/time.py | 2 +- .../en/taggers/tokenize_and_classify.py | 15 +- .../en/taggers/tokenize_and_classify_lm.py | 2 +- .../tokenize_and_classify_with_audio.py | 2 +- .../text_normalization/en/utils.py | 4 +- .../en/verbalizers/measure.py | 2 +- .../en/verbalizers/ordinal.py | 5 +- .../en/verbalizers/post_processing.py | 6 +- .../text_normalization/es/graph_utils.py | 5 +- .../text_normalization/es/taggers/cardinal.py | 4 +- .../text_normalization/es/taggers/date.py | 2 +- .../text_normalization/es/taggers/fraction.py | 56 +++- .../text_normalization/es/taggers/measure.py | 4 +- .../text_normalization/es/taggers/ordinal.py | 2 +- .../es/taggers/telephone.py | 4 +- .../text_normalization/es/taggers/time.py | 4 +- .../es/taggers/tokenize_and_classify.py | 20 +- .../es/verbalizers/cardinal.py | 12 +- .../es/verbalizers/decimals.py | 12 +- .../es/verbalizers/fraction.py | 20 +- .../es/verbalizers/ordinal.py | 2 +- .../text_normalization/fr/taggers/ordinal.py | 2 +- .../fr/taggers/tokenize_and_classify.py | 9 +- .../text_normalization/fr/utils.py | 4 +- .../fr/verbalizers/cardinal.py | 12 +- .../fr/verbalizers/decimals.py | 12 +- .../fr/verbalizers/fraction.py | 14 +- .../text_normalization/hu/taggers/cardinal.py | 10 +- .../text_normalization/hu/taggers/date.py | 6 +- .../text_normalization/hu/taggers/decimal.py | 6 +- .../text_normalization/hu/taggers/measure.py | 4 +- .../text_normalization/hu/taggers/ordinal.py | 2 +- .../hu/taggers/telephone.py | 4 +- .../text_normalization/hu/taggers/time.py | 8 +- .../hu/taggers/tokenize_and_classify.py | 20 +- .../text_normalization/hu/utils.py | 6 +- .../hu/verbalizers/measure.py | 2 +- .../hu/verbalizers/telephone.py | 10 +- .../text_normalization/hy/utils.py | 4 +- .../hy/verbalizers/verbalize.py | 14 +- .../hy/verbalizers/verbalize_final.py | 16 +- .../text_normalization/it/taggers/cardinal.py | 4 +- .../text_normalization/it/taggers/decimals.py | 3 +- .../text_normalization/it/taggers/measure.py | 6 +- .../text_normalization/it/taggers/money.py | 2 +- .../text_normalization/it/taggers/time.py | 2 +- .../it/taggers/tokenize_and_classify.py | 15 +- .../text_normalization/it/utils.py | 2 +- .../it/verbalizers/decimal.py | 12 +- .../it/verbalizers/measure.py | 2 +- .../it/verbalizers/money.py | 2 +- .../normalize_with_audio.py | 15 +- .../text_normalization/ru/taggers/cardinal.py | 2 +- .../text_normalization/ru/taggers/date.py | 4 +- .../text_normalization/ru/taggers/decimals.py | 2 +- .../text_normalization/ru/taggers/ordinal.py | 2 +- .../ru/taggers/telephone.py | 12 +- .../text_normalization/ru/taggers/time.py | 2 +- .../text_normalization/ru/utils.py | 2 +- .../ru/verbalizers/measure.py | 2 +- .../ru/verbalizers/verbalize_final.py | 2 +- .../text_normalization/rw/__init__.py | 1 - .../text_normalization/rw/graph_utils.py | 33 +- .../text_normalization/rw/taggers/__init__.py | 2 +- .../text_normalization/rw/taggers/cardinal.py | 307 +++++++++++------- .../text_normalization/rw/taggers/time.py | 19 +- .../rw/taggers/tokenize_and_classify.py | 40 ++- .../rw/taggers/whitelist.py | 5 +- .../text_normalization/rw/utils.py | 5 +- .../rw/verbalizers/__init__.py | 2 +- .../text_normalization/rw/verbalizers/time.py | 29 +- .../rw/verbalizers/verbalize.py | 14 +- .../rw/verbalizers/verbalize_final.py | 20 +- .../text_normalization/sv/taggers/cardinal.py | 28 +- .../text_normalization/sv/taggers/measure.py | 4 +- .../text_normalization/sv/taggers/ordinal.py | 9 +- .../sv/taggers/telephone.py | 4 +- .../text_normalization/sv/taggers/time.py | 6 +- .../sv/verbalizers/decimals.py | 12 +- .../sv/verbalizers/telephone.py | 6 +- .../text_normalization/token_parser.py | 12 +- .../text_normalization/zh/taggers/cardinal.py | 34 +- .../text_normalization/zh/taggers/date.py | 2 +- .../text_normalization/zh/taggers/decimal.py | 2 +- .../text_normalization/zh/taggers/fraction.py | 2 +- .../text_normalization/zh/taggers/measure.py | 2 +- .../zh/taggers/preprocessor.py | 18 +- .../zh/taggers/tokenize_and_classify.py | 4 +- .../text_normalization/zh/utils.py | 4 +- .../zh/verbalizers/measure.py | 2 +- .../zh/verbalizers/post_processing.py | 6 +- .../zh/verbalizers/postprocessor.py | 14 +- .../zh/verbalizers/verbalize.py | 2 +- .../zh/verbalizers/verbalize_final.py | 11 +- .../zh/verbalizers/whitelist.py | 2 +- .../text_normalization/zh/verbalizers/word.py | 2 +- setup.py | 19 +- tests/conftest.py | 14 +- tests/nemo_text_processing/ar/test_money.py | 4 +- tests/nemo_text_processing/en/test_address.py | 4 +- .../nemo_text_processing/en/test_cardinal.py | 4 +- tests/nemo_text_processing/en/test_decimal.py | 4 +- .../en/test_electronic.py | 4 +- .../nemo_text_processing/en/test_fraction.py | 4 +- tests/nemo_text_processing/en/test_math.py | 4 +- tests/nemo_text_processing/en/test_measure.py | 4 +- tests/nemo_text_processing/en/test_money.py | 4 +- tests/nemo_text_processing/en/test_ordinal.py | 4 +- .../en/test_punctuation.py | 6 +- tests/nemo_text_processing/en/test_range.py | 4 +- tests/nemo_text_processing/en/test_roman.py | 4 +- tests/nemo_text_processing/en/test_serial.py | 4 +- .../en/test_special_text.py | 4 +- tests/nemo_text_processing/es/test_ordinal.py | 4 +- .../nemo_text_processing/rw/test_cardinal.py | 4 - tests/nemo_text_processing/rw/test_time.py | 3 - .../nemo_text_processing/rw/test_whitelist.py | 4 +- tests/nemo_text_processing/rw/test_word.py | 4 +- .../pynini_export.py | 24 +- 264 files changed, 1310 insertions(+), 791 deletions(-) diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py index 3100cf49f..5e76f66eb 100644 --- a/nemo_text_processing/fst_alignment/alignment.py +++ b/nemo_text_processing/fst_alignment/alignment.py @@ -200,7 +200,7 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: st alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions start: inclusive start position in input string end: exclusive end position in input string - mode: grammar type for either tn or itn + mode: grammar type for either tn or itn Returns: output_og_start_index: inclusive start position in output string diff --git a/nemo_text_processing/hybrid/mlm_scorer.py b/nemo_text_processing/hybrid/mlm_scorer.py index 2986f3562..b2c94598e 100644 --- a/nemo_text_processing/hybrid/mlm_scorer.py +++ b/nemo_text_processing/hybrid/mlm_scorer.py @@ -93,7 +93,7 @@ def score_sentence(self, sentence: str): def __mask_text__(self, idx: int, tokens: List[str]): """ - replaces string at index idx in list `tokens` with a masked token and returns the modified list. + replaces string at index idx in list `tokens` with a masked token and returns the modified list. """ masked = tokens.copy() masked[idx] = self.MASK_LABEL diff --git a/nemo_text_processing/hybrid/model_utils.py b/nemo_text_processing/hybrid/model_utils.py index 7b2f8e960..b81d59b2a 100644 --- a/nemo_text_processing/hybrid/model_utils.py +++ b/nemo_text_processing/hybrid/model_utils.py @@ -74,7 +74,7 @@ def get_masked_score(text, model, do_lower=True): def _get_ambiguous_positions(sentences: List[str]): """returns None or index list of ambigous semiotic tokens for list of sentences. - E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only + E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only the first semiotic span / is ambiguous.""" l_sets = [set([x]) for x in re.findall(r"<\s.+?\s>", sentences[0])] for sentence in sentences[1:]: diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py index ced823510..82c96aa6f 100644 --- a/nemo_text_processing/hybrid/utils.py +++ b/nemo_text_processing/hybrid/utils.py @@ -390,8 +390,8 @@ def clean_post_norm( def clean_libri_tts(target: str): """ - Replace abbreviations in LibriTTS dataset - """ + Replace abbreviations in LibriTTS dataset + """ # Normalized text in LibriTTS by Google which contains abbreviations from `libri_sometimes_converts_abbrs` sometimes wasn't converted. libri_sometimes_converts_abbrs = {"St.": "saint", "Rev.": "reverend"} @@ -515,7 +515,11 @@ def _relax_diff(text): return acceptable -def get_labels(targets: List[str], norm_texts_weights: List[Tuple[str, str]], lang="en",) -> List[List[str]]: +def get_labels( + targets: List[str], + norm_texts_weights: List[Tuple[str, str]], + lang="en", +) -> List[List[str]]: """ Assign labels to generated normalization options (1 - for ground truth, 0 - other options) Args: @@ -605,7 +609,14 @@ def print_df(df): prints data frame """ with pd.option_context( - "display.max_rows", None, "display.max_columns", None, "display.width", 1000, "display.max_colwidth", 400, + "display.max_rows", + None, + "display.max_columns", + None, + "display.width", + 1000, + "display.max_colwidth", + 400, ): print(df) @@ -641,7 +652,7 @@ def get_diff(a: str, b: str): def diff_pred_gt(pred: str, gt: str): """returns list of different substrings between prediction and gt - relies on that prediction uses '< ' ' >' + relies on that prediction uses '< ' ' >' Args: pred (str): prediction @@ -649,7 +660,7 @@ def diff_pred_gt(pred: str, gt: str): Returns: list of Tuple(pred start and end, gt start and end) subsections - + e.g. pred="< Edward third >., king Our own . loss had been < two thousand two hundred >" gt ="Edward III., king Our own loss had been twenty two hundred" --> [([0, 16], [0, 10]), ([32, 34], [26, 26]), ([48, 76], [40, 58])] diff --git a/nemo_text_processing/hybrid/wfst_lm_rescoring.py b/nemo_text_processing/hybrid/wfst_lm_rescoring.py index 86f375058..7f001e6a2 100644 --- a/nemo_text_processing/hybrid/wfst_lm_rescoring.py +++ b/nemo_text_processing/hybrid/wfst_lm_rescoring.py @@ -73,7 +73,7 @@ def threshold_weights(norm_texts_weights, delta: float = 0.2): delta: delta to add to minimum weight in options to compose upper limit for threshhold returns: - filter list of same format as input + filter list of same format as input """ # threshold value is factor applied to lowest/first weight of all normalization options for every input res = [] diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py index 40ee1acf0..2c58df6a9 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py @@ -22,8 +22,8 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals e.g. سالب تسعة وتسعون -> cardinal { integer: "99" negative: "-" } } - Numbers below thirteen are not converted. - Args: + Numbers below thirteen are not converted. + Args: tn_cardinal: cardinal FST for TN """ @@ -33,7 +33,9 @@ def __init__(self, tn_cardinal): self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize() optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, 0, 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, + 0, + 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py index f0d641d14..3b22ece05 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py @@ -36,7 +36,9 @@ def __init__(self, tn_decimal): super().__init__(name="decimal", kind="classify") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, + 0, + 1, ) graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py index beefe52ee..db14cc9a8 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py @@ -29,7 +29,7 @@ class FractionFst(GraphFst): """ Finite state transducer for classifying fraction e.g. واحد و نصف -> tokens { integer_part: "1" numerator: "1" denominator: "2" } - + Args: tn_cardinal: TN cardinal tagger diff --git a/nemo_text_processing/inverse_text_normalization/ar/utils.py b/nemo_text_processing/inverse_text_normalization/ar/utils.py index ca6210150..67594bf55 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ar/utils.py @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]): Args x: integer - Returns: spoken representation + Returns: spoken representation """ if isinstance(x, int): x = str(x) @@ -41,7 +41,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py index 326d49df8..7f557096b 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py @@ -21,7 +21,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py index 0670090b8..46fdca4e3 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py @@ -20,7 +20,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals. Numbers below ten are not converted. + Finite state transducer for classifying cardinals. Numbers below ten are not converted. Allows both compound numeral strings or separated by whitespace. "und" (en: "and") can be inserted between "hundert" and following number or "tausend" and following single or double digit number. @@ -32,7 +32,7 @@ class CardinalFst(GraphFst): e.g. ein tausend -> cardinal { integer: "1000" } } e.g. eintausend -> cardinal { integer: "1000" } } e.g. ein tausend zwanzig -> cardinal { integer: "1020" } } - + Args: tn_cardinal_tagger: TN cardinal tagger """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py index 38ca80ca5..dc9f96bd1 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py @@ -22,7 +22,7 @@ class ElectronicFst(GraphFst): """ Finite state transducer for classifying electronic: email addresses, etc. e.g. c d f eins at a b c punkt e d u -> tokens { name: "cdf1.abc.edu" } - + Args: tn_electronic_tagger: TN eletronic tagger tn_electronic_verbalizer: TN eletronic verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py index 14e06a5be..960c9ffa9 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py @@ -29,7 +29,7 @@ class FractionFst(GraphFst): e.g. ein halb -> tokens { name: "1/2" } e.g. ein ein halb -> tokens { name: "1 1/2" } e.g. drei zwei ein hundertstel -> tokens { name: "3 2/100" } - + Args: itn_cardinal_tagger: ITN cardinal tagger tn_fraction_verbalizer: TN fraction verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py index 22474376f..dd7f79878 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py @@ -20,9 +20,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. null vier eins eins eins zwei drei vier eins zwei drei vier -> tokens { name: "(0411) 1234-1234" } - + Args: tn_cardinal_tagger: TN Cardinal Tagger """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py index 571edd724..db2edb66b 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py @@ -31,7 +31,7 @@ class TimeFst(GraphFst): e.g. drei vor zwölf -> time { minutes: "57" hours: "11" } e.g. drei nach zwölf -> time { minutes: "3" hours: "12" } e.g. drei uhr zehn minuten zehn sekunden -> time { hours: "3" hours: "10" sekunden: "10"} - + Args: tn_time_verbalizer: TN time verbalizer """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py index 3031ac2b4..ac67928ce 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py @@ -23,7 +23,7 @@ class TimeFst(GraphFst): Finite state transducer for verbalizing time, e.g. time { hours: "8" minutes: "30" zone: "e s t" } -> 08:30 Uhr est time { hours: "8" } -> 8 Uhr - time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr + time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py index ab2576934..beb9b1e7c 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "jetzt" } tokens { name: "ist" } tokens { time { hours: "12" minutes: "30" } } -> jetzt ist 12:30 Uhr """ diff --git a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py index ab2969f98..e9dd16034 100644 --- a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py @@ -67,7 +67,7 @@ def process(self, instance: Instance) -> Instance: Args: processes given instance with process function - + Returns: processed instance if instance belongs to expected class type or original instance """ if instance.token_type != self.class_type: diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index 36f424208..5eea89af1 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -207,7 +207,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): + graph_in_thousands ) - graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,) + graph = pynini.union( + (graph_int | graph_ind) + delete_space + graph_hundreds, + graph_zero, + ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" @@ -243,7 +246,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.fst = final_graph.optimize() def delete_word(self, word: str): - """ Capitalizes word for `cased` input""" + """Capitalizes word for `cased` input""" delete_graph = pynutil.delete(word).optimize() if self.input_case == INPUT_CASED: if len(word) > 0: diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index 8d8a4f444..b1ace40ce 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -137,7 +137,7 @@ def _get_thousands_graph(): class DateFst(GraphFst): """ - Finite state transducer for classifying date, + Finite state transducer for classifying date, e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true } e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true } e.g. twenty twenty -> date { year: "2012" preserve_order: true } @@ -165,7 +165,11 @@ def __init__(self, ordinal: GraphFst, input_case: str): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure(graph_year, 0, 1,) + optional_graph_year = pynini.closure( + graph_year, + 0, + 1, + ) graph_mdy = month_graph + ( (delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py index 2c6ee7a62..6e5de2418 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py @@ -41,7 +41,7 @@ def get_quantity( e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST input_case: accepting either "lower_cased" or "cased" input. @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): point = pynutil.delete("point") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py index a2373d9d7..0a41b4702 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py @@ -106,7 +106,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + url_symbols + delete_extra_space - + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) + + ( + domain + | pynini.closure( + accepted_username + delete_extra_space, + ) + + accepted_username + ) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py index 2d9d5e02c..69eeaa56e 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py @@ -58,7 +58,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize() optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) unit_singular = convert_space(graph_unit_singular) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py index 2a1e32a49..2c5d5ad78 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU # "one fifty" -> "one hundred fifty" with_hundred = pynini.compose( pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA, - pynini.compose(cardinal_graph, NEMO_DIGIT ** 3), + pynini.compose(cardinal_graph, NEMO_DIGIT**3), ) cardinal_graph |= with_hundred graph_decimal_final = decimal.final_graph_wo_negative diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py index dba4c0201..9a106ca78 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py @@ -40,7 +40,7 @@ def get_serial_number(cardinal): """ digit = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT) - two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT ** 2), 0.002) + two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT**2), 0.002) character = digit | two_digit | NEMO_ALPHA sequence = (NEMO_LOWER_NOT_A | digit) + pynini.closure(pynutil.delete(" ") + character, 2) sequence |= character + pynini.closure(pynutil.delete(" ") + (digit | NEMO_ALPHA), 2) @@ -61,7 +61,7 @@ def get_serial_number(cardinal): class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. one two three one two three five six seven eight -> { number_part: "123-123-5678" } This class also support card number and IP format. @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): triple_digit.invert() # to handle cases like "one twenty three" - two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2) + two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2) double_digit_to_digit = ( pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal ) @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): number_part = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4, + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"") @@ -156,16 +156,16 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): graph = optional_country_code + number_part # credit card number - space_four_digits = insert_space + NEMO_DIGIT ** 4 + space_four_digits = insert_space + NEMO_DIGIT**4 space_five_digits = space_four_digits + NEMO_DIGIT space_six_digits = space_five_digits + NEMO_DIGIT credit_card_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 4 + (space_six_digits | (space_four_digits ** 2)) + space_four_digits, + NEMO_DIGIT**4 + (space_six_digits | (space_four_digits**2)) + space_four_digits, ).optimize() credit_card_graph |= pynini.compose( - single_double_or_triple_digit, NEMO_DIGIT ** 4 + space_six_digits + space_five_digits + single_double_or_triple_digit, NEMO_DIGIT**4 + space_six_digits + space_five_digits ).optimize() graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"") @@ -173,7 +173,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): # SSN ssn_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4, + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py index 53d3dd931..46dc71bc8 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py @@ -71,14 +71,32 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") - oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock", "hundred hours",), "",) + oclock = pynini.cross( + pynini.union( + "o' clock", + "o clock", + "o'clock", + "oclock", + "hundred hours", + ), + "", + ) if input_case == INPUT_CASED: minute_to_graph = capitalized_input_graph(minute_to_graph) graph_minute_single = capitalized_input_graph(graph_minute_single) graph_minute_double = capitalized_input_graph(graph_minute_double) graph_minute_verbose |= pynini.cross("Half", "30") | pynini.cross("Quarter", "15") - oclock |= pynini.cross(pynini.union("O' clock", "O clock", "O'clock", "Oclock", "Hundred hours",), "",) + oclock |= pynini.cross( + pynini.union( + "O' clock", + "O clock", + "O'clock", + "Oclock", + "Hundred hours", + ), + "", + ) final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( diff --git a/nemo_text_processing/inverse_text_normalization/en/utils.py b/nemo_text_processing/inverse_text_normalization/en/utils.py index 00b6a636f..cd54850ce 100644 --- a/nemo_text_processing/inverse_text_normalization/en/utils.py +++ b/nemo_text_processing/inverse_text_normalization/en/utils.py @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]): Args x: integer - Returns: spoken representation + Returns: spoken representation """ if isinstance(x, int): x = str(x) @@ -41,7 +41,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py index ca2bdcee2..780185325 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py @@ -18,7 +18,7 @@ class FractionFst(GraphFst): """ - Finite state transducer for verbalizing fraction, + Finite state transducer for verbalizing fraction, """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py index e8d622e3c..141e41fe5 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py @@ -23,7 +23,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. telephone { number_part: "123-123-5678" } - -> 123-123-5678 + -> 123-123-5678 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py index 467329001..86c1b575b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py @@ -23,7 +23,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 2f62d589d..d3082509a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -31,10 +31,10 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"} + e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"} This class converts cardinals up to (but not including) "un cuatrillón", i.e up to "one septillion" in English (10^{24}). - Cardinals below ten are not converted (in order to avoid + Cardinals below ten are not converted (in order to avoid "vivo en una casa" --> "vivo en 1 casa" and any other odd conversions.) Although technically Spanish grammar requires that "y" only comes after @@ -160,18 +160,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand.optimize() # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million.optimize() @@ -199,7 +194,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.fst = final_graph.optimize() def delete_word(self, word: str): - """ Capitalizes word for `cased` input""" + """Capitalizes word for `cased` input""" delete_graph = pynutil.delete(word).optimize() if self.input_case == INPUT_CASED: if len(word) > 0: diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index af96ee002..66281d225 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -28,10 +28,10 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, + Finite state transducer for classifying date, e.g. primero de enero -> date { day: "1" month: "enero" } e.g. uno de enero -> date { day: "1" month: "enero" } - + Args: cardinal: CardinalFst input_case: accepting either "lower_cased" or "cased" input. diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index 2b1949041..8bfa560d2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -38,7 +38,7 @@ def get_quantity( e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_million: cardinal FST input_case: accepting either "lower_cased" or "cased" input. @@ -87,7 +87,7 @@ class DecimalFst(GraphFst): This decimal rule assumes that decimals can be pronounced as: (a cardinal) + ('coma' or 'punto') plus (any sequence of cardinals <1000, including 'zero') - Also writes large numbers in shortened form, e.g. + Also writes large numbers in shortened form, e.g. e.g. uno coma dos seis millón -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } e.g. dos millones -> decimal { negative: "false" integer_part: "2" quantity: "millones" } e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" } diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 3bc6a8b6d..a7d767119 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -36,7 +36,7 @@ class ElectronicFst(GraphFst): and URLS (which get converted to a "protocol" field). e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } } e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } } - + Args: input_case: accepting either "lower_cased" or "cased" input. """ @@ -136,7 +136,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + symbols + delete_extra_space - + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) + + ( + domain + | pynini.closure( + accepted_username + delete_extra_space, + ) + + accepted_username + ) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index a2b55026e..ae5d13fa9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -23,18 +23,18 @@ class FractionFst(GraphFst): """ Finite state transducer for classifying fractions - e.g. dos quintos -> fraction { numerator: "2" denominator: "5" } - This class converts fractions with a denominator up to (and including) + e.g. dos quintos -> fraction { numerator: "2" denominator: "5" } + This class converts fractions with a denominator up to (and including) "1/999". - + Fractions with 4 as their denominator, read as "cuarto(s)", are not converted because "room" is also "cuarto", which could cause issues like "quiero reservar un cuarto" -> quiero reservar 1/2". - + Fractions without a numerator are not converted either to prevent issues like: "estaba medio dormido" -> "estaba 1/2 dormido" - + Args: cardinal: CardinalFst ordinal: OrdinalFst diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index 9d231bc25..bdefdcf71 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -32,7 +32,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for classifying measure - e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" } + e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" } Args: cardinal: CardinalFst diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index d03640742..7cdcfacc7 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -31,7 +31,7 @@ class OrdinalFst(GraphFst): vigésimo primero -> ordinal { integer: "21" morphosyntactic_features: "o" } This class converts ordinal up to "millesímo" (one thousandth) exclusive. - Cardinals below ten are not converted (in order to avoid + Cardinals below ten are not converted (in order to avoid e.g. "primero hice ..." -> "1.º hice...", "segunda guerra mundial" -> "2.ª guerra mundial" and any other odd conversions.) @@ -62,7 +62,13 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit) - ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,) + ordinal_graph_union = pynini.union( + graph_digit, + graph_teens, + graph_twenties, + full_graph_ties, + graph_hundreds, + ) accept_o_endings = NEMO_SIGMA + pynini.accep("o") accept_a_endings = NEMO_SIGMA + pynini.accep("a") diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 1c0be2037..8c73ca434 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -27,7 +27,7 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. uno dos tres uno dos tres cinco seis siete ocho -> { number_part: "123-123-5678" }. If 10 digits are spoken, they are grouped as 3+3+4 (eg. 123-456-7890). If 9 digits are spoken, they are grouped as 3+3+3 (eg. 123-456-789). @@ -37,7 +37,7 @@ class TelephoneFst(GraphFst): "twelve thirty four" = "1234". (we ignore more complicated cases such as "three hundred and two" or "three nines"). - + Args: input_case: accepting either "lower_cased" or "cased" input. """ @@ -110,7 +110,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # Denormalized phone numbers are grouped in sets of 3 or 4 digits group_of_two = pynini.union(doubled_digit, digit_twice, double_digits) - group_of_three = pynini.union(tripled_digit, single_digits + pynutil.delete(" ") + group_of_two,) + group_of_three = pynini.union( + tripled_digit, + single_digits + pynutil.delete(" ") + group_of_two, + ) group_of_four = pynini.union( group_of_two + pynutil.delete(" ") + group_of_two, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index 9d55f35a3..f33c7c1b1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -45,21 +45,21 @@ class TimeFst(GraphFst): e.g. cuarto para las dos -> time { minutes: "45" hours: "la 1" } Note that times on the hour (e.g. "las dos" i.e. "two o'clock") do not get - converted into a time format. This is to avoid converting phrases that are + converted into a time format. This is to avoid converting phrases that are not part of a time phrase (e.g. "las dos personas" i.e. "the two people") e.g. las dos -> tokens { name: "las" } tokens { name: "dos" } - However, if a time on the hour is followed by a suffix (indicating 'a.m.' + However, if a time on the hour is followed by a suffix (indicating 'a.m.' or 'p.m.'), it will be converted. e.g. las dos pe eme -> time { hours: "las 2" minutes: "00" suffix: "p.m." } - - In the same way, times without a preceding article are not converted. This is + + In the same way, times without a preceding article are not converted. This is to avoid converting ranges or complex fractions e.g. dos y media -> tokens { name: "dos" } tokens { name: "y" } tokens { name: "media" } - However, if a time without an article is followed by a suffix (indicating 'a.m.' + However, if a time without an article is followed by a suffix (indicating 'a.m.' or 'p.m.'), it will be converted. e.g. dos y media p m -> time { hours: "2" minutes: "30" suffix: "p.m." } - Note that although the TimeFst verbalizer can accept 'zone' (timezone) fields, + Note that although the TimeFst verbalizer can accept 'zone' (timezone) fields, so far the rules have not been added to the TimeFst tagger to process timezones (to keep the rules simple, and because timezones are not very often specified in Spanish.) diff --git a/nemo_text_processing/inverse_text_normalization/es/utils.py b/nemo_text_processing/inverse_text_normalization/es/utils.py index f6e06f793..bedda6391 100644 --- a/nemo_text_processing/inverse_text_normalization/es/utils.py +++ b/nemo_text_processing/inverse_text_normalization/es/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py index 58aa190ba..8364c250b 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py @@ -22,7 +22,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. telephone { number_part: "123-123-5678" } - -> 123-123-5678 + -> 123-123-5678 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py index 6b22d6f73..5c45ff66f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/es_en/utils.py b/nemo_text_processing/inverse_text_normalization/es_en/utils.py index f6e06f793..bedda6391 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/utils.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py index 3323f173b..e46b6db56 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py index 333460eb0..ea1fcf8ea 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py @@ -35,9 +35,9 @@ def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': In cases where original orthography is current, or string is mixture of two orthographies, will render invalid form that will not pass through CardinalFst e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.) - e.g. deux + e.g. deux - Args: + Args: cardinal: cardinal FST """ @@ -90,13 +90,13 @@ def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"} + e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"} This class converts cardinals up to (but not including) "un-quatrillion", i.e up to "one septillion" in English (10^{24}). - Cardinals below nine are not converted (in order to avoid + Cardinals below nine are not converted (in order to avoid "j'ai un pomme." --> "j'ai 1 pomme" and any other odd conversions.) This transducer accomodates both traditional hyphenation of numbers ('-' for most numbers <100) - and current hyphenation (all elements of number are hyphenated), prioritizing the latter. + and current hyphenation (all elements of number are hyphenated), prioritizing the latter. e.g cent cinquante et un -> cardinal { integer: "151"} cent-cinquante-et-un -> cardinal { integer: "151"} This is done through a context dependent rewrite that attempts to map old spelling to new. @@ -248,18 +248,13 @@ def __init__(self): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py index 06807f6a3..68d35741c 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py @@ -46,7 +46,9 @@ def __init__(self, cardinal: GraphFst): day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") optional_graph_year = pynini.closure( - delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1, + delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), + 0, + 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py index 7994b719d..9f6341cf4 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py @@ -31,9 +31,9 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_thousand: 'pynini.Fst e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions + Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions - Args: + Args: decimal: decimal FST cardinal_up_to_million: cardinal FST """ @@ -79,7 +79,7 @@ class DecimalFst(GraphFst): This decimal rule assumes that decimals can be pronounced as: (a cardinal) + ('virgule') plus (any sequence of cardinals <1 million, including 'zero') - Also writes large numbers in shortened form, e.g. + Also writes large numbers in shortened form, e.g. e.g. un virgule deux-six-million -> decimal { negative: "false" integer_part: "1" fractional_part: "26" quantity: "million" } e.g. deux-million -> decimal { negative: "false" integer_part: "2" quantity: "millions" } e.g. moins cent-vingt-quatre-millions -> decimal { negative: "true" integer_part: "124" quantity: "millions" } diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py index ca089455a..94b87bfd5 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py @@ -30,7 +30,7 @@ class FractionFst(GraphFst): e.g. demi -> tokens { fraction { numerator: "1" denominator: "2" } } e.g. un et demi -> tokens { fraction { integer_part: "1" numerator: "1" denominator: "2" } } e.g. trois et deux centième -> tokens { fraction { integer_part: "3" numerator: "2" denominator: "100" } } - + Args: cardinal: OrdinalFst """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py index 03976e9e9..629fc0e26 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py @@ -24,7 +24,7 @@ class OrdinalFst(GraphFst): Finite state transducer for classifying ordinal vingt-deuxième -> ordinal { integer: "22" morphosyntactic_features: "e" } - Also notes specific nouns that have unique normalization conventions. + Also notes specific nouns that have unique normalization conventions. For instance, 'siècles' are rendered in roman numerals when given an ordinal adjective. e.g. dix-neuvième siècle -> XIXe diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py index b157960c0..c532cfd06 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py @@ -27,7 +27,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for classifying telephone numbers. Assumes conventional grouping for Metropolitan France (and overseas departments) - (two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g. + (two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g. "zero un quatre-vingt-deux zero deux vingt-deux cinquante" -> { number_part: "01 42 02 22 50" } "zero un quatre deux zero deux deux deux cinq zero" -> { number_part: "01 42 02 22 50" } diff --git a/nemo_text_processing/inverse_text_normalization/fr/utils.py b/nemo_text_processing/inverse_text_normalization/fr/utils.py index f6e06f793..bedda6391 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/utils.py +++ b/nemo_text_processing/inverse_text_normalization/fr/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py index c1a55401e..3e654b859 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py @@ -26,8 +26,8 @@ class NumberParser(GraphFst): """ - Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for - strings of digits of four or more (inclusive). Groupings are separated by non-breaking space. + Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for + strings of digits of four or more (inclusive). Groupings are separated by non-breaking space. e.g. '1000' -> '1 000' e.g. '1000,33333' -> '1 000,333 33 """ @@ -46,7 +46,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # Need parser to group digits by threes - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) space_every_three_integer = ( diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py index 77dd6323f..3179af643 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py @@ -61,12 +61,12 @@ def __init__(self): graph_roman_hundreds = pynini.string_file(get_abs_path("data/roman/hundreds_large.tsv")).invert() graph_roman_zero_digit = pynutil.delete("0") - graph_roman_hundreds = NEMO_DIGIT ** 3 @ ( + graph_roman_hundreds = NEMO_DIGIT**3 @ ( graph_roman_hundreds + pynini.union(graph_roman_ties, graph_roman_zero_digit) + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) - graph_roman_ties = NEMO_DIGIT ** 2 @ ( + graph_roman_ties = NEMO_DIGIT**2 @ ( graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) graph_roman_digits = NEMO_DIGIT @ graph_roman_digits diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py index d937c04d7..5dd5e175c 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py @@ -22,7 +22,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. telephone { number_part: "02 33 43 53 22" } - -> 02 33 43 53 22 + -> 02 33 43 53 22 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py index 52af95d09..99f5b99e8 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py @@ -29,7 +29,7 @@ class TimeFst(GraphFst): Finite state transducer for verbalizing time, e.g. time { hours: "8" minutes: "30" suffix: "du matin"} -> 8 h 30 time { hours: "8" minutes: "30" } -> 8 h 30 - time { hours: "8" minutes: "30" suffix: "du soir"} -> 20 h 30 + time { hours: "8" minutes: "30" suffix: "du soir"} -> 20 h 30 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py index c0bf305da..677386d28 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/hy/utils.py b/nemo_text_processing/inverse_text_normalization/hy/utils.py index f7179e35b..1f1349115 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/utils.py +++ b/nemo_text_processing/inverse_text_normalization/hy/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py index e912ff60b..b0d4e52cc 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py @@ -37,7 +37,12 @@ def __init__(self): convert_one = pynini.cross("[BOS]1", "[BOS]1-ին") convert_rest = pynutil.insert("-րդ", weight=0.01) - suffix = pynini.cdrewrite(convert_rest | convert_one, "", "[EOS]", NEMO_SIGMA,) + suffix = pynini.cdrewrite( + convert_rest | convert_one, + "", + "[EOS]", + NEMO_SIGMA, + ) graph = graph @ suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py index c265f7ef9..15d17f81d 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py @@ -23,7 +23,7 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. 二十三 -> cardinal { integer: "23" } + e.g. 二十三 -> cardinal { integer: "23" } e.g. にじゅうさん -> cardinal { integer: "23" } """ @@ -39,7 +39,10 @@ def __init__(self): hundred = pynutil.delete("百") | pynutil.delete("ひゃく") | pynutil.delete("びゃく") | pynutil.delete("ぴゃく") hundred_alt = ( - pynini.cross("百", "1") | pynini.cross("ひゃく", "1") | pynini.cross("びゃく", "1") | pynini.cross("ぴゃく", "1") + pynini.cross("百", "1") + | pynini.cross("ひゃく", "1") + | pynini.cross("びゃく", "1") + | pynini.cross("ぴゃく", "1") ) graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) graph_hundred_component += pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py index 52d72be58..0e30449e8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py @@ -22,7 +22,7 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g., + Finite state transducer for classifying date, e.g., 一日 -> 1日 date { day: "1" } 五から九日 -> (5~9日) date { day: "5~9" } 一月 -> 1月 date { month: "1" } diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py index 7ec070457..6e070231c 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py @@ -31,8 +31,8 @@ def get_quantity(decimal): class DecimalFst(GraphFst): """ Finite state transducer for classifying decimal - e.g. 一点五 -> decimnl { integer_part: "1" fractional_part: "5" } - e.g. 一点五万 -> decimal { integer_part: "1" fractional_part: "5" quantity: "万" } + e.g. 一点五 -> decimnl { integer_part: "1" fractional_part: "5" } + e.g. 一点五万 -> decimal { integer_part: "1" fractional_part: "5" quantity: "万" } """ def __init__(self, cardinal: GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py index 458448fb4..bc4c8f60c 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): """ Fitite state transducer for classifying fractions - e.g., + e.g., 四分の三 -> fraction { denominator: "4" numerator: "3" } 一と四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } 一荷四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } @@ -36,7 +36,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal = cardinal.just_cardinals decimal = decimal.just_decimal - fraction_word = pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + fraction_word = ( + pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + ) integer_word = pynini.accep("と") | pynini.accep("荷") optional_sign = ( pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("マイナス", "-")) + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py index d478e5f4c..8f474cbb5 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): """ Fitite state transducer for classifying fractions - e.g., + e.g., 四分の三 -> fraction { denominator: "4" numerator: "3" } 一と四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } 一荷四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } @@ -36,7 +36,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal = cardinal.just_cardinals decimal = decimal.just_decimal - fraction_word = pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + fraction_word = ( + pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + ) inetegr_word = pynutil.delete("と") | pynutil.delete("荷") optional_sign = ( pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("マイナス", "-")) + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py index ad20ab82f..1f48bc273 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py @@ -22,7 +22,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. 第二十三 -> cardinal { morphsyntactic_feature: "第" integer: "23" } + e.g. 第二十三 -> cardinal { morphsyntactic_feature: "第" integer: "23" } e.g. 百番目 -> cardinal { integer: "100" morphsyntactic_feature:"番目" } """ diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py index 742be01bb..26e053334 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py @@ -22,17 +22,19 @@ class PreProcessorFst(GraphFst): ''' - Preprocessing of TN: - 1. interjections removal such as '啊, 呃' - 2. fullwidth -> halfwidth char conversion - 好啊 -> 好 - 呃对 -> 对 - : -> : - ; -> ; + Preprocessing of TN: + 1. interjections removal such as '啊, 呃' + 2. fullwidth -> halfwidth char conversion + 好啊 -> 好 + 呃对 -> 对 + : -> : + ; -> ; ''' def __init__( - self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, + self, + remove_interjections: bool = True, + fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py index 8477dfaa5..20ff3f34a 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py @@ -38,14 +38,18 @@ def __init__(self): minutes_seconds = pynini.string_file(get_abs_path("data/time_minutes_seconds.tsv")) hour_component = ( - pynutil.insert("hours: \"") + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + pynutil.insert("\"") + pynutil.insert("hours: \"") + + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + + pynutil.insert("\"") ) minute_component = ( pynutil.insert("minutes: \"") + ((minutes_seconds + pynutil.delete("分")) | pynini.accep("半")) + pynutil.insert("\"") ) - second_component = pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") + second_component = ( + pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") + ) graph_regular = ( pynini.closure(hour_component + insert_space + minute_component + insert_space + second_component) diff --git a/nemo_text_processing/inverse_text_normalization/ja/utils.py b/nemo_text_processing/inverse_text_normalization/ja/utils.py index bb0c588c2..28f7b70d8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ja/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py index 60bdff8a1..62d41cb65 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py @@ -52,7 +52,7 @@ def __init__(self): + pynutil.delete("\"") ) - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py index cea461463..b765b338f 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py @@ -21,7 +21,7 @@ class DateFst(GraphFst): """ - Finite state transducer for verbalizing date, e.g., + Finite state transducer for verbalizing date, e.g., date { day: "1" } -> 1日 date { day: "5~9" } -> 5~9日 date { month: "1" } -> 1月 diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py index 7c37886f8..028864ee9 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self): """ Fitite state transducer for classifying fractions - e.g., + e.g., fraction { denominator: "4" numerator: "3" } -> 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py index cae890be5..2269f9999 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self): """ Fitite state transducer for classifying fractions - e.g., + e.g., fraction { denominator: "4" numerator: "3" } -> 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py index 4bafef0bd..8b196dcaf 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py index e78dba58c..103cfb7a8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py @@ -29,14 +29,18 @@ class PostProcessor(GraphFst): ''' - Postprocessing of TN, now contains: - 1. punctuation removal - 2. letter case conversion - 3. oov tagger + Postprocessing of TN, now contains: + 1. punctuation removal + 2. letter case conversion + 3. oov tagger ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py index 386f1d4a1..8e95e14cf 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py @@ -22,11 +22,11 @@ class TimeFst(GraphFst): """ Finite state transducer for verbalizing time, e.g., - time { hours: "1" minutes: "0" } -> 1時30分 -> + time { hours: "1" minutes: "0" } -> 1時30分 -> time { hours: "5" minutes: "20" suffix: "過ぎ" } -> 5時20分 time { hours: "8" minutes: "半" suffix: "頃" } -> 8時半頃 - time { hours: "10" minutes: "25" suffix: "前" } -> 10時5分前 - time { hours: "正午" minutes: "1" suffix: "前" } -> 正午1分前 + time { hours: "10" minutes: "25" suffix: "前" } -> 10時5分前 + time { hours: "正午" minutes: "1" suffix: "前" } -> 正午1分前 time { hours: "正午" minutes: "10" suffix: "過ぎ" } -> 正午10分過ぎ """ @@ -40,12 +40,18 @@ def __init__(self): hours_component |= hours_component_alt minutes_component = ( - pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("分") + pynutil.delete("\"") + pynutil.delete("minutes: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.insert("分") + + pynutil.delete("\"") ) minutes_component_alt = pynutil.delete("minutes: \"") + pynini.accep("半") + pynutil.delete("\"") minutes_component |= minutes_component_alt second_component = ( - pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("秒") + pynutil.delete("\"") + pynutil.delete("seconds: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.insert("秒") + + pynutil.delete("\"") ) suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py index 8f68abe65..7624d5f1b 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py @@ -26,9 +26,7 @@ class VerbalizeFinalFst(GraphFst): - """ - - """ + """ """ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) @@ -49,7 +47,12 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() if far_file: diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py index debe75196..1c21ce8d3 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py @@ -21,7 +21,7 @@ class WhiteListFst(GraphFst): ''' - tokens { whitelist: "ATM" } -> A T M + tokens { whitelist: "ATM" } -> A T M ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py index d7c2cc874..621ae003e 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py @@ -21,7 +21,7 @@ class WordFst(GraphFst): ''' - tokens { char: "一" } -> 一 + tokens { char: "一" } -> 一 ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py index 27d0a35c5..8aa218a9a 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py @@ -91,7 +91,11 @@ def __init__(self): graph_arabs + delete_space + graph_crores + delete_space + graph_lakhs + delete_space + graph_thousands ) - graph = pynini.union(graph_higher_powers + delete_space + graph_hundreds, graph_hundred_unique, graph_zero,) + graph = pynini.union( + graph_higher_powers + delete_space + graph_hundreds, + graph_hundred_unique, + graph_zero, + ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("०")) + pynini.difference(NEMO_DIGIT, "०") + pynini.closure(NEMO_DIGIT), "०" diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py index 96e8fb08d..15a75affc 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py @@ -46,7 +46,11 @@ def __init__(self, cardinal: GraphFst): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure(graph_year, 0, 1,) + optional_graph_year = pynini.closure( + graph_year, + 0, + 1, + ) graph_ad_bc = pynutil.insert("text: \"") + prefixes + delete_space + pynutil.insert("\"") graph_mdy = month_graph + ( diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py index 9434f77fe..92af8c7c3 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py @@ -51,12 +51,12 @@ def get_quantity(decimal, cardinal_fst): class DecimalFst(GraphFst): """ - Finite state transducer for classifying cardinals - e.g. तेहतीस पूर्णांक तीन -> decimal { integer_part: "३३" fractional_part: "३" } - e.g. उणे तेहतीस पूर्णांक तीन लाख -> decimal { negative: "true" integer_part: "३३" fractional_part: "३" quantity: "लाख" } + Finite state transducer for classifying cardinals + e.g. तेहतीस पूर्णांक तीन -> decimal { integer_part: "३३" fractional_part: "३" } + e.g. उणे तेहतीस पूर्णांक तीन लाख -> decimal { negative: "true" integer_part: "३३" fractional_part: "३" quantity: "लाख" } - Args: - cardinal: CardinalFst + Args: + cardinal: CardinalFst """ def __init__(self, cardinal: GraphFst): @@ -65,7 +65,9 @@ def __init__(self, cardinal: GraphFst): graph_digits = pynini.string_file(get_abs_path("data/numbers/digits.tsv")).invert() decimal_word = pynini.cross("पूर्णांक", "") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) graph_integer = ( pynutil.insert("integer_part: \"") diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py index c4b311e4b..b6e1080da 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py @@ -23,14 +23,14 @@ class TimeFst(GraphFst): """ - Finite state transducer for classifying time - e.g. साडे चार -> time { hours: "४" minutes: "३०" } - e.g. सव्वा बारा -> time { hours: "१२" minutes: "१५" } - e.g. पावणे दहा -> time { hours: "९" minutes: "४५" } - e.g. अकराला पाच मिनिटे -> time { hours: "१०" minutes: "५५" } - e.g. अकरा वाजून दोन मिनिटे -> time { hours: "११" minutes: "२" } - e.g. अडीच -> time { hours: "२" minutes: "३०" } - """ + Finite state transducer for classifying time + e.g. साडे चार -> time { hours: "४" minutes: "३०" } + e.g. सव्वा बारा -> time { hours: "१२" minutes: "१५" } + e.g. पावणे दहा -> time { hours: "९" minutes: "४५" } + e.g. अकराला पाच मिनिटे -> time { hours: "१०" minutes: "५५" } + e.g. अकरा वाजून दोन मिनिटे -> time { hours: "११" minutes: "२" } + e.g. अडीच -> time { hours: "२" minutes: "३०" } + """ def __init__(self): super().__init__(name="time", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/mr/utils.py b/nemo_text_processing/inverse_text_normalization/mr/utils.py index f7179e35b..1f1349115 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/utils.py +++ b/nemo_text_processing/inverse_text_normalization/mr/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py index 7cc99b311..15fcf6e45 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py @@ -21,11 +21,11 @@ class TimeFst(GraphFst): """ - Finite state transducer for verbalizing time, e.g. - e.g. time { hours: "४" minutes: "३०" } -> ०४:३० - e.g. time { hours: "११" minutes: "३०" } -> ११:३० - e.g. time { hours: "८" minutes: "१५" } -> ०८:१५ - """ + Finite state transducer for verbalizing time, e.g. + e.g. time { hours: "४" minutes: "३०" } -> ०४:३० + e.g. time { hours: "११" minutes: "३०" } -> ११:३० + e.g. time { hours: "८" minutes: "१५" } -> ०८:१५ + """ def __init__(self): super().__init__(name="time", kind="verbalize") diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py index 8eeea3876..59b30ae9e 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py @@ -171,9 +171,9 @@ def __init__(self, use_strict_e=False): ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_prefix_e = graph_hundred_component_prefix_e.optimize() - graph_hundred_component_no_prefix = pynini.union(graph_hundreds + graph_e + graph_ties_component,) @ ( - pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) - ) + graph_hundred_component_no_prefix = pynini.union( + graph_hundreds + graph_e + graph_ties_component, + ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_no_prefix = graph_hundred_component_no_prefix.optimize() graph_mil_prefix_e = pynini.union( @@ -350,18 +350,13 @@ def __init__(self, use_strict_e=False): self.graph_no_exception = graph # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py index 20bab26f2..cfb6add51 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py @@ -21,7 +21,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "тысяча один" -> cardinal { integer: "1 001" } Args: diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py index bfa68b8a6..53a325c15 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py index d08b39589..d352284be 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py @@ -20,7 +20,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals. Numbers below ten are not converted. + Finite state transducer for classifying cardinals. Numbers below ten are not converted. Allows both compound numeral strings or separated by whitespace. e.g. minus tjugoen -> cardinal { negative: "-" integer: "21" } } diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py index 5bb6c63bc..5d9308958 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py @@ -31,7 +31,8 @@ class DateFst(GraphFst): """ def __init__( - self, tn_date_tagger: GraphFst, + self, + tn_date_tagger: GraphFst, ): super().__init__(name="date", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py index e39a9017a..97bd36582 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py @@ -49,7 +49,15 @@ def __init__(self, itn_cardinal_tagger: GraphFst, tn_decimal_tagger: GraphFst): self.final_graph_wo_sign = final_graph_wo_sign self.final_graph_wo_negative = ( - final_graph_wo_sign | get_quantity(final_graph_wo_sign, None, hundreds_no_one, None, False, True,) + final_graph_wo_sign + | get_quantity( + final_graph_wo_sign, + None, + hundreds_no_one, + None, + False, + True, + ) ).optimize() optional_minus_graph = pynini.closure(pynini.cross("minus ", "negative: \"true\" "), 0, 1) diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py index c1c2bc2a3..484efff78 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py @@ -22,7 +22,7 @@ class ElectronicFst(GraphFst): """ Finite state transducer for classifying electronic: email addresses, etc. e.g. c d f ett at a b c punkt e d u -> tokens { name: "cdf1.abc.edu" } - + Args: tn_electronic_tagger: TN eletronic tagger tn_electronic_verbalizer: TN eletronic verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py index 2ba361280..df56d8d7f 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py @@ -25,7 +25,7 @@ class FractionFst(GraphFst): e.g. halv -> tokens { name: "1/2" } e.g. ett och en halv -> tokens { name: "1 1/2" } e.g. tre och fyra femtedelar -> tokens { name: "3 4/5" } - + Args: itn_cardinal_tagger: ITN cardinal tagger tn_fraction_verbalizer: TN fraction verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py index 7c319e0f3..74369e70f 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py @@ -20,9 +20,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. noll åtta sjuhundraåttionio femtiotvå tjugofem -> tokens { name: "08-789 52 25" } - + Args: tn_cardinal_tagger: TN Cardinal Tagger """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py index cf8fdc202..311c14c36 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py @@ -62,7 +62,7 @@ class TimeFst(GraphFst): e.g. klockan tretton tio -> time { hours: "kl. 13" minutes: "10" } e.g. kvart i tolv -> time { minutes: "45" hours: "11" } e.g. kvart över tolv -> time { minutes: "15" hours: "12" } - + Args: tn_cardinal_tagger: TN cardinal verbalizer """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/utils.py b/nemo_text_processing/inverse_text_normalization/sv/utils.py index 0a7f1ff2d..e645db2dd 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/utils.py +++ b/nemo_text_processing/inverse_text_normalization/sv/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py index 272f047e1..643017c47 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "klockan" } tokens { name: "är" } tokens { time { hours: "12" minutes: "30" } } -> klockan är 12:30 """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py index 016df4f1d..155513937 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py @@ -134,7 +134,8 @@ def __init__(self): ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", ) # don't convert cardinals from zero to nine inclusive @@ -145,7 +146,9 @@ def __init__(self): self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, 0, 1, + pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, + 0, + 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py index b0cd8561a..21576efd5 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py @@ -59,7 +59,10 @@ def _get_year_graph(): def _get_digits_graph(): zero = pynini.cross((pynini.union("linh", "lẻ")), "0") four = pynini.cross("tư", "4") - graph = pynini.union(zero + delete_space + (graph_digit | four), graph_zero + delete_space + graph_digit,) + graph = pynini.union( + zero + delete_space + (graph_digit | four), + graph_zero + delete_space + graph_digit, + ) graph.optimize() return graph diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py index 033f3d86e..60c550228 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py @@ -123,10 +123,12 @@ def __init__(self, cardinal: GraphFst): final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, + cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph |= optional_graph_negative + get_quantity( - final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, + cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py index 30d262722..2ad4d5bbf 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py @@ -70,7 +70,9 @@ def __init__(self): ) optional_zone = pynini.closure(zone, 0, 1) optional_second = pynini.closure( - delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), 0, 1, + delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), + 0, + 1, ) graph_h = hour + pynutil.insert("h") diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index de1a7a28c..9c0199b13 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -86,7 +86,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.capitalize(), + ], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -100,7 +103,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): print(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [ + [written, spoken_no_space], + [written_capitalized, spoken_no_space.upper()], + ] ) else: additional_labels.extend( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index c99ae25d2..0715a3988 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -25,7 +25,7 @@ def __init__(self): Fitite state transducer for classifying cardinals (e.g., 负五十 -> cardinal { negative: "-" integer: "50" }) This class converts cardinals up to hundred millions (i.e., (10**10)) Single unit digits are not converted (e.g., 五 -> 五) - Numbers less than 20 are not converted. + Numbers less than 20 are not converted. 二十 (2 characters/logograms) is kept as it is but 二十一 (3 characters/logograms) would become 21 """ super().__init__(name="cardinal", kind="classify") @@ -110,7 +110,12 @@ def __init__(self): + graph_hundreds_complex ) | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) - | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + | ( + graph_hundreds_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "000") + + graph_digits + ) ) graph_millions = ( pynutil.add_weight(graph_millions_simple, -1.0) | graph_millions_complex | pynutil.insert("0000000") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index 331f0b7ff..108c222fd 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -61,7 +61,9 @@ def __init__(self): # graph_date = graph_year | graph_month | graph_day # grammar for optional prefix ad or bc - graph_bc_prefix = pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + graph_bc_prefix = ( + pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + ) graph_bc = pynutil.delete(graph_bc_prefix) graph_ad_prefix = ( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index c4911e832..49fd428c1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): Finite state transducer for classifying fraction e.g. 二分之一 -> tokens { fraction { denominator: "2" numerator: "1"} } e.g. 五又二分之一 -> tokens { fraction { integer_part: "1" denominator: "2" numerator: "1" } } - + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index e660b6015..477a82f5d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -57,7 +57,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # yuan major plus minor major_symbol = pynini.accep("块") | pynini.cross("塊", "块") - tencent = pynini.accep("毛") | pynini.accep("角",) + tencent = pynini.accep("毛") | pynini.accep( + "角", + ) cent = pynini.accep("分") graph_kuai = ( graph_integer_component diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 96266df25..d183ad1ad 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -39,7 +39,7 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: @@ -48,7 +48,11 @@ class ClassifyFst(GraphFst): """ def __init__( - self, input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, + self, + input_case: str, + cache_dir: str = None, + whitelist: str = None, + overwrite_cache: bool = False, ): super().__init__(name="tokenize_and_classify", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py index 92336fe0f..8db669ff6 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]): Args x: integer - Returns: spoken representation + Returns: spoken representation """ if isinstance(x, int): x = str(x) @@ -41,7 +41,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index 31d5880dc..f33987173 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="cardinal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) suffix = pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index 28e2d5ff1..b36e44dfa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # insert a "," for every three numbers before decimal point diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py index b379c4d94..5368e2c42 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -26,7 +26,7 @@ class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py index 849cc690d..5538d8ed6 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/text_normalization/ar/taggers/cardinal.py index 9a8ba7cd4..a6ab7aca3 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/cardinal.py @@ -21,7 +21,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "9837" -> cardinal { integer: "تسعة اَلاف وثمان مئة وسبعة وثلاثون" } Args: diff --git a/nemo_text_processing/text_normalization/ar/taggers/decimal.py b/nemo_text_processing/text_normalization/ar/taggers/decimal.py index f276155e9..72d2dc47b 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/decimal.py @@ -21,8 +21,8 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - 321.7 --> ثلاث مئة وواحد وعشرون وسبعة من عشرة + Finite state transducer for classifying decimal, e.g. + 321.7 --> ثلاث مئة وواحد وعشرون وسبعة من عشرة -321.7 -> decimal { negative: "true" integer_part: "321" fractional_part: ".7" } cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/ar/taggers/fraction.py b/nemo_text_processing/text_normalization/ar/taggers/fraction.py index aad046011..1ef390506 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ar/taggers/fraction.py @@ -26,7 +26,7 @@ class FractionFst(GraphFst): tokens { fraction { integer_part: "واحد" numerator: "واحد" denominator: "نص" } } Args: - cardinal: cardinal fst + cardinal: cardinal fst """ def __init__(self, cardinal): diff --git a/nemo_text_processing/text_normalization/ar/taggers/measure.py b/nemo_text_processing/text_normalization/ar/taggers/measure.py index 707b40998..ce22f3d76 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/measure.py +++ b/nemo_text_processing/text_normalization/ar/taggers/measure.py @@ -55,7 +55,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( @@ -76,15 +78,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) subgraph_cardinal = ( - (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst - + insert_space - + pynini.closure(pynutil.delete(" "), 0, 1) - + unit_plural - | unit_plural - + pynini.closure(pynutil.delete(" "), 0, 1) - + insert_space - + (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst - ) + optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") + ) @ cardinal.fst + insert_space + pynini.closure( + pynutil.delete(" "), 0, 1 + ) + unit_plural | unit_plural + pynini.closure( + pynutil.delete(" "), 0, 1 + ) + insert_space + ( + optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") + ) @ cardinal.fst subgraph_cardinal |= ( (optional_graph_negative + pynini.accep("1")) @ cardinal.fst diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 0df176491..925fa348e 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -36,7 +36,7 @@ class MoneyFst(GraphFst): "$1,99" -> money { integer_part: "سبعة" currency_maj: "دولار" fractional_part: "تسعة وتسعون" currency_min: "سنت" preserve_order: true} "$0,10" -> money { fractional_part: "عشرة" currency_min: "بنسات" preserve_order: true } "$9" -> money { integer_part: "تسعة" currency_maj: "دولار" preserve_order: true} - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, @@ -142,7 +142,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) graph_with_no_minor_curr = integer_plus_maj - graph_with_no_minor_curr |= pynutil.add_weight(integer_plus_maj, weight=0.0001,) + graph_with_no_minor_curr |= pynutil.add_weight( + integer_plus_maj, + weight=0.0001, + ) graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order diff --git a/nemo_text_processing/text_normalization/ar/utils.py b/nemo_text_processing/text_normalization/ar/utils.py index fac39551c..1ad8f9927 100644 --- a/nemo_text_processing/text_normalization/ar/utils.py +++ b/nemo_text_processing/text_normalization/ar/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/measure.py b/nemo_text_processing/text_normalization/ar/verbalizers/measure.py index aaca02de0..b762eaa3b 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/measure.py @@ -27,7 +27,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "20" } units: "%" } -> "عشرون في المائة" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py index 8388f8e84..4145c2330 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py @@ -31,7 +31,7 @@ class VerbalizeFinalFst(GraphFst): """ Finite state transducer that verbalizes an entire sentence - + Args: deterministic: if True will provide a single transduction option, for False multiple options (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 01a85ec10..040a9e74c 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -50,7 +50,7 @@ def _load_kaggle_text_norm_file(file_path: str, to_lower: bool) -> List[Instance """ https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish Loads text file in the Kaggle Google text normalization file format: \t\t<`self` if trivial class or normalized text> - E.g. + E.g. PLAIN Brillantaisia PLAIN is PLAIN a @@ -66,7 +66,7 @@ def _load_kaggle_text_norm_file(file_path: str, to_lower: bool) -> List[Instance Args: file_path: file path to text file - Returns: flat list of instances + Returns: flat list of instances """ res = [] with open(file_path, 'r') as fp: @@ -91,7 +91,7 @@ def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file, to_ """ Load given list of text files using the `load_func` function. - Args: + Args: file_paths: list of file paths load_func: loading function @@ -119,7 +119,7 @@ def clean_generic(text: str) -> str: def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = None, verbose: bool = True) -> float: """ - Evaluates accuracy given predictions and labels. + Evaluates accuracy given predictions and labels. Args: preds: predictions @@ -250,7 +250,7 @@ def load_file(file_path: str) -> List[str]: """ Loads given text file with separate lines into list of string. - Args: + Args: file_path: file path Returns: flat list of string @@ -269,7 +269,7 @@ def write_file(file_path: str, data: List[str]): Args: file_path: file path data: list of string - + """ with open(file_path, 'w') as fp: for line in data: diff --git a/nemo_text_processing/text_normalization/de/taggers/cardinal.py b/nemo_text_processing/text_normalization/de/taggers/cardinal.py index bb14d2c95..902a62b3f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/cardinal.py @@ -66,7 +66,7 @@ def get_ties_digit(digit_path: str, tie_path: str) -> 'pynini.FstLike': class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "101" -> cardinal { integer: "ein hundert und zehn" } Args: @@ -166,7 +166,7 @@ def thousand(): self.graph = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -181,7 +181,7 @@ def thousand(): self.graph_hundred_component_at_least_one_none_zero_digit = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 3 + @ NEMO_DIGIT**3 @ hundred_non_zero() ) | pynini.cross("1", "eins") diff --git a/nemo_text_processing/text_normalization/de/taggers/date.py b/nemo_text_processing/text_normalization/de/taggers/date.py index 673bd8868..8c13882d2 100644 --- a/nemo_text_processing/text_normalization/de/taggers/date.py +++ b/nemo_text_processing/text_normalization/de/taggers/date.py @@ -42,7 +42,7 @@ def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': cardinal: cardinal GraphFst """ - year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT ** 2) @ cardinal.graph + year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT**2) @ cardinal.graph graph_two_digit = delete_leading_zero @ cardinal.two_digit_non_zero hundred = pynutil.insert("hundert") @@ -64,7 +64,7 @@ def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. "01.04.2010" -> date { day: "erster" month: "april" year: "zwei tausend zehn" preserve_order: true } "1994" -> date { year: "neunzehn vier und neuzig" } "1900" -> date { year: "neunzehn hundert" } diff --git a/nemo_text_processing/text_normalization/de/taggers/decimal.py b/nemo_text_processing/text_normalization/de/taggers/decimal.py index 6381d942b..8d1540110 100644 --- a/nemo_text_processing/text_normalization/de/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/de/taggers/decimal.py @@ -27,7 +27,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL e.g. 1 million -> integer_part: "eine" quantity: "million" e.g. 1.4 million -> integer_part: "eins" fractional_part: "vier" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -48,7 +48,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -11,4006 billion -> decimal { negative: "true" integer_part: "elf" fractional_part: "vier null null sechs" quantity: "billion" preserve_order: true } 1 billion -> decimal { integer_part: "eins" quantity: "billion" preserve_order: true } Args: diff --git a/nemo_text_processing/text_normalization/de/taggers/measure.py b/nemo_text_processing/text_normalization/de/taggers/measure.py index 122ff8a67..a46822a0f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/measure.py +++ b/nemo_text_processing/text_normalization/de/taggers/measure.py @@ -82,7 +82,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/de/taggers/ordinal.py b/nemo_text_processing/text_normalization/de/taggers/ordinal.py index f446099df..a99e4e4a8 100644 --- a/nemo_text_processing/text_normalization/de/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "2." -> ordinal { integer: "zwei" } } "2tes" -> ordinal { integer: "zwei" } } diff --git a/nemo_text_processing/text_normalization/de/taggers/telephone.py b/nemo_text_processing/text_normalization/de/taggers/telephone.py index f40173b0f..97482a236 100644 --- a/nemo_text_processing/text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/de/taggers/telephone.py @@ -21,9 +21,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone, which includes country code, number part and extension + Finite state transducer for classifying telephone, which includes country code, number part and extension - E.g + E.g "+49 1234-1233" -> telephone { country_code: "plus neun und vierzig" number_part: "eins zwei drei vier eins zwei drei drei" preserve_order: true } "(012) 1234-1233" -> telephone { country_code: "null eins zwei" number_part: "eins zwei drei vier eins zwei drei drei" preserve_order: true } (0**) @@ -45,7 +45,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numbers_with_single_digits = pynini.closure(graph_digit + insert_space) + graph_digit - two_digit_and_zero = (NEMO_DIGIT ** 2 @ cardinal.two_digit_non_zero) | graph_zero + two_digit_and_zero = (NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero # def add_space_after_two_digit(): # return pynini.closure(two_digit_and_zero + insert_space) + ( # two_digit_and_zero diff --git a/nemo_text_processing/text_normalization/de/taggers/time.py b/nemo_text_processing/text_normalization/de/taggers/time.py index 371ad16ac..2fe74f5ba 100644 --- a/nemo_text_processing/text_normalization/de/taggers/time.py +++ b/nemo_text_processing/text_normalization/de/taggers/time.py @@ -65,7 +65,9 @@ def __init__(self, deterministic: bool = True): + pynutil.insert('"') ) final_time_zone_optional = pynini.closure( - pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), 0, 1, + pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), + 0, + 1, ) # Accepts the following formats: 02:30 Uhr, 02.30 Uhr, 2:30 Uhr, 2.30 Uhr diff --git a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py index e6590536f..646d7a6b7 100644 --- a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py @@ -70,7 +70,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -92,7 +93,10 @@ def __init__( self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -104,7 +108,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/de/utils.py b/nemo_text_processing/text_normalization/de/utils.py index d2dc9ce80..0b364938b 100644 --- a/nemo_text_processing/text_normalization/de/utils.py +++ b/nemo_text_processing/text_normalization/de/utils.py @@ -24,7 +24,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path diff --git a/nemo_text_processing/text_normalization/de/verbalizers/decimal.py b/nemo_text_processing/text_normalization/de/verbalizers/decimal.py index 915d5ab67..b544a2d6c 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/decimal.py @@ -26,8 +26,8 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "elf" fractional_part: "vier null sechs" quantity: "billionen" } -> minus elf komma vier null sechs billionen + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "elf" fractional_part: "vier null sechs" quantity: "billionen" } -> minus elf komma vier null sechs billionen decimal { integer_part: "eins" quantity: "billion" } -> eins billion """ diff --git a/nemo_text_processing/text_normalization/de/verbalizers/measure.py b/nemo_text_processing/text_normalization/de/verbalizers/measure.py index 41f7fb89c..675659044 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/measure.py @@ -28,7 +28,7 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "zwei" units: "unzen" } } -> "zwei unzen" measure { cardinal { integer_part: "zwei" quantity: "millionen" units: "unzen" } } -> "zwei millionen unzen" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py index f8d5f6967..d4ea8eb09 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py @@ -43,7 +43,10 @@ def __init__(self, deterministic: bool = True): self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( - pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, + pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, + "", + "[EOS]", + NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/de/verbalizers/telephone.py b/nemo_text_processing/text_normalization/de/verbalizers/telephone.py index 7a50e785f..5bae8fe2d 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/telephone.py @@ -21,7 +21,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. - telephone { country_code: "plus neun und vierzig" number_part: "null eins eins eins null null null" } + telephone { country_code: "plus neun und vierzig" number_part: "null eins eins eins null null null" } -> "plus neun und vierzig null eins eins eins null null null" Args: diff --git a/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py index f4e19ea0f..6cda902f8 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py @@ -31,7 +31,7 @@ class VerbalizeFinalFst(GraphFst): """ Finite state transducer that verbalizes an entire sentence - + Args: deterministic: if True will provide a single transduction option, for False multiple options (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/en/clean_eval_data.py b/nemo_text_processing/text_normalization/en/clean_eval_data.py index a7dc24310..9d0aaed6b 100644 --- a/nemo_text_processing/text_normalization/en/clean_eval_data.py +++ b/nemo_text_processing/text_normalization/en/clean_eval_data.py @@ -67,7 +67,7 @@ def process(self, instance: Instance) -> Instance: Args: processes given instance with process function - + Returns: processed instance if instance belongs to expected class type or original instance """ if instance.token_type != self.class_type: diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py index 161e5d97e..668e1fb7c 100644 --- a/nemo_text_processing/text_normalization/en/graph_utils.py +++ b/nemo_text_processing/text_normalization/en/graph_utils.py @@ -103,14 +103,36 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "x", + "y", + "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, + plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), + NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -125,7 +147,9 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, + graph: "pynini.FstLike", + original_graph_weight: float = None, + capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) @@ -209,7 +233,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.capitalize(), + ], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -223,7 +250,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): logger.debug(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [ + [written, spoken_no_space], + [written_capitalized, spoken_no_space.upper()], + ] ) else: additional_labels.extend( diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py index 616e018e3..5e2a8535c 100644 --- a/nemo_text_processing/text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/cardinal.py @@ -30,7 +30,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. -23 -> cardinal { negative: "true" integer: "twenty three" } } Args: @@ -83,7 +83,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): graph = ( pynini.closure(NEMO_DIGIT, 1, 3) - + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT ** 3) | pynini.closure(NEMO_DIGIT ** 3)) + + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3)) ) @ graph self.graph = graph @@ -118,7 +118,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) final_graph |= pynini.compose(final_graph, one_to_a_replacement_graph.optimize() + NEMO_SIGMA).optimize() # remove commas for 4 digits numbers - four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT ** 3 + four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT**3 final_graph |= pynini.compose(four_digit_comma_graph.optimize(), final_graph).optimize() self.final_graph = final_graph diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index c5e3dd418..52225f0ba 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -43,7 +43,7 @@ def get_ties_graph(deterministic: bool = True): """ - Returns two digit transducer, e.g. + Returns two digit transducer, e.g. 03 -> o three 12 -> thirteen 20 -> twenty @@ -119,18 +119,18 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): Transducer for year, only from 1000 - 2999 e.g. 1290 -> twelve nineteen 2000 - 2009 will be verbalized as two thousand. - + Transducer for 3 digit year, e.g. 123-> one twenty three - + Transducer for year with suffix 123 A.D., 4200 B.C """ graph = get_four_digit_year_graph(deterministic) - graph = (pynini.union("1", "2") + (NEMO_DIGIT ** 3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph + graph = (pynini.union("1", "2") + (NEMO_DIGIT**3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph graph |= _get_two_digit_year_with_s_graph() - three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph + three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT**2) @ cardinal_graph year_with_suffix = ( (get_four_digit_year_graph(deterministic=True) | three_digit_year) + delete_space + insert_space + year_suffix ) @@ -159,7 +159,7 @@ def _get_financial_period_graph(): class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. jan. 5, 2012 -> date { month: "january" day: "five" year: "twenty twelve" preserve_order: true } jan. 5 -> date { month: "january" day: "five" preserve_order: true } 5 january 2012 -> date { day: "five" month: "january" year: "twenty twelve" preserve_order: true } @@ -270,7 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year - day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph + day_ex_month = (NEMO_DIGIT**2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( diff --git a/nemo_text_processing/text_normalization/en/taggers/decimal.py b/nemo_text_processing/text_normalization/en/taggers/decimal.py index df9a3bddb..f68d7da4b 100644 --- a/nemo_text_processing/text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/en/taggers/decimal.py @@ -31,7 +31,7 @@ def get_quantity( e.g. 1 million -> integer_part: "one" quantity: "million" e.g. 1.5 million -> integer_part: "one" fractional_part: "five" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -63,7 +63,7 @@ def get_quantity( class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -12.5006 billion -> decimal { negative: "true" integer_part: "12" fractional_part: "five o o six" quantity: "billion" } 1 billion -> decimal { integer_part: "one" quantity: "billion" } diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 3262c7485..874d2e437 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -49,9 +49,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): else: numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ") - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), + MIN_NEG_WEIGHT, + ) - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), + MIN_NEG_WEIGHT, + ) accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") @@ -59,10 +65,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" ) - dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,) + dict_words = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/words.tsv")), + MIN_NEG_WEIGHT, + ) dict_words_without_delimiter = dict_words + pynini.closure( - pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1, + pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), + 1, ) dict_words_graph = dict_words_without_delimiter | dict_words diff --git a/nemo_text_processing/text_normalization/en/taggers/measure.py b/nemo_text_processing/text_normalization/en/taggers/measure.py index fc61620ce..e8d92e1da 100644 --- a/nemo_text_processing/text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/text_normalization/en/taggers/measure.py @@ -53,7 +53,11 @@ class MeasureFst(GraphFst): """ def __init__( - self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, + self, + cardinal: GraphFst, + decimal: GraphFst, + fraction: GraphFst, + deterministic: bool = True, ): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and | self.get_range(cardinal.graph_with_and) @@ -63,7 +67,8 @@ def __init__( graph_unit |= pynini.string_file(get_abs_path("data/measure/unit_alternatives.tsv")) graph_unit |= pynini.compose( - pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), graph_unit, + pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), + graph_unit, ).optimize() graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) @@ -76,7 +81,9 @@ def __init__( ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, + 0, + 1, ) unit_plural = ( @@ -250,11 +257,12 @@ def get_address_graph(self, cardinal): ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( - pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), ordinal_verbalizer, + pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), + ordinal_verbalizer, ) address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit - address_num += insert_space + NEMO_DIGIT ** 2 @ ( + address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit ) @@ -292,8 +300,12 @@ def get_address_graph(self, cardinal): state = pynini.invert(state_graph) state = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) - zip_code = pynini.compose(NEMO_DIGIT ** 5, cardinal.single_digits_graph) - zip_code = pynini.closure(pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1,) + zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) + zip_code = pynini.closure( + pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, + 0, + 1, + ) address = address_num + direction + address_words + pynini.closure(city + state + zip_code, 0, 1) diff --git a/nemo_text_processing/text_normalization/en/taggers/money.py b/nemo_text_processing/text_normalization/en/taggers/money.py index ef38c56b5..0687b0c1a 100644 --- a/nemo_text_processing/text_normalization/en/taggers/money.py +++ b/nemo_text_processing/text_normalization/en/taggers/money.py @@ -112,7 +112,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( - NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj, + NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), + integer_plus_maj, ) integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma @@ -189,7 +190,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( - NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered, + NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), + integer_graph_reordered, ) final_graph += graph_per_units.ques diff --git a/nemo_text_processing/text_normalization/en/taggers/ordinal.py b/nemo_text_processing/text_normalization/en/taggers/ordinal.py index 70ae2d70d..8687b493c 100644 --- a/nemo_text_processing/text_normalization/en/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal, e.g. 13th -> ordinal { integer: "thirteen" } - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index 5e0d017d4..c989e99f5 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -22,7 +22,7 @@ class RangeFst(GraphFst): """ This class is a composite class of two other class instances - + Args: time: composed tagger and verbalizer date: composed tagger and verbalizer @@ -33,7 +33,12 @@ class RangeFst(GraphFst): """ def __init__( - self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False, + self, + time: GraphFst, + date: GraphFst, + cardinal: GraphFst, + deterministic: bool = True, + lm: bool = False, ): super().__init__(name="range", kind="classify", deterministic=deterministic) @@ -47,14 +52,14 @@ def __init__( cardinal = cardinal.graph_with_and # YEAR - date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date - date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_four_digit = (NEMO_DIGIT**4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_two_digit = (NEMO_DIGIT**2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date year_to_year_graph = ( date_year_four_digit + delete_space + pynini.cross("-", " to ") + delete_space - + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal)) + + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT**2 @ cardinal)) ) mid_year_graph = pynini.accep("mid") + pynini.cross("-", " ") + (date_year_four_digit | date_year_two_digit) diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index e1a76dd63..f650c8ff3 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -31,7 +31,7 @@ class SerialFst(GraphFst): """ This class is a composite class of two other class instances - + Args: time: composed tagger and verbalizer date: composed tagger and verbalizer @@ -71,7 +71,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = num_graph |= pynini.compose(num_graph, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA) # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( - NEMO_DIGIT ** 2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 + NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 ) # add space between letter and digit/symbol diff --git a/nemo_text_processing/text_normalization/en/taggers/telephone.py b/nemo_text_processing/text_normalization/en/taggers/telephone.py index 06d791264..aa9865928 100644 --- a/nemo_text_processing/text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/en/taggers/telephone.py @@ -30,11 +30,11 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension - country code optional: +*** + Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension + country code optional: +*** number part: ***-***-****, or (***) ***-**** extension optional: 1-9999 - E.g + E.g +1 123-123-5678-1 -> telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" } 1-800-GO-U-HAUL -> telephone { country_code: "one" number_part: "one, eight hundred GO U HAUL" } Args: diff --git a/nemo_text_processing/text_normalization/en/taggers/time.py b/nemo_text_processing/text_normalization/en/taggers/time.py index a66f18314..b9e4e824f 100644 --- a/nemo_text_processing/text_normalization/en/taggers/time.py +++ b/nemo_text_processing/text_normalization/en/taggers/time.py @@ -41,7 +41,7 @@ class TimeFst(GraphFst): 02:00 -> time { hours: "two" } 2:00 -> time { hours: "two" } 10:00:05 a.m. -> time { hours: "ten" minutes: "zero" seconds: "five" suffix: "a m" } - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py index 28614fad1..7a253cccc 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py @@ -78,7 +78,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", + cache_dir, + f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -107,7 +108,12 @@ def __init__( logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") start_time = time.time() - measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic,) + measure = MeasureFst( + cardinal=cardinal, + decimal=decimal, + fraction=fraction, + deterministic=deterministic, + ) measure_graph = measure.fst logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") @@ -157,7 +163,10 @@ def __init__( time_final = pynini.compose(time_graph, v_time_graph) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst( - time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic, + time=time_final, + date=date_final, + cardinal=cardinal, + deterministic=deterministic, ).fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py index 95c22bcbe..5fc8bdbaf 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py @@ -65,7 +65,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py index 110747cab..239984a80 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py @@ -65,7 +65,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/utils.py b/nemo_text_processing/text_normalization/en/utils.py index 31d9ec635..a2a765a06 100644 --- a/nemo_text_processing/text_normalization/en/utils.py +++ b/nemo_text_processing/text_normalization/en/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/text_normalization/en/verbalizers/measure.py b/nemo_text_processing/text_normalization/en/verbalizers/measure.py index ae5fa8800..c998a809f 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/measure.py @@ -30,7 +30,7 @@ class MeasureFst(GraphFst): measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" } -> minus twelve kilograms measure { decimal { integer_part: "twelve" fractional_part: "five" } units: "kilograms" } -> twelve point five kilograms tokens { measure { units: "covid" decimal { integer_part: "nineteen" fractional_part: "five" } } } -> covid nineteen point five - + Args: decimal: DecimalFst cardinal: CardinalFst diff --git a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py index 4ad7d1c85..dff205f8e 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py @@ -46,7 +46,10 @@ def __init__(self, deterministic: bool = True): convert_rest = pynutil.insert("th") suffix = pynini.cdrewrite( - graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, "", "[EOS]", NEMO_SIGMA, + graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, + "", + "[EOS]", + NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py index b64abf6a2..33a472656 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py @@ -99,10 +99,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ punct_marks_all = PunctuationFst().punct_marks diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 101185a90..946f4234e 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -107,7 +107,10 @@ def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike": """ fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA) fem_allign @= pynini.cdrewrite( - fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA, + fem_ones, + "", + pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), + NEMO_SIGMA, ) # If before a quote or EOS, we know it's the end of a string return fst @ fem_allign diff --git a/nemo_text_processing/text_normalization/es/taggers/cardinal.py b/nemo_text_processing/text_normalization/es/taggers/cardinal.py index 1b8f0a440..85402089f 100644 --- a/nemo_text_processing/text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/cardinal.py @@ -47,7 +47,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -157,7 +157,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/es/taggers/date.py b/nemo_text_processing/text_normalization/es/taggers/date.py index ea7f15292..dd5cd7f0e 100644 --- a/nemo_text_processing/text_normalization/es/taggers/date.py +++ b/nemo_text_processing/text_normalization/es/taggers/date.py @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): dash = "-" day_optional = pynini.closure(pynini.cross(dash, NEMO_SPACE) + day, 0, 1) - graph_ymd = NEMO_DIGIT ** 4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional + graph_ymd = NEMO_DIGIT**4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional final_graph = graph_dmy + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd diff --git a/nemo_text_processing/text_normalization/es/taggers/fraction.py b/nemo_text_processing/text_normalization/es/taggers/fraction.py index 1fb5b8118..7bbe86402 100644 --- a/nemo_text_processing/text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/es/taggers/fraction.py @@ -47,15 +47,50 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = ordinal_graph = ordinal.graph # 2-10 are all ordinals - three_to_ten = pynini.string_map(["2", "3", "4", "5", "6", "7", "8", "9", "10",]) + three_to_ten = pynini.string_map( + [ + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + ] + ) block_three_to_ten = pynutil.delete(three_to_ten) # To block cardinal productions if not deterministic: # Multiples of tens are sometimes rendered as ordinals - three_to_ten |= pynini.string_map(["20", "30", "40", "50", "60", "70", "80", "90",]) + three_to_ten |= pynini.string_map( + [ + "20", + "30", + "40", + "50", + "60", + "70", + "80", + "90", + ] + ) graph_three_to_ten = three_to_ten @ ordinal_graph graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", NEMO_SIGMA) # Higher powers of tens (and multiples) are converted to ordinals. - hundreds = pynini.string_map(["100", "200", "300", "400", "500", "600", "700", "800", "900",]) + hundreds = pynini.string_map( + [ + "100", + "200", + "300", + "400", + "500", + "600", + "700", + "800", + "900", + ] + ) graph_hundreds = hundreds @ ordinal_graph multiples_of_thousand = ordinal.multiples_of_thousand # So we can have X milésimos @@ -68,7 +103,10 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = graph_higher_powers_of_ten += higher_powers_of_ten graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten graph_higher_powers_of_ten @= pynini.cdrewrite( - pynutil.delete("un "), pynini.accep("[BOS]"), pynini.project(higher_powers_of_ten, "output"), NEMO_SIGMA, + pynutil.delete("un "), + pynini.accep("[BOS]"), + pynini.project(higher_powers_of_ten, "output"), + NEMO_SIGMA, ) # we drop 'un' from these ordinals (millionths, not one-millionths) graph_higher_powers_of_ten = multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten @@ -83,10 +121,16 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = # Blocking the digits and hundreds from Cardinal graph graph_fractions_cardinals = pynini.cdrewrite( - block_three_to_ten | block_higher_powers_of_ten, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, + block_three_to_ten | block_higher_powers_of_ten, + pynini.accep("[BOS]"), + pynini.accep("[EOS]"), + NEMO_SIGMA, ) graph_fractions_cardinals @= NEMO_CHAR.plus @ pynini.cdrewrite( - pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, + pynutil.delete("0"), + pynini.accep("[BOS]"), + pynini.accep("[EOS]"), + NEMO_SIGMA, ) # Empty characters become '0' for NEMO_CHAR fst, so need to block graph_fractions_cardinals @= cardinal_graph graph_fractions_cardinals += pynutil.insert( diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index a1933dbed..a63677c47 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -79,7 +79,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) complex_unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/es/taggers/ordinal.py b/nemo_text_processing/text_normalization/es/taggers/ordinal.py index 8af8773e5..d1822103a 100644 --- a/nemo_text_processing/text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/ordinal.py @@ -49,7 +49,7 @@ def get_one_to_one_thousand(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal - "21.º" -> ordinal { integer: "vigésimo primero" morphosyntactic_features: "gender_masc" } + "21.º" -> ordinal { integer: "vigésimo primero" morphosyntactic_features: "gender_masc" } This class converts ordinal up to the millionth (millonésimo) order (exclusive). This FST also records the ending of the ordinal (called "morphosyntactic_features"): diff --git a/nemo_text_processing/text_normalization/es/taggers/telephone.py b/nemo_text_processing/text_normalization/es/taggers/telephone.py index 83efc587c..1cc332f07 100644 --- a/nemo_text_processing/text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/es/taggers/telephone.py @@ -43,8 +43,8 @@ class TelephoneFst(GraphFst): (we ignore more complicated cases such as "doscientos y dos" or "tres nueves"). Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/es/taggers/time.py b/nemo_text_processing/text_normalization/es/taggers/time.py index 4a947dd31..de2752657 100644 --- a/nemo_text_processing/text_normalization/es/taggers/time.py +++ b/nemo_text_processing/text_normalization/es/taggers/time.py @@ -115,7 +115,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): time_zone_graph = time_zones + pynini.closure(utc_or_gmt_diff, 0, 1) final_time_zone_optional = pynini.closure( - delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, + delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), + 0, + 1, ) # 02.30 h diff --git a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py index 5aa66031a..165f5eeca 100644 --- a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py @@ -69,7 +69,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -86,10 +87,17 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -101,7 +109,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py index 972100be8..968075e11 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py @@ -24,13 +24,13 @@ class CardinalFst(GraphFst): """ - Finite state transducer for verbalizing cardinals - e.g. cardinal { integer: "dos" } -> "dos" + Finite state transducer for verbalizing cardinals + e.g. cardinal { integer: "dos" } -> "dos" - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/decimals.py b/nemo_text_processing/text_normalization/es/verbalizers/decimals.py index 3a94899fc..4feedd37d 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/decimals.py @@ -32,14 +32,14 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones - decimal { integer_part: "un" quantity: "billón" } -> un billón + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones + decimal { integer_part: "un" quantity: "billón" } -> un billón Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py index 094098f2e..5d7afc1b7 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py @@ -33,15 +33,15 @@ class FractionFst(GraphFst): """ - Finite state transducer for verbalizing fraction - e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> - treinta y tres y cuatro quintos + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> + treinta y tres y cuatro quintos - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) @@ -141,7 +141,8 @@ def __init__(self, deterministic: bool = True): fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( - denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word, + denominator_singular_fem @ merge_stem, + denominator_singular_fem @ merge_into_single_word, ) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( @@ -150,7 +151,8 @@ def __init__(self, deterministic: bool = True): fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( - denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word, + denominator_plural_fem @ merge_stem, + denominator_plural_fem @ merge_into_single_word, ) fraction_default_fem += pynutil.insert(" partes") diff --git a/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py index 4def8307a..039b00de5 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): Finite state transducer for verbalizing ordinals e.g. ordinal { integer: "tercer" } } -> "tercero" -> "tercera" - -> "tercer" + -> "tercer" Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/fr/taggers/ordinal.py b/nemo_text_processing/text_normalization/fr/taggers/ordinal.py index d3afb13da..73b42053c 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/fr/taggers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal - "2e" -> ordinal { integer: "deux" morphosyntactic_features: "ième" } + "2e" -> ordinal { integer: "deux" morphosyntactic_features: "ième" } This grammar covers from single digits to hundreds of billions ("milliardième" in French). This FST also records the ending of the ordinal (called "morphosyntactic_features"). Args: diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index de9a0b047..0b38aeebb 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -62,7 +62,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -79,7 +80,11 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst word_graph = WordFst(deterministic=deterministic).fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) diff --git a/nemo_text_processing/text_normalization/fr/utils.py b/nemo_text_processing/text_normalization/fr/utils.py index 4f6882b51..7523e5762 100644 --- a/nemo_text_processing/text_normalization/fr/utils.py +++ b/nemo_text_processing/text_normalization/fr/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -34,7 +34,7 @@ def load_labels(abs_path): Args: abs_path: absolute path - + Returns dictionary of mappings """ label_tsv = open(abs_path) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py index a12dbf520..347922a1d 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py @@ -19,12 +19,12 @@ class CardinalFst(GraphFst): """ - Finite state transducer for verbalizing cardinals - e.g. cardinal { negative: "true" integer: "un milliard et un" } -> "moins un milliard et un" - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Finite state transducer for verbalizing cardinals + e.g. cardinal { negative: "true" integer: "un milliard et un" } -> "moins un milliard et un" + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py b/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py index af892e6ca..a720b405b 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py @@ -25,13 +25,13 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "onze" fractional_part: "quatre cent six" quantity: "millions" preserve_order: true } -> moins onze virgule quatre cent six millions - decimal { integer_part: "cent quatorze" quantity: "billions" preserve_order: true } -> cent quatorze billions + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "onze" fractional_part: "quatre cent six" quantity: "millions" preserve_order: true } -> moins onze virgule quatre cent six millions + decimal { integer_part: "cent quatorze" quantity: "billions" preserve_order: true } -> cent quatorze billions Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py b/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py index 7d2ecb395..9388cf343 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py @@ -26,13 +26,13 @@ class FractionFst(GraphFst): """ - Finite state transducer for verbalizing fraction - e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> - treinta y tres y cuatro quintos - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> + treinta y tres y cuatro quintos + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py index c20a3d27b..c9c5c3063 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py @@ -62,7 +62,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': fst: A pynini.FstLike object """ cardinal_separator = pynini.string_map([".", NEMO_SPACE]) - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string up_to_three_digits = up_to_three_digits - "000" - "00" - "0" @@ -246,7 +246,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ clean_output ) @@ -257,12 +257,12 @@ def __init__(self, deterministic: bool = True): zero_space + digit, ).optimize() self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ).optimize() self.four_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 3)) @ self.graph, zero_space + self.three_digits_read + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**3)) @ self.graph, zero_space + self.three_digits_read ).optimize() self.graph |= graph_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/date.py b/nemo_text_processing/text_normalization/hu/taggers/date.py index 0cfddd652..da410dc31 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/date.py +++ b/nemo_text_processing/text_normalization/hu/taggers/date.py @@ -41,7 +41,7 @@ def day_inflector(number, day): Args: number: the day number day: the day name - + Returns: a list of expanded forms, two per ending. """ @@ -71,7 +71,7 @@ def day_adj_endings(number, word, basic=True): 1-jei -> elsejei 2-i -> másodiki 2-ai -> másodikai - 4-i -> negyediki + 4-i -> negyediki 4-ei -> negyedikei This is based on other -i adjectives, because these forms are rare. """ @@ -103,7 +103,7 @@ def day_adj_endings(number, word, basic=True): class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. "2010. április 1." -> date { year: "kettőezer-tíz" month: "április" day: "elseje" preserve_order: true } "2010. ápr. 1." -> date { year: "kettőezer-tíz" month: "április" day: "elseje" preserve_order: true } "2010. IV. 1." -> date { year: "kettőezer-tíz" month: "április" day: "elseje" preserve_order: true } diff --git a/nemo_text_processing/text_normalization/hu/taggers/decimal.py b/nemo_text_processing/text_normalization/hu/taggers/decimal.py index a6f819d17..10ae4a8fe 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/decimal.py @@ -46,7 +46,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL e.g. 1 millió -> integer_part: "egy" quantity: "millió" e.g. 1,4 million -> integer_part: "egy" fractional_part: "négy" quantity: "millió" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -68,7 +68,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -11,4006 milliárd -> decimal { negative: "true" integer_part: "tizenegy" fractional_part: "négyezer-hat tízezred" quantity: "milliárd" preserve_order: true } 1 milliárd -> decimal { integer_part: "egy" quantity: "milliárd" preserve_order: true } Args: @@ -101,7 +101,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ]: for modifier in ["", "tíz", "száz"]: decimal_number |= ( - (NEMO_DIGIT ** order + (NEMO_DIGIT - "0")) + (NEMO_DIGIT**order + (NEMO_DIGIT - "0")) @ pynini.cdrewrite(pynini.cross("0", ""), "[BOS]", "", NEMO_SIGMA) @ cardinal_graph + final_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/measure.py b/nemo_text_processing/text_normalization/hu/taggers/measure.py index 9e5f328fb..f2c3a2368 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hu/taggers/measure.py @@ -61,7 +61,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/ordinal.py b/nemo_text_processing/text_normalization/hu/taggers/ordinal.py index 634e006e6..a63a9f02a 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/ordinal.py @@ -25,7 +25,7 @@ class OrdinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "2." -> ordinal { integer: "második" } } Args: diff --git a/nemo_text_processing/text_normalization/hu/taggers/telephone.py b/nemo_text_processing/text_normalization/hu/taggers/telephone.py index 856353a30..b031ca5dc 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/taggers/telephone.py @@ -41,8 +41,8 @@ class TelephoneFst(GraphFst): https://en.wikipedia.org/wiki/Telephone_numbers_in_Hungary Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/hu/taggers/time.py b/nemo_text_processing/text_normalization/hu/taggers/time.py index 65dc26398..43e067fef 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/time.py +++ b/nemo_text_processing/text_normalization/hu/taggers/time.py @@ -71,7 +71,7 @@ class TimeFst(GraphFst): "09:00 óra" -> time { hours: "2" } "02:15:10 óra" -> time { hours: "2" minutes: "15" seconds: "10"} "negyed 2" -> time { minutes: "15" hours: "1" } - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) @@ -180,7 +180,11 @@ def hours_to_pairs(): final_time_zone = ( pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") ) - final_time_zone_optional = pynini.closure(final_time_zone, 0, 1,) + final_time_zone_optional = pynini.closure( + final_time_zone, + 0, + 1, + ) # This might be better as just the inflected forms hour_only_delimited = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py index 60ed0ddc9..8c269bb00 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py @@ -69,7 +69,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -86,10 +87,17 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -101,7 +109,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/hu/utils.py b/nemo_text_processing/text_normalization/hu/utils.py index 8a87a3166..a5fb4fc3c 100644 --- a/nemo_text_processing/text_normalization/hu/utils.py +++ b/nemo_text_processing/text_normalization/hu/utils.py @@ -49,7 +49,7 @@ def load_inflection(abs_path): Args: abs_path: absolute path - + Returns dictionary of mappings of word endings to lists of case endings. """ @@ -97,7 +97,7 @@ def inflect_abbreviation(abbr: str, word: str, singular_only=False): word: the base (nominative singular) form of the expansion of abbr singular_only: whether or not to add plural forms - + Returns a list of tuples containing the inflected abbreviation and its expansion. """ @@ -133,7 +133,7 @@ def naive_inflector(abbr: str, word: str, singular_only=False): word: the base (nominative singular) form of the expansion of abbr singular_only: whether or not to add plural forms - + Returns a list of tuples containing the inflected abbreviation and its expansion. """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/measure.py b/nemo_text_processing/text_normalization/hu/verbalizers/measure.py index 41f7fb89c..675659044 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/measure.py @@ -28,7 +28,7 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "zwei" units: "unzen" } } -> "zwei unzen" measure { cardinal { integer_part: "zwei" quantity: "millionen" units: "unzen" } } -> "zwei millionen unzen" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py index f17f7c36a..b52e6efb7 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py @@ -34,7 +34,11 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) + optional_country_code = pynini.closure( + country_code + delete_space + insert_space, + 0, + 1, + ) number_part = ( pynutil.delete("number_part: \"") @@ -53,6 +57,8 @@ def __init__(self, deterministic: bool = True): 1, ) - graph = pynini.union(optional_country_code + number_part + optional_extension,) + graph = pynini.union( + optional_country_code + number_part + optional_extension, + ) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/utils.py b/nemo_text_processing/text_normalization/hy/utils.py index 7abe91e9e..26c9f5119 100644 --- a/nemo_text_processing/text_normalization/hy/utils.py +++ b/nemo_text_processing/text_normalization/hy/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -34,7 +34,7 @@ def load_labels(abs_path): Args: abs_path: absolute path - + Returns dictionary of mappings """ label_tsv = open(abs_path) diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py index 810b1af49..d5d56cf66 100644 --- a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py @@ -25,14 +25,14 @@ class VerbalizeFst(GraphFst): """ - Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. - More details to deployment at NeMo/tools/text_processing_deployment. + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ def __init__(self, deterministic=True): super().__init__(name="verbalize", kind="verbalize") diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py index aebadd456..e5afd807e 100644 --- a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py @@ -23,15 +23,15 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. - tokens { name: "Երևանում" } tokens { name: "ժամը" } tokens { time { hours: "տասներկուսն" minutes: "հիսունհինգ" } } tokens { name: "է" } tokens { name: ":" } -> Երևանում ժամը տասներկուսն անց հիսունհինգ է: + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "Երևանում" } tokens { name: "ժամը" } tokens { time { hours: "տասներկուսն" minutes: "հիսունհինգ" } } tokens { name: "է" } tokens { name: ":" } -> Երևանում ժամը տասներկուսն անց հիսունհինգ է: - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. - overwrite_cache: set to True to overwrite .far files - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ def __init__(self, deterministic=True, cache_dir=None, overwrite_cache=False): super().__init__(name="verbalize_final", kind="verbalize") diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index ecb003775..1e16d6e36 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -48,7 +48,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -162,7 +162,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/it/taggers/decimals.py b/nemo_text_processing/text_normalization/it/taggers/decimals.py index 4e32855ad..8f98d5a2b 100644 --- a/nemo_text_processing/text_normalization/it/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/it/taggers/decimals.py @@ -36,8 +36,7 @@ def get_quantity(decimal_graph: "pynini.FstLike", cardinal_graph: "pynini.FstLike") -> "pynini.FstLike": - """ - """ + """ """ numbers = pynini.closure(NEMO_DIGIT, 1, 6) @ cardinal_graph numbers = pynini.cdrewrite(pynutil.delete(cardinal_separator), "", "", NEMO_SIGMA) @ numbers diff --git a/nemo_text_processing/text_normalization/it/taggers/measure.py b/nemo_text_processing/text_normalization/it/taggers/measure.py index d3591089e..880be0aa7 100644 --- a/nemo_text_processing/text_normalization/it/taggers/measure.py +++ b/nemo_text_processing/text_normalization/it/taggers/measure.py @@ -45,7 +45,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for classifying measure, e.g. "2,4 g" -> measure { cardinal { integer_part: "due" fractional_part: "quattro" units: "grammi" preserve_order: true } } - + Args: cardinal: CardinalFst decimal: DecimalFst @@ -68,7 +68,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/it/taggers/money.py b/nemo_text_processing/text_normalization/it/taggers/money.py index e8f68c2ac..d92906f1d 100644 --- a/nemo_text_processing/text_normalization/it/taggers/money.py +++ b/nemo_text_processing/text_normalization/it/taggers/money.py @@ -40,7 +40,7 @@ class MoneyFst(GraphFst): "€1" -> money { currency_maj: "euro" integer_part: "un"} "€1,000" -> money { currency_maj: "euro" integer_part: "un" } "4,2 £" -> money { integer_part: "quattro" currency_maj: "sterline" fractional_part: "venti" currency_min: "penny" preserve_order: true } - + Args: cardinal: CardinalFst decimal: DecimalFst diff --git a/nemo_text_processing/text_normalization/it/taggers/time.py b/nemo_text_processing/text_normalization/it/taggers/time.py index 351b6f40c..97d952489 100644 --- a/nemo_text_processing/text_normalization/it/taggers/time.py +++ b/nemo_text_processing/text_normalization/it/taggers/time.py @@ -25,7 +25,7 @@ class TimeFst(GraphFst): 15:30:30 tokens { time { hours: "15" minutes: "30" seconds: "30" preserve_order: true } } -> quindici e mezza trenta secondi 12:15 tokens { time { hours: "12" minutes: "15" } } -> dodici e un quarto 03:38 tokens { time { hours: "3" minutes: "38" } } -> tre e trentotto minuti - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py index 3aebcca91..603d520b5 100644 --- a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py @@ -66,7 +66,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -88,10 +89,18 @@ def __init__( self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.measure = MeasureFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) measure_graph = self.measure.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.time = TimeFst(deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/it/utils.py b/nemo_text_processing/text_normalization/it/utils.py index eadec4d89..be8bdb5ad 100644 --- a/nemo_text_processing/text_normalization/it/utils.py +++ b/nemo_text_processing/text_normalization/it/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/text_normalization/it/verbalizers/decimal.py b/nemo_text_processing/text_normalization/it/verbalizers/decimal.py index 568361603..f257d7df4 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/decimal.py @@ -26,18 +26,18 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "venti" fractional_part: "trentaquattro" quantity: "miliardi" } -> + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "venti" fractional_part: "trentaquattro" quantity: "miliardi" } -> meno venti virgola trentaquattro - decimal { integer_part: "un milione" fractional_part: "zero zero zero" quantity: "milioni" preserve_order: true } --> + decimal { integer_part: "un milione" fractional_part: "zero zero zero" quantity: "milioni" preserve_order: true } --> un milione virgola zero zero zero decimal { integer_part: "due" quantity: "milioni" preserve_order: true } --> due milioni Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/it/verbalizers/measure.py b/nemo_text_processing/text_normalization/it/verbalizers/measure.py index 93fa50500..c7fe33969 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/measure.py @@ -27,7 +27,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "due" units: "grammi" } } -> "due grammi" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/it/verbalizers/money.py b/nemo_text_processing/text_normalization/it/verbalizers/money.py index ba9687bd5..f4b3fdef8 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/money.py @@ -40,7 +40,7 @@ class MoneyFst(GraphFst): Finite state transducer for verbalizing money, e.g. money { currency_maj: "euro" integer_part: "un"} -> "un euro" money { integer_part: "quattro" currency_maj: "sterline" fractional_part: "venti" currency_min: "penny" preserve_order: true } -> "quattro sterline venti penny" - + Args: decimal: GraphFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 6a61efd4e..8a60516cc 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -164,11 +164,16 @@ def normalize( text_with_span_tags_list[masked_idx_list[sem_tag_idx]] = "" else: non_deter_options = self.normalize_non_deterministic( - text=cur_semiotic_span, n_tagged=n_tagged, punct_post_process=punct_post_process, verbose=verbose, + text=cur_semiotic_span, + n_tagged=n_tagged, + punct_post_process=punct_post_process, + verbose=verbose, ) try: best_option, cer, _ = self.select_best_match( - normalized_texts=non_deter_options, pred_text=cur_pred_text, verbose=verbose, + normalized_texts=non_deter_options, + pred_text=cur_pred_text, + verbose=verbose, ) if cer_threshold > 0 and cer > cer_threshold: best_option = cur_deter_norm @@ -366,7 +371,11 @@ def get_verbalized_text(tagged_text): continue def select_best_match( - self, normalized_texts: List[str], pred_text: str, verbose: bool = False, remove_punct: bool = False, + self, + normalized_texts: List[str], + pred_text: str, + verbose: bool = False, + remove_punct: bool = False, ): """ Selects the best normalization option based on the lowest CER diff --git a/nemo_text_processing/text_normalization/ru/taggers/cardinal.py b/nemo_text_processing/text_normalization/ru/taggers/cardinal.py index d0bc8cc07..5e780969a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ru/taggers/cardinal.py @@ -32,7 +32,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "1 001" -> cardinal { integer: "тысяча один" } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/date.py b/nemo_text_processing/text_normalization/ru/taggers/date.py index dd3872e2f..3ad16f999 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/text_normalization/ru/taggers/date.py @@ -29,7 +29,7 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. "01.05" -> tokens { date { day: "первое мая" } } Args: @@ -78,7 +78,7 @@ def __init__(self, number_names: dict, deterministic: bool): month = ( pynutil.insert("month: \"") + (month_name | pynutil.add_weight(digit_month, 0.1)) + pynutil.insert("\"") ).optimize() - year = pynini.compose(((NEMO_DIGIT ** 4) | (NEMO_DIGIT ** 2)), numbers).optimize() + year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)), numbers).optimize() year |= zero_digit # reduce year options diff --git a/nemo_text_processing/text_normalization/ru/taggers/decimals.py b/nemo_text_processing/text_normalization/ru/taggers/decimals.py index 29c208777..40ced8d52 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/ru/taggers/decimals.py @@ -50,7 +50,7 @@ def prepare_labels_for_insertion(file_path: str): class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. "1,08" -> tokens { decimal { integer_part: "одно целая" fractional_part: "восемь сотых} } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/ordinal.py b/nemo_text_processing/text_normalization/ru/taggers/ordinal.py index 09cd57d33..43277db46 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/ru/taggers/ordinal.py @@ -25,7 +25,7 @@ class OrdinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "2" -> ordinal { integer: "второе" } } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/telephone.py b/nemo_text_processing/text_normalization/ru/taggers/telephone.py index d2b3d508c..456bd6f1a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/taggers/telephone.py @@ -21,9 +21,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone, which includes country code, number part and extension + Finite state transducer for classifying telephone, which includes country code, number part and extension - E.g + E.g "8-913-983-56-01" -> telephone { number_part: "восемь девятьсот тринадцать девятьсот восемьдесят три пятьдесят шесть ноль один" } Args: @@ -48,13 +48,13 @@ def __init__(self, number_names: dict, deterministic: bool = True): optional_country_code = pynini.closure(country_code + insert_space, 0, 1) number_part = ( - NEMO_DIGIT ** 3 @ number + NEMO_DIGIT**3 @ number + separator - + NEMO_DIGIT ** 3 @ number + + NEMO_DIGIT**3 @ number + separator - + NEMO_DIGIT ** 2 @ number + + NEMO_DIGIT**2 @ number + separator - + NEMO_DIGIT ** 2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) + + NEMO_DIGIT**2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) ) number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") tagger_graph = (optional_country_code + number_part).optimize() diff --git a/nemo_text_processing/text_normalization/ru/taggers/time.py b/nemo_text_processing/text_normalization/ru/taggers/time.py index 4b3f40560..427aab00d 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/time.py +++ b/nemo_text_processing/text_normalization/ru/taggers/time.py @@ -24,7 +24,7 @@ class TimeFst(GraphFst): """ Finite state transducer for classifying time, e.g. "02:15" -> time { hours: "два часа пятнадцать минут" } - + Args: number_names: number_names for cardinal and ordinal numbers deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/ru/utils.py b/nemo_text_processing/text_normalization/ru/utils.py index 5f5c4bbfb..a55659868 100644 --- a/nemo_text_processing/text_normalization/ru/utils.py +++ b/nemo_text_processing/text_normalization/ru/utils.py @@ -24,7 +24,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/measure.py b/nemo_text_processing/text_normalization/ru/verbalizers/measure.py index ad2e85bf5..001691518 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/measure.py @@ -28,7 +28,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "два килограма" } } -> "два килограма" - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py index 8d92e3efe..ceaf04d7d 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py @@ -29,7 +29,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now Args: diff --git a/nemo_text_processing/text_normalization/rw/__init__.py b/nemo_text_processing/text_normalization/rw/__init__.py index c921ca1b8..876f20b3f 100644 --- a/nemo_text_processing/text_normalization/rw/__init__.py +++ b/nemo_text_processing/text_normalization/rw/__init__.py @@ -16,4 +16,3 @@ from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.rw.verbalizers.verbalize_final import VerbalizeFinalFst - diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py index 46ab24f7c..450ab1d01 100644 --- a/nemo_text_processing/text_normalization/rw/graph_utils.py +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -77,7 +77,7 @@ ).optimize() delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) -delete_space_or_punct = NEMO_PUNCT | delete_space +delete_space_or_punct = NEMO_PUNCT | delete_space delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") @@ -107,14 +107,36 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "x", + "y", + "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, + plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), + NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -129,7 +151,9 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, + graph: "pynini.FstLike", + original_graph_weight: float = None, + capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) @@ -204,7 +228,6 @@ def convert_space(fst) -> "pynini.FstLike": return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) - class GraphFst: """ Base class for all grammar fsts. diff --git a/nemo_text_processing/text_normalization/rw/taggers/__init__.py b/nemo_text_processing/text_normalization/rw/taggers/__init__.py index 96d45783e..9c4313114 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/rw/taggers/__init__.py @@ -12,4 +12,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py index c80097a8e..14da33500 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py @@ -16,7 +16,18 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst,NEMO_CHAR,insert_space,NEMO_DIGIT,NEMO_ALPHA,NEMO_CONSONANTS,NEMO_VOWELS,delete_extra_space,delete_space + +from nemo_text_processing.text_normalization.rw.graph_utils import ( + NEMO_ALPHA, + NEMO_CHAR, + NEMO_CONSONANTS, + NEMO_DIGIT, + NEMO_VOWELS, + GraphFst, + delete_extra_space, + delete_space, + insert_space, +) from nemo_text_processing.text_normalization.rw.utils import get_abs_path @@ -24,21 +35,29 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") vowels_or_space = NEMO_VOWELS | " " - rewrite_na_fst = pynini.cdrewrite(pynini.cross(" "," na "),vowels_or_space,NEMO_CONSONANTS,NEMO_CHAR.closure()) - rewrite_n_fst = pynini.cdrewrite(pynini.cross(" "," n'"),vowels_or_space,NEMO_VOWELS,NEMO_CHAR.closure()) - remove_underscore_fst = pynini.cdrewrite(pynini.cross("_"," "),pynini.union(NEMO_ALPHA),pynini.union(NEMO_ALPHA),NEMO_CHAR.closure()) - remove_extra_space_fst = pynini.cdrewrite(delete_extra_space,pynini.union(NEMO_ALPHA),pynini.union(NEMO_ALPHA),NEMO_CHAR.closure()) - remove_trailing_space_fst = pynini.cdrewrite(delete_space,pynini.union(NEMO_ALPHA).closure(),'[EOS]',NEMO_CHAR.closure()) - - rewrite_add_separator_fst = pynini.compose(rewrite_na_fst,rewrite_n_fst) - ten_thousand = pynini.string_map([("ibihumbi_icumi","10")]) - ten = pynini.string_map([("icumi","10")]) + rewrite_na_fst = pynini.cdrewrite( + pynini.cross(" ", " na "), vowels_or_space, NEMO_CONSONANTS, NEMO_CHAR.closure() + ) + rewrite_n_fst = pynini.cdrewrite(pynini.cross(" ", " n'"), vowels_or_space, NEMO_VOWELS, NEMO_CHAR.closure()) + remove_underscore_fst = pynini.cdrewrite( + pynini.cross("_", " "), pynini.union(NEMO_ALPHA), pynini.union(NEMO_ALPHA), NEMO_CHAR.closure() + ) + remove_extra_space_fst = pynini.cdrewrite( + delete_extra_space, pynini.union(NEMO_ALPHA), pynini.union(NEMO_ALPHA), NEMO_CHAR.closure() + ) + remove_trailing_space_fst = pynini.cdrewrite( + delete_space, pynini.union(NEMO_ALPHA).closure(), '[EOS]', NEMO_CHAR.closure() + ) + + rewrite_add_separator_fst = pynini.compose(rewrite_na_fst, rewrite_n_fst) + ten_thousand = pynini.string_map([("ibihumbi_icumi", "10")]) + ten = pynini.string_map([("icumi", "10")]) digits = pynini.string_file(get_abs_path("data/cardinal/digits.tsv")) - digits_for_thousands = pynini.string_file(get_abs_path("data/cardinal/digits_for_thousands.tsv")) - digits_millions_trillions= pynini.string_file(get_abs_path("data/cardinal/digits_millions_trillions.tsv")) + digits_for_thousands = pynini.string_file(get_abs_path("data/cardinal/digits_for_thousands.tsv")) + digits_millions_trillions = pynini.string_file(get_abs_path("data/cardinal/digits_millions_trillions.tsv")) tens = pynini.string_file(get_abs_path("data/cardinal/tens.tsv")) - tens_for_ends = pynini.string_map([("icumi","1")])|tens - tens_for_beginnings= pynini.string_map([("cumi","1")])|tens + tens_for_ends = pynini.string_map([("icumi", "1")]) | tens + tens_for_beginnings = pynini.string_map([("cumi", "1")]) | tens hundreds = pynini.string_file(get_abs_path("data/cardinal/hundreds.tsv")) thousands = pynini.string_file(get_abs_path("data/cardinal/thousands.tsv")) tens_of_thousands = pynini.string_file(get_abs_path("data/cardinal/tens_of_thousands.tsv")) @@ -59,113 +78,167 @@ def __init__(self): EIGHT_ZEROS = "00000000" NINE_ZEROS = "000000000" - zero = pynini.string_map([("zeru","0")]) - rewrite_remove_comma_fst = pynini.cdrewrite(pynini.cross(",",""),pynini.union(NEMO_DIGIT),pynini.union(NEMO_DIGIT),NEMO_CHAR.closure()) + zero = pynini.string_map([("zeru", "0")]) + rewrite_remove_comma_fst = pynini.cdrewrite( + pynini.cross(",", ""), pynini.union(NEMO_DIGIT), pynini.union(NEMO_DIGIT), NEMO_CHAR.closure() + ) single_digits_graph = pynini.invert(digits | zero) single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) - remove_comma = rewrite_remove_comma_fst@single_digits_graph - - graph_tens_ends = tens_for_ends +pynutil.delete(" ")+ digits | tens_for_ends+pynutil.insert("0") - graph_tens_starts = tens_for_beginnings +pynutil.delete(" ")+ digits | tens_for_beginnings+pynutil.insert("0") - - graph_tens_for_thousands = tens_for_beginnings +pynutil.delete(" ")+ digits_for_thousands | tens_for_beginnings+pynutil.insert("0") - - graph_tens_for_millions_trillions = tens_for_beginnings +pynutil.delete(" ")+ digits_millions_trillions \ - | tens_for_beginnings+pynutil.insert("0") - graph_hundreds = hundreds+pynutil.delete(" ")+graph_tens_ends | hundreds+pynutil.insert("00") \ - | hundreds+pynutil.delete(" ")+pynutil.insert("0")+digits - graph_thousands = thousands+pynutil.delete(" ")+graph_hundreds | thousands+pynutil.insert(THREE_ZEROS) \ - | thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_ends \ - | thousands+pynutil.delete(" ")+pynutil.insert("00")+digits - - graph_ten_thousand_and_hundreds = ten_thousand +pynutil.insert(THREE_ZEROS) | ten_thousand +pynutil.delete(" ") + graph_hundreds \ - | ten_thousand+pynutil.delete(" ") +pynutil.insert("0")+graph_tens_ends \ - | ten_thousand+pynutil.delete(" ") +pynutil.insert("00")+digits - prefix_tens_of_thousands = tens_of_thousands+pynutil.delete(" ") + digits_for_thousands - graph_tens_of_thousands = pynutil.add_weight(graph_ten_thousand_and_hundreds, weight=-0.1) \ - | prefix_tens_of_thousands+ pynutil.delete(" ")+ graph_hundreds \ - | prefix_tens_of_thousands + pynutil.insert(THREE_ZEROS) \ - | prefix_tens_of_thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_hundreds \ - | prefix_tens_of_thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_ends \ - | prefix_tens_of_thousands+pynutil.delete(" ")+pynutil.insert("00")+digits - - prefix_hundreds_of_thousands = hundreds_of_thousands+pynutil.delete(" ") + graph_tens_for_thousands - graph_hundreds_of_thousands = hundreds_of_thousands+pynutil.insert(FIVE_ZEROS) \ - | prefix_hundreds_of_thousands+pynutil.insert(THREE_ZEROS) \ - | prefix_hundreds_of_thousands+pynutil.delete(" ")+graph_hundreds \ - | pynutil.add_weight(prefix_hundreds_of_thousands+pynutil.delete(" ")+pynutil.insert("00")+digits,weight=-0.1) \ - | prefix_hundreds_of_thousands+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_for_thousands - - graph_millions = millions +pynutil.delete(" ") + graph_hundreds_of_thousands | millions+pynutil.insert(SIX_ZEROS) \ - | millions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_thousands \ - | millions+pynutil.delete(" ")+pynutil.insert("00")+graph_thousands \ - | millions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds \ - | millions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_ends \ - | millions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+digits - - prefix_tens_of_millions = tens_of_millions+pynutil.delete(" ") + digits_millions_trillions - graph_tens_of_millions = prefix_tens_of_millions +pynutil.delete(" ")+graph_hundreds_of_thousands \ - | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS) \ - | prefix_tens_of_millions+pynutil.delete(" ") +pynutil.insert("0")+graph_tens_of_thousands \ - | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds \ - | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_ends \ - | tens_of_millions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_tens_ends \ - | prefix_tens_of_millions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+digits - - prefix_hundreds_of_millions = hundreds_of_millions+pynutil.delete(" ") +graph_tens_for_millions_trillions - graph_hundreds_of_millions = prefix_hundreds_of_millions+pynutil.delete(" ")+graph_hundreds_of_thousands \ - | prefix_hundreds_of_millions+pynutil.insert(SIX_ZEROS) \ - | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_thousands \ - | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert("00")+graph_thousands \ - | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds \ - | prefix_hundreds_of_millions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_ends - - graph_trillions = trillions+pynutil.delete(" ")+graph_hundreds_of_millions | trillions+pynutil.insert(NINE_ZEROS) \ - | trillions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_millions \ - | trillions+pynutil.delete(" ")+pynutil.insert("00")+graph_millions \ - | trillions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds_of_thousands \ - | trillions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_of_thousands \ - | trillions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_thousands\ - | trillions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS)+graph_hundreds \ - | trillions+pynutil.delete(" ")+pynutil.insert(SEVEN_ZEROS)+graph_tens_ends \ - | trillions+pynutil.delete(" ")+pynutil.insert(EIGHT_ZEROS)+digits - - prefix_tens_of_trillions = tens_of_trillions+pynutil.delete(" ") + digits_millions_trillions - graph_tens_of_trillions = prefix_tens_of_trillions+pynutil.delete(" ")+graph_hundreds_of_millions \ - | prefix_tens_of_trillions+pynutil.insert(NINE_ZEROS) \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_millions \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert("00")+graph_millions \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds_of_thousands \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_of_thousands \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_thousands \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS)+graph_hundreds \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(SEVEN_ZEROS)+graph_tens_ends \ - | prefix_tens_of_trillions+pynutil.delete(" ")+pynutil.insert(EIGHT_ZEROS)+digits - - prefix_hundreds_of_trillions = hundreds_of_trillions+pynutil.delete(" ") +graph_tens_for_millions_trillions - graph_hundreds_of_trillions = prefix_hundreds_of_trillions+pynutil.delete(" ")+ graph_hundreds_of_millions \ - | prefix_hundreds_of_trillions+pynutil.insert(NINE_ZEROS) \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert("0")+graph_tens_of_millions \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert("00")+graph_millions \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(THREE_ZEROS)+graph_hundreds_of_thousands \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(FOUR_ZEROS)+graph_tens_of_thousands \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(FIVE_ZEROS)+graph_thousands \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(SIX_ZEROS)+graph_hundreds \ - | prefix_hundreds_of_trillions+pynutil.delete(" ")+pynutil.insert(SEVEN_ZEROS)+graph_tens_ends - - graph_all = graph_hundreds_of_trillions | graph_tens_of_trillions | graph_trillions | graph_hundreds_of_millions | graph_tens_of_millions \ - | graph_millions | graph_hundreds_of_thousands | graph_tens_of_thousands \ - | graph_thousands | graph_hundreds | pynutil.add_weight(ten, weight=-0.1) \ - | graph_tens_starts | digits | pynini.cross("zeru","0") - - inverted_graph_all = pynini.compose(pynini.invert(graph_all),rewrite_add_separator_fst) - inverted_graph_all = pynini.compose(inverted_graph_all,remove_extra_space_fst) - inverted_graph_all = pynini.compose(inverted_graph_all,remove_trailing_space_fst) - inverted_graph_all = pynini.compose(inverted_graph_all,remove_underscore_fst) | pynutil.add_weight(remove_comma, 0.0001) + remove_comma = rewrite_remove_comma_fst @ single_digits_graph + + graph_tens_ends = tens_for_ends + pynutil.delete(" ") + digits | tens_for_ends + pynutil.insert("0") + graph_tens_starts = tens_for_beginnings + pynutil.delete(" ") + digits | tens_for_beginnings + pynutil.insert( + "0" + ) + + graph_tens_for_thousands = tens_for_beginnings + pynutil.delete( + " " + ) + digits_for_thousands | tens_for_beginnings + pynutil.insert("0") + + graph_tens_for_millions_trillions = tens_for_beginnings + pynutil.delete( + " " + ) + digits_millions_trillions | tens_for_beginnings + pynutil.insert("0") + graph_hundreds = ( + hundreds + pynutil.delete(" ") + graph_tens_ends + | hundreds + pynutil.insert("00") + | hundreds + pynutil.delete(" ") + pynutil.insert("0") + digits + ) + graph_thousands = ( + thousands + pynutil.delete(" ") + graph_hundreds + | thousands + pynutil.insert(THREE_ZEROS) + | thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_ends + | thousands + pynutil.delete(" ") + pynutil.insert("00") + digits + ) + + graph_ten_thousand_and_hundreds = ( + ten_thousand + pynutil.insert(THREE_ZEROS) + | ten_thousand + pynutil.delete(" ") + graph_hundreds + | ten_thousand + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_ends + | ten_thousand + pynutil.delete(" ") + pynutil.insert("00") + digits + ) + prefix_tens_of_thousands = tens_of_thousands + pynutil.delete(" ") + digits_for_thousands + graph_tens_of_thousands = ( + pynutil.add_weight(graph_ten_thousand_and_hundreds, weight=-0.1) + | prefix_tens_of_thousands + pynutil.delete(" ") + graph_hundreds + | prefix_tens_of_thousands + pynutil.insert(THREE_ZEROS) + | prefix_tens_of_thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_hundreds + | prefix_tens_of_thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_ends + | prefix_tens_of_thousands + pynutil.delete(" ") + pynutil.insert("00") + digits + ) + + prefix_hundreds_of_thousands = hundreds_of_thousands + pynutil.delete(" ") + graph_tens_for_thousands + graph_hundreds_of_thousands = ( + hundreds_of_thousands + pynutil.insert(FIVE_ZEROS) + | prefix_hundreds_of_thousands + pynutil.insert(THREE_ZEROS) + | prefix_hundreds_of_thousands + pynutil.delete(" ") + graph_hundreds + | pynutil.add_weight( + prefix_hundreds_of_thousands + pynutil.delete(" ") + pynutil.insert("00") + digits, weight=-0.1 + ) + | prefix_hundreds_of_thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_for_thousands + ) + + graph_millions = ( + millions + pynutil.delete(" ") + graph_hundreds_of_thousands + | millions + pynutil.insert(SIX_ZEROS) + | millions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_thousands + | millions + pynutil.delete(" ") + pynutil.insert("00") + graph_thousands + | millions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds + | millions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_ends + | millions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + digits + ) + + prefix_tens_of_millions = tens_of_millions + pynutil.delete(" ") + digits_millions_trillions + graph_tens_of_millions = ( + prefix_tens_of_millions + pynutil.delete(" ") + graph_hundreds_of_thousands + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_thousands + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_ends + | tens_of_millions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_tens_ends + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + digits + ) + + prefix_hundreds_of_millions = hundreds_of_millions + pynutil.delete(" ") + graph_tens_for_millions_trillions + graph_hundreds_of_millions = ( + prefix_hundreds_of_millions + pynutil.delete(" ") + graph_hundreds_of_thousands + | prefix_hundreds_of_millions + pynutil.insert(SIX_ZEROS) + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_thousands + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert("00") + graph_thousands + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_ends + ) + + graph_trillions = ( + trillions + pynutil.delete(" ") + graph_hundreds_of_millions + | trillions + pynutil.insert(NINE_ZEROS) + | trillions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_millions + | trillions + pynutil.delete(" ") + pynutil.insert("00") + graph_millions + | trillions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds_of_thousands + | trillions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_of_thousands + | trillions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_thousands + | trillions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + graph_hundreds + | trillions + pynutil.delete(" ") + pynutil.insert(SEVEN_ZEROS) + graph_tens_ends + | trillions + pynutil.delete(" ") + pynutil.insert(EIGHT_ZEROS) + digits + ) + + prefix_tens_of_trillions = tens_of_trillions + pynutil.delete(" ") + digits_millions_trillions + graph_tens_of_trillions = ( + prefix_tens_of_trillions + pynutil.delete(" ") + graph_hundreds_of_millions + | prefix_tens_of_trillions + pynutil.insert(NINE_ZEROS) + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_millions + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert("00") + graph_millions + | prefix_tens_of_trillions + + pynutil.delete(" ") + + pynutil.insert(THREE_ZEROS) + + graph_hundreds_of_thousands + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_of_thousands + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_thousands + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + graph_hundreds + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(SEVEN_ZEROS) + graph_tens_ends + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(EIGHT_ZEROS) + digits + ) + + prefix_hundreds_of_trillions = hundreds_of_trillions + pynutil.delete(" ") + graph_tens_for_millions_trillions + graph_hundreds_of_trillions = ( + prefix_hundreds_of_trillions + pynutil.delete(" ") + graph_hundreds_of_millions + | prefix_hundreds_of_trillions + pynutil.insert(NINE_ZEROS) + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_millions + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert("00") + graph_millions + | prefix_hundreds_of_trillions + + pynutil.delete(" ") + + pynutil.insert(THREE_ZEROS) + + graph_hundreds_of_thousands + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_of_thousands + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_thousands + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + graph_hundreds + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(SEVEN_ZEROS) + graph_tens_ends + ) + + graph_all = ( + graph_hundreds_of_trillions + | graph_tens_of_trillions + | graph_trillions + | graph_hundreds_of_millions + | graph_tens_of_millions + | graph_millions + | graph_hundreds_of_thousands + | graph_tens_of_thousands + | graph_thousands + | graph_hundreds + | pynutil.add_weight(ten, weight=-0.1) + | graph_tens_starts + | digits + | pynini.cross("zeru", "0") + ) + + inverted_graph_all = pynini.compose(pynini.invert(graph_all), rewrite_add_separator_fst) + inverted_graph_all = pynini.compose(inverted_graph_all, remove_extra_space_fst) + inverted_graph_all = pynini.compose(inverted_graph_all, remove_trailing_space_fst) + inverted_graph_all = pynini.compose(inverted_graph_all, remove_underscore_fst) | pynutil.add_weight( + remove_comma, 0.0001 + ) inverted_graph_all = inverted_graph_all.optimize() final_graph = pynutil.insert("integer: \"") + inverted_graph_all + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph - - diff --git a/nemo_text_processing/text_normalization/rw/taggers/time.py b/nemo_text_processing/text_normalization/rw/taggers/time.py index a07ae059e..0caf4f7d5 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/time.py +++ b/nemo_text_processing/text_normalization/rw/taggers/time.py @@ -14,21 +14,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst import pynini from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.utils import get_abs_path class TimeFst(GraphFst): def __init__(self): super().__init__(name="time", kind="classify") - + hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) - + minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) - - final_graph = pynutil.insert("hours:\"")+hours+pynutil.insert("\"")+pynutil.delete(":")+pynutil.insert(" minutes:\"")+minutes+pynutil.insert("\"") + + final_graph = ( + pynutil.insert("hours:\"") + + hours + + pynutil.insert("\"") + + pynutil.delete(":") + + pynutil.insert(" minutes:\"") + + minutes + + pynutil.insert("\"") + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py index e17841e10..01ec1e370 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py @@ -13,34 +13,40 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst -from nemo_text_processing.text_normalization.rw.taggers.time import TimeFst -from nemo_text_processing.text_normalization.rw.taggers.whitelist import WhiteListFst -from nemo_text_processing.text_normalization.rw.taggers.cardinal import CardinalFst - +import os -from nemo_text_processing.text_normalization.en.taggers.word import WordFst -from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst import pynini from pynini.lib import pynutil -import os + +from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.en.taggers.word import WordFst from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_extra_space, delete_space, generator_main, ) +from nemo_text_processing.text_normalization.rw.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.rw.taggers.time import TimeFst +from nemo_text_processing.text_normalization.rw.taggers.whitelist import WhiteListFst + class ClassifyFst(GraphFst): - def __init__(self,input_case: str,cache_dir: str = None, overwrite_cache: bool = False,deterministic: bool = True,whitelist: str = None, -): - super().__init__(name='tokenize_and_classify',kind='classify',deterministic=deterministic) + def __init__( + self, + input_case: str, + cache_dir: str = None, + overwrite_cache: bool = False, + deterministic: bool = True, + whitelist: str = None, + ): + super().__init__(name='tokenize_and_classify', kind='classify', deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "rw_tn_tokenize_and_classify.far") if not overwrite_cache and far_file and os.path.exists(far_file): - print("FAR file: ",far_file) + print("FAR file: ", far_file) self.fst = pynini.Far(far_file, mode="r")["TOKENIZE_AND_CLASSIFY"] else: cardinal = CardinalFst() @@ -48,7 +54,7 @@ def __init__(self,input_case: str,cache_dir: str = None, overwrite_cache: bool = time_graph = TimeFst().fst punctuation = PunctuationFst() punct_graph = punctuation.fst - + word_graph = WordFst(punctuation=punctuation).fst whitelist_graph = WhiteListFst().fst @@ -56,17 +62,17 @@ def __init__(self,input_case: str,cache_dir: str = None, overwrite_cache: bool = pynutil.add_weight(time_graph, 1.05) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 1.50) - | pynutil.add_weight(whitelist_graph,1.01) + | pynutil.add_weight(whitelist_graph, 1.01) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token+ pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) - + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: - generator_main(far_file, {"TOKENIZE_AND_CLASSIFY":self.fst}) + generator_main(far_file, {"TOKENIZE_AND_CLASSIFY": self.fst}) diff --git a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py index 288a1edda..382243d26 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py @@ -14,13 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst import pynini from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.utils import get_abs_path +transliterations = pynini.string_file(get_abs_path("data/whitelist/kinya_transliterations.tsv")) -transliterations = pynini.string_file(get_abs_path("data/whitelist/kinya_transliterations.tsv")) class WhiteListFst(GraphFst): def __init__(self): diff --git a/nemo_text_processing/text_normalization/rw/utils.py b/nemo_text_processing/text_normalization/rw/utils.py index 148d2de51..460596bca 100644 --- a/nemo_text_processing/text_normalization/rw/utils.py +++ b/nemo_text_processing/text_normalization/rw/utils.py @@ -21,10 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - - diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py index 2931cfd9b..9c4313114 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py @@ -12,4 +12,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/time.py b/nemo_text_processing/text_normalization/rw/verbalizers/time.py index 99bcd7808..50c0f71a2 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/time.py @@ -15,19 +15,28 @@ # limitations under the License. import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.rw.graph_utils import ( - GraphFst, - delete_space, - NEMO_CHAR -) + +from nemo_text_processing.text_normalization.rw.graph_utils import NEMO_CHAR, GraphFst, delete_space + class VerbalizeTimeFst(GraphFst): def __init__(self): - super().__init__(name="time",kind="verbalize") - hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR)+pynutil.delete("\"")+delete_space \ - +pynutil.delete("minutes:")+delete_space+pynutil.delete("\"") + pynini.closure(NEMO_CHAR)+pynutil.delete("\"")) + super().__init__(name="time", kind="verbalize") + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR) + + pynutil.delete("\"") + + delete_space + + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR) + + pynutil.delete("\"") + ) - graph = hour + graph = hour delete_tokens = self.delete_tokens(graph) - + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py index 9d3e69cd9..267215145 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py @@ -13,21 +13,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.verbalizers.time import VerbalizeTimeFst -from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst + class VerbalizeFst(GraphFst): - def __init__(self,deterministic: bool = True): - super().__init__(name="verbalize", kind="verbalize",deterministic=deterministic) + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) cardinal = CardinalFst() cardinal_graph = cardinal.fst time = VerbalizeTimeFst().fst - graph = ( - cardinal_graph - | time - ) + graph = cardinal_graph | time self.fst = graph - - diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py index 953bffdfe..1ac1adf01 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py @@ -13,24 +13,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os + import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst + from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.rw.graph_utils import ( + NEMO_PUNCT, GraphFst, delete_extra_space, - delete_space_or_punct, delete_space, - NEMO_PUNCT, + delete_space_or_punct, generator_main, - delete_space ) -import os +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst + class VerbalizeFinalFst(GraphFst): - def __init__(self, cache_dir: str = None, overwrite_cache: bool = False,deterministic: bool = True): - super().__init__(name="verbalize_final", kind="verbalize",deterministic=deterministic) + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) @@ -52,9 +54,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False,determin ) graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space - - self.fst = graph if far_file: - generator_main(far_file, {"ALL":self.fst,'REDUP': pynini.accep("REDUP")}) + generator_main(far_file, {"ALL": self.fst, 'REDUP': pynini.accep("REDUP")}) diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py index 021e652bd..750ff867b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py @@ -69,7 +69,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_separator = NEMO_SPACE @@ -249,7 +249,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -276,30 +276,27 @@ def __init__(self, deterministic: bool = True): zero_space = zero + insert_space self.zero_space = zero_space self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, zero_space + digit + insert_space + digit, ) self.three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, zero_space + digit + insert_space + digit, ) self.two_or_three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, zero_space + single_digits_graph + pynini.closure(insert_space + digit, 0, 1), single_digits_graph + pynini.closure(insert_space + single_digits_graph, 3), @@ -307,7 +304,7 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), zero_space + single_digits_graph + pynini.closure(insert_space + single_digits_graph, 0, 1), @@ -316,9 +313,8 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_both = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py index e114e9e6d..4da3f81c2 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/measure.py +++ b/nemo_text_processing/text_normalization/sv/taggers/measure.py @@ -81,7 +81,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py index 7cb62517f..0877ca08f 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py @@ -32,7 +32,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal - "21:a" -> ordinal { integer: "tjugoförsta" } + "21:a" -> ordinal { integer: "tjugoförsta" } Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, @@ -95,7 +95,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): hundreds |= pynini.cross("1", "ett hundra") hundreds |= digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra") - graph_hundreds = hundreds + pynini.union(graph_tens, (pynutil.delete("0") + graph_digit),) + graph_hundreds = hundreds + pynini.union( + graph_tens, + (pynutil.delete("0") + graph_digit), + ) if not deterministic: graph_hundreds |= hundreds + pynini.union( (graph_teens | pynutil.insert(NEMO_SPACE) + graph_teens), (pynini.cross("0", NEMO_SPACE) + graph_digit) @@ -179,7 +182,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/sv/taggers/telephone.py b/nemo_text_processing/text_normalization/sv/taggers/telephone.py index 4b37d28de..a03e0430b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/taggers/telephone.py @@ -50,8 +50,8 @@ class TelephoneFst(GraphFst): https://codegolf.stackexchange.com/questions/195787/format-a-swedish-phone-number Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py index 676e78592..cb5067058 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/text_normalization/sv/taggers/time.py @@ -106,7 +106,11 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(ensure_space + final_suffix, 0, 1) final_time_zone = pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") - final_time_zone_optional = pynini.closure(NEMO_SPACE + final_time_zone, 0, 1,) + final_time_zone_optional = pynini.closure( + NEMO_SPACE + final_time_zone, + 0, + 1, + ) # 2:30 pm, 02:30, 2:00 graph_hm_kl = ( diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py b/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py index 404b42495..dd71814a1 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py @@ -25,14 +25,14 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones - decimal { integer_part: "un" quantity: "billón" } -> un billón + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones + decimal { integer_part: "un" quantity: "billón" } -> un billón Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py index af17c6d48..6656e3445 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py @@ -40,7 +40,11 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) + optional_country_code = pynini.closure( + country_code + delete_space + insert_space, + 0, + 1, + ) number_part = ( pynutil.delete("number_part: \"") diff --git a/nemo_text_processing/text_normalization/token_parser.py b/nemo_text_processing/text_normalization/token_parser.py index 638b71bbf..4adcd7d7f 100644 --- a/nemo_text_processing/text_normalization/token_parser.py +++ b/nemo_text_processing/text_normalization/token_parser.py @@ -34,7 +34,7 @@ def __call__(self, text): Args: text: text to be parsed - + """ self.text = text self.len_text = len(text) @@ -107,11 +107,11 @@ def parse_token_value(self) -> Union[str, dict]: def parse_char(self, exp) -> bool: """ - Parses character + Parses character Args: exp: character to read in - + Returns true if successful """ assert self.char == exp @@ -124,7 +124,7 @@ def parse_chars(self, exp) -> bool: Args: exp: characters to read in - + Returns true if successful """ ok = False @@ -181,8 +181,8 @@ def parse_ws(self): def read(self): """ - Reads in next char. - + Reads in next char. + Returns true if not EOS """ if self.index < self.len_text - 1: # should be unique diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 21437e82f..a0c3b587d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -35,7 +35,7 @@ def __init__(self, deterministic: bool = True): graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) - alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 + alls = NEMO_DIGIT**2 | NEMO_DIGIT**1 graph_all = ( (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 @@ -46,7 +46,7 @@ def __init__(self, deterministic: bool = True): ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 graph_all_alt = alls @ graph_all_alt - hundreds = NEMO_DIGIT ** 3 + hundreds = NEMO_DIGIT**3 graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), @@ -56,7 +56,7 @@ def __init__(self, deterministic: bool = True): self.digit = graph_digit.optimize() self.all = graph_all.optimize() - thousands = NEMO_DIGIT ** 4 + thousands = NEMO_DIGIT**4 graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -64,7 +64,7 @@ def __init__(self, deterministic: bool = True): ) graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT ** 5 + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -73,8 +73,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - hundred_thousands = NEMO_DIGIT ** 6 - hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands = NEMO_DIGIT**6 + hundred_thousands_position = NEMO_DIGIT**2 hundred_thousands_position = hundred_thousands_position @ graph_all_alt graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT ** 7 - million_position = NEMO_DIGIT ** 3 + millions = NEMO_DIGIT**7 + million_position = NEMO_DIGIT**3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -95,8 +95,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT ** 8 - ten_million_position = NEMO_DIGIT ** 4 + ten_millions = NEMO_DIGIT**8 + ten_million_position = NEMO_DIGIT**4 ten_million_position = ten_million_position @ graph_thousand_component graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -106,7 +106,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT ** 9 + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -119,8 +119,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT ** 10 - thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions = NEMO_DIGIT**10 + thousand_millions_position = NEMO_DIGIT**2 thousand_millions_position = thousand_millions_position @ graph_all_alt graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -134,8 +134,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT ** 11 - ten_billions_position = NEMO_DIGIT ** 3 + ten_billions = NEMO_DIGIT**11 + ten_billions_position = NEMO_DIGIT**3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -149,8 +149,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT ** 12 - hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions = NEMO_DIGIT**12 + hundred_billions_position = NEMO_DIGIT**4 hundred_billions_position = hundred_billions_position @ graph_thousand_component graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), diff --git a/nemo_text_processing/text_normalization/zh/taggers/date.py b/nemo_text_processing/text_normalization/zh/taggers/date.py index 607b63511..f5ea122e7 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/text_normalization/zh/taggers/date.py @@ -32,7 +32,7 @@ class DateFst(GraphFst): 2002/02 -> is an error format according to the national standard 02/11 -> is an error format according to the national standard According to national standard, only when the year, month, and day are all exist, it is allowed to use symbols to separate them - + """ def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/text_normalization/zh/taggers/decimal.py b/nemo_text_processing/text_normalization/zh/taggers/decimal.py index d4afb3fd9..713fd4ab2 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/decimal.py @@ -64,7 +64,7 @@ class DecimalFst(GraphFst): 0.5 -> decimal { integer_part: "零" fractional_part: "五" } 0.5万 -> decimal { integer_part: "零" fractional_part: "五" quantity: "万" } -0.5万 -> decimal { negative: "负" integer_part: "零" fractional_part: "五" quantity: "万"} - + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/fraction.py b/nemo_text_processing/text_normalization/zh/taggers/fraction.py index 3f9ce42c7..e3ad5b513 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/taggers/fraction.py @@ -30,7 +30,7 @@ class FractionFst(GraphFst): 100分之1 -> tokens { fraction { denominator: "一百" numerator: "一"} } 百分之1 -> tokens { fraction { denominator: "百" numerator: "一"} } 98% -> tokens { fraction { denominator: "百" numerator: "九十八"} } - + Args: cardinal: CardinalFst, decimal: DecimalFst """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/measure.py b/nemo_text_processing/text_normalization/zh/taggers/measure.py index d7da8f524..1ec47aae9 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/measure.py +++ b/nemo_text_processing/text_normalization/zh/taggers/measure.py @@ -22,7 +22,7 @@ class MeasureFst(GraphFst): ''' - 1kg -> tokens { measure { cardinal { integer: "一" } units: "千克" } } + 1kg -> tokens { measure { cardinal { integer: "一" } units: "千克" } } ''' def __init__( diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index 82e1c174f..b283f3444 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -22,17 +22,19 @@ class PreProcessorFst(GraphFst): ''' - Preprocessing of TN: - 1. interjections removal such as '啊, 呃' - 2. fullwidth -> halfwidth char conversion - 好啊 -> 好 - 呃对 -> 对 - : -> : - ; -> ; + Preprocessing of TN: + 1. interjections removal such as '啊, 呃' + 2. fullwidth -> halfwidth char conversion + 好啊 -> 好 + 呃对 -> 对 + : -> : + ; -> ; ''' def __init__( - self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, + self, + remove_interjections: bool = True, + fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index d35ea178b..3a0b28aeb 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -35,9 +35,9 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/zh/utils.py b/nemo_text_processing/text_normalization/zh/utils.py index 4d08f1deb..175aba206 100644 --- a/nemo_text_processing/text_normalization/zh/utils.py +++ b/nemo_text_processing/text_normalization/zh/utils.py @@ -28,7 +28,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -50,7 +50,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py index 00ba3b8ed..4487c6449 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py @@ -21,7 +21,7 @@ class MeasureFst(GraphFst): ''' - tokens { measure { cardinal: "一" } units: "千克" } } -> 一千克 + tokens { measure { cardinal: "一" } units: "千克" } } -> 一千克 ''' def __init__( diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py index 4bafef0bd..8b196dcaf 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index a63769787..dcdd73622 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -29,14 +29,18 @@ class PostProcessor(GraphFst): ''' - Postprocessing of TN, now contains: - 1. punctuation removal - 2. letter case conversion - 3. oov tagger + Postprocessing of TN, now contains: + 1. punctuation removal + 2. letter case conversion + 3. oov tagger ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py index 221fbcbc7..a927f4716 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py @@ -31,7 +31,7 @@ class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index b16625530..846254938 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -24,9 +24,7 @@ class VerbalizeFinalFst(GraphFst): - """ - - """ + """ """ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) @@ -44,6 +42,11 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py index 662cf9f28..0dc6cca68 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py @@ -21,7 +21,7 @@ class Whitelist(GraphFst): ''' - tokens { whitelist: "ATM" } -> A T M + tokens { whitelist: "ATM" } -> A T M ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/word.py b/nemo_text_processing/text_normalization/zh/verbalizers/word.py index f30f254c5..b481d78d5 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/word.py @@ -20,7 +20,7 @@ class WordFst(GraphFst): ''' - tokens { char: "你" } -> 你 + tokens { char: "你" } -> 你 ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/setup.py b/setup.py index 4667b49e8..e22afbab3 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,9 @@ elif os.path.exists('README.rst'): # codec is used for consistent encoding long_description = codecs.open( - os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', encoding='utf-8', + os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), + 'r', + encoding='utf-8', ).read() long_description_content_type = "text/x-rst" @@ -125,7 +127,8 @@ def __call_checker(self, base_command, scope, check): command.extend(['--check', '--diff']) self.announce( - msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO, + msg='Running command: %s' % str(' '.join(command)), + level=distutils_log.INFO, ) return_code = subprocess.call(command) @@ -133,10 +136,18 @@ def __call_checker(self, base_command, scope, check): return return_code def _isort(self, scope, check): - return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,) + return self.__call_checker( + base_command=self.__ISORT_BASE.split(), + scope=scope, + check=check, + ) def _black(self, scope, check): - return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,) + return self.__call_checker( + base_command=self.__BLACK_BASE.split(), + scope=scope, + check=check, + ) def _pass(self): self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO) diff --git a/tests/conftest.py b/tests/conftest.py index b2216e874..a26dab531 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,13 +56,15 @@ def pytest_addoption(parser): help="path to a directory with .far grammars for CPU TN/ITN tests, (DEFAULT: None, i.e. no cache)", ) parser.addoption( - '--run_audio_based', action='store_true', help="pass this argument to run audio-based TN tests", + '--run_audio_based', + action='store_true', + help="pass this argument to run audio-based TN tests", ) @pytest.fixture def device(request): - """ Simple fixture returning string denoting the device [CPU | GPU] """ + """Simple fixture returning string denoting the device [CPU | GPU]""" if request.config.getoption("--cpu"): return "CPU" else: @@ -104,7 +106,7 @@ def cleanup_local_folder(): @pytest.fixture def test_data_dir(): - """ Fixture returns test_data_dir. """ + """Fixture returns test_data_dir.""" # Test dir. test_data_dir_ = join(dirname(__file__), __TEST_DATA_SUBDIR) return test_data_dir_ @@ -148,10 +150,12 @@ def pytest_configure(config): If file absent or sizes not equal, function downloads the archive from github and unpacks it. """ config.addinivalue_line( - "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]", + "markers", + "run_only_on(device): runs the test only on a given device [CPU | GPU]", ) config.addinivalue_line( - "markers", "with_downloads: runs the test using data present in tests/.data", + "markers", + "with_downloads: runs the test using data present in tests/.data", ) # Test dir and archive filepath. test_dir = join(dirname(__file__), __TEST_DATA_SUBDIR) diff --git a/tests/nemo_text_processing/ar/test_money.py b/tests/nemo_text_processing/ar/test_money.py index 6fe36ba35..2aa49ba9a 100644 --- a/tests/nemo_text_processing/ar/test_money.py +++ b/tests/nemo_text_processing/ar/test_money.py @@ -49,6 +49,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_address.py b/tests/nemo_text_processing/en/test_address.py index c7a3523a0..ea8328d10 100644 --- a/tests/nemo_text_processing/en/test_address.py +++ b/tests/nemo_text_processing/en/test_address.py @@ -42,6 +42,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_cardinal.py b/tests/nemo_text_processing/en/test_cardinal.py index 1ee3a2a5b..f40e0d1f6 100644 --- a/tests/nemo_text_processing/en/test_cardinal.py +++ b/tests/nemo_text_processing/en/test_cardinal.py @@ -63,6 +63,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_decimal.py b/tests/nemo_text_processing/en/test_decimal.py index ff021f72a..ea20f18d6 100644 --- a/tests/nemo_text_processing/en/test_decimal.py +++ b/tests/nemo_text_processing/en/test_decimal.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_electronic.py b/tests/nemo_text_processing/en/test_electronic.py index e8640062c..4dfec585e 100644 --- a/tests/nemo_text_processing/en/test_electronic.py +++ b/tests/nemo_text_processing/en/test_electronic.py @@ -60,6 +60,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=100, punct_post_process=False, + test_input, + n_tagged=100, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_fraction.py b/tests/nemo_text_processing/en/test_fraction.py index 764205591..a6186aabb 100644 --- a/tests/nemo_text_processing/en/test_fraction.py +++ b/tests/nemo_text_processing/en/test_fraction.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_math.py b/tests/nemo_text_processing/en/test_math.py index e2ecdebb8..22859f596 100644 --- a/tests/nemo_text_processing/en/test_math.py +++ b/tests/nemo_text_processing/en/test_math.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_measure.py b/tests/nemo_text_processing/en/test_measure.py index b03b3ff53..6ea9a0eda 100644 --- a/tests/nemo_text_processing/en/test_measure.py +++ b/tests/nemo_text_processing/en/test_measure.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_money.py b/tests/nemo_text_processing/en/test_money.py index c81945ecd..103223d5e 100644 --- a/tests/nemo_text_processing/en/test_money.py +++ b/tests/nemo_text_processing/en/test_money.py @@ -63,6 +63,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_ordinal.py b/tests/nemo_text_processing/en/test_ordinal.py index 6f87a832d..dac56bf38 100644 --- a/tests/nemo_text_processing/en/test_ordinal.py +++ b/tests/nemo_text_processing/en/test_ordinal.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_punctuation.py b/tests/nemo_text_processing/en/test_punctuation.py index 75ff2e73c..761b3c9f4 100644 --- a/tests/nemo_text_processing/en/test_punctuation.py +++ b/tests/nemo_text_processing/en/test_punctuation.py @@ -22,7 +22,11 @@ class TestPunctuation: normalizer_en = Normalizer( - input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True, + input_case='cased', + lang='en', + cache_dir=CACHE_DIR, + overwrite_cache=False, + post_process=True, ) # address is tagged by the measure class diff --git a/tests/nemo_text_processing/en/test_range.py b/tests/nemo_text_processing/en/test_range.py index ac93613be..64b47d898 100644 --- a/tests/nemo_text_processing/en/test_range.py +++ b/tests/nemo_text_processing/en/test_range.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_roman.py b/tests/nemo_text_processing/en/test_roman.py index dc9468fb3..3ef655c65 100644 --- a/tests/nemo_text_processing/en/test_roman.py +++ b/tests/nemo_text_processing/en/test_roman.py @@ -40,6 +40,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_serial.py b/tests/nemo_text_processing/en/test_serial.py index aab870abf..2a27b1f54 100644 --- a/tests/nemo_text_processing/en/test_serial.py +++ b/tests/nemo_text_processing/en/test_serial.py @@ -38,6 +38,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=-1, punct_post_process=False, + test_input, + n_tagged=-1, + punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_special_text.py b/tests/nemo_text_processing/en/test_special_text.py index a461fe703..73be5d382 100644 --- a/tests/nemo_text_processing/en/test_special_text.py +++ b/tests/nemo_text_processing/en/test_special_text.py @@ -41,6 +41,8 @@ def test_norm(self, test_input, expected): # Audio-based normalization will output only options without digits if self.normalizer_with_audio_en and sum([1 for ch in expected if ch.isdigit()]) == 0: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=True, + test_input, + n_tagged=30, + punct_post_process=True, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index e2cd7d4a2..1a48d6da8 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -62,6 +62,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=500, punct_post_process=False, + test_input, + n_tagged=500, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/rw/test_cardinal.py b/tests/nemo_text_processing/rw/test_cardinal.py index d1d290cb4..eed4be57a 100644 --- a/tests/nemo_text_processing/rw/test_cardinal.py +++ b/tests/nemo_text_processing/rw/test_cardinal.py @@ -24,11 +24,9 @@ class TestCardinal: - normalizer_rw = Normalizer( input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True ) - @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @@ -37,5 +35,3 @@ def test_norm(self, test_input, expected): pred = self.normalizer_rw.normalize(test_input, verbose=False, punct_post_process=False) assert pred == expected, f"input: {test_input}" print(pred) - - \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/test_time.py b/tests/nemo_text_processing/rw/test_time.py index ff49a3dc8..a8ada8f73 100644 --- a/tests/nemo_text_processing/rw/test_time.py +++ b/tests/nemo_text_processing/rw/test_time.py @@ -24,7 +24,6 @@ class TestTime: - normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_time.txt')) @@ -33,5 +32,3 @@ class TestTime: def test_norm(self, test_input, expected): pred = self.normalizer_rw.normalize(test_input, verbose=False) assert pred == expected - - \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/test_whitelist.py b/tests/nemo_text_processing/rw/test_whitelist.py index b5850ab6a..3726dbaff 100644 --- a/tests/nemo_text_processing/rw/test_whitelist.py +++ b/tests/nemo_text_processing/rw/test_whitelist.py @@ -24,9 +24,8 @@ class TestWhitelist: - - normalizer_rw = Normalizer(input_case='cased',lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.run_only_on('CPU') @@ -34,4 +33,3 @@ class TestWhitelist: def test_norm(self, test_input, expected): pred = self.normalizer_rw.normalize(test_input, verbose=False) assert pred == expected - diff --git a/tests/nemo_text_processing/rw/test_word.py b/tests/nemo_text_processing/rw/test_word.py index 06fff29b1..10f2e1883 100644 --- a/tests/nemo_text_processing/rw/test_word.py +++ b/tests/nemo_text_processing/rw/test_word.py @@ -24,9 +24,8 @@ class TestWord: - - normalizer_rw = Normalizer(input_case='cased',lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @@ -34,4 +33,3 @@ class TestWord: def test_norm(self, test_input, expected): pred = self.normalizer_rw.normalize(test_input, verbose=False) assert pred == expected - diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0cbd53349..f20660502 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -87,7 +87,25 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr', 'ja','rw'], + choices=[ + "en", + "de", + "es", + "pt", + "ru", + 'fr', + 'hu', + 'sv', + 'vi', + 'zh', + 'ar', + 'it', + 'es_en', + 'hy', + 'mr', + 'ja', + 'rw', + ], type=str, default='en', ) @@ -275,9 +293,7 @@ def parse_args(): from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) - from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import ( - VerbalizeFst as TNVerbalizeFst, - ) + from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From 26a7131fad6307fbc2edb724e2b68eb51b393ae5 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Tue, 17 Sep 2024 00:08:01 +0200 Subject: [PATCH 7/7] Disabled Black during formatting. Signed-off-by: kurt0cougar --- nemo_text_processing/hybrid/utils.py | 15 +---- .../ar/taggers/cardinal.py | 4 +- .../ar/taggers/decimal.py | 4 +- .../en/taggers/cardinal.py | 5 +- .../en/taggers/date.py | 6 +- .../en/taggers/decimal.py | 4 +- .../en/taggers/electronic.py | 8 +-- .../en/taggers/measure.py | 4 +- .../en/taggers/money.py | 2 +- .../en/taggers/telephone.py | 14 ++--- .../en/taggers/time.py | 22 +------- .../es/taggers/cardinal.py | 9 ++- .../es/taggers/electronic.py | 8 +-- .../es/taggers/ordinal.py | 8 +-- .../es/taggers/telephone.py | 5 +- .../fr/taggers/cardinal.py | 9 ++- .../fr/taggers/date.py | 4 +- .../fr/verbalizers/decimal.py | 2 +- .../fr/verbalizers/ordinal.py | 4 +- .../hy/verbalizers/ordinal.py | 7 +-- .../ja/taggers/cardinal.py | 5 +- .../ja/taggers/fraction.py | 4 +- .../ja/taggers/fraction_old.py | 4 +- .../ja/taggers/preprocessor.py | 4 +- .../ja/taggers/time.py | 8 +-- .../ja/verbalizers/cardinal.py | 2 +- .../ja/verbalizers/postprocessor.py | 6 +- .../ja/verbalizers/time.py | 10 +--- .../ja/verbalizers/verbalize_final.py | 7 +-- .../mr/taggers/cardinal.py | 6 +- .../mr/taggers/date.py | 6 +- .../mr/taggers/decimal.py | 4 +- .../pt/taggers/cardinal.py | 15 +++-- .../sv/taggers/date.py | 3 +- .../sv/taggers/decimal.py | 10 +--- .../vi/taggers/cardinal.py | 7 +-- .../vi/taggers/date.py | 5 +- .../vi/taggers/decimal.py | 6 +- .../vi/verbalizers/time.py | 4 +- .../zh/graph_utils.py | 10 +--- .../zh/taggers/cardinal.py | 7 +-- .../zh/taggers/date.py | 4 +- .../zh/taggers/money.py | 4 +- .../zh/taggers/tokenize_and_classify.py | 6 +- .../zh/verbalizers/cardinal.py | 2 +- .../zh/verbalizers/decimal.py | 2 +- .../text_normalization/ar/taggers/measure.py | 21 ++++--- .../text_normalization/ar/taggers/money.py | 5 +- .../text_normalization/de/taggers/cardinal.py | 4 +- .../text_normalization/de/taggers/date.py | 2 +- .../text_normalization/de/taggers/measure.py | 4 +- .../de/taggers/telephone.py | 2 +- .../text_normalization/de/taggers/time.py | 4 +- .../de/taggers/tokenize_and_classify.py | 14 +---- .../de/verbalizers/ordinal.py | 5 +- .../text_normalization/en/graph_utils.py | 40 ++----------- .../text_normalization/en/taggers/cardinal.py | 4 +- .../text_normalization/en/taggers/date.py | 6 +- .../en/taggers/electronic.py | 18 ++---- .../text_normalization/en/taggers/measure.py | 26 +++------ .../text_normalization/en/taggers/money.py | 6 +- .../text_normalization/en/taggers/range.py | 13 ++--- .../text_normalization/en/taggers/serial.py | 2 +- .../en/taggers/tokenize_and_classify.py | 15 +---- .../en/verbalizers/ordinal.py | 5 +- .../text_normalization/es/graph_utils.py | 5 +- .../text_normalization/es/taggers/cardinal.py | 4 +- .../text_normalization/es/taggers/date.py | 2 +- .../text_normalization/es/taggers/fraction.py | 56 ++----------------- .../text_normalization/es/taggers/measure.py | 4 +- .../text_normalization/es/taggers/time.py | 4 +- .../es/taggers/tokenize_and_classify.py | 20 ++----- .../es/verbalizers/fraction.py | 6 +- .../fr/taggers/tokenize_and_classify.py | 9 +-- .../text_normalization/hu/taggers/cardinal.py | 10 ++-- .../text_normalization/hu/taggers/decimal.py | 2 +- .../text_normalization/hu/taggers/measure.py | 4 +- .../text_normalization/hu/taggers/time.py | 6 +- .../hu/taggers/tokenize_and_classify.py | 20 ++----- .../hu/verbalizers/telephone.py | 10 +--- .../text_normalization/it/taggers/cardinal.py | 4 +- .../text_normalization/it/taggers/measure.py | 4 +- .../it/taggers/tokenize_and_classify.py | 15 +---- .../normalize_with_audio.py | 15 +---- .../text_normalization/ru/taggers/date.py | 2 +- .../ru/taggers/telephone.py | 8 +-- .../text_normalization/rw/graph_utils.py | 32 ++--------- .../text_normalization/rw/taggers/cardinal.py | 1 - .../rw/verbalizers/verbalize_final.py | 9 +-- .../text_normalization/sv/taggers/cardinal.py | 28 ++++++---- .../text_normalization/sv/taggers/measure.py | 4 +- .../text_normalization/sv/taggers/ordinal.py | 7 +-- .../text_normalization/sv/taggers/time.py | 6 +- .../sv/verbalizers/telephone.py | 6 +- .../text_normalization/zh/taggers/cardinal.py | 34 +++++------ .../zh/taggers/preprocessor.py | 4 +- .../zh/verbalizers/postprocessor.py | 6 +- .../zh/verbalizers/verbalize_final.py | 7 +-- setup.py | 19 ++----- tests/conftest.py | 10 +--- tests/nemo_text_processing/ar/test_money.py | 4 +- tests/nemo_text_processing/en/test_address.py | 4 +- .../nemo_text_processing/en/test_cardinal.py | 4 +- tests/nemo_text_processing/en/test_decimal.py | 4 +- .../en/test_electronic.py | 4 +- .../nemo_text_processing/en/test_fraction.py | 4 +- tests/nemo_text_processing/en/test_math.py | 4 +- tests/nemo_text_processing/en/test_measure.py | 4 +- tests/nemo_text_processing/en/test_money.py | 4 +- tests/nemo_text_processing/en/test_ordinal.py | 4 +- .../en/test_punctuation.py | 6 +- tests/nemo_text_processing/en/test_range.py | 4 +- tests/nemo_text_processing/en/test_roman.py | 4 +- tests/nemo_text_processing/en/test_serial.py | 4 +- .../en/test_special_text.py | 4 +- tests/nemo_text_processing/es/test_ordinal.py | 4 +- 116 files changed, 251 insertions(+), 689 deletions(-) diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py index 82c96aa6f..d634f5a09 100644 --- a/nemo_text_processing/hybrid/utils.py +++ b/nemo_text_processing/hybrid/utils.py @@ -515,11 +515,7 @@ def _relax_diff(text): return acceptable -def get_labels( - targets: List[str], - norm_texts_weights: List[Tuple[str, str]], - lang="en", -) -> List[List[str]]: +def get_labels(targets: List[str], norm_texts_weights: List[Tuple[str, str]], lang="en",) -> List[List[str]]: """ Assign labels to generated normalization options (1 - for ground truth, 0 - other options) Args: @@ -609,14 +605,7 @@ def print_df(df): prints data frame """ with pd.option_context( - "display.max_rows", - None, - "display.max_columns", - None, - "display.width", - 1000, - "display.max_colwidth", - 400, + "display.max_rows", None, "display.max_columns", None, "display.width", 1000, "display.max_colwidth", 400, ): print(df) diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py index 2c58df6a9..47febc4ac 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py @@ -33,9 +33,7 @@ def __init__(self, tn_cardinal): self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize() optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, - 0, - 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, 0, 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py index 3b22ece05..f0d641d14 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py @@ -36,9 +36,7 @@ def __init__(self, tn_decimal): super().__init__(name="decimal", kind="classify") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, - 0, - 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1, ) graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index 5eea89af1..fa5df3367 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -207,10 +207,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): + graph_in_thousands ) - graph = pynini.union( - (graph_int | graph_ind) + delete_space + graph_hundreds, - graph_zero, - ) + graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index b1ace40ce..5be9240d7 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -165,11 +165,7 @@ def __init__(self, ordinal: GraphFst, input_case: str): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure( - graph_year, - 0, - 1, - ) + optional_graph_year = pynini.closure(graph_year, 0, 1,) graph_mdy = month_graph + ( (delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py index 6e5de2418..1d730ec30 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py @@ -97,9 +97,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): point = pynutil.delete("point") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, - 0, - 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py index 0a41b4702..a2373d9d7 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py @@ -106,13 +106,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + url_symbols + delete_extra_space - + ( - domain - | pynini.closure( - accepted_username + delete_extra_space, - ) - + accepted_username - ) + + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py index 69eeaa56e..2d9d5e02c 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py @@ -58,9 +58,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize() optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, - 0, - 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, ) unit_singular = convert_space(graph_unit_singular) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py index 2c5d5ad78..2a1e32a49 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU # "one fifty" -> "one hundred fifty" with_hundred = pynini.compose( pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA, - pynini.compose(cardinal_graph, NEMO_DIGIT**3), + pynini.compose(cardinal_graph, NEMO_DIGIT ** 3), ) cardinal_graph |= with_hundred graph_decimal_final = decimal.final_graph_wo_negative diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py index 9a106ca78..06d749e39 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py @@ -40,7 +40,7 @@ def get_serial_number(cardinal): """ digit = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT) - two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT**2), 0.002) + two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT ** 2), 0.002) character = digit | two_digit | NEMO_ALPHA sequence = (NEMO_LOWER_NOT_A | digit) + pynini.closure(pynutil.delete(" ") + character, 2) sequence |= character + pynini.closure(pynutil.delete(" ") + (digit | NEMO_ALPHA), 2) @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): triple_digit.invert() # to handle cases like "one twenty three" - two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2) + two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2) double_digit_to_digit = ( pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal ) @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): number_part = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4, + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4, ).optimize() number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"") @@ -156,16 +156,16 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): graph = optional_country_code + number_part # credit card number - space_four_digits = insert_space + NEMO_DIGIT**4 + space_four_digits = insert_space + NEMO_DIGIT ** 4 space_five_digits = space_four_digits + NEMO_DIGIT space_six_digits = space_five_digits + NEMO_DIGIT credit_card_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT**4 + (space_six_digits | (space_four_digits**2)) + space_four_digits, + NEMO_DIGIT ** 4 + (space_six_digits | (space_four_digits ** 2)) + space_four_digits, ).optimize() credit_card_graph |= pynini.compose( - single_double_or_triple_digit, NEMO_DIGIT**4 + space_six_digits + space_five_digits + single_double_or_triple_digit, NEMO_DIGIT ** 4 + space_six_digits + space_five_digits ).optimize() graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"") @@ -173,7 +173,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): # SSN ssn_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4, + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4, ).optimize() graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py index 46dc71bc8..53d3dd931 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py @@ -71,32 +71,14 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") - oclock = pynini.cross( - pynini.union( - "o' clock", - "o clock", - "o'clock", - "oclock", - "hundred hours", - ), - "", - ) + oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock", "hundred hours",), "",) if input_case == INPUT_CASED: minute_to_graph = capitalized_input_graph(minute_to_graph) graph_minute_single = capitalized_input_graph(graph_minute_single) graph_minute_double = capitalized_input_graph(graph_minute_double) graph_minute_verbose |= pynini.cross("Half", "30") | pynini.cross("Quarter", "15") - oclock |= pynini.cross( - pynini.union( - "O' clock", - "O clock", - "O'clock", - "Oclock", - "Hundred hours", - ), - "", - ) + oclock |= pynini.cross(pynini.union("O' clock", "O clock", "O'clock", "Oclock", "Hundred hours",), "",) final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index d3082509a..3e164bcc9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -160,13 +160,18 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand.optimize() # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) + NEMO_DIGIT + | (NEMO_DIGIT ** 2) + | (NEMO_DIGIT ** 3) + | (NEMO_DIGIT ** 4) + | (NEMO_DIGIT ** 5) + | (NEMO_DIGIT ** 6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index a7d767119..50a5e07f7 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -136,13 +136,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + symbols + delete_extra_space - + ( - domain - | pynini.closure( - accepted_username + delete_extra_space, - ) - + accepted_username - ) + + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index 7cdcfacc7..d97cc752a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -62,13 +62,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit) - ordinal_graph_union = pynini.union( - graph_digit, - graph_teens, - graph_twenties, - full_graph_ties, - graph_hundreds, - ) + ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,) accept_o_endings = NEMO_SIGMA + pynini.accep("o") accept_a_endings = NEMO_SIGMA + pynini.accep("a") diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 8c73ca434..2086d643c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -110,10 +110,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # Denormalized phone numbers are grouped in sets of 3 or 4 digits group_of_two = pynini.union(doubled_digit, digit_twice, double_digits) - group_of_three = pynini.union( - tripled_digit, - single_digits + pynutil.delete(" ") + group_of_two, - ) + group_of_three = pynini.union(tripled_digit, single_digits + pynutil.delete(" ") + group_of_two,) group_of_four = pynini.union( group_of_two + pynutil.delete(" ") + group_of_two, diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py index ea1fcf8ea..d827a63e2 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py @@ -248,13 +248,18 @@ def __init__(self): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) + NEMO_DIGIT + | (NEMO_DIGIT ** 2) + | (NEMO_DIGIT ** 3) + | (NEMO_DIGIT ** 4) + | (NEMO_DIGIT ** 5) + | (NEMO_DIGIT ** 6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py index 68d35741c..06807f6a3 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py @@ -46,9 +46,7 @@ def __init__(self, cardinal: GraphFst): day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") optional_graph_year = pynini.closure( - delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), - 0, - 1, + delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py index 3e654b859..ce0bdf8c4 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py @@ -46,7 +46,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # Need parser to group digits by threes - exactly_three_digits = NEMO_DIGIT**3 + exactly_three_digits = NEMO_DIGIT ** 3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) space_every_three_integer = ( diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py index 3179af643..77dd6323f 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py @@ -61,12 +61,12 @@ def __init__(self): graph_roman_hundreds = pynini.string_file(get_abs_path("data/roman/hundreds_large.tsv")).invert() graph_roman_zero_digit = pynutil.delete("0") - graph_roman_hundreds = NEMO_DIGIT**3 @ ( + graph_roman_hundreds = NEMO_DIGIT ** 3 @ ( graph_roman_hundreds + pynini.union(graph_roman_ties, graph_roman_zero_digit) + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) - graph_roman_ties = NEMO_DIGIT**2 @ ( + graph_roman_ties = NEMO_DIGIT ** 2 @ ( graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) graph_roman_digits = NEMO_DIGIT @ graph_roman_digits diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py index b0d4e52cc..e912ff60b 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py @@ -37,12 +37,7 @@ def __init__(self): convert_one = pynini.cross("[BOS]1", "[BOS]1-ին") convert_rest = pynutil.insert("-րդ", weight=0.01) - suffix = pynini.cdrewrite( - convert_rest | convert_one, - "", - "[EOS]", - NEMO_SIGMA, - ) + suffix = pynini.cdrewrite(convert_rest | convert_one, "", "[EOS]", NEMO_SIGMA,) graph = graph @ suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py index 15d17f81d..fa6bebd87 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py @@ -39,10 +39,7 @@ def __init__(self): hundred = pynutil.delete("百") | pynutil.delete("ひゃく") | pynutil.delete("びゃく") | pynutil.delete("ぴゃく") hundred_alt = ( - pynini.cross("百", "1") - | pynini.cross("ひゃく", "1") - | pynini.cross("びゃく", "1") - | pynini.cross("ぴゃく", "1") + pynini.cross("百", "1") | pynini.cross("ひゃく", "1") | pynini.cross("びゃく", "1") | pynini.cross("ぴゃく", "1") ) graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) graph_hundred_component += pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py index bc4c8f60c..bf3b60630 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py @@ -36,9 +36,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal = cardinal.just_cardinals decimal = decimal.just_decimal - fraction_word = ( - pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") - ) + fraction_word = pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") integer_word = pynini.accep("と") | pynini.accep("荷") optional_sign = ( pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("マイナス", "-")) + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py index 8f474cbb5..5ef844495 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py @@ -36,9 +36,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal = cardinal.just_cardinals decimal = decimal.just_decimal - fraction_word = ( - pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") - ) + fraction_word = pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") inetegr_word = pynutil.delete("と") | pynutil.delete("荷") optional_sign = ( pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("マイナス", "-")) + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py index 26e053334..8fca40fdd 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py @@ -32,9 +32,7 @@ class PreProcessorFst(GraphFst): ''' def __init__( - self, - remove_interjections: bool = True, - fullwidth_to_halfwidth: bool = True, + self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py index 20ff3f34a..8477dfaa5 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py @@ -38,18 +38,14 @@ def __init__(self): minutes_seconds = pynini.string_file(get_abs_path("data/time_minutes_seconds.tsv")) hour_component = ( - pynutil.insert("hours: \"") - + ((hours + pynutil.delete("時")) | pynini.accep("正午")) - + pynutil.insert("\"") + pynutil.insert("hours: \"") + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + pynutil.insert("\"") ) minute_component = ( pynutil.insert("minutes: \"") + ((minutes_seconds + pynutil.delete("分")) | pynini.accep("半")) + pynutil.insert("\"") ) - second_component = ( - pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") - ) + second_component = pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") graph_regular = ( pynini.closure(hour_component + insert_space + minute_component + insert_space + second_component) diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py index 62d41cb65..60bdff8a1 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py @@ -52,7 +52,7 @@ def __init__(self): + pynutil.delete("\"") ) - exactly_three_digits = NEMO_DIGIT**3 + exactly_three_digits = NEMO_DIGIT ** 3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py index 103cfb7a8..7bbc16516 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py @@ -36,11 +36,7 @@ class PostProcessor(GraphFst): ''' def __init__( - self, - remove_puncts: bool = False, - to_upper: bool = False, - to_lower: bool = False, - tag_oov: bool = False, + self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py index 8e95e14cf..798cd001d 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py @@ -40,18 +40,12 @@ def __init__(self): hours_component |= hours_component_alt minutes_component = ( - pynutil.delete("minutes: \"") - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.insert("分") - + pynutil.delete("\"") + pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("分") + pynutil.delete("\"") ) minutes_component_alt = pynutil.delete("minutes: \"") + pynini.accep("半") + pynutil.delete("\"") minutes_component |= minutes_component_alt second_component = ( - pynutil.delete("seconds: \"") - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.insert("秒") - + pynutil.delete("\"") + pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("秒") + pynutil.delete("\"") ) suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py index 7624d5f1b..980e41816 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py @@ -47,12 +47,7 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor( - remove_puncts=False, - to_upper=False, - to_lower=False, - tag_oov=False, - ) + postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) self.fst = (verbalizer @ postprocessor.fst).optimize() if far_file: diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py index 8aa218a9a..27d0a35c5 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py @@ -91,11 +91,7 @@ def __init__(self): graph_arabs + delete_space + graph_crores + delete_space + graph_lakhs + delete_space + graph_thousands ) - graph = pynini.union( - graph_higher_powers + delete_space + graph_hundreds, - graph_hundred_unique, - graph_zero, - ) + graph = pynini.union(graph_higher_powers + delete_space + graph_hundreds, graph_hundred_unique, graph_zero,) graph = graph @ pynini.union( pynutil.delete(pynini.closure("०")) + pynini.difference(NEMO_DIGIT, "०") + pynini.closure(NEMO_DIGIT), "०" diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py index 15a75affc..96e8fb08d 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py @@ -46,11 +46,7 @@ def __init__(self, cardinal: GraphFst): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure( - graph_year, - 0, - 1, - ) + optional_graph_year = pynini.closure(graph_year, 0, 1,) graph_ad_bc = pynutil.insert("text: \"") + prefixes + delete_space + pynutil.insert("\"") graph_mdy = month_graph + ( diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py index 92af8c7c3..8882b860c 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py @@ -65,9 +65,7 @@ def __init__(self, cardinal: GraphFst): graph_digits = pynini.string_file(get_abs_path("data/numbers/digits.tsv")).invert() decimal_word = pynini.cross("पूर्णांक", "") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, - 0, - 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, ) graph_integer = ( pynutil.insert("integer_part: \"") diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py index 59b30ae9e..8eeea3876 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py @@ -171,9 +171,9 @@ def __init__(self, use_strict_e=False): ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_prefix_e = graph_hundred_component_prefix_e.optimize() - graph_hundred_component_no_prefix = pynini.union( - graph_hundreds + graph_e + graph_ties_component, - ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) + graph_hundred_component_no_prefix = pynini.union(graph_hundreds + graph_e + graph_ties_component,) @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) + ) graph_hundred_component_no_prefix = graph_hundred_component_no_prefix.optimize() graph_mil_prefix_e = pynini.union( @@ -350,13 +350,18 @@ def __init__(self, use_strict_e=False): self.graph_no_exception = graph # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) + NEMO_DIGIT + | (NEMO_DIGIT ** 2) + | (NEMO_DIGIT ** 3) + | (NEMO_DIGIT ** 4) + | (NEMO_DIGIT ** 5) + | (NEMO_DIGIT ** 6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py index 5d9308958..5bb6c63bc 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py @@ -31,8 +31,7 @@ class DateFst(GraphFst): """ def __init__( - self, - tn_date_tagger: GraphFst, + self, tn_date_tagger: GraphFst, ): super().__init__(name="date", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py index 97bd36582..e39a9017a 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py @@ -49,15 +49,7 @@ def __init__(self, itn_cardinal_tagger: GraphFst, tn_decimal_tagger: GraphFst): self.final_graph_wo_sign = final_graph_wo_sign self.final_graph_wo_negative = ( - final_graph_wo_sign - | get_quantity( - final_graph_wo_sign, - None, - hundreds_no_one, - None, - False, - True, - ) + final_graph_wo_sign | get_quantity(final_graph_wo_sign, None, hundreds_no_one, None, False, True,) ).optimize() optional_minus_graph = pynini.closure(pynini.cross("minus ", "negative: \"true\" "), 0, 1) diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py index 155513937..016df4f1d 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py @@ -134,8 +134,7 @@ def __init__(self): ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), - "0", + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", ) # don't convert cardinals from zero to nine inclusive @@ -146,9 +145,7 @@ def __init__(self): self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, - 0, - 1, + pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, 0, 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py index 21576efd5..b0cd8561a 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py @@ -59,10 +59,7 @@ def _get_year_graph(): def _get_digits_graph(): zero = pynini.cross((pynini.union("linh", "lẻ")), "0") four = pynini.cross("tư", "4") - graph = pynini.union( - zero + delete_space + (graph_digit | four), - graph_zero + delete_space + graph_digit, - ) + graph = pynini.union(zero + delete_space + (graph_digit | four), graph_zero + delete_space + graph_digit,) graph.optimize() return graph diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py index 60c550228..033f3d86e 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py @@ -123,12 +123,10 @@ def __init__(self, cardinal: GraphFst): final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, - cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph |= optional_graph_negative + get_quantity( - final_graph_wo_sign, - cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py index 2ad4d5bbf..30d262722 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py @@ -70,9 +70,7 @@ def __init__(self): ) optional_zone = pynini.closure(zone, 0, 1) optional_second = pynini.closure( - delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), - 0, - 1, + delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), 0, 1, ) graph_h = hour + pynutil.insert("h") diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index 9c0199b13..de1a7a28c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -86,10 +86,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [ - written_capitalized, - spoken.capitalize(), - ], # first letter capitalized + [written_capitalized, spoken.capitalize(),], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -103,10 +100,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): print(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [ - [written, spoken_no_space], - [written_capitalized, spoken_no_space.upper()], - ] + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] ) else: additional_labels.extend( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index 0715a3988..f3b30238c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -110,12 +110,7 @@ def __init__(self): + graph_hundreds_complex ) | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) - | ( - graph_hundreds_complex - + delete_ten_thousands - + pynini.cross(pynini.closure("零"), "000") - + graph_digits - ) + | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) graph_millions = ( pynutil.add_weight(graph_millions_simple, -1.0) | graph_millions_complex | pynutil.insert("0000000") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index 108c222fd..331f0b7ff 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -61,9 +61,7 @@ def __init__(self): # graph_date = graph_year | graph_month | graph_day # grammar for optional prefix ad or bc - graph_bc_prefix = ( - pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) - ) + graph_bc_prefix = pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) graph_bc = pynutil.delete(graph_bc_prefix) graph_ad_prefix = ( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index 477a82f5d..e660b6015 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -57,9 +57,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # yuan major plus minor major_symbol = pynini.accep("块") | pynini.cross("塊", "块") - tencent = pynini.accep("毛") | pynini.accep( - "角", - ) + tencent = pynini.accep("毛") | pynini.accep("角",) cent = pynini.accep("分") graph_kuai = ( graph_integer_component diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index d183ad1ad..4c69b697c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -48,11 +48,7 @@ class ClassifyFst(GraphFst): """ def __init__( - self, - input_case: str, - cache_dir: str = None, - whitelist: str = None, - overwrite_cache: bool = False, + self, input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, ): super().__init__(name="tokenize_and_classify", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index f33987173..31d5880dc 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="cardinal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT**3 + exactly_three_digits = NEMO_DIGIT ** 3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) suffix = pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index b36e44dfa..28e2d5ff1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT**3 + exactly_three_digits = NEMO_DIGIT ** 3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # insert a "," for every three numbers before decimal point diff --git a/nemo_text_processing/text_normalization/ar/taggers/measure.py b/nemo_text_processing/text_normalization/ar/taggers/measure.py index ce22f3d76..707b40998 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/measure.py +++ b/nemo_text_processing/text_normalization/ar/taggers/measure.py @@ -55,9 +55,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, - 0, - 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) unit_plural = ( @@ -78,14 +76,15 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) subgraph_cardinal = ( - optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") - ) @ cardinal.fst + insert_space + pynini.closure( - pynutil.delete(" "), 0, 1 - ) + unit_plural | unit_plural + pynini.closure( - pynutil.delete(" "), 0, 1 - ) + insert_space + ( - optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") - ) @ cardinal.fst + (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst + + insert_space + + pynini.closure(pynutil.delete(" "), 0, 1) + + unit_plural + | unit_plural + + pynini.closure(pynutil.delete(" "), 0, 1) + + insert_space + + (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst + ) subgraph_cardinal |= ( (optional_graph_negative + pynini.accep("1")) @ cardinal.fst diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 925fa348e..5098989c6 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -142,10 +142,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) graph_with_no_minor_curr = integer_plus_maj - graph_with_no_minor_curr |= pynutil.add_weight( - integer_plus_maj, - weight=0.0001, - ) + graph_with_no_minor_curr |= pynutil.add_weight(integer_plus_maj, weight=0.0001,) graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order diff --git a/nemo_text_processing/text_normalization/de/taggers/cardinal.py b/nemo_text_processing/text_normalization/de/taggers/cardinal.py index 902a62b3f..a8ef5af17 100644 --- a/nemo_text_processing/text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/cardinal.py @@ -166,7 +166,7 @@ def thousand(): self.graph = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**24 + @ NEMO_DIGIT ** 24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -181,7 +181,7 @@ def thousand(): self.graph_hundred_component_at_least_one_none_zero_digit = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**3 + @ NEMO_DIGIT ** 3 @ hundred_non_zero() ) | pynini.cross("1", "eins") diff --git a/nemo_text_processing/text_normalization/de/taggers/date.py b/nemo_text_processing/text_normalization/de/taggers/date.py index 8c13882d2..21b32eb2b 100644 --- a/nemo_text_processing/text_normalization/de/taggers/date.py +++ b/nemo_text_processing/text_normalization/de/taggers/date.py @@ -42,7 +42,7 @@ def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': cardinal: cardinal GraphFst """ - year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT**2) @ cardinal.graph + year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT ** 2) @ cardinal.graph graph_two_digit = delete_leading_zero @ cardinal.two_digit_non_zero hundred = pynutil.insert("hundert") diff --git a/nemo_text_processing/text_normalization/de/taggers/measure.py b/nemo_text_processing/text_normalization/de/taggers/measure.py index a46822a0f..122ff8a67 100644 --- a/nemo_text_processing/text_normalization/de/taggers/measure.py +++ b/nemo_text_processing/text_normalization/de/taggers/measure.py @@ -82,9 +82,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, - 0, - 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/de/taggers/telephone.py b/nemo_text_processing/text_normalization/de/taggers/telephone.py index 97482a236..90af2f07e 100644 --- a/nemo_text_processing/text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/de/taggers/telephone.py @@ -45,7 +45,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numbers_with_single_digits = pynini.closure(graph_digit + insert_space) + graph_digit - two_digit_and_zero = (NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero + two_digit_and_zero = (NEMO_DIGIT ** 2 @ cardinal.two_digit_non_zero) | graph_zero # def add_space_after_two_digit(): # return pynini.closure(two_digit_and_zero + insert_space) + ( # two_digit_and_zero diff --git a/nemo_text_processing/text_normalization/de/taggers/time.py b/nemo_text_processing/text_normalization/de/taggers/time.py index 2fe74f5ba..371ad16ac 100644 --- a/nemo_text_processing/text_normalization/de/taggers/time.py +++ b/nemo_text_processing/text_normalization/de/taggers/time.py @@ -65,9 +65,7 @@ def __init__(self, deterministic: bool = True): + pynutil.insert('"') ) final_time_zone_optional = pynini.closure( - pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), - 0, - 1, + pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), 0, 1, ) # Accepts the following formats: 02:30 Uhr, 02.30 Uhr, 2:30 Uhr, 2.30 Uhr diff --git a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py index 646d7a6b7..e6590536f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py @@ -70,8 +70,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, - f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -93,10 +92,7 @@ def __init__( self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, - decimal=self.decimal, - fraction=self.fraction, - deterministic=deterministic, + cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -108,11 +104,7 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst( - cardinal=self.cardinal, - decimal=self.decimal, - deterministic=deterministic, - ) + self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py index d4ea8eb09..f8d5f6967 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py @@ -43,10 +43,7 @@ def __init__(self, deterministic: bool = True): self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( - pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, - "", - "[EOS]", - NEMO_SIGMA, + pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py index 668e1fb7c..161e5d97e 100644 --- a/nemo_text_processing/text_normalization/en/graph_utils.py +++ b/nemo_text_processing/text_normalization/en/graph_utils.py @@ -103,36 +103,14 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", - "c", - "d", - "f", - "g", - "h", - "j", - "k", - "l", - "m", - "n", - "p", - "q", - "r", - "s", - "t", - "v", - "w", - "x", - "y", - "z", + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, - plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), - NEMO_SIGMA, + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -147,9 +125,7 @@ def capitalized_input_graph( - graph: "pynini.FstLike", - original_graph_weight: float = None, - capitalized_graph_weight: float = None, + graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) @@ -233,10 +209,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [ - written_capitalized, - spoken.capitalize(), - ], # first letter capitalized + [written_capitalized, spoken.capitalize(),], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -250,10 +223,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): logger.debug(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [ - [written, spoken_no_space], - [written_capitalized, spoken_no_space.upper()], - ] + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] ) else: additional_labels.extend( diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py index 5e2a8535c..6ec0ac9dd 100644 --- a/nemo_text_processing/text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/cardinal.py @@ -83,7 +83,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): graph = ( pynini.closure(NEMO_DIGIT, 1, 3) - + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3)) + + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT ** 3) | pynini.closure(NEMO_DIGIT ** 3)) ) @ graph self.graph = graph @@ -118,7 +118,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) final_graph |= pynini.compose(final_graph, one_to_a_replacement_graph.optimize() + NEMO_SIGMA).optimize() # remove commas for 4 digits numbers - four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT**3 + four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT ** 3 final_graph |= pynini.compose(four_digit_comma_graph.optimize(), final_graph).optimize() self.final_graph = final_graph diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 52225f0ba..869716ef9 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -126,11 +126,11 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): 123 A.D., 4200 B.C """ graph = get_four_digit_year_graph(deterministic) - graph = (pynini.union("1", "2") + (NEMO_DIGIT**3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph + graph = (pynini.union("1", "2") + (NEMO_DIGIT ** 3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph graph |= _get_two_digit_year_with_s_graph() - three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT**2) @ cardinal_graph + three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph year_with_suffix = ( (get_four_digit_year_graph(deterministic=True) | three_digit_year) + delete_space + insert_space + year_suffix ) @@ -270,7 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year - day_ex_month = (NEMO_DIGIT**2 - pynini.project(month_numbers_graph, "input")) @ day_graph + day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 874d2e437..3262c7485 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -49,15 +49,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): else: numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ") - cc_cues = pynutil.add_weight( - pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), - MIN_NEG_WEIGHT, - ) + cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) - cc_cues = pynutil.add_weight( - pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), - MIN_NEG_WEIGHT, - ) + cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") @@ -65,14 +59,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" ) - dict_words = pynutil.add_weight( - pynini.string_file(get_abs_path("data/electronic/words.tsv")), - MIN_NEG_WEIGHT, - ) + dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,) dict_words_without_delimiter = dict_words + pynini.closure( - pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), - 1, + pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1, ) dict_words_graph = dict_words_without_delimiter | dict_words diff --git a/nemo_text_processing/text_normalization/en/taggers/measure.py b/nemo_text_processing/text_normalization/en/taggers/measure.py index e8d92e1da..fc61620ce 100644 --- a/nemo_text_processing/text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/text_normalization/en/taggers/measure.py @@ -53,11 +53,7 @@ class MeasureFst(GraphFst): """ def __init__( - self, - cardinal: GraphFst, - decimal: GraphFst, - fraction: GraphFst, - deterministic: bool = True, + self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, ): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and | self.get_range(cardinal.graph_with_and) @@ -67,8 +63,7 @@ def __init__( graph_unit |= pynini.string_file(get_abs_path("data/measure/unit_alternatives.tsv")) graph_unit |= pynini.compose( - pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), - graph_unit, + pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), graph_unit, ).optimize() graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) @@ -81,9 +76,7 @@ def __init__( ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, - 0, - 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( @@ -257,12 +250,11 @@ def get_address_graph(self, cardinal): ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( - pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), - ordinal_verbalizer, + pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), ordinal_verbalizer, ) address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit - address_num += insert_space + NEMO_DIGIT**2 @ ( + address_num += insert_space + NEMO_DIGIT ** 2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit ) @@ -300,12 +292,8 @@ def get_address_graph(self, cardinal): state = pynini.invert(state_graph) state = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) - zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) - zip_code = pynini.closure( - pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, - 0, - 1, - ) + zip_code = pynini.compose(NEMO_DIGIT ** 5, cardinal.single_digits_graph) + zip_code = pynini.closure(pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1,) address = address_num + direction + address_words + pynini.closure(city + state + zip_code, 0, 1) diff --git a/nemo_text_processing/text_normalization/en/taggers/money.py b/nemo_text_processing/text_normalization/en/taggers/money.py index 0687b0c1a..ef38c56b5 100644 --- a/nemo_text_processing/text_normalization/en/taggers/money.py +++ b/nemo_text_processing/text_normalization/en/taggers/money.py @@ -112,8 +112,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( - NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), - integer_plus_maj, + NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj, ) integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma @@ -190,8 +189,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( - NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), - integer_graph_reordered, + NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered, ) final_graph += graph_per_units.ques diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index c989e99f5..9d57a9fb9 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -33,12 +33,7 @@ class RangeFst(GraphFst): """ def __init__( - self, - time: GraphFst, - date: GraphFst, - cardinal: GraphFst, - deterministic: bool = True, - lm: bool = False, + self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False, ): super().__init__(name="range", kind="classify", deterministic=deterministic) @@ -52,14 +47,14 @@ def __init__( cardinal = cardinal.graph_with_and # YEAR - date_year_four_digit = (NEMO_DIGIT**4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date - date_year_two_digit = (NEMO_DIGIT**2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date year_to_year_graph = ( date_year_four_digit + delete_space + pynini.cross("-", " to ") + delete_space - + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT**2 @ cardinal)) + + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal)) ) mid_year_graph = pynini.accep("mid") + pynini.cross("-", " ") + (date_year_four_digit | date_year_two_digit) diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index f650c8ff3..913c09285 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -71,7 +71,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = num_graph |= pynini.compose(num_graph, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA) # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( - NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 + NEMO_DIGIT ** 2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 ) # add space between letter and digit/symbol diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py index 7a253cccc..28614fad1 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py @@ -78,8 +78,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, - f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", + cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -108,12 +107,7 @@ def __init__( logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") start_time = time.time() - measure = MeasureFst( - cardinal=cardinal, - decimal=decimal, - fraction=fraction, - deterministic=deterministic, - ) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic,) measure_graph = measure.fst logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") @@ -163,10 +157,7 @@ def __init__( time_final = pynini.compose(time_graph, v_time_graph) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst( - time=time_final, - date=date_final, - cardinal=cardinal, - deterministic=deterministic, + time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic, ).fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py index dff205f8e..4ad7d1c85 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py @@ -46,10 +46,7 @@ def __init__(self, deterministic: bool = True): convert_rest = pynutil.insert("th") suffix = pynini.cdrewrite( - graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, - "", - "[EOS]", - NEMO_SIGMA, + graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 946f4234e..101185a90 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -107,10 +107,7 @@ def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike": """ fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA) fem_allign @= pynini.cdrewrite( - fem_ones, - "", - pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), - NEMO_SIGMA, + fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA, ) # If before a quote or EOS, we know it's the end of a string return fst @ fem_allign diff --git a/nemo_text_processing/text_normalization/es/taggers/cardinal.py b/nemo_text_processing/text_normalization/es/taggers/cardinal.py index 85402089f..1b8f0a440 100644 --- a/nemo_text_processing/text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/cardinal.py @@ -47,7 +47,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT**3 # for blocks of three + exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -157,7 +157,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**24 + @ NEMO_DIGIT ** 24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/es/taggers/date.py b/nemo_text_processing/text_normalization/es/taggers/date.py index dd5cd7f0e..ea7f15292 100644 --- a/nemo_text_processing/text_normalization/es/taggers/date.py +++ b/nemo_text_processing/text_normalization/es/taggers/date.py @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): dash = "-" day_optional = pynini.closure(pynini.cross(dash, NEMO_SPACE) + day, 0, 1) - graph_ymd = NEMO_DIGIT**4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional + graph_ymd = NEMO_DIGIT ** 4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional final_graph = graph_dmy + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd diff --git a/nemo_text_processing/text_normalization/es/taggers/fraction.py b/nemo_text_processing/text_normalization/es/taggers/fraction.py index 7bbe86402..1fb5b8118 100644 --- a/nemo_text_processing/text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/es/taggers/fraction.py @@ -47,50 +47,15 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = ordinal_graph = ordinal.graph # 2-10 are all ordinals - three_to_ten = pynini.string_map( - [ - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - ] - ) + three_to_ten = pynini.string_map(["2", "3", "4", "5", "6", "7", "8", "9", "10",]) block_three_to_ten = pynutil.delete(three_to_ten) # To block cardinal productions if not deterministic: # Multiples of tens are sometimes rendered as ordinals - three_to_ten |= pynini.string_map( - [ - "20", - "30", - "40", - "50", - "60", - "70", - "80", - "90", - ] - ) + three_to_ten |= pynini.string_map(["20", "30", "40", "50", "60", "70", "80", "90",]) graph_three_to_ten = three_to_ten @ ordinal_graph graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", NEMO_SIGMA) # Higher powers of tens (and multiples) are converted to ordinals. - hundreds = pynini.string_map( - [ - "100", - "200", - "300", - "400", - "500", - "600", - "700", - "800", - "900", - ] - ) + hundreds = pynini.string_map(["100", "200", "300", "400", "500", "600", "700", "800", "900",]) graph_hundreds = hundreds @ ordinal_graph multiples_of_thousand = ordinal.multiples_of_thousand # So we can have X milésimos @@ -103,10 +68,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = graph_higher_powers_of_ten += higher_powers_of_ten graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten graph_higher_powers_of_ten @= pynini.cdrewrite( - pynutil.delete("un "), - pynini.accep("[BOS]"), - pynini.project(higher_powers_of_ten, "output"), - NEMO_SIGMA, + pynutil.delete("un "), pynini.accep("[BOS]"), pynini.project(higher_powers_of_ten, "output"), NEMO_SIGMA, ) # we drop 'un' from these ordinals (millionths, not one-millionths) graph_higher_powers_of_ten = multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten @@ -121,16 +83,10 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = # Blocking the digits and hundreds from Cardinal graph graph_fractions_cardinals = pynini.cdrewrite( - block_three_to_ten | block_higher_powers_of_ten, - pynini.accep("[BOS]"), - pynini.accep("[EOS]"), - NEMO_SIGMA, + block_three_to_ten | block_higher_powers_of_ten, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, ) graph_fractions_cardinals @= NEMO_CHAR.plus @ pynini.cdrewrite( - pynutil.delete("0"), - pynini.accep("[BOS]"), - pynini.accep("[EOS]"), - NEMO_SIGMA, + pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, ) # Empty characters become '0' for NEMO_CHAR fst, so need to block graph_fractions_cardinals @= cardinal_graph graph_fractions_cardinals += pynutil.insert( diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index a63677c47..a1933dbed 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -79,9 +79,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, - 0, - 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) complex_unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/es/taggers/time.py b/nemo_text_processing/text_normalization/es/taggers/time.py index de2752657..4a947dd31 100644 --- a/nemo_text_processing/text_normalization/es/taggers/time.py +++ b/nemo_text_processing/text_normalization/es/taggers/time.py @@ -115,9 +115,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): time_zone_graph = time_zones + pynini.closure(utc_or_gmt_diff, 0, 1) final_time_zone_optional = pynini.closure( - delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), - 0, - 1, + delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, ) # 02.30 h diff --git a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py index 165f5eeca..5aa66031a 100644 --- a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py @@ -69,8 +69,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, - f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -87,17 +86,10 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst( - cardinal=self.cardinal, - ordinal=self.ordinal, - deterministic=deterministic, - ) + self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, - decimal=self.decimal, - fraction=self.fraction, - deterministic=deterministic, + cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -109,11 +101,7 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst( - cardinal=self.cardinal, - decimal=self.decimal, - deterministic=deterministic, - ) + self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py index 5d7afc1b7..3758c1bd5 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py @@ -141,8 +141,7 @@ def __init__(self, deterministic: bool = True): fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( - denominator_singular_fem @ merge_stem, - denominator_singular_fem @ merge_into_single_word, + denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word, ) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( @@ -151,8 +150,7 @@ def __init__(self, deterministic: bool = True): fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( - denominator_plural_fem @ merge_stem, - denominator_plural_fem @ merge_into_single_word, + denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word, ) fraction_default_fem += pynutil.insert(" partes") diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index 0b38aeebb..de9a0b047 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -62,8 +62,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, - f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -80,11 +79,7 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst( - cardinal=self.cardinal, - ordinal=self.ordinal, - deterministic=deterministic, - ) + self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) fraction_graph = self.fraction.fst word_graph = WordFst(deterministic=deterministic).fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) diff --git a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py index c9c5c3063..c20a3d27b 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py @@ -62,7 +62,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': fst: A pynini.FstLike object """ cardinal_separator = pynini.string_map([".", NEMO_SPACE]) - exactly_three_digits = NEMO_DIGIT**3 # for blocks of three + exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string up_to_three_digits = up_to_three_digits - "000" - "00" - "0" @@ -246,7 +246,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**24 + @ NEMO_DIGIT ** 24 @ graph @ clean_output ) @@ -257,12 +257,12 @@ def __init__(self, deterministic: bool = True): zero_space + digit, ).optimize() self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, - zero_space + ((NEMO_DIGIT**2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, + zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), zero_space + zero_space + digit, ).optimize() self.four_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**3)) @ self.graph, zero_space + self.three_digits_read + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 3)) @ self.graph, zero_space + self.three_digits_read ).optimize() self.graph |= graph_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/decimal.py b/nemo_text_processing/text_normalization/hu/taggers/decimal.py index 10ae4a8fe..5026caec3 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/decimal.py @@ -101,7 +101,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ]: for modifier in ["", "tíz", "száz"]: decimal_number |= ( - (NEMO_DIGIT**order + (NEMO_DIGIT - "0")) + (NEMO_DIGIT ** order + (NEMO_DIGIT - "0")) @ pynini.cdrewrite(pynini.cross("0", ""), "[BOS]", "", NEMO_SIGMA) @ cardinal_graph + final_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/measure.py b/nemo_text_processing/text_normalization/hu/taggers/measure.py index f2c3a2368..9e5f328fb 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hu/taggers/measure.py @@ -61,9 +61,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, - 0, - 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/time.py b/nemo_text_processing/text_normalization/hu/taggers/time.py index 43e067fef..ae1592f74 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/time.py +++ b/nemo_text_processing/text_normalization/hu/taggers/time.py @@ -180,11 +180,7 @@ def hours_to_pairs(): final_time_zone = ( pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") ) - final_time_zone_optional = pynini.closure( - final_time_zone, - 0, - 1, - ) + final_time_zone_optional = pynini.closure(final_time_zone, 0, 1,) # This might be better as just the inflected forms hour_only_delimited = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py index 8c269bb00..60ed0ddc9 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py @@ -69,8 +69,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, - f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -87,17 +86,10 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst( - cardinal=self.cardinal, - ordinal=self.ordinal, - deterministic=deterministic, - ) + self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, - decimal=self.decimal, - fraction=self.fraction, - deterministic=deterministic, + cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -109,11 +101,7 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst( - cardinal=self.cardinal, - decimal=self.decimal, - deterministic=deterministic, - ) + self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py index b52e6efb7..f17f7c36a 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py @@ -34,11 +34,7 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure( - country_code + delete_space + insert_space, - 0, - 1, - ) + optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) number_part = ( pynutil.delete("number_part: \"") @@ -57,8 +53,6 @@ def __init__(self, deterministic: bool = True): 1, ) - graph = pynini.union( - optional_country_code + number_part + optional_extension, - ) + graph = pynini.union(optional_country_code + number_part + optional_extension,) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index 1e16d6e36..ecb003775 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -48,7 +48,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT**3 # for blocks of three + exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -162,7 +162,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**24 + @ NEMO_DIGIT ** 24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/it/taggers/measure.py b/nemo_text_processing/text_normalization/it/taggers/measure.py index 880be0aa7..40144cd61 100644 --- a/nemo_text_processing/text_normalization/it/taggers/measure.py +++ b/nemo_text_processing/text_normalization/it/taggers/measure.py @@ -68,9 +68,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, - 0, - 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py index 603d520b5..3aebcca91 100644 --- a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py @@ -66,8 +66,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, - f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -89,18 +88,10 @@ def __init__( self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.measure = MeasureFst( - cardinal=self.cardinal, - decimal=self.decimal, - deterministic=deterministic, - ) + self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) measure_graph = self.measure.fst - self.money = MoneyFst( - cardinal=self.cardinal, - decimal=self.decimal, - deterministic=deterministic, - ) + self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) money_graph = self.money.fst self.time = TimeFst(deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 8a60516cc..6a61efd4e 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -164,16 +164,11 @@ def normalize( text_with_span_tags_list[masked_idx_list[sem_tag_idx]] = "" else: non_deter_options = self.normalize_non_deterministic( - text=cur_semiotic_span, - n_tagged=n_tagged, - punct_post_process=punct_post_process, - verbose=verbose, + text=cur_semiotic_span, n_tagged=n_tagged, punct_post_process=punct_post_process, verbose=verbose, ) try: best_option, cer, _ = self.select_best_match( - normalized_texts=non_deter_options, - pred_text=cur_pred_text, - verbose=verbose, + normalized_texts=non_deter_options, pred_text=cur_pred_text, verbose=verbose, ) if cer_threshold > 0 and cer > cer_threshold: best_option = cur_deter_norm @@ -371,11 +366,7 @@ def get_verbalized_text(tagged_text): continue def select_best_match( - self, - normalized_texts: List[str], - pred_text: str, - verbose: bool = False, - remove_punct: bool = False, + self, normalized_texts: List[str], pred_text: str, verbose: bool = False, remove_punct: bool = False, ): """ Selects the best normalization option based on the lowest CER diff --git a/nemo_text_processing/text_normalization/ru/taggers/date.py b/nemo_text_processing/text_normalization/ru/taggers/date.py index 3ad16f999..2dc87ee06 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/text_normalization/ru/taggers/date.py @@ -78,7 +78,7 @@ def __init__(self, number_names: dict, deterministic: bool): month = ( pynutil.insert("month: \"") + (month_name | pynutil.add_weight(digit_month, 0.1)) + pynutil.insert("\"") ).optimize() - year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)), numbers).optimize() + year = pynini.compose(((NEMO_DIGIT ** 4) | (NEMO_DIGIT ** 2)), numbers).optimize() year |= zero_digit # reduce year options diff --git a/nemo_text_processing/text_normalization/ru/taggers/telephone.py b/nemo_text_processing/text_normalization/ru/taggers/telephone.py index 456bd6f1a..4fbfbf06a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/taggers/telephone.py @@ -48,13 +48,13 @@ def __init__(self, number_names: dict, deterministic: bool = True): optional_country_code = pynini.closure(country_code + insert_space, 0, 1) number_part = ( - NEMO_DIGIT**3 @ number + NEMO_DIGIT ** 3 @ number + separator - + NEMO_DIGIT**3 @ number + + NEMO_DIGIT ** 3 @ number + separator - + NEMO_DIGIT**2 @ number + + NEMO_DIGIT ** 2 @ number + separator - + NEMO_DIGIT**2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) + + NEMO_DIGIT ** 2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) ) number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") tagger_graph = (optional_country_code + number_part).optimize() diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py index 450ab1d01..ce75cd17e 100644 --- a/nemo_text_processing/text_normalization/rw/graph_utils.py +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -25,7 +25,7 @@ from pynini.export import export from pynini.lib import byte, pynutil, utf8 -from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.en.utils import get_abs_path from nemo_text_processing.utils.logging import logger NEMO_CHAR = utf8.VALID_UTF8_CHAR @@ -107,36 +107,14 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", - "c", - "d", - "f", - "g", - "h", - "j", - "k", - "l", - "m", - "n", - "p", - "q", - "r", - "s", - "t", - "v", - "w", - "x", - "y", - "z", + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, - plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), - NEMO_SIGMA, + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -151,9 +129,7 @@ def capitalized_input_graph( - graph: "pynini.FstLike", - original_graph_weight: float = None, - capitalized_graph_weight: float = None, + graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) diff --git a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py index 14da33500..958a95234 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py @@ -73,7 +73,6 @@ def __init__(self): FOUR_ZEROS = "0000" FIVE_ZEROS = "00000" SIX_ZEROS = "000000" - SIX_ZEROS = "000000" SEVEN_ZEROS = "0000000" EIGHT_ZEROS = "00000000" NINE_ZEROS = "000000000" diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py index 1ac1adf01..d39210ff5 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py @@ -19,14 +19,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst -from nemo_text_processing.text_normalization.rw.graph_utils import ( - NEMO_PUNCT, - GraphFst, - delete_extra_space, - delete_space, - delete_space_or_punct, - generator_main, -) +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py index 750ff867b..021e652bd 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py @@ -69,7 +69,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT**3 # for blocks of three + exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_separator = NEMO_SPACE @@ -249,7 +249,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**24 + @ NEMO_DIGIT ** 24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -276,27 +276,30 @@ def __init__(self, deterministic: bool = True): zero_space = zero + insert_space self.zero_space = zero_space self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - zero_space + ((NEMO_DIGIT**2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, - zero_space + ((NEMO_DIGIT**2) @ graph_tens), + zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, zero_space + digit + insert_space + digit, ) self.three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, zero_space + digit + insert_space + digit, ) self.two_or_three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, zero_space + single_digits_graph + pynini.closure(insert_space + digit, 0, 1), single_digits_graph + pynini.closure(insert_space + single_digits_graph, 3), @@ -304,7 +307,7 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), zero_space + single_digits_graph + pynini.closure(insert_space + single_digits_graph, 0, 1), @@ -313,8 +316,9 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_both = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py index 4da3f81c2..e114e9e6d 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/measure.py +++ b/nemo_text_processing/text_normalization/sv/taggers/measure.py @@ -81,9 +81,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, - 0, - 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py index 0877ca08f..25dfb6e9b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py @@ -95,10 +95,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): hundreds |= pynini.cross("1", "ett hundra") hundreds |= digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra") - graph_hundreds = hundreds + pynini.union( - graph_tens, - (pynutil.delete("0") + graph_digit), - ) + graph_hundreds = hundreds + pynini.union(graph_tens, (pynutil.delete("0") + graph_digit),) if not deterministic: graph_hundreds |= hundreds + pynini.union( (graph_teens | pynutil.insert(NEMO_SPACE) + graph_teens), (pynini.cross("0", NEMO_SPACE) + graph_digit) @@ -182,7 +179,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT**24 + @ NEMO_DIGIT ** 24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py index cb5067058..676e78592 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/text_normalization/sv/taggers/time.py @@ -106,11 +106,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(ensure_space + final_suffix, 0, 1) final_time_zone = pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") - final_time_zone_optional = pynini.closure( - NEMO_SPACE + final_time_zone, - 0, - 1, - ) + final_time_zone_optional = pynini.closure(NEMO_SPACE + final_time_zone, 0, 1,) # 2:30 pm, 02:30, 2:00 graph_hm_kl = ( diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py index 6656e3445..af17c6d48 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py @@ -40,11 +40,7 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure( - country_code + delete_space + insert_space, - 0, - 1, - ) + optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) number_part = ( pynutil.delete("number_part: \"") diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index a0c3b587d..21437e82f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -35,7 +35,7 @@ def __init__(self, deterministic: bool = True): graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) - alls = NEMO_DIGIT**2 | NEMO_DIGIT**1 + alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 graph_all = ( (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 @@ -46,7 +46,7 @@ def __init__(self, deterministic: bool = True): ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 graph_all_alt = alls @ graph_all_alt - hundreds = NEMO_DIGIT**3 + hundreds = NEMO_DIGIT ** 3 graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), @@ -56,7 +56,7 @@ def __init__(self, deterministic: bool = True): self.digit = graph_digit.optimize() self.all = graph_all.optimize() - thousands = NEMO_DIGIT**4 + thousands = NEMO_DIGIT ** 4 graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -64,7 +64,7 @@ def __init__(self, deterministic: bool = True): ) graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT**5 + ten_thousands = NEMO_DIGIT ** 5 graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -73,8 +73,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - hundred_thousands = NEMO_DIGIT**6 - hundred_thousands_position = NEMO_DIGIT**2 + hundred_thousands = NEMO_DIGIT ** 6 + hundred_thousands_position = NEMO_DIGIT ** 2 hundred_thousands_position = hundred_thousands_position @ graph_all_alt graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT**7 - million_position = NEMO_DIGIT**3 + millions = NEMO_DIGIT ** 7 + million_position = NEMO_DIGIT ** 3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -95,8 +95,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT**8 - ten_million_position = NEMO_DIGIT**4 + ten_millions = NEMO_DIGIT ** 8 + ten_million_position = NEMO_DIGIT ** 4 ten_million_position = ten_million_position @ graph_thousand_component graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -106,7 +106,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT**9 + hundred_millions = NEMO_DIGIT ** 9 graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -119,8 +119,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT**10 - thousand_millions_position = NEMO_DIGIT**2 + thousand_millions = NEMO_DIGIT ** 10 + thousand_millions_position = NEMO_DIGIT ** 2 thousand_millions_position = thousand_millions_position @ graph_all_alt graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -134,8 +134,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT**11 - ten_billions_position = NEMO_DIGIT**3 + ten_billions = NEMO_DIGIT ** 11 + ten_billions_position = NEMO_DIGIT ** 3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -149,8 +149,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT**12 - hundred_billions_position = NEMO_DIGIT**4 + hundred_billions = NEMO_DIGIT ** 12 + hundred_billions_position = NEMO_DIGIT ** 4 hundred_billions_position = hundred_billions_position @ graph_thousand_component graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index b283f3444..5cd95e58c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -32,9 +32,7 @@ class PreProcessorFst(GraphFst): ''' def __init__( - self, - remove_interjections: bool = True, - fullwidth_to_halfwidth: bool = True, + self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index dcdd73622..dab0cea0f 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -36,11 +36,7 @@ class PostProcessor(GraphFst): ''' def __init__( - self, - remove_puncts: bool = False, - to_upper: bool = False, - to_lower: bool = False, - tag_oov: bool = False, + self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index 846254938..4592d7841 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -42,11 +42,6 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor( - remove_puncts=False, - to_upper=False, - to_lower=False, - tag_oov=False, - ) + postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) self.fst = (verbalizer @ postprocessor.fst).optimize() diff --git a/setup.py b/setup.py index e22afbab3..4667b49e8 100644 --- a/setup.py +++ b/setup.py @@ -52,9 +52,7 @@ elif os.path.exists('README.rst'): # codec is used for consistent encoding long_description = codecs.open( - os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), - 'r', - encoding='utf-8', + os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', encoding='utf-8', ).read() long_description_content_type = "text/x-rst" @@ -127,8 +125,7 @@ def __call_checker(self, base_command, scope, check): command.extend(['--check', '--diff']) self.announce( - msg='Running command: %s' % str(' '.join(command)), - level=distutils_log.INFO, + msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO, ) return_code = subprocess.call(command) @@ -136,18 +133,10 @@ def __call_checker(self, base_command, scope, check): return return_code def _isort(self, scope, check): - return self.__call_checker( - base_command=self.__ISORT_BASE.split(), - scope=scope, - check=check, - ) + return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,) def _black(self, scope, check): - return self.__call_checker( - base_command=self.__BLACK_BASE.split(), - scope=scope, - check=check, - ) + return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,) def _pass(self): self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO) diff --git a/tests/conftest.py b/tests/conftest.py index a26dab531..8db3b106c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,9 +56,7 @@ def pytest_addoption(parser): help="path to a directory with .far grammars for CPU TN/ITN tests, (DEFAULT: None, i.e. no cache)", ) parser.addoption( - '--run_audio_based', - action='store_true', - help="pass this argument to run audio-based TN tests", + '--run_audio_based', action='store_true', help="pass this argument to run audio-based TN tests", ) @@ -150,12 +148,10 @@ def pytest_configure(config): If file absent or sizes not equal, function downloads the archive from github and unpacks it. """ config.addinivalue_line( - "markers", - "run_only_on(device): runs the test only on a given device [CPU | GPU]", + "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]", ) config.addinivalue_line( - "markers", - "with_downloads: runs the test using data present in tests/.data", + "markers", "with_downloads: runs the test using data present in tests/.data", ) # Test dir and archive filepath. test_dir = join(dirname(__file__), __TEST_DATA_SUBDIR) diff --git a/tests/nemo_text_processing/ar/test_money.py b/tests/nemo_text_processing/ar/test_money.py index 2aa49ba9a..6fe36ba35 100644 --- a/tests/nemo_text_processing/ar/test_money.py +++ b/tests/nemo_text_processing/ar/test_money.py @@ -49,8 +49,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_address.py b/tests/nemo_text_processing/en/test_address.py index ea8328d10..c7a3523a0 100644 --- a/tests/nemo_text_processing/en/test_address.py +++ b/tests/nemo_text_processing/en/test_address.py @@ -42,8 +42,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_cardinal.py b/tests/nemo_text_processing/en/test_cardinal.py index f40e0d1f6..1ee3a2a5b 100644 --- a/tests/nemo_text_processing/en/test_cardinal.py +++ b/tests/nemo_text_processing/en/test_cardinal.py @@ -63,8 +63,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_decimal.py b/tests/nemo_text_processing/en/test_decimal.py index ea20f18d6..ff021f72a 100644 --- a/tests/nemo_text_processing/en/test_decimal.py +++ b/tests/nemo_text_processing/en/test_decimal.py @@ -61,8 +61,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_electronic.py b/tests/nemo_text_processing/en/test_electronic.py index 4dfec585e..e8640062c 100644 --- a/tests/nemo_text_processing/en/test_electronic.py +++ b/tests/nemo_text_processing/en/test_electronic.py @@ -60,8 +60,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=100, - punct_post_process=False, + test_input, n_tagged=100, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_fraction.py b/tests/nemo_text_processing/en/test_fraction.py index a6186aabb..764205591 100644 --- a/tests/nemo_text_processing/en/test_fraction.py +++ b/tests/nemo_text_processing/en/test_fraction.py @@ -39,8 +39,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_math.py b/tests/nemo_text_processing/en/test_math.py index 22859f596..e2ecdebb8 100644 --- a/tests/nemo_text_processing/en/test_math.py +++ b/tests/nemo_text_processing/en/test_math.py @@ -39,8 +39,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_measure.py b/tests/nemo_text_processing/en/test_measure.py index 6ea9a0eda..b03b3ff53 100644 --- a/tests/nemo_text_processing/en/test_measure.py +++ b/tests/nemo_text_processing/en/test_measure.py @@ -61,8 +61,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_money.py b/tests/nemo_text_processing/en/test_money.py index 103223d5e..c81945ecd 100644 --- a/tests/nemo_text_processing/en/test_money.py +++ b/tests/nemo_text_processing/en/test_money.py @@ -63,8 +63,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_ordinal.py b/tests/nemo_text_processing/en/test_ordinal.py index dac56bf38..6f87a832d 100644 --- a/tests/nemo_text_processing/en/test_ordinal.py +++ b/tests/nemo_text_processing/en/test_ordinal.py @@ -61,8 +61,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_punctuation.py b/tests/nemo_text_processing/en/test_punctuation.py index 761b3c9f4..75ff2e73c 100644 --- a/tests/nemo_text_processing/en/test_punctuation.py +++ b/tests/nemo_text_processing/en/test_punctuation.py @@ -22,11 +22,7 @@ class TestPunctuation: normalizer_en = Normalizer( - input_case='cased', - lang='en', - cache_dir=CACHE_DIR, - overwrite_cache=False, - post_process=True, + input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True, ) # address is tagged by the measure class diff --git a/tests/nemo_text_processing/en/test_range.py b/tests/nemo_text_processing/en/test_range.py index 64b47d898..ac93613be 100644 --- a/tests/nemo_text_processing/en/test_range.py +++ b/tests/nemo_text_processing/en/test_range.py @@ -39,8 +39,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_roman.py b/tests/nemo_text_processing/en/test_roman.py index 3ef655c65..dc9468fb3 100644 --- a/tests/nemo_text_processing/en/test_roman.py +++ b/tests/nemo_text_processing/en/test_roman.py @@ -40,8 +40,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=False, + test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_serial.py b/tests/nemo_text_processing/en/test_serial.py index 2a27b1f54..aab870abf 100644 --- a/tests/nemo_text_processing/en/test_serial.py +++ b/tests/nemo_text_processing/en/test_serial.py @@ -38,8 +38,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=-1, - punct_post_process=False, + test_input, n_tagged=-1, punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_special_text.py b/tests/nemo_text_processing/en/test_special_text.py index 73be5d382..a461fe703 100644 --- a/tests/nemo_text_processing/en/test_special_text.py +++ b/tests/nemo_text_processing/en/test_special_text.py @@ -41,8 +41,6 @@ def test_norm(self, test_input, expected): # Audio-based normalization will output only options without digits if self.normalizer_with_audio_en and sum([1 for ch in expected if ch.isdigit()]) == 0: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, - n_tagged=30, - punct_post_process=True, + test_input, n_tagged=30, punct_post_process=True, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index 1a48d6da8..e2cd7d4a2 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -62,8 +62,6 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, - n_tagged=500, - punct_post_process=False, + test_input, n_tagged=500, punct_post_process=False, ) assert expected in pred_non_deterministic