From 81d0e8362ef04242973ccc4ee19017eaf384de98 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Mon, 2 Sep 2024 06:28:51 +0200 Subject: [PATCH] Cardinals up to a hundred trillions, timeFST and transliteration (moving constants to data files). Signed-off-by: kurt0cougar --- .../text_normalization/normalize.py | 3 + .../text_normalization/rw/__init__.py | 1 + .../text_normalization/rw/data/__init__.py | 1 + .../rw/data/cardinal/__init__.py | 15 + .../rw/data/cardinal/digits.tsv | 9 + .../rw/data/cardinal/digits_for_thousands.tsv | 10 + .../cardinal/digits_millions_trillions.tsv | 10 + .../rw/data/cardinal/hundreds.tsv | 9 + .../rw/data/cardinal/hundreds_of_millions.tsv | 9 + .../data/cardinal/hundreds_of_thousands.tsv | 9 + .../data/cardinal/hundreds_of_trillions.tsv | 9 + .../rw/data/cardinal/millions.tsv | 9 + .../rw/data/cardinal/tens.tsv | 9 + .../rw/data/cardinal/tens_of_millions.tsv | 9 + .../rw/data/cardinal/tens_of_thousands.tsv | 9 + .../rw/data/cardinal/tens_of_trillions.tsv | 9 + .../rw/data/cardinal/thousands.tsv | 10 + .../rw/data/cardinal/trillions.tsv | 9 + .../rw/data/time/__init__.py | 15 + .../text_normalization/rw/data/time/hours.tsv | 12 + .../rw/data/time/minutes.tsv | 60 ++++ .../rw/data/whitelist/__init__.py | 1 + .../text_normalization/rw/graph_utils.py | 311 ++++++++++++++++++ .../text_normalization/rw/taggers/__init__.py | 1 + .../text_normalization/rw/taggers/cardinal.py | 196 ++--------- .../text_normalization/rw/taggers/time.py | 82 +---- .../rw/taggers/tokenize_and_classify.py | 5 +- .../rw/taggers/whitelist.py | 3 +- .../rw/verbalizers/__init__.py | 1 + .../text_normalization/rw/verbalizers/time.py | 7 +- .../rw/verbalizers/verbalize.py | 3 +- .../rw/verbalizers/verbalize_final.py | 30 +- .../pynini_export.py | 7 +- 33 files changed, 611 insertions(+), 272 deletions(-) create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/time/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/time/hours.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/time/minutes.tsv create mode 100644 nemo_text_processing/text_normalization/rw/graph_utils.py diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 14093dadf..c6d19f82f 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -165,6 +165,9 @@ def __init__( elif lang == 'hy': from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.hy.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'rw': + from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.rw.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") diff --git a/nemo_text_processing/text_normalization/rw/__init__.py b/nemo_text_processing/text_normalization/rw/__init__.py index b136ce06b..c921ca1b8 100644 --- a/nemo_text_processing/text_normalization/rw/__init__.py +++ b/nemo_text_processing/text_normalization/rw/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/data/__init__.py b/nemo_text_processing/text_normalization/rw/data/__init__.py index 9fb50331b..9c4313114 100644 --- a/nemo_text_processing/text_normalization/rw/data/__init__.py +++ b/nemo_text_processing/text_normalization/rw/data/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py b/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv new file mode 100644 index 000000000..bf85b743b --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv @@ -0,0 +1,9 @@ +rimwe 1 +kabiri 2 +gatatu 3 +kane 4 +gatanu 5 +gatandatu 6 +karindwi 7 +umunani 8 +icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv new file mode 100644 index 000000000..ee31aadee --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv @@ -0,0 +1,10 @@ + 0 +kimwe 1 +bibiri 2 +bitatu 3 +bine 4 +bitanu 5 +bitandatu 6 +birindwi 7 +umunani 8 +icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv new file mode 100644 index 000000000..126ad90a3 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv @@ -0,0 +1,10 @@ + 0 +imwe 1 +ebyiri 2 +eshatu 3 +enye 4 +eshanu 5 +esheshatu 6 +zirindwi 7 +umunani 8 +icyenda 9 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv new file mode 100644 index 000000000..a46623cc1 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv @@ -0,0 +1,9 @@ +ijana 1 +magana_abiri 2 +magana_atatu 3 +magana_ane 4 +magana_atanu 5 +magana_atandatu 6 +magana_arindwi 7 +magana_inani 8 +magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv new file mode 100644 index 000000000..6e38c3ceb --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv @@ -0,0 +1,9 @@ +miliyoni_ijana 1 +miliyoni_magana_abiri 2 +miliyoni_magana_atatu 3 +miliyoni_magana_ane 4 +miliyoni_magana_atanu 5 +miliyoni_magana_atandatu 6 +miliyoni_magana_arindwi 7 +miliyoni_magana_inani 8 +miliyoni_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv new file mode 100644 index 000000000..a73477c14 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv @@ -0,0 +1,9 @@ +ibihumbi_ijana 1 +ibihumbi_magana_abiri 2 +ibihumbi_magana_atatu 3 +ibihumbi_magana_ane 4 +ibihumbi_magana_atanu 5 +ibihumbi_magana_atandatu 6 +ibihumbi_magana_arindwi 7 +ibihumbi_magana_inani 8 +ibihumbi_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv new file mode 100644 index 000000000..00fc01aa4 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni_ijana 1 +tiriyoni_magana_abiri 2 +tiriyoni_magana_atatu 3 +tiriyoni_magana_ane 4 +tiriyoni_magana_atanu 5 +tiriyoni_magana_atandatu 6 +tiriyoni_magana_arindwi 7 +tiriyoni_magana_inani 8 +tiriyoni_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv new file mode 100644 index 000000000..fded5ed55 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv @@ -0,0 +1,9 @@ +miliyoni 1 +miliyoni_ebyiri 2 +miliyoni_eshatu 3 +miliyoni_enye 4 +miliyoni_eshanu 5 +miliyoni_esheshatu 6 +miliyoni_zirindwi 7 +miliyoni_umunani 8 +miliyoni_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv new file mode 100644 index 000000000..6e63c3875 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv @@ -0,0 +1,9 @@ + 0 +makumyabiri 2 +mirongo_itatu 3 +mirongo_ine 4 +mirongo_itanu 5 +mirongo_itandatu 6 +mirongo_irindwi 7 +mirongo_inani 8 +mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv new file mode 100644 index 000000000..36f077d00 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv @@ -0,0 +1,9 @@ +miliyoni_cumi 1 +miliyoni_makumyabiri 2 +miliyoni_mirongo_itatu 3 +miliyoni_mirongo_ine 4 +miliyoni_mirongo_itanu 5 +miliyoni_mirongo_itandatatu 6 +miliyoni_mirongo_irindwi 7 +miliyoni_mirongo_inani 8 +miliyoni_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv new file mode 100644 index 000000000..f230751bf --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv @@ -0,0 +1,9 @@ +ibihumbi_cumi 1 +ibihumbi_makumyabiri 2 +ibihumbi_mirongo_itatu 3 +ibihumbi_mirongo_ine 4 +ibihumbi_mirongo_itanu 5 +ibihumbi_mirongo_itandatatu 6 +ibihumbi_mirongo_irindwi 7 +ibihumbi_mirongo_inani 8 +ibihumbi_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv new file mode 100644 index 000000000..3cf483594 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni_icumi 1 +tiriyoni_makumyabiri 2 +tiriyoni_mirongo_itatu 3 +tiriyoni_mirongo_ine 4 +tiriyoni_mirongo_itanu 5 +tiriyoni_mirongo_itandatatu 6 +tiriyoni_mirongo_irindwi 7 +tiriyoni_mirongo_inani 8 +tiriyoni_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv new file mode 100644 index 000000000..39d262443 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv @@ -0,0 +1,10 @@ +igihumbi 1 +ibihumbi_bibiri 2 +ibihumbi_bitatu 3 +ibihumbi_bine 4 +ibihumbi_bitanu 5 +ibihumbi_bitandatu 6 +ibihumbi_birindwi 7 +ibihumbi_umunani 8 +ibihumbi_icyenda 9 + diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv new file mode 100644 index 000000000..8098158df --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni 1 +tiriyoni_ebyiri 2 +tiriyoni_eshatu 3 +tiriyoni_enye 4 +tiriyoni_eshanu 5 +tiriyoni_esheshatu 6 +tiriyoni_zirindwi 7 +tiriyoni_umunani 8 +tiriyoni_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/time/__init__.py b/nemo_text_processing/text_normalization/rw/data/time/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/time/hours.tsv b/nemo_text_processing/text_normalization/rw/data/time/hours.tsv new file mode 100644 index 000000000..fae6f0898 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/hours.tsv @@ -0,0 +1,12 @@ +1 saa saba +2 saa munani +3 saa cyenda +4 saa cumi +5 saa cumi n'imwe +6 saa cumi n'ebyiri +7 saa moya +8 saa mbiri +9 saa tatu +10 saa ine +11 saa tanu +12 saa sita \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv b/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv new file mode 100644 index 000000000..c30327106 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv @@ -0,0 +1,60 @@ +00 +01 n'umunota umwe +02 n'iminota ibiri +03 n'iminota itatu +04 n'iminota ine +05 n'iminota itanu +06 n'iminota itandatu +07 n'iminota irindwi +08 n'iminota umunani +09 n'iminota icyenda +10 n'iminota icumi +11 n'iminota cumi n'umwe +12 n'iminota cumi n'ibiri +13 n'iminota cumi n'itatu +14 n'iminota cumi n'ine +15 n'iminota cumi n'itanu +16 n'iminota cumi n'itandatu +17 n'iminota cumi n'irindwi +18 n'iminota cumi n'umunani +19 n'iminota cumi n'icyenda +20 n'iminota makumyabiri +21 n'iminota makumyabiri na rimwe +22 n'iminota makumyabiri n'ibiri +23 n'iminota makumyabiri n'itatu +24 n'iminota makumyabiri n'ine +25 n'iminota makumyabiri n'itanu +26 n'iminota makumyabiri n'itandatu +27 n'iminota makumyabiri n'irindwi +28 n'iminota makumyabiri n'umunani +29 n'iminota makumyabiri n'icyenda +30 n'iminota mirongo itatu +31 n'iminota mirongo itatu n'umwe +32 n'iminota mirongo itatu n'ibiri +33 n'iminota mirongo itatu n'itatu +34 n'iminota mirongo itatu n'ine +35 n'iminota mirongo itatu n'itanu +36 n'iminota mirongo itatu n'itandatu +37 n'iminota mirongo itatu n'irindwi +38 n'iminota mirongo itatu n'umunani +39 n'iminota mirongo itatu n'icyenda +40 n'iminota mirongo ine +41 n'iminota mirongo ine n'umwe +42 n'iminota mirongo ine n'ibiri +43 n'iminota mirongo ine n'itatu +44 n'iminota mirongo ine n'ine +45 n'iminota mirongo ine n'itanu +46 n'iminota mirongo ine n'itandatu +47 n'iminota mirongo ine n'irindwi +48 n'iminota mirongo ine n'umunani +49 n'iminota mirongo ine n'icyenda +50 n'iminota mirongo itanu +51 n'iminota mirongo itanu n'umwe +52 n'iminota mirongo itanu n'ibiri +53 n'iminota mirongo itanu n'itatu +54 n'iminota mirongo itanu n'ine +55 n'iminota mirongo itanu n'itanu +56 n'iminota mirongo itanu n'itandatu +57 n'iminota mirongo itanu n'irindwi +58 n'iminota mirongo itanu n'umunani +59 n'iminota mirongo itanu n'icyenda \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py index 9fb50331b..9c4313114 100644 --- a/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py +++ b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py new file mode 100644 index 000000000..3744580d5 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -0,0 +1,311 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_VOWELS = pynini.union(*"aeiouAEIOU").optimize() +NEMO_CONSONANTS = pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz").optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_space_or_punct = NEMO_PUNCT | delete_space +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, +) -> "pynini.FstLike": + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logger.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/rw/taggers/__init__.py b/nemo_text_processing/text_normalization/rw/taggers/__init__.py index 90380542f..96d45783e 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/rw/taggers/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py index 68abc5fbd..c80097a8e 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,189 +16,40 @@ import pynini from pynini.lib import pynutil -import string -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst,NEMO_CHAR,insert_space +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst,NEMO_CHAR,insert_space,NEMO_DIGIT,NEMO_ALPHA,NEMO_CONSONANTS,NEMO_VOWELS,delete_extra_space,delete_space from nemo_text_processing.text_normalization.rw.utils import get_abs_path -def apply_fst(text, fst): - try: - print(pynini.shortestpath(text @ fst).string()) - print(len(pynini.shortestpath(text @ fst).string())) - - except pynini.FstOpError: - print(f"Error: no valid output with given'input: '{text}'") class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - alphabet = string.ascii_letters - rewrite_na_fst = pynini.cdrewrite(pynini.cross(" "," na "),pynini.union(*"aeiouAEIOU "),pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz"),NEMO_CHAR.closure()) - rewrite_n_fst = pynini.cdrewrite(pynini.cross(" "," n'"),pynini.union(*"aeiouAEIOU "),pynini.union(*"aeiouAEIOU"),NEMO_CHAR.closure()) - remove_underscore_fst = pynini.cdrewrite(pynini.cross("_"," "),pynini.union(*alphabet),pynini.union(*alphabet),NEMO_CHAR.closure()) - remove_extra_space_fst = pynini.cdrewrite(pynini.cross(" "," "),pynini.union(*alphabet),pynini.union(*alphabet),NEMO_CHAR.closure()) - remove_trailing_space_fst = pynini.cdrewrite(pynini.cross(pynini.accep(' ').closure(),''),pynini.union(*alphabet).closure(),'[EOS]',NEMO_CHAR.closure()) + vowels_or_space = NEMO_VOWELS | " " + rewrite_na_fst = pynini.cdrewrite(pynini.cross(" "," na "),vowels_or_space,NEMO_CONSONANTS,NEMO_CHAR.closure()) + rewrite_n_fst = pynini.cdrewrite(pynini.cross(" "," n'"),vowels_or_space,NEMO_VOWELS,NEMO_CHAR.closure()) + remove_underscore_fst = pynini.cdrewrite(pynini.cross("_"," "),pynini.union(NEMO_ALPHA),pynini.union(NEMO_ALPHA),NEMO_CHAR.closure()) + remove_extra_space_fst = pynini.cdrewrite(delete_extra_space,pynini.union(NEMO_ALPHA),pynini.union(NEMO_ALPHA),NEMO_CHAR.closure()) + remove_trailing_space_fst = pynini.cdrewrite(delete_space,pynini.union(NEMO_ALPHA).closure(),'[EOS]',NEMO_CHAR.closure()) rewrite_add_separator_fst = pynini.compose(rewrite_na_fst,rewrite_n_fst) ten_thousand = pynini.string_map([("ibihumbi_icumi","10")]) ten = pynini.string_map([("icumi","10")]) - digits = pynini.string_map([ - ("rimwe","1"), - ("kabiri","2"), - ("gatatu","3"), - ("kane","4"), - ("gatanu","5"), - ("gatandatu","6"), - ("karindwi","7"), - ("umunani","8"), - ("icyenda","9"), - ]) - digits_for_thousands = pynini.string_map([ - ("","0"), - ("kimwe","1"), - ("bibiri","2"), - ("bitatu","3"), - ("bine","4"), - ("bitanu","5"), - ("bitandatu","6"), - ("birindwi","7"), - ("umunani","8"), - ("icyenda","9") - ]) - digits_millions_trillions= pynini.string_map([ - ("","0"), - ("imwe","1"), - ("ebyiri","2"), - ("eshatu","3"), - ("enye","4"), - ("eshanu","5"), - ("esheshatu","6"), - ("zirindwi","7"), - ("umunani","8"), - ("icyenda","9") - ]) - tens = pynini.string_map([ - (" ","0"), - ("makumyabiri","2"), - ("mirongo_itatu","3"), - ("mirongo_ine","4"), - ("mirongo_itanu","5"), - ("mirongo_itandatu","6"), - ("mirongo_irindwi","7"), - ("mirongo_inani","8"), - ("mirongo_icyenda","9") - ]) + digits = pynini.string_file(get_abs_path("data/cardinal/digits.tsv")) + digits_for_thousands = pynini.string_file(get_abs_path("data/cardinal/digits_for_thousands.tsv")) + digits_millions_trillions= pynini.string_file(get_abs_path("data/cardinal/digits_millions_trillions.tsv")) + tens = pynini.string_file(get_abs_path("data/cardinal/tens.tsv")) tens_for_ends = pynini.string_map([("icumi","1")])|tens tens_for_beginnings= pynini.string_map([("cumi","1")])|tens - hundreds = pynini.string_map([ - ("ijana","1"), - ("magana_abiri","2"), - ("magana_atatu","3"), - ("magana_ane","4"), - ("magana_atanu","5"), - ("magana_atandatu","6"), - ("magana_arindwi","7"), - ("magana_inani","8"), - ("magana_cyenda","9") - ]) - thousands = pynini.string_map([ - ("igihumbi","1"), - ("ibihumbi_bibiri","2"), - ("ibihumbi_bitatu","3"), - ("ibihumbi_bine","4"), - ("ibihumbi_bitanu","5"), - ("ibihumbi_bitandatu","6"), - ("ibihumbi_birindwi","7"), - ("ibihumbi_umunani","8"), - ("ibihumbi_icyenda","9") - ]) - tens_of_thousands = pynini.string_map([ - ("ibihumbi_cumi","1"), - ("ibihumbi_makumyabiri","2"), - ("ibihumbi_mirongo_itatu","3"), - ("ibihumbi_mirongo_ine","4"), - ("ibihumbi_mirongo_itanu","5"), - ("ibihumbi_mirongo_itandatatu","6"), - ("ibihumbi_mirongo_irindwi","7"), - ("ibihumbi_mirongo_inani","8"), - ("ibihumbi_mirongo_icyenda","9") - ]) - hundreds_of_thousands = pynini.string_map([ - ("ibihumbi_ijana","1"), - ("ibihumbi_magana_abiri","2"), - ("ibihumbi_magana_atatu","3"), - ("ibihumbi_magana_ane","4"), - ("ibihumbi_magana_atanu","5"), - ("ibihumbi_magana_atandatu","6"), - ("ibihumbi_magana_arindwi","7"), - ("ibihumbi_magana_inani","8"), - ("ibihumbi_magana_cyenda","9") - ]) - millions = pynini.string_map([ - ("miliyoni","1"), - ("miliyoni_ebyiri","2"), - ("miliyoni_eshatu","3"), - ("miliyoni_enye","4"), - ("miliyoni_eshanu","5"), - ("miliyoni_esheshatu","6"), - ("miliyoni_zirindwi","7"), - ("miliyoni_umunani","8"), - ("miliyoni_icyenda","9") - ]) - tens_of_millions = pynini.string_map([ - ("miliyoni_cumi","1"), - ("miliyoni_makumyabiri","2"), - ("miliyoni_mirongo_itatu","3"), - ("miliyoni_mirongo_ine","4"), - ("miliyoni_mirongo_itanu","5"), - ("miliyoni_mirongo_itandatatu","6"), - ("miliyoni_mirongo_irindwi","7"), - ("miliyoni_mirongo_inani","8"), - ("miliyoni_mirongo_icyenda","9") - ]) - hundreds_of_millions = pynini.string_map([ - ("miliyoni_ijana","1"), - ("miliyoni_magana_abiri","2"), - ("miliyoni_magana_atatu","3"), - ("miliyoni_magana_ane","4"), - ("miliyoni_magana_atanu","5"), - ("miliyoni_magana_atandatu","6"), - ("miliyoni_magana_arindwi","7"), - ("miliyoni_magana_inani","8"), - ("miliyoni_magana_cyenda","9") - ]) - trillions = pynini.string_map([ - ("tiriyoni","1"), - ("tiriyoni_ebyiri","2"), - ("tiriyoni_eshatu","3"), - ("tiriyoni_enye","4"), - ("tiriyoni_eshanu","5"), - ("tiriyoni_esheshatu","6"), - ("tiriyoni_zirindwi","7"), - ("tiriyoni_umunani","8"), - ("tiriyoni_icyenda","9") - ]) - tens_of_trillions = pynini.string_map([ - ("tiriyoni_icumi","1"), - ("tiriyoni_makumyabiri","2"), - ("tiriyoni_mirongo_itatu","3"), - ("tiriyoni_mirongo_ine","4"), - ("tiriyoni_mirongo_itanu","5"), - ("tiriyoni_mirongo_itandatatu","6"), - ("tiriyoni_mirongo_irindwi","7"), - ("tiriyoni_mirongo_inani","8"), - ("tiriyoni_mirongo_icyenda","9") - ]) - hundreds_of_trillions = pynini.string_map([ - ("tiriyoni_ijana","1"), - ("tiriyoni_magana_abiri","2"), - ("tiriyoni_magana_atatu","3"), - ("tiriyoni_magana_ane","4"), - ("tiriyoni_magana_atanu","5"), - ("tiriyoni_magana_atandatu","6"), - ("tiriyoni_magana_arindwi","7"), - ("tiriyoni_magana_inani","8"), - ("tiriyoni_magana_cyenda","9") - ]) + hundreds = pynini.string_file(get_abs_path("data/cardinal/hundreds.tsv")) + thousands = pynini.string_file(get_abs_path("data/cardinal/thousands.tsv")) + tens_of_thousands = pynini.string_file(get_abs_path("data/cardinal/tens_of_thousands.tsv")) + hundreds_of_thousands = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_thousands.tsv")) + millions = pynini.string_file(get_abs_path("data/cardinal/millions.tsv")) + tens_of_millions = pynini.string_file(get_abs_path("data/cardinal/tens_of_millions.tsv")) + hundreds_of_millions = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_millions.tsv")) + trillions = pynini.string_file(get_abs_path("data/cardinal/trillions.tsv")) + tens_of_trillions = pynini.string_file(get_abs_path("data/cardinal/tens_of_trillions.tsv")) + hundreds_of_trillions = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_trillions.tsv")) + THREE_ZEROS = "000" FOUR_ZEROS = "0000" FIVE_ZEROS = "00000" @@ -208,7 +60,7 @@ def __init__(self): NINE_ZEROS = "000000000" zero = pynini.string_map([("zeru","0")]) - rewrite_remove_comma_fst = pynini.cdrewrite(pynini.cross(",",""),pynini.union(*"0123456789"),pynini.union(*"0123456789"),NEMO_CHAR.closure()) + rewrite_remove_comma_fst = pynini.cdrewrite(pynini.cross(",",""),pynini.union(NEMO_DIGIT),pynini.union(NEMO_DIGIT),NEMO_CHAR.closure()) single_digits_graph = pynini.invert(digits | zero) single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) remove_comma = rewrite_remove_comma_fst@single_digits_graph diff --git a/nemo_text_processing/text_normalization/rw/taggers/time.py b/nemo_text_processing/text_normalization/rw/taggers/time.py index 6b2a0d531..a07ae059e 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/time.py +++ b/nemo_text_processing/text_normalization/rw/taggers/time.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,92 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst import pynini from pynini.lib import pynutil +from nemo_text_processing.text_normalization.rw.utils import get_abs_path class TimeFst(GraphFst): def __init__(self): super().__init__(name="time", kind="classify") - hours = pynini.string_map([ - ('1', 'saa saba'), - ('2', 'saa munani'), - ('3', 'saa cyenda'), - ('4', 'saa cumi'), - ('5', "saa cumi n'imwe"), - ('6', "saa cumi n'ebyiri"), - ('7', 'saa moya'), - ('8', 'saa mbiri'), - ('9', 'saa tatu'), - ('10', 'saa ine'), - ('11', 'saa tanu'), - ('12', 'saa sita'), - ]) + hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) - minutes = pynini.string_map([ - ('00', ' '), - ('01', " n'umunota umwe") , - ('02', " n'iminota ibiri") , - ('03', " n'iminota itatu") , - ('04', " n'iminota ine") , - ('05', " n'iminota itanu") , - ('06', " n'iminota itandatu") , - ('07', " n'iminota irindwi") , - ('08', " n'iminota umunani") , - ('09', " n'iminota icyenda") , - ('10', " n'iminota icumi") , - ('11', " n'iminota cumi n'umwe") , - ('12', " n'iminota cumi n'ibiri") , - ('13', " n'iminota cumi n'itatu") , - ('14', " n'iminota cumi n'ine") , - ('15', " n'iminota cumi n'itanu") , - ('16', " n'iminota cumi n'itandatu") , - ('17', " n'iminota cumi n'irindwi") , - ('18', " n'iminota cumi n'umunani") , - ('19', " n'iminota cumi n'icyenda") , - ('20', " n'iminota makumyabiri") , - ('21', " n'iminota makumyabiri na rimwe") , - ('22', " n'iminota makumyabiri n'ibiri") , - ('23', " n'iminota makumyabiri n'itatu") , - ('24', " n'iminota makumyabiri n'ine") , - ('25', " n'iminota makumyabiri n'itanu") , - ('26', " n'iminota makumyabiri n'itandatu") , - ('27', " n'iminota makumyabiri n'irindwi") , - ('28', " n'iminota makumyabiri n'umunani") , - ('29', " n'iminota makumyabiri n'icyenda") , - ('30', " n'iminota mirongo itatu") , - ('31', " n'iminota mirongo itatu n'umwe") , - ('32', " n'iminota mirongo itatu n'ibiri") , - ('33', " n'iminota mirongo itatu n'itatu") , - ('34', " n'iminota mirongo itatu n'ine") , - ('35', " n'iminota mirongo itatu n'itanu") , - ('36', " n'iminota mirongo itatu n'itandatu") , - ('37', " n'iminota mirongo itatu n'irindwi") , - ('38', " n'iminota mirongo itatu n'umunani") , - ('39', " n'iminota mirongo itatu n'icyenda") , - ('40', " n'iminota mirongo ine") , - ('41', " n'iminota mirongo ine n'umwe") , - ('42', " n'iminota mirongo ine n'ibiri") , - ('43', " n'iminota mirongo ine n'itatu") , - ('44', " n'iminota mirongo ine n'ine") , - ('45', " n'iminota mirongo ine n'itanu") , - ('46', " n'iminota mirongo ine n'itandatu") , - ('47', " n'iminota mirongo ine n'irindwi") , - ('48', " n'iminota mirongo ine n'umunani") , - ('49', " n'iminota mirongo ine n'icyenda") , - ('50', " n'iminota mirongo itanu") , - ('51', " n'iminota mirongo itanu n'umwe") , - ('52', " n'iminota mirongo itanu n'ibiri") , - ('53', " n'iminota mirongo itanu n'itatu") , - ('54', " n'iminota mirongo itanu n'ine") , - ('55', " n'iminota mirongo itanu n'itanu") , - ('56', " n'iminota mirongo itanu n'itandatu") , - ('57', " n'iminota mirongo itanu n'irindwi") , - ('58', " n'iminota mirongo itanu n'umunani") , - ('59', " n'iminota mirongo itanu n'icyenda") , - ]) + minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) final_graph = pynutil.insert("hours:\"")+hours+pynutil.insert("\"")+pynutil.delete(":")+pynutil.insert(" minutes:\"")+minutes+pynutil.insert("\"") final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py index 3a034af13..e17841e10 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.taggers.time import TimeFst from nemo_text_processing.text_normalization.rw.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.rw.taggers.cardinal import CardinalFst @@ -23,7 +24,7 @@ import pynini from pynini.lib import pynutil import os -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_extra_space, delete_space, diff --git a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py index 0355d9741..288a1edda 100644 --- a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst import pynini from pynini.lib import pynutil from nemo_text_processing.text_normalization.rw.utils import get_abs_path diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py index 26cff59aa..2931cfd9b 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/time.py b/nemo_text_processing/text_normalization/rw/verbalizers/time.py index 90d1c17e4..99bcd7808 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/time.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +15,7 @@ # limitations under the License. import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_space, NEMO_CHAR @@ -23,8 +24,8 @@ class VerbalizeTimeFst(GraphFst): def __init__(self): super().__init__(name="time",kind="verbalize") - hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")+delete_space \ - +pynutil.delete("minutes:")+delete_space+pynutil.delete("\"") + pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")) + hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR)+pynutil.delete("\"")+delete_space \ + +pynutil.delete("minutes:")+delete_space+pynutil.delete("\"") + pynini.closure(NEMO_CHAR)+pynutil.delete("\"")) graph = hour delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py index 94bf7a038..9d3e69cd9 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst from nemo_text_processing.text_normalization.rw.verbalizers.time import VerbalizeTimeFst from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py index e191fbf32..953bffdfe 100644 --- a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,11 +17,14 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.rw.graph_utils import ( GraphFst, delete_extra_space, + delete_space_or_punct, delete_space, + NEMO_PUNCT, generator_main, + delete_space ) import os @@ -34,20 +38,20 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False,determin if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["verbalize"] else: - verbalize = VerbalizeFst().fst - word = WordFst().fst - + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst types = verbalize | word graph = ( - pynutil.delete("tokens") - + delete_space - + pynutil.delete("{") - + delete_space - + types - + delete_space - + pynutil.delete("}") - ) - graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space + self.fst = graph diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 596723091..0cbd53349 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -1,4 +1,5 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +21,7 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import generator_main +from nemo_text_processing.text_normalization.rw.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files # tokenize_and_classify.far and verbalize.far for production purposes @@ -270,7 +271,7 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst - elif args.language == 'rw': + elif args.language == 'rw': from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, )