Unioning the lower and upper Roman graphs into one

Signed-off-by: Simon Zuberek <[email protected]>
NVIDIA · Aug 30, 2024 · db53aea · db53aea
1 parent f661c67
commit db53aea
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 50 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -14,8 +14,8 @@ pipeline {
     AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
     DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
     EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
-    ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-29-24-0'
-    ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-29-24-0'
+    ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
+    ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
     FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
     HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
     PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py
@@ -15,29 +15,52 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_SIGMA,
+    NEMO_SPACE,
+)
 from nemo_text_processing.text_normalization.es import LOCALIZATION
 from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels
 
-digits = pynini.project(pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input")
-tens = pynini.project(pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input")
-teens = pynini.project(pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input")
-twenties = pynini.project(pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input")
-hundreds = pynini.project(pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input")
-
-accents = pynini.string_map([("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u")])
-
-if LOCALIZATION == "am":  # Setting localization for central and northern america formatting
+digits = pynini.project(
+    pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input"
+)
+tens = pynini.project(
+    pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input"
+)
+teens = pynini.project(
+    pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input"
+)
+twenties = pynini.project(
+    pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input"
+)
+hundreds = pynini.project(
+    pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input"
+)
+
+accents = pynini.string_map(
+    [("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u")]
+)
+
+if (
+    LOCALIZATION == "am"
+):  # Setting localization for central and northern america formatting
     cardinal_separator = pynini.string_map([",", NEMO_SPACE])
     decimal_separator = pynini.accep(".")
 else:
     cardinal_separator = pynini.string_map([".", NEMO_SPACE])
     decimal_separator = pynini.accep(",")
 
 ones = pynini.union("un", "ún")
-fem_ones = pynini.union(pynini.cross("un", "una"), pynini.cross("ún", "una"), pynini.cross("uno", "una"))
-one_to_one_hundred = pynini.union(digits, "uno", tens, teens, twenties, tens + pynini.accep(" y ") + digits)
-fem_hundreds = hundreds @ pynini.cdrewrite(pynini.cross("ientos", "ientas"), "", "", NEMO_SIGMA)
+fem_ones = pynini.union(
+    pynini.cross("un", "una"), pynini.cross("ún", "una"), pynini.cross("uno", "una")
+)
+one_to_one_hundred = pynini.union(
+    digits, "uno", tens, teens, twenties, tens + pynini.accep(" y ") + digits
+)
+fem_hundreds = hundreds @ pynini.cdrewrite(
+    pynini.cross("ientos", "ientas"), "", "", NEMO_SIGMA
+)
 
 
 ES_MINUS = pynini.union("menos", "Menos", "MENOS").optimize()
@@ -78,12 +101,16 @@ def shift_cardinal_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
         + pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1)
         + pynini.union(pynini.accep("[EOS]"), pynini.accep('"'), decimal_separator)
     )
-    before_double_digits = pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
-        pynini.accep("[EOS]"), pynini.accep('"')
-    )
+    before_double_digits = pynini.closure(
+        NEMO_SPACE + one_to_one_hundred, 0, 1
+    ) + pynini.union(pynini.accep("[EOS]"), pynini.accep('"'))
 
-    fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil, NEMO_SIGMA)  # doscientas mil dosciento
-    fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits, NEMO_SIGMA)  # doscientas mil doscienta
+    fem_allign = pynini.cdrewrite(
+        fem_hundreds, "", before_mil, NEMO_SIGMA
+    )  # doscientas mil dosciento
+    fem_allign @= pynini.cdrewrite(
+        fem_hundreds, "", before_double_digits, NEMO_SIGMA
+    )  # doscientas mil doscienta
 
     fem_allign @= pynini.cdrewrite(
         fem_ones, "", pynini.union("[EOS]", '"', decimal_separator), NEMO_SIGMA
@@ -107,7 +134,10 @@ def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA)
     fem_allign @= pynini.cdrewrite(
-        fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA,
+        fem_ones,
+        "",
+        pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')),
+        NEMO_SIGMA,
     )  # If before a quote or EOS, we know it's the end of a string
 
     return fst @ fem_allign
@@ -169,46 +199,50 @@ def _load_roman(file: str, upper_casing: bool):
     # A split between all upper-case and all lower-case Roman numerals is introduced in order to preserve orthographic accuracy,
     # and to prevent cases in which certain proper nouns e.g. (Li, Xi, Yi, etc.) are transduced to Roman numerals.
 
-    digit = _load_roman("data/roman/digit.tsv", False)
+    digit_lower = _load_roman("data/roman/digit.tsv", False)
     digit_upper = _load_roman("data/roman/digit.tsv", True)
-    ties = _load_roman("data/roman/ties.tsv", False)
+    ties_lower = _load_roman("data/roman/ties.tsv", False)
     ties_upper = _load_roman("data/roman/ties.tsv", True)
-    hundreds = _load_roman("data/roman/hundreds.tsv", False)
+    hundreds_lower = _load_roman("data/roman/hundreds.tsv", False)
     hundreds_upper = _load_roman("data/roman/hundreds.tsv", True)
-    thousands = _load_roman("data/roman/thousands.tsv", False)
+    thousands_lower = _load_roman("data/roman/thousands.tsv", False)
     thousands_upper = _load_roman("data/roman/thousands.tsv", True)
 
     graph = (
-        digit
-        | ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
+        (digit_upper | digit_lower)
         | (
-            hundreds
-            + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            (ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01)))
+            | (
+                ties_lower
+                + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
         )
         | (
-            thousands
-            + (hundreds | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
-        )
-    ).optimize()
-
-    graph_upper = (
-        digit_upper
-        | ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
-        | (
-            hundreds_upper
-            + (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            (
+                hundreds_upper
+                + (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
+            | (
+                hundreds_lower
+                + (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
         )
         | (
-            thousands_upper
-            + (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            (
+                thousands_upper
+                + (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
+            | (
+                thousands_lower
+                + (hundreds_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
         )
     ).optimize()
 
-    graph_all_romans = graph | graph_upper
-    return graph_all_romans @ fst
+    return graph @ fst