ES TN Fix for Issue #166 (#224)

* ES TN Fix for Issue #166 Signed-off-by: Simon Zuberek <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates the cache Signed-off-by: Simon Zuberek <[email protected]> * Unioning the lower and upper Roman graphs into one Signed-off-by: Simon Zuberek <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Removes all upper-case Roman numerals from data files Signed-off-by: Simon Zuberek <[email protected]> --------- Signed-off-by: Simon Zuberek <[email protected]> Co-authored-by: Simon Zuberek <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Sep 3, 2024 · 92bdf93 · 92bdf93
1 parent 2cb0275
commit 92bdf93
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 43 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -14,8 +14,8 @@ pipeline {
     AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
     DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
     EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
-    ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
-    ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
+    ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
+    ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
     FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
     HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
     PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

diff --git a/nemo_text_processing/text_normalization/es/data/ordinals/roman_exceptions.tsv b/nemo_text_processing/text_normalization/es/data/ordinals/roman_exceptions.tsv
@@ -1,23 +1,10 @@
 di
-Di
-DI
 mi
-Mi
-MI
 vi
-Vi
-VI
-I
 i
-V
 v
-X
 x
-L
 l
-C
 c
-D
 d
-M
 m
diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py
@@ -44,7 +44,7 @@
 ES_PLUS = pynini.union("más", "Más", "MÁS").optimize()
 
 
-def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+def strip_accent(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     Converts all accented vowels to non-accented equivalents
 
@@ -54,7 +54,7 @@ def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
     return fst @ pynini.cdrewrite(accents, "", "", NEMO_SIGMA)
 
 
-def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+def shift_cardinal_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     Applies gender conversion rules to a cardinal string. These include: rendering all masculine forms of "uno" (including apocopated forms) as "una" and
     Converting all gendered numbers in the hundreds series (200,300,400...) to feminine equivalent (e.g. "doscientos" -> "doscientas"). Conversion only applies
@@ -76,23 +76,23 @@ def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
         + (pynini.accep("mil") | pynini.accep("milésimo"))
         + pynini.closure(NEMO_SPACE + hundreds, 0, 1)
         + pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1)
-        + pynini.union(pynini.accep("[EOS]"), pynini.accep("\""), decimal_separator)
+        + pynini.union(pynini.accep("[EOS]"), pynini.accep('"'), decimal_separator)
     )
     before_double_digits = pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
-        pynini.accep("[EOS]"), pynini.accep("\"")
+        pynini.accep("[EOS]"), pynini.accep('"')
     )
 
     fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil, NEMO_SIGMA)  # doscientas mil dosciento
     fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits, NEMO_SIGMA)  # doscientas mil doscienta
 
     fem_allign @= pynini.cdrewrite(
-        fem_ones, "", pynini.union("[EOS]", "\"", decimal_separator), NEMO_SIGMA
+        fem_ones, "", pynini.union("[EOS]", '"', decimal_separator), NEMO_SIGMA
     )  # If before a quote or EOS, we know it's the end of a string
 
     return fst @ fem_allign
 
 
-def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     Performs gender conversion on all verbalized numbers in output. All values in the hundreds series (200,300,400) are changed to
     feminine gender (e.g. "doscientos" -> "doscientas") and all forms of "uno" (including apocopated forms) are converted to "una".
@@ -107,13 +107,13 @@ def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
     """
     fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA)
     fem_allign @= pynini.cdrewrite(
-        fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep("\"")), NEMO_SIGMA
+        fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA,
     )  # If before a quote or EOS, we know it's the end of a string
 
     return fst @ fem_allign
 
 
-def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+def strip_cardinal_apocope(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
     affects strings where the final value is a variation of "un".
@@ -126,11 +126,11 @@ def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
     """
     # Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
     strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
-    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA)
+    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", '"'), NEMO_SIGMA)
     return fst @ strip
 
 
-def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+def add_cardinal_apocope_fem(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
     with text string, so is included for non-deterministic cases.
@@ -143,11 +143,11 @@ def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
     """
     # Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
     strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
-    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA)
+    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", '"'), NEMO_SIGMA)
     return fst @ strip
 
 
-def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+def roman_to_int(fst: "pynini.FstLike") -> "pynini.FstLike":
     """
     Alters given fst to convert Roman integers (lower and upper cased) into Arabic numerals. Valid for values up to 1000.
     e.g.
@@ -158,29 +158,57 @@ def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
         fst: Any fst. Composes fst onto Roman conversion outputs.
     """
 
-    def _load_roman(file: str):
+    def _load_roman(file: str, upper_casing: bool):
         roman = load_labels(get_abs_path(file))
-        roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman]
+        if upper_casing:
+            roman_numerals = [(x.upper(), y) for x, y in roman]
+        else:
+            roman_numerals = [(x, y) for x, y in roman]
         return pynini.string_map(roman_numerals)
 
-    digit = _load_roman("data/roman/digit.tsv")
-    ties = _load_roman("data/roman/ties.tsv")
-    hundreds = _load_roman("data/roman/hundreds.tsv")
-    thousands = _load_roman("data/roman/thousands.tsv")
+    # A split between all upper-case and all lower-case Roman numerals is introduced in order to preserve orthographic accuracy,
+    # and to prevent cases in which certain proper nouns e.g. (Li, Xi, Yi, etc.) are transduced to Roman numerals.
+
+    digit_lower = _load_roman("data/roman/digit.tsv", False)
+    digit_upper = _load_roman("data/roman/digit.tsv", True)
+    ties_lower = _load_roman("data/roman/ties.tsv", False)
+    ties_upper = _load_roman("data/roman/ties.tsv", True)
+    hundreds_lower = _load_roman("data/roman/hundreds.tsv", False)
+    hundreds_upper = _load_roman("data/roman/hundreds.tsv", True)
+    thousands_lower = _load_roman("data/roman/thousands.tsv", False)
+    thousands_upper = _load_roman("data/roman/thousands.tsv", True)
 
     graph = (
-        digit
-        | ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
+        (digit_upper | digit_lower)
+        | (
+            (ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01)))
+            | (ties_lower + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01)))
+        )
         | (
-            hundreds
-            + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            (
+                hundreds_upper
+                + (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
+            | (
+                hundreds_lower
+                + (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
         )
         | (
-            thousands
-            + (hundreds | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
-            + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            (
+                thousands_upper
+                + (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
+            | (
+                thousands_lower
+                + (hundreds_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+                + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+            )
         )
     ).optimize()
 

diff --git a/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt
@@ -114,4 +114,7 @@
 1ra~primera
 maría vii~maría séptima~maría séptimo
 todo mi reconocimiento~todo mi reconocimiento
-V~V
+V~V
+El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.~El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.
+Xi Jinping es el actual presidente de China.~Xi Jinping es el actual presidente de China.
+Matías fue el XI apóstol.~Matías fue el undécimo apóstol.