Skip to content

Commit

Permalink
Unioning the lower and upper Roman graphs into one
Browse files Browse the repository at this point in the history
Signed-off-by: Simon Zuberek <[email protected]>
  • Loading branch information
Simon Zuberek committed Aug 30, 2024
1 parent f661c67 commit db53aea
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 50 deletions.
4 changes: 2 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ pipeline {
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-29-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-29-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
130 changes: 82 additions & 48 deletions nemo_text_processing/text_normalization/es/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,52 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_SIGMA,
NEMO_SPACE,
)
from nemo_text_processing.text_normalization.es import LOCALIZATION
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

digits = pynini.project(pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input")
tens = pynini.project(pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input")
teens = pynini.project(pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input")
twenties = pynini.project(pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input")
hundreds = pynini.project(pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input")

accents = pynini.string_map([("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u")])

if LOCALIZATION == "am": # Setting localization for central and northern america formatting
digits = pynini.project(
pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input"
)
tens = pynini.project(
pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input"
)
teens = pynini.project(
pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input"
)
twenties = pynini.project(
pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input"
)
hundreds = pynini.project(
pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input"
)

accents = pynini.string_map(
[("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u")]
)

if (
LOCALIZATION == "am"
): # Setting localization for central and northern america formatting
cardinal_separator = pynini.string_map([",", NEMO_SPACE])
decimal_separator = pynini.accep(".")
else:
cardinal_separator = pynini.string_map([".", NEMO_SPACE])
decimal_separator = pynini.accep(",")

ones = pynini.union("un", "ún")
fem_ones = pynini.union(pynini.cross("un", "una"), pynini.cross("ún", "una"), pynini.cross("uno", "una"))
one_to_one_hundred = pynini.union(digits, "uno", tens, teens, twenties, tens + pynini.accep(" y ") + digits)
fem_hundreds = hundreds @ pynini.cdrewrite(pynini.cross("ientos", "ientas"), "", "", NEMO_SIGMA)
fem_ones = pynini.union(
pynini.cross("un", "una"), pynini.cross("ún", "una"), pynini.cross("uno", "una")
)
one_to_one_hundred = pynini.union(
digits, "uno", tens, teens, twenties, tens + pynini.accep(" y ") + digits
)
fem_hundreds = hundreds @ pynini.cdrewrite(
pynini.cross("ientos", "ientas"), "", "", NEMO_SIGMA
)


ES_MINUS = pynini.union("menos", "Menos", "MENOS").optimize()
Expand Down Expand Up @@ -78,12 +101,16 @@ def shift_cardinal_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
+ pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1)
+ pynini.union(pynini.accep("[EOS]"), pynini.accep('"'), decimal_separator)
)
before_double_digits = pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
pynini.accep("[EOS]"), pynini.accep('"')
)
before_double_digits = pynini.closure(
NEMO_SPACE + one_to_one_hundred, 0, 1
) + pynini.union(pynini.accep("[EOS]"), pynini.accep('"'))

fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil, NEMO_SIGMA) # doscientas mil dosciento
fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits, NEMO_SIGMA) # doscientas mil doscienta
fem_allign = pynini.cdrewrite(
fem_hundreds, "", before_mil, NEMO_SIGMA
) # doscientas mil dosciento
fem_allign @= pynini.cdrewrite(
fem_hundreds, "", before_double_digits, NEMO_SIGMA
) # doscientas mil doscienta

fem_allign @= pynini.cdrewrite(
fem_ones, "", pynini.union("[EOS]", '"', decimal_separator), NEMO_SIGMA
Expand All @@ -107,7 +134,10 @@ def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA)
fem_allign @= pynini.cdrewrite(
fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA,
fem_ones,
"",
pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')),
NEMO_SIGMA,
) # If before a quote or EOS, we know it's the end of a string

return fst @ fem_allign
Expand Down Expand Up @@ -169,46 +199,50 @@ def _load_roman(file: str, upper_casing: bool):
# A split between all upper-case and all lower-case Roman numerals is introduced in order to preserve orthographic accuracy,
# and to prevent cases in which certain proper nouns e.g. (Li, Xi, Yi, etc.) are transduced to Roman numerals.

digit = _load_roman("data/roman/digit.tsv", False)
digit_lower = _load_roman("data/roman/digit.tsv", False)
digit_upper = _load_roman("data/roman/digit.tsv", True)
ties = _load_roman("data/roman/ties.tsv", False)
ties_lower = _load_roman("data/roman/ties.tsv", False)
ties_upper = _load_roman("data/roman/ties.tsv", True)
hundreds = _load_roman("data/roman/hundreds.tsv", False)
hundreds_lower = _load_roman("data/roman/hundreds.tsv", False)
hundreds_upper = _load_roman("data/roman/hundreds.tsv", True)
thousands = _load_roman("data/roman/thousands.tsv", False)
thousands_lower = _load_roman("data/roman/thousands.tsv", False)
thousands_upper = _load_roman("data/roman/thousands.tsv", True)

graph = (
digit
| ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(digit_upper | digit_lower)
| (
hundreds
+ (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01)))
| (
ties_lower
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
| (
thousands
+ (hundreds | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
).optimize()

graph_upper = (
digit_upper
| ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
| (
hundreds_upper
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
(
hundreds_upper
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
| (
hundreds_lower
+ (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
| (
thousands_upper
+ (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
(
thousands_upper
+ (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
| (
thousands_lower
+ (hundreds_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
).optimize()

graph_all_romans = graph | graph_upper
return graph_all_romans @ fst
return graph @ fst

0 comments on commit db53aea

Please sign in to comment.