Skip to content

Commit

Permalink
ES TN Fix for Issue #166 (#224)
Browse files Browse the repository at this point in the history
* ES TN Fix for Issue #166

Signed-off-by: Simon Zuberek <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Updates the cache

Signed-off-by: Simon Zuberek <[email protected]>

* Unioning the lower and upper Roman graphs into one

Signed-off-by: Simon Zuberek <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Removes all upper-case Roman numerals from data files

Signed-off-by: Simon Zuberek <[email protected]>

---------

Signed-off-by: Simon Zuberek <[email protected]>
Co-authored-by: Simon Zuberek <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 3, 2024
1 parent 2cb0275 commit 92bdf93
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 43 deletions.
4 changes: 2 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ pipeline {
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,10 @@
di
Di
DI
mi
Mi
MI
vi
Vi
VI
I
i
V
v
X
x
L
l
C
c
D
d
M
m
82 changes: 55 additions & 27 deletions nemo_text_processing/text_normalization/es/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
ES_PLUS = pynini.union("más", "Más", "MÁS").optimize()


def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def strip_accent(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Converts all accented vowels to non-accented equivalents
Expand All @@ -54,7 +54,7 @@ def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
return fst @ pynini.cdrewrite(accents, "", "", NEMO_SIGMA)


def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def shift_cardinal_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Applies gender conversion rules to a cardinal string. These include: rendering all masculine forms of "uno" (including apocopated forms) as "una" and
Converting all gendered numbers in the hundreds series (200,300,400...) to feminine equivalent (e.g. "doscientos" -> "doscientas"). Conversion only applies
Expand All @@ -76,23 +76,23 @@ def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+ (pynini.accep("mil") | pynini.accep("milésimo"))
+ pynini.closure(NEMO_SPACE + hundreds, 0, 1)
+ pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1)
+ pynini.union(pynini.accep("[EOS]"), pynini.accep("\""), decimal_separator)
+ pynini.union(pynini.accep("[EOS]"), pynini.accep('"'), decimal_separator)
)
before_double_digits = pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
pynini.accep("[EOS]"), pynini.accep("\"")
pynini.accep("[EOS]"), pynini.accep('"')
)

fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil, NEMO_SIGMA) # doscientas mil dosciento
fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits, NEMO_SIGMA) # doscientas mil doscienta

fem_allign @= pynini.cdrewrite(
fem_ones, "", pynini.union("[EOS]", "\"", decimal_separator), NEMO_SIGMA
fem_ones, "", pynini.union("[EOS]", '"', decimal_separator), NEMO_SIGMA
) # If before a quote or EOS, we know it's the end of a string

return fst @ fem_allign


def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Performs gender conversion on all verbalized numbers in output. All values in the hundreds series (200,300,400) are changed to
feminine gender (e.g. "doscientos" -> "doscientas") and all forms of "uno" (including apocopated forms) are converted to "una".
Expand All @@ -107,13 +107,13 @@ def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
"""
fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA)
fem_allign @= pynini.cdrewrite(
fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep("\"")), NEMO_SIGMA
fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA,
) # If before a quote or EOS, we know it's the end of a string

return fst @ fem_allign


def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def strip_cardinal_apocope(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
affects strings where the final value is a variation of "un".
Expand All @@ -126,11 +126,11 @@ def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
"""
# Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA)
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", '"'), NEMO_SIGMA)
return fst @ strip


def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def add_cardinal_apocope_fem(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
with text string, so is included for non-deterministic cases.
Expand All @@ -143,11 +143,11 @@ def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
"""
# Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA)
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", '"'), NEMO_SIGMA)
return fst @ strip


def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def roman_to_int(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Alters given fst to convert Roman integers (lower and upper cased) into Arabic numerals. Valid for values up to 1000.
e.g.
Expand All @@ -158,29 +158,57 @@ def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
fst: Any fst. Composes fst onto Roman conversion outputs.
"""

def _load_roman(file: str):
def _load_roman(file: str, upper_casing: bool):
roman = load_labels(get_abs_path(file))
roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman]
if upper_casing:
roman_numerals = [(x.upper(), y) for x, y in roman]
else:
roman_numerals = [(x, y) for x, y in roman]
return pynini.string_map(roman_numerals)

digit = _load_roman("data/roman/digit.tsv")
ties = _load_roman("data/roman/ties.tsv")
hundreds = _load_roman("data/roman/hundreds.tsv")
thousands = _load_roman("data/roman/thousands.tsv")
# A split between all upper-case and all lower-case Roman numerals is introduced in order to preserve orthographic accuracy,
# and to prevent cases in which certain proper nouns e.g. (Li, Xi, Yi, etc.) are transduced to Roman numerals.

digit_lower = _load_roman("data/roman/digit.tsv", False)
digit_upper = _load_roman("data/roman/digit.tsv", True)
ties_lower = _load_roman("data/roman/ties.tsv", False)
ties_upper = _load_roman("data/roman/ties.tsv", True)
hundreds_lower = _load_roman("data/roman/hundreds.tsv", False)
hundreds_upper = _load_roman("data/roman/hundreds.tsv", True)
thousands_lower = _load_roman("data/roman/thousands.tsv", False)
thousands_upper = _load_roman("data/roman/thousands.tsv", True)

graph = (
digit
| ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(digit_upper | digit_lower)
| (
(ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01)))
| (ties_lower + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01)))
)
| (
hundreds
+ (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(
hundreds_upper
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
| (
hundreds_lower
+ (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
| (
thousands
+ (hundreds | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(
thousands_upper
+ (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
| (
thousands_lower
+ (hundreds_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
).optimize()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,7 @@
1ra~primera
maría vii~maría séptima~maría séptimo
todo mi reconocimiento~todo mi reconocimiento
V~V
V~V
El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.~El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.
Xi Jinping es el actual presidente de China.~Xi Jinping es el actual presidente de China.
Matías fue el XI apóstol.~Matías fue el undécimo apóstol.

0 comments on commit 92bdf93

Please sign in to comment.