Merge branch 'main' into hi_itn

Signed-off-by: tarushi2k2 <[email protected]>
NVIDIA · Oct 22, 2024 · 8c677dc · 8c677dc
2 parents f80f86a + a3fc6f5
commit 8c677dc
Show file tree

Hide file tree

Showing 298 changed files with 6,274 additions and 692 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -12,9 +12,9 @@ pipeline {
   environment {
 
     AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
-    DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
-    EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
-    ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
+    DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-14-24-0'
+    EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0'
+    ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0'
     ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
     FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
     HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
@@ -26,8 +26,9 @@ pipeline {
     IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
     HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
-    JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0'
+    JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
     HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-29-24-0'
+
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {

diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py
@@ -200,7 +200,7 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: st
         alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions
         start: inclusive start position in input string
         end: exclusive end position in input string
-        mode: grammar type for either tn or itn 
+        mode: grammar type for either tn or itn
 
     Returns:
         output_og_start_index: inclusive start position in output string

diff --git a/nemo_text_processing/hybrid/mlm_scorer.py b/nemo_text_processing/hybrid/mlm_scorer.py
@@ -93,7 +93,7 @@ def score_sentence(self, sentence: str):
 
     def __mask_text__(self, idx: int, tokens: List[str]):
         """
-        replaces string at index idx in list `tokens` with a masked token and returns the modified list. 
+        replaces string at index idx in list `tokens` with a masked token and returns the modified list.
         """
         masked = tokens.copy()
         masked[idx] = self.MASK_LABEL

diff --git a/nemo_text_processing/hybrid/model_utils.py b/nemo_text_processing/hybrid/model_utils.py
@@ -74,7 +74,7 @@ def get_masked_score(text, model, do_lower=True):
 
 def _get_ambiguous_positions(sentences: List[str]):
     """returns None or index list of ambigous semiotic tokens for list of sentences.
-    E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only 
+    E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only
     the first semiotic span <street>/<saint> is ambiguous."""
     l_sets = [set([x]) for x in re.findall(r"<\s.+?\s>", sentences[0])]
     for sentence in sentences[1:]:

diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py
@@ -390,8 +390,8 @@ def clean_post_norm(
 
 def clean_libri_tts(target: str):
     """
-	Replace abbreviations in LibriTTS dataset
-	"""
+    Replace abbreviations in LibriTTS dataset
+    """
 
     # Normalized text in LibriTTS by Google which contains abbreviations from `libri_sometimes_converts_abbrs` sometimes wasn't converted.
     libri_sometimes_converts_abbrs = {"St.": "saint", "Rev.": "reverend"}
@@ -641,15 +641,15 @@ def get_diff(a: str, b: str):
 
 def diff_pred_gt(pred: str, gt: str):
     """returns list of different substrings between prediction and gt
-    relies on that prediction uses '< '  ' >'  
+    relies on that prediction uses '< '  ' >'
 
     Args:
         pred (str): prediction
         gt (str): ground truth
 
     Returns:
         list of Tuple(pred start and end, gt start and end) subsections
-    
+
     e.g. pred="< Edward third >., king Our own . loss had been < two thousand two hundred >"
          gt  ="Edward III., king Our own loss had been twenty two hundred"
          --> [([0, 16], [0, 10]),      ([32, 34], [26, 26]),      ([48, 76], [40, 58])]

diff --git a/nemo_text_processing/hybrid/wfst_lm_rescoring.py b/nemo_text_processing/hybrid/wfst_lm_rescoring.py
@@ -73,7 +73,7 @@ def threshold_weights(norm_texts_weights, delta: float = 0.2):
     delta: delta to add to minimum weight in options to compose upper limit for threshhold
 
     returns:
-        filter list of same format as input 
+        filter list of same format as input
     """
     # threshold value is factor applied to lowest/first weight of all normalization options for every input
     res = []

diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py
@@ -22,8 +22,8 @@ class CardinalFst(GraphFst):
     """
     Finite state transducer for classifying cardinals
         e.g. سالب تسعة وتسعون  -> cardinal { integer: "99" negative: "-" } }
-    Numbers below thirteen are not converted. 
-    Args: 
+    Numbers below thirteen are not converted.
+    Args:
         tn_cardinal: cardinal FST for TN
     """
 

diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py
@@ -29,7 +29,7 @@ class FractionFst(GraphFst):
     """
     Finite state transducer for classifying fraction
         e.g. واحد و نصف -> tokens { integer_part: "1" numerator: "1" denominator: "2" }
-    
+
     Args:
         tn_cardinal: TN cardinal tagger
 

diff --git a/nemo_text_processing/inverse_text_normalization/ar/utils.py b/nemo_text_processing/inverse_text_normalization/ar/utils.py
@@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]):
     Args
         x: integer
 
-    Returns: spoken representation 
+    Returns: spoken representation
     """
     if isinstance(x, int):
         x = str(x)
@@ -41,7 +41,7 @@ def get_abs_path(rel_path):
 
     Args:
         rel_path: relative path to this file
-        
+
     Returns absolute path
     """
     return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py
@@ -21,7 +21,7 @@
 
 class VerbalizeFinalFst(GraphFst):
     """
-    Finite state transducer that verbalizes an entire sentence, e.g. 
+    Finite state transducer that verbalizes an entire sentence, e.g.
     tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
     """
 

diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py
@@ -20,7 +20,7 @@
 
 class CardinalFst(GraphFst):
     """
-    Finite state transducer for classifying cardinals. Numbers below ten are not converted. 
+    Finite state transducer for classifying cardinals. Numbers below ten are not converted.
     Allows both compound numeral strings or separated by whitespace.
     "und" (en: "and") can be inserted between "hundert" and following number or "tausend" and following single or double digit number.
 
@@ -32,7 +32,7 @@ class CardinalFst(GraphFst):
         e.g. ein tausend -> cardinal { integer: "1000" } }
         e.g. eintausend -> cardinal { integer: "1000" } }
         e.g. ein tausend zwanzig -> cardinal { integer: "1020" } }
-    
+
     Args:
         tn_cardinal_tagger: TN cardinal tagger
     """

diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py
@@ -22,7 +22,7 @@ class ElectronicFst(GraphFst):
     """
     Finite state transducer for classifying electronic: email addresses, etc.
         e.g. c d f eins at a b c punkt e d u -> tokens { name: "cdf1.abc.edu" }
-    
+
     Args:
         tn_electronic_tagger: TN eletronic tagger
         tn_electronic_verbalizer: TN eletronic verbalizer

diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py
@@ -29,7 +29,7 @@ class FractionFst(GraphFst):
         e.g. ein halb -> tokens { name: "1/2" }
         e.g. ein ein halb -> tokens { name: "1 1/2" }
         e.g. drei zwei ein hundertstel -> tokens { name: "3 2/100" }
-    
+
     Args:
         itn_cardinal_tagger: ITN cardinal tagger
         tn_fraction_verbalizer: TN fraction verbalizer

diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py
@@ -20,9 +20,9 @@
 
 class TelephoneFst(GraphFst):
     """
-    Finite state transducer for classifying telephone numbers, e.g. 
+    Finite state transducer for classifying telephone numbers, e.g.
         null vier eins eins eins zwei drei vier eins zwei drei vier -> tokens { name: "(0411) 1234-1234" }
-    
+
     Args:
         tn_cardinal_tagger: TN Cardinal Tagger
     """

diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py
@@ -31,7 +31,7 @@ class TimeFst(GraphFst):
         e.g. drei vor zwölf -> time { minutes: "57" hours: "11" }
         e.g. drei nach zwölf -> time { minutes: "3" hours: "12" }
         e.g. drei uhr zehn minuten zehn sekunden -> time { hours: "3" hours: "10" sekunden: "10"}
-    
+
     Args:
         tn_time_verbalizer: TN time verbalizer
     """

diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py
@@ -23,7 +23,7 @@ class TimeFst(GraphFst):
     Finite state transducer for verbalizing time, e.g.
         time { hours: "8" minutes: "30" zone: "e s t" } -> 08:30 Uhr est
         time { hours: "8" } -> 8 Uhr
-        time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr 
+        time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr
     """
 
     def __init__(self, deterministic: bool = True):

diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py
@@ -22,7 +22,7 @@
 
 class VerbalizeFinalFst(GraphFst):
     """
-    Finite state transducer that verbalizes an entire sentence, e.g. 
+    Finite state transducer that verbalizes an entire sentence, e.g.
     tokens { name: "jetzt" } tokens { name: "ist" } tokens { time { hours: "12" minutes: "30" } } -> jetzt ist 12:30 Uhr
     """
 

diff --git a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py
@@ -67,7 +67,7 @@ def process(self, instance: Instance) -> Instance:
 
         Args:
             processes given instance with process function
-            
+
         Returns: processed instance if instance belongs to expected class type or original instance
         """
         if instance.token_type != self.class_type:

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py
@@ -243,7 +243,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
         self.fst = final_graph.optimize()
 
     def delete_word(self, word: str):
-        """ Capitalizes word for `cased` input"""
+        """Capitalizes word for `cased` input"""
         delete_graph = pynutil.delete(word).optimize()
         if self.input_case == INPUT_CASED:
             if len(word) > 0:

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py
@@ -137,7 +137,7 @@ def _get_thousands_graph():
 
 class DateFst(GraphFst):
     """
-    Finite state transducer for classifying date, 
+    Finite state transducer for classifying date,
         e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
         e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
         e.g. twenty twenty -> date { year: "2012" preserve_order: true }

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py
@@ -41,7 +41,7 @@ def get_quantity(
     e.g. one million -> integer_part: "1" quantity: "million"
     e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
 
-    Args: 
+    Args:
         decimal: decimal FST
         cardinal_up_to_hundred: cardinal FST
         input_case: accepting either "lower_cased" or "cased" input.

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py
@@ -61,7 +61,7 @@ def get_serial_number(cardinal):
 
 class TelephoneFst(GraphFst):
     """
-    Finite state transducer for classifying telephone numbers, e.g. 
+    Finite state transducer for classifying telephone numbers, e.g.
         one two three one two three five six seven eight -> { number_part: "123-123-5678" }
 
     This class also support card number and IP format.

diff --git a/nemo_text_processing/inverse_text_normalization/en/utils.py b/nemo_text_processing/inverse_text_normalization/en/utils.py
@@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]):
     Args
         x: integer
 
-    Returns: spoken representation 
+    Returns: spoken representation
     """
     if isinstance(x, int):
         x = str(x)
@@ -41,7 +41,7 @@ def get_abs_path(rel_path):
 
     Args:
         rel_path: relative path to this file
-        
+
     Returns absolute path
     """
     return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path

diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py
@@ -18,7 +18,7 @@
 
 class FractionFst(GraphFst):
     """
-    Finite state transducer for verbalizing fraction, 
+    Finite state transducer for verbalizing fraction,
     """
 
     def __init__(self):

diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py
@@ -23,7 +23,7 @@ class TelephoneFst(GraphFst):
     """
     Finite state transducer for verbalizing telephone, e.g.
         telephone { number_part: "123-123-5678" }
-        -> 123-123-5678 
+        -> 123-123-5678
     """
 
     def __init__(self):

diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py
@@ -23,7 +23,7 @@
 
 class VerbalizeFinalFst(GraphFst):
     """
-    Finite state transducer that verbalizes an entire sentence, e.g. 
+    Finite state transducer that verbalizes an entire sentence, e.g.
     tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
     """
 

diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py
@@ -31,10 +31,10 @@
 class CardinalFst(GraphFst):
     """
     Finite state transducer for classifying cardinals
-        e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"} 
+        e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"}
     This class converts cardinals up to (but not including) "un cuatrillón",
     i.e up to "one septillion" in English (10^{24}).
-    Cardinals below ten are not converted (in order to avoid 
+    Cardinals below ten are not converted (in order to avoid
     "vivo en una casa" --> "vivo en 1 casa" and any other odd conversions.)
 
     Although technically Spanish grammar requires that "y" only comes after
@@ -199,7 +199,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
         self.fst = final_graph.optimize()
 
     def delete_word(self, word: str):
-        """ Capitalizes word for `cased` input"""
+        """Capitalizes word for `cased` input"""
         delete_graph = pynutil.delete(word).optimize()
         if self.input_case == INPUT_CASED:
             if len(word) > 0:

diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py
@@ -28,10 +28,10 @@
 
 class DateFst(GraphFst):
     """
-    Finite state transducer for classifying date, 
+    Finite state transducer for classifying date,
         e.g. primero de enero -> date { day: "1" month: "enero" }
         e.g. uno de enero -> date { day: "1" month: "enero" }
-    
+
     Args:
         cardinal: CardinalFst
         input_case: accepting either "lower_cased" or "cased" input.

diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py
@@ -38,7 +38,7 @@ def get_quantity(
     e.g. one million -> integer_part: "1" quantity: "million"
     e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
 
-    Args: 
+    Args:
         decimal: decimal FST
         cardinal_up_to_million: cardinal FST
         input_case: accepting either "lower_cased" or "cased" input.
@@ -87,7 +87,7 @@ class DecimalFst(GraphFst):
         This decimal rule assumes that decimals can be pronounced as:
         (a cardinal) + ('coma' or 'punto') plus (any sequence of cardinals <1000, including 'zero')
 
-        Also writes large numbers in shortened form, e.g. 
+        Also writes large numbers in shortened form, e.g.
             e.g. uno coma dos seis millón -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" }
             e.g. dos millones -> decimal { negative: "false" integer_part: "2" quantity: "millones" }
             e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" }

diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py
@@ -36,7 +36,7 @@ class ElectronicFst(GraphFst):
     and URLS (which get converted to a "protocol" field).
         e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
         e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } }
-    
+
     Args:
         input_case: accepting either "lower_cased" or "cased" input.
     """

diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py
@@ -23,18 +23,18 @@
 class FractionFst(GraphFst):
     """
     Finite state transducer for classifying fractions
-        e.g. dos quintos -> fraction { numerator: "2" denominator: "5" } 
-    This class converts fractions with a denominator up to (and including) 
+        e.g. dos quintos -> fraction { numerator: "2" denominator: "5" }
+    This class converts fractions with a denominator up to (and including)
     "1/999".
-    
+
     Fractions with 4 as their denominator, read as "cuarto(s)", are not
     converted because "room" is also "cuarto", which could cause issues like
         "quiero reservar un cuarto" -> quiero reservar 1/2".
-    
+
     Fractions without a numerator are not converted either to prevent issues
     like:
         "estaba medio dormido" -> "estaba 1/2 dormido"
-        
+
     Args:
         cardinal: CardinalFst
         ordinal: OrdinalFst

diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py
@@ -32,7 +32,7 @@
 class MeasureFst(GraphFst):
     """
     Finite state transducer for classifying measure
-        e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" } 
+        e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" }
 
     Args:
         cardinal: CardinalFst