From 1bc493082c25a9704e5ee6e9c81d773306c562c8 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Tue, 17 Sep 2024 18:01:11 +0200 Subject: [PATCH] Cardinals up to a hundred trillions, timeFST and transliteration (#209) * Cardinals up to a hundred trillions, timeFST and transliteration Signed-off-by: kurt0cougar * Cardinals up to a hundred trillions, timeFST and transliteration Signed-off-by: kurt0cougar * Cardinals up to a hundred trillions, timeFST and transliteration (moving constants to data files). Signed-off-by: kurt0cougar * Update test_cases_word.txt Signed-off-by: kurt0cougar * Update graph_utils.py Signed-off-by: kurt0cougar * Cardinals up to a hundred trillions, timeFST and transliteration - reformatteda Signed-off-by: kurt0cougar * Disabled Black during formatting. Signed-off-by: kurt0cougar --------- Signed-off-by: kurt0cougar --- .../fst_alignment/alignment.py | 2 +- nemo_text_processing/hybrid/mlm_scorer.py | 2 +- nemo_text_processing/hybrid/model_utils.py | 2 +- nemo_text_processing/hybrid/utils.py | 8 +- .../hybrid/wfst_lm_rescoring.py | 2 +- .../ar/taggers/cardinal.py | 4 +- .../ar/taggers/fraction.py | 2 +- .../inverse_text_normalization/ar/utils.py | 4 +- .../ar/verbalizers/verbalize_final.py | 2 +- .../de/taggers/cardinal.py | 4 +- .../de/taggers/electronic.py | 2 +- .../de/taggers/fraction.py | 2 +- .../de/taggers/telephone.py | 4 +- .../de/taggers/time.py | 2 +- .../de/verbalizers/time.py | 2 +- .../de/verbalizers/verbalize_final.py | 2 +- .../en/clean_eval_data.py | 2 +- .../en/taggers/cardinal.py | 2 +- .../en/taggers/date.py | 2 +- .../en/taggers/decimal.py | 2 +- .../en/taggers/telephone.py | 2 +- .../inverse_text_normalization/en/utils.py | 4 +- .../en/verbalizers/fraction.py | 2 +- .../en/verbalizers/telephone.py | 2 +- .../en/verbalizers/verbalize_final.py | 2 +- .../es/taggers/cardinal.py | 6 +- .../es/taggers/date.py | 4 +- .../es/taggers/decimal.py | 4 +- .../es/taggers/electronic.py | 2 +- .../es/taggers/fraction.py | 10 +- .../es/taggers/measure.py | 2 +- .../es/taggers/ordinal.py | 2 +- .../es/taggers/telephone.py | 4 +- .../es/taggers/time.py | 12 +- .../inverse_text_normalization/es/utils.py | 2 +- .../es/verbalizers/telephone.py | 2 +- .../es/verbalizers/verbalize_final.py | 2 +- .../inverse_text_normalization/es_en/utils.py | 2 +- .../es_en/verbalizers/verbalize_final.py | 2 +- .../fr/taggers/cardinal.py | 10 +- .../fr/taggers/decimal.py | 6 +- .../fr/taggers/fraction.py | 2 +- .../fr/taggers/ordinal.py | 2 +- .../fr/taggers/telephone.py | 2 +- .../inverse_text_normalization/fr/utils.py | 2 +- .../fr/verbalizers/decimal.py | 4 +- .../fr/verbalizers/telephone.py | 2 +- .../fr/verbalizers/time.py | 2 +- .../fr/verbalizers/verbalize_final.py | 2 +- .../inverse_text_normalization/hy/utils.py | 4 +- .../ja/taggers/cardinal.py | 2 +- .../ja/taggers/date.py | 2 +- .../ja/taggers/decimal.py | 4 +- .../ja/taggers/fraction.py | 2 +- .../ja/taggers/fraction_old.py | 2 +- .../ja/taggers/ordinal.py | 2 +- .../ja/taggers/preprocessor.py | 14 +- .../inverse_text_normalization/ja/utils.py | 2 +- .../ja/verbalizers/date.py | 2 +- .../ja/verbalizers/fraction.py | 2 +- .../ja/verbalizers/fraction_old.py | 2 +- .../ja/verbalizers/post_processing.py | 6 +- .../ja/verbalizers/postprocessor.py | 8 +- .../ja/verbalizers/time.py | 6 +- .../ja/verbalizers/verbalize_final.py | 4 +- .../ja/verbalizers/whitelist.py | 2 +- .../ja/verbalizers/word.py | 2 +- .../mr/taggers/decimal.py | 10 +- .../mr/taggers/time.py | 16 +- .../inverse_text_normalization/mr/utils.py | 4 +- .../mr/verbalizers/time.py | 10 +- .../ru/taggers/cardinal.py | 2 +- .../ru/verbalizers/verbalize_final.py | 2 +- .../sv/taggers/cardinal.py | 2 +- .../sv/taggers/electronic.py | 2 +- .../sv/taggers/fraction.py | 2 +- .../sv/taggers/telephone.py | 4 +- .../sv/taggers/time.py | 2 +- .../inverse_text_normalization/sv/utils.py | 2 +- .../sv/verbalizers/verbalize_final.py | 2 +- .../zh/taggers/cardinal.py | 2 +- .../zh/taggers/fraction.py | 2 +- .../zh/taggers/tokenize_and_classify.py | 2 +- .../inverse_text_normalization/zh/utils.py | 4 +- .../zh/verbalizers/verbalize.py | 2 +- .../zh/verbalizers/verbalize_final.py | 2 +- .../text_normalization/ar/taggers/cardinal.py | 2 +- .../text_normalization/ar/taggers/decimal.py | 4 +- .../text_normalization/ar/taggers/fraction.py | 2 +- .../text_normalization/ar/taggers/money.py | 2 +- .../text_normalization/ar/utils.py | 4 +- .../ar/verbalizers/measure.py | 2 +- .../ar/verbalizers/verbalize_final.py | 2 +- .../text_normalization/data_loader_utils.py | 12 +- .../text_normalization/de/taggers/cardinal.py | 2 +- .../text_normalization/de/taggers/date.py | 2 +- .../text_normalization/de/taggers/decimal.py | 4 +- .../text_normalization/de/taggers/ordinal.py | 2 +- .../de/taggers/telephone.py | 4 +- .../text_normalization/de/utils.py | 2 +- .../de/verbalizers/decimal.py | 4 +- .../de/verbalizers/measure.py | 2 +- .../de/verbalizers/telephone.py | 2 +- .../de/verbalizers/verbalize_final.py | 2 +- .../text_normalization/en/clean_eval_data.py | 2 +- .../text_normalization/en/taggers/cardinal.py | 2 +- .../text_normalization/en/taggers/date.py | 8 +- .../text_normalization/en/taggers/decimal.py | 4 +- .../text_normalization/en/taggers/ordinal.py | 2 +- .../text_normalization/en/taggers/range.py | 2 +- .../text_normalization/en/taggers/serial.py | 2 +- .../en/taggers/telephone.py | 6 +- .../text_normalization/en/taggers/time.py | 2 +- .../en/taggers/tokenize_and_classify_lm.py | 2 +- .../tokenize_and_classify_with_audio.py | 2 +- .../text_normalization/en/utils.py | 4 +- .../en/verbalizers/measure.py | 2 +- .../en/verbalizers/post_processing.py | 6 +- .../text_normalization/es/taggers/ordinal.py | 2 +- .../es/taggers/telephone.py | 4 +- .../es/verbalizers/cardinal.py | 12 +- .../es/verbalizers/decimals.py | 12 +- .../es/verbalizers/fraction.py | 14 +- .../es/verbalizers/ordinal.py | 2 +- .../text_normalization/fr/taggers/ordinal.py | 2 +- .../text_normalization/fr/utils.py | 4 +- .../fr/verbalizers/cardinal.py | 12 +- .../fr/verbalizers/decimals.py | 12 +- .../fr/verbalizers/fraction.py | 14 +- .../text_normalization/hu/taggers/date.py | 6 +- .../text_normalization/hu/taggers/decimal.py | 4 +- .../text_normalization/hu/taggers/ordinal.py | 2 +- .../hu/taggers/telephone.py | 4 +- .../text_normalization/hu/taggers/time.py | 2 +- .../text_normalization/hu/utils.py | 6 +- .../hu/verbalizers/measure.py | 2 +- .../text_normalization/hy/utils.py | 4 +- .../hy/verbalizers/verbalize.py | 14 +- .../hy/verbalizers/verbalize_final.py | 16 +- .../text_normalization/it/taggers/decimals.py | 3 +- .../text_normalization/it/taggers/measure.py | 2 +- .../text_normalization/it/taggers/money.py | 2 +- .../text_normalization/it/taggers/time.py | 2 +- .../text_normalization/it/utils.py | 2 +- .../it/verbalizers/decimal.py | 12 +- .../it/verbalizers/measure.py | 2 +- .../it/verbalizers/money.py | 2 +- .../text_normalization/normalize.py | 3 + .../text_normalization/ru/taggers/cardinal.py | 2 +- .../text_normalization/ru/taggers/date.py | 2 +- .../text_normalization/ru/taggers/decimals.py | 2 +- .../text_normalization/ru/taggers/ordinal.py | 2 +- .../ru/taggers/telephone.py | 4 +- .../text_normalization/ru/taggers/time.py | 2 +- .../text_normalization/ru/utils.py | 2 +- .../ru/verbalizers/measure.py | 2 +- .../ru/verbalizers/verbalize_final.py | 2 +- .../text_normalization/rw/__init__.py | 18 ++ .../text_normalization/rw/data/__init__.py | 15 + .../rw/data/cardinal/__init__.py | 15 + .../rw/data/cardinal/digits.tsv | 9 + .../rw/data/cardinal/digits_for_thousands.tsv | 10 + .../cardinal/digits_millions_trillions.tsv | 10 + .../rw/data/cardinal/hundreds.tsv | 9 + .../rw/data/cardinal/hundreds_of_millions.tsv | 9 + .../data/cardinal/hundreds_of_thousands.tsv | 9 + .../data/cardinal/hundreds_of_trillions.tsv | 9 + .../rw/data/cardinal/millions.tsv | 9 + .../rw/data/cardinal/tens.tsv | 9 + .../rw/data/cardinal/tens_of_millions.tsv | 9 + .../rw/data/cardinal/tens_of_thousands.tsv | 9 + .../rw/data/cardinal/tens_of_trillions.tsv | 9 + .../rw/data/cardinal/thousands.tsv | 10 + .../rw/data/cardinal/trillions.tsv | 9 + .../rw/data/time/__init__.py | 15 + .../text_normalization/rw/data/time/hours.tsv | 12 + .../rw/data/time/minutes.tsv | 60 ++++ .../rw/data/whitelist/__init__.py | 15 + .../data/whitelist/kinya_transliterations.tsv | 175 +++++++++++ .../text_normalization/rw/graph_utils.py | 273 ++++++++++++++++++ .../text_normalization/rw/taggers/__init__.py | 15 + .../text_normalization/rw/taggers/cardinal.py | 243 ++++++++++++++++ .../text_normalization/rw/taggers/time.py | 43 +++ .../rw/taggers/tokenize_and_classify.py | 78 +++++ .../rw/taggers/whitelist.py | 32 ++ .../text_normalization/rw/utils.py | 27 ++ .../rw/verbalizers/__init__.py | 15 + .../text_normalization/rw/verbalizers/time.py | 42 +++ .../rw/verbalizers/verbalize.py | 29 ++ .../rw/verbalizers/verbalize_final.py | 53 ++++ .../text_normalization/sv/taggers/ordinal.py | 2 +- .../sv/taggers/telephone.py | 4 +- .../sv/verbalizers/decimals.py | 12 +- .../text_normalization/token_parser.py | 12 +- .../text_normalization/zh/taggers/date.py | 2 +- .../text_normalization/zh/taggers/decimal.py | 2 +- .../text_normalization/zh/taggers/fraction.py | 2 +- .../text_normalization/zh/taggers/measure.py | 2 +- .../zh/taggers/preprocessor.py | 14 +- .../zh/taggers/tokenize_and_classify.py | 4 +- .../text_normalization/zh/utils.py | 4 +- .../zh/verbalizers/measure.py | 2 +- .../zh/verbalizers/post_processing.py | 6 +- .../zh/verbalizers/postprocessor.py | 8 +- .../zh/verbalizers/verbalize.py | 2 +- .../zh/verbalizers/verbalize_final.py | 4 +- .../zh/verbalizers/whitelist.py | 2 +- .../text_normalization/zh/verbalizers/word.py | 2 +- tests/conftest.py | 4 +- tests/nemo_text_processing/rw/__init__.py | 15 + .../test_cases_cardinal.txt | 57 ++++ .../test_cases_time.txt | 14 + .../test_cases_whitelist.txt | 3 + .../test_cases_word.txt | 26 ++ .../nemo_text_processing/rw/test_cardinal.py | 37 +++ .../rw/test_sparrowhawk_normalization.sh | 60 ++++ tests/nemo_text_processing/rw/test_time.py | 34 +++ .../nemo_text_processing/rw/test_whitelist.py | 35 +++ tests/nemo_text_processing/rw/test_word.py | 35 +++ .../pynini_export.py | 30 +- 220 files changed, 1991 insertions(+), 349 deletions(-) create mode 100644 nemo_text_processing/text_normalization/rw/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/time/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/time/hours.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/time/minutes.tsv create mode 100644 nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv create mode 100644 nemo_text_processing/text_normalization/rw/graph_utils.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/time.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/text_normalization/rw/taggers/whitelist.py create mode 100644 nemo_text_processing/text_normalization/rw/utils.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/__init__.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/time.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py create mode 100644 nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py create mode 100644 tests/nemo_text_processing/rw/__init__.py create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/rw/test_cardinal.py create mode 100644 tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh create mode 100644 tests/nemo_text_processing/rw/test_time.py create mode 100644 tests/nemo_text_processing/rw/test_whitelist.py create mode 100644 tests/nemo_text_processing/rw/test_word.py diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py index 3100cf49f..5e76f66eb 100644 --- a/nemo_text_processing/fst_alignment/alignment.py +++ b/nemo_text_processing/fst_alignment/alignment.py @@ -200,7 +200,7 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: st alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions start: inclusive start position in input string end: exclusive end position in input string - mode: grammar type for either tn or itn + mode: grammar type for either tn or itn Returns: output_og_start_index: inclusive start position in output string diff --git a/nemo_text_processing/hybrid/mlm_scorer.py b/nemo_text_processing/hybrid/mlm_scorer.py index 2986f3562..b2c94598e 100644 --- a/nemo_text_processing/hybrid/mlm_scorer.py +++ b/nemo_text_processing/hybrid/mlm_scorer.py @@ -93,7 +93,7 @@ def score_sentence(self, sentence: str): def __mask_text__(self, idx: int, tokens: List[str]): """ - replaces string at index idx in list `tokens` with a masked token and returns the modified list. + replaces string at index idx in list `tokens` with a masked token and returns the modified list. """ masked = tokens.copy() masked[idx] = self.MASK_LABEL diff --git a/nemo_text_processing/hybrid/model_utils.py b/nemo_text_processing/hybrid/model_utils.py index 7b2f8e960..b81d59b2a 100644 --- a/nemo_text_processing/hybrid/model_utils.py +++ b/nemo_text_processing/hybrid/model_utils.py @@ -74,7 +74,7 @@ def get_masked_score(text, model, do_lower=True): def _get_ambiguous_positions(sentences: List[str]): """returns None or index list of ambigous semiotic tokens for list of sentences. - E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only + E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only the first semiotic span / is ambiguous.""" l_sets = [set([x]) for x in re.findall(r"<\s.+?\s>", sentences[0])] for sentence in sentences[1:]: diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py index ced823510..d634f5a09 100644 --- a/nemo_text_processing/hybrid/utils.py +++ b/nemo_text_processing/hybrid/utils.py @@ -390,8 +390,8 @@ def clean_post_norm( def clean_libri_tts(target: str): """ - Replace abbreviations in LibriTTS dataset - """ + Replace abbreviations in LibriTTS dataset + """ # Normalized text in LibriTTS by Google which contains abbreviations from `libri_sometimes_converts_abbrs` sometimes wasn't converted. libri_sometimes_converts_abbrs = {"St.": "saint", "Rev.": "reverend"} @@ -641,7 +641,7 @@ def get_diff(a: str, b: str): def diff_pred_gt(pred: str, gt: str): """returns list of different substrings between prediction and gt - relies on that prediction uses '< ' ' >' + relies on that prediction uses '< ' ' >' Args: pred (str): prediction @@ -649,7 +649,7 @@ def diff_pred_gt(pred: str, gt: str): Returns: list of Tuple(pred start and end, gt start and end) subsections - + e.g. pred="< Edward third >., king Our own . loss had been < two thousand two hundred >" gt ="Edward III., king Our own loss had been twenty two hundred" --> [([0, 16], [0, 10]), ([32, 34], [26, 26]), ([48, 76], [40, 58])] diff --git a/nemo_text_processing/hybrid/wfst_lm_rescoring.py b/nemo_text_processing/hybrid/wfst_lm_rescoring.py index 86f375058..7f001e6a2 100644 --- a/nemo_text_processing/hybrid/wfst_lm_rescoring.py +++ b/nemo_text_processing/hybrid/wfst_lm_rescoring.py @@ -73,7 +73,7 @@ def threshold_weights(norm_texts_weights, delta: float = 0.2): delta: delta to add to minimum weight in options to compose upper limit for threshhold returns: - filter list of same format as input + filter list of same format as input """ # threshold value is factor applied to lowest/first weight of all normalization options for every input res = [] diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py index 40ee1acf0..47febc4ac 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py @@ -22,8 +22,8 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals e.g. سالب تسعة وتسعون -> cardinal { integer: "99" negative: "-" } } - Numbers below thirteen are not converted. - Args: + Numbers below thirteen are not converted. + Args: tn_cardinal: cardinal FST for TN """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py index beefe52ee..db14cc9a8 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py @@ -29,7 +29,7 @@ class FractionFst(GraphFst): """ Finite state transducer for classifying fraction e.g. واحد و نصف -> tokens { integer_part: "1" numerator: "1" denominator: "2" } - + Args: tn_cardinal: TN cardinal tagger diff --git a/nemo_text_processing/inverse_text_normalization/ar/utils.py b/nemo_text_processing/inverse_text_normalization/ar/utils.py index ca6210150..67594bf55 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ar/utils.py @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]): Args x: integer - Returns: spoken representation + Returns: spoken representation """ if isinstance(x, int): x = str(x) @@ -41,7 +41,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py index 326d49df8..7f557096b 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py @@ -21,7 +21,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py index 0670090b8..46fdca4e3 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py @@ -20,7 +20,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals. Numbers below ten are not converted. + Finite state transducer for classifying cardinals. Numbers below ten are not converted. Allows both compound numeral strings or separated by whitespace. "und" (en: "and") can be inserted between "hundert" and following number or "tausend" and following single or double digit number. @@ -32,7 +32,7 @@ class CardinalFst(GraphFst): e.g. ein tausend -> cardinal { integer: "1000" } } e.g. eintausend -> cardinal { integer: "1000" } } e.g. ein tausend zwanzig -> cardinal { integer: "1020" } } - + Args: tn_cardinal_tagger: TN cardinal tagger """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py index 38ca80ca5..dc9f96bd1 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py @@ -22,7 +22,7 @@ class ElectronicFst(GraphFst): """ Finite state transducer for classifying electronic: email addresses, etc. e.g. c d f eins at a b c punkt e d u -> tokens { name: "cdf1.abc.edu" } - + Args: tn_electronic_tagger: TN eletronic tagger tn_electronic_verbalizer: TN eletronic verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py index 14e06a5be..960c9ffa9 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py @@ -29,7 +29,7 @@ class FractionFst(GraphFst): e.g. ein halb -> tokens { name: "1/2" } e.g. ein ein halb -> tokens { name: "1 1/2" } e.g. drei zwei ein hundertstel -> tokens { name: "3 2/100" } - + Args: itn_cardinal_tagger: ITN cardinal tagger tn_fraction_verbalizer: TN fraction verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py index 22474376f..dd7f79878 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py @@ -20,9 +20,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. null vier eins eins eins zwei drei vier eins zwei drei vier -> tokens { name: "(0411) 1234-1234" } - + Args: tn_cardinal_tagger: TN Cardinal Tagger """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py index 571edd724..db2edb66b 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py @@ -31,7 +31,7 @@ class TimeFst(GraphFst): e.g. drei vor zwölf -> time { minutes: "57" hours: "11" } e.g. drei nach zwölf -> time { minutes: "3" hours: "12" } e.g. drei uhr zehn minuten zehn sekunden -> time { hours: "3" hours: "10" sekunden: "10"} - + Args: tn_time_verbalizer: TN time verbalizer """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py index 3031ac2b4..ac67928ce 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py @@ -23,7 +23,7 @@ class TimeFst(GraphFst): Finite state transducer for verbalizing time, e.g. time { hours: "8" minutes: "30" zone: "e s t" } -> 08:30 Uhr est time { hours: "8" } -> 8 Uhr - time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr + time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py index ab2576934..beb9b1e7c 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "jetzt" } tokens { name: "ist" } tokens { time { hours: "12" minutes: "30" } } -> jetzt ist 12:30 Uhr """ diff --git a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py index ab2969f98..e9dd16034 100644 --- a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py @@ -67,7 +67,7 @@ def process(self, instance: Instance) -> Instance: Args: processes given instance with process function - + Returns: processed instance if instance belongs to expected class type or original instance """ if instance.token_type != self.class_type: diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index 36f424208..fa5df3367 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -243,7 +243,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.fst = final_graph.optimize() def delete_word(self, word: str): - """ Capitalizes word for `cased` input""" + """Capitalizes word for `cased` input""" delete_graph = pynutil.delete(word).optimize() if self.input_case == INPUT_CASED: if len(word) > 0: diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index 8d8a4f444..5be9240d7 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -137,7 +137,7 @@ def _get_thousands_graph(): class DateFst(GraphFst): """ - Finite state transducer for classifying date, + Finite state transducer for classifying date, e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true } e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true } e.g. twenty twenty -> date { year: "2012" preserve_order: true } diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py index 2c6ee7a62..1d730ec30 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py @@ -41,7 +41,7 @@ def get_quantity( e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST input_case: accepting either "lower_cased" or "cased" input. diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py index dba4c0201..06d749e39 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py @@ -61,7 +61,7 @@ def get_serial_number(cardinal): class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. one two three one two three five six seven eight -> { number_part: "123-123-5678" } This class also support card number and IP format. diff --git a/nemo_text_processing/inverse_text_normalization/en/utils.py b/nemo_text_processing/inverse_text_normalization/en/utils.py index 00b6a636f..cd54850ce 100644 --- a/nemo_text_processing/inverse_text_normalization/en/utils.py +++ b/nemo_text_processing/inverse_text_normalization/en/utils.py @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]): Args x: integer - Returns: spoken representation + Returns: spoken representation """ if isinstance(x, int): x = str(x) @@ -41,7 +41,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py index ca2bdcee2..780185325 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/fraction.py @@ -18,7 +18,7 @@ class FractionFst(GraphFst): """ - Finite state transducer for verbalizing fraction, + Finite state transducer for verbalizing fraction, """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py index e8d622e3c..141e41fe5 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py @@ -23,7 +23,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. telephone { number_part: "123-123-5678" } - -> 123-123-5678 + -> 123-123-5678 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py index 467329001..86c1b575b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py @@ -23,7 +23,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 2f62d589d..3e164bcc9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -31,10 +31,10 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"} + e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"} This class converts cardinals up to (but not including) "un cuatrillón", i.e up to "one septillion" in English (10^{24}). - Cardinals below ten are not converted (in order to avoid + Cardinals below ten are not converted (in order to avoid "vivo en una casa" --> "vivo en 1 casa" and any other odd conversions.) Although technically Spanish grammar requires that "y" only comes after @@ -199,7 +199,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.fst = final_graph.optimize() def delete_word(self, word: str): - """ Capitalizes word for `cased` input""" + """Capitalizes word for `cased` input""" delete_graph = pynutil.delete(word).optimize() if self.input_case == INPUT_CASED: if len(word) > 0: diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index af96ee002..66281d225 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -28,10 +28,10 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, + Finite state transducer for classifying date, e.g. primero de enero -> date { day: "1" month: "enero" } e.g. uno de enero -> date { day: "1" month: "enero" } - + Args: cardinal: CardinalFst input_case: accepting either "lower_cased" or "cased" input. diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index 2b1949041..8bfa560d2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -38,7 +38,7 @@ def get_quantity( e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_million: cardinal FST input_case: accepting either "lower_cased" or "cased" input. @@ -87,7 +87,7 @@ class DecimalFst(GraphFst): This decimal rule assumes that decimals can be pronounced as: (a cardinal) + ('coma' or 'punto') plus (any sequence of cardinals <1000, including 'zero') - Also writes large numbers in shortened form, e.g. + Also writes large numbers in shortened form, e.g. e.g. uno coma dos seis millón -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } e.g. dos millones -> decimal { negative: "false" integer_part: "2" quantity: "millones" } e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" } diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 3bc6a8b6d..50a5e07f7 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -36,7 +36,7 @@ class ElectronicFst(GraphFst): and URLS (which get converted to a "protocol" field). e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } } e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } } - + Args: input_case: accepting either "lower_cased" or "cased" input. """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index a2b55026e..ae5d13fa9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -23,18 +23,18 @@ class FractionFst(GraphFst): """ Finite state transducer for classifying fractions - e.g. dos quintos -> fraction { numerator: "2" denominator: "5" } - This class converts fractions with a denominator up to (and including) + e.g. dos quintos -> fraction { numerator: "2" denominator: "5" } + This class converts fractions with a denominator up to (and including) "1/999". - + Fractions with 4 as their denominator, read as "cuarto(s)", are not converted because "room" is also "cuarto", which could cause issues like "quiero reservar un cuarto" -> quiero reservar 1/2". - + Fractions without a numerator are not converted either to prevent issues like: "estaba medio dormido" -> "estaba 1/2 dormido" - + Args: cardinal: CardinalFst ordinal: OrdinalFst diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index 9d231bc25..bdefdcf71 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -32,7 +32,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for classifying measure - e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" } + e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" } Args: cardinal: CardinalFst diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index d03640742..d97cc752a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -31,7 +31,7 @@ class OrdinalFst(GraphFst): vigésimo primero -> ordinal { integer: "21" morphosyntactic_features: "o" } This class converts ordinal up to "millesímo" (one thousandth) exclusive. - Cardinals below ten are not converted (in order to avoid + Cardinals below ten are not converted (in order to avoid e.g. "primero hice ..." -> "1.º hice...", "segunda guerra mundial" -> "2.ª guerra mundial" and any other odd conversions.) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 1c0be2037..2086d643c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -27,7 +27,7 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. uno dos tres uno dos tres cinco seis siete ocho -> { number_part: "123-123-5678" }. If 10 digits are spoken, they are grouped as 3+3+4 (eg. 123-456-7890). If 9 digits are spoken, they are grouped as 3+3+3 (eg. 123-456-789). @@ -37,7 +37,7 @@ class TelephoneFst(GraphFst): "twelve thirty four" = "1234". (we ignore more complicated cases such as "three hundred and two" or "three nines"). - + Args: input_case: accepting either "lower_cased" or "cased" input. """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index 9d55f35a3..f33c7c1b1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -45,21 +45,21 @@ class TimeFst(GraphFst): e.g. cuarto para las dos -> time { minutes: "45" hours: "la 1" } Note that times on the hour (e.g. "las dos" i.e. "two o'clock") do not get - converted into a time format. This is to avoid converting phrases that are + converted into a time format. This is to avoid converting phrases that are not part of a time phrase (e.g. "las dos personas" i.e. "the two people") e.g. las dos -> tokens { name: "las" } tokens { name: "dos" } - However, if a time on the hour is followed by a suffix (indicating 'a.m.' + However, if a time on the hour is followed by a suffix (indicating 'a.m.' or 'p.m.'), it will be converted. e.g. las dos pe eme -> time { hours: "las 2" minutes: "00" suffix: "p.m." } - - In the same way, times without a preceding article are not converted. This is + + In the same way, times without a preceding article are not converted. This is to avoid converting ranges or complex fractions e.g. dos y media -> tokens { name: "dos" } tokens { name: "y" } tokens { name: "media" } - However, if a time without an article is followed by a suffix (indicating 'a.m.' + However, if a time without an article is followed by a suffix (indicating 'a.m.' or 'p.m.'), it will be converted. e.g. dos y media p m -> time { hours: "2" minutes: "30" suffix: "p.m." } - Note that although the TimeFst verbalizer can accept 'zone' (timezone) fields, + Note that although the TimeFst verbalizer can accept 'zone' (timezone) fields, so far the rules have not been added to the TimeFst tagger to process timezones (to keep the rules simple, and because timezones are not very often specified in Spanish.) diff --git a/nemo_text_processing/inverse_text_normalization/es/utils.py b/nemo_text_processing/inverse_text_normalization/es/utils.py index f6e06f793..bedda6391 100644 --- a/nemo_text_processing/inverse_text_normalization/es/utils.py +++ b/nemo_text_processing/inverse_text_normalization/es/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py index 58aa190ba..8364c250b 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py @@ -22,7 +22,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. telephone { number_part: "123-123-5678" } - -> 123-123-5678 + -> 123-123-5678 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py index 6b22d6f73..5c45ff66f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/es_en/utils.py b/nemo_text_processing/inverse_text_normalization/es_en/utils.py index f6e06f793..bedda6391 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/utils.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py index 3323f173b..e46b6db56 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py index 333460eb0..d827a63e2 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py @@ -35,9 +35,9 @@ def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': In cases where original orthography is current, or string is mixture of two orthographies, will render invalid form that will not pass through CardinalFst e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.) - e.g. deux + e.g. deux - Args: + Args: cardinal: cardinal FST """ @@ -90,13 +90,13 @@ def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"} + e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"} This class converts cardinals up to (but not including) "un-quatrillion", i.e up to "one septillion" in English (10^{24}). - Cardinals below nine are not converted (in order to avoid + Cardinals below nine are not converted (in order to avoid "j'ai un pomme." --> "j'ai 1 pomme" and any other odd conversions.) This transducer accomodates both traditional hyphenation of numbers ('-' for most numbers <100) - and current hyphenation (all elements of number are hyphenated), prioritizing the latter. + and current hyphenation (all elements of number are hyphenated), prioritizing the latter. e.g cent cinquante et un -> cardinal { integer: "151"} cent-cinquante-et-un -> cardinal { integer: "151"} This is done through a context dependent rewrite that attempts to map old spelling to new. diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py index 7994b719d..9f6341cf4 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py @@ -31,9 +31,9 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_thousand: 'pynini.Fst e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions + Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions - Args: + Args: decimal: decimal FST cardinal_up_to_million: cardinal FST """ @@ -79,7 +79,7 @@ class DecimalFst(GraphFst): This decimal rule assumes that decimals can be pronounced as: (a cardinal) + ('virgule') plus (any sequence of cardinals <1 million, including 'zero') - Also writes large numbers in shortened form, e.g. + Also writes large numbers in shortened form, e.g. e.g. un virgule deux-six-million -> decimal { negative: "false" integer_part: "1" fractional_part: "26" quantity: "million" } e.g. deux-million -> decimal { negative: "false" integer_part: "2" quantity: "millions" } e.g. moins cent-vingt-quatre-millions -> decimal { negative: "true" integer_part: "124" quantity: "millions" } diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py index ca089455a..94b87bfd5 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py @@ -30,7 +30,7 @@ class FractionFst(GraphFst): e.g. demi -> tokens { fraction { numerator: "1" denominator: "2" } } e.g. un et demi -> tokens { fraction { integer_part: "1" numerator: "1" denominator: "2" } } e.g. trois et deux centième -> tokens { fraction { integer_part: "3" numerator: "2" denominator: "100" } } - + Args: cardinal: OrdinalFst """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py index 03976e9e9..629fc0e26 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py @@ -24,7 +24,7 @@ class OrdinalFst(GraphFst): Finite state transducer for classifying ordinal vingt-deuxième -> ordinal { integer: "22" morphosyntactic_features: "e" } - Also notes specific nouns that have unique normalization conventions. + Also notes specific nouns that have unique normalization conventions. For instance, 'siècles' are rendered in roman numerals when given an ordinal adjective. e.g. dix-neuvième siècle -> XIXe diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py index b157960c0..c532cfd06 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py @@ -27,7 +27,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for classifying telephone numbers. Assumes conventional grouping for Metropolitan France (and overseas departments) - (two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g. + (two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g. "zero un quatre-vingt-deux zero deux vingt-deux cinquante" -> { number_part: "01 42 02 22 50" } "zero un quatre deux zero deux deux deux cinq zero" -> { number_part: "01 42 02 22 50" } diff --git a/nemo_text_processing/inverse_text_normalization/fr/utils.py b/nemo_text_processing/inverse_text_normalization/fr/utils.py index f6e06f793..bedda6391 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/utils.py +++ b/nemo_text_processing/inverse_text_normalization/fr/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py index c1a55401e..ce0bdf8c4 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py @@ -26,8 +26,8 @@ class NumberParser(GraphFst): """ - Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for - strings of digits of four or more (inclusive). Groupings are separated by non-breaking space. + Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for + strings of digits of four or more (inclusive). Groupings are separated by non-breaking space. e.g. '1000' -> '1 000' e.g. '1000,33333' -> '1 000,333 33 """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py index d937c04d7..5dd5e175c 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py @@ -22,7 +22,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. telephone { number_part: "02 33 43 53 22" } - -> 02 33 43 53 22 + -> 02 33 43 53 22 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py index 52af95d09..99f5b99e8 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py @@ -29,7 +29,7 @@ class TimeFst(GraphFst): Finite state transducer for verbalizing time, e.g. time { hours: "8" minutes: "30" suffix: "du matin"} -> 8 h 30 time { hours: "8" minutes: "30" } -> 8 h 30 - time { hours: "8" minutes: "30" suffix: "du soir"} -> 20 h 30 + time { hours: "8" minutes: "30" suffix: "du soir"} -> 20 h 30 """ def __init__(self): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py index c0bf305da..677386d28 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/hy/utils.py b/nemo_text_processing/inverse_text_normalization/hy/utils.py index f7179e35b..1f1349115 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/utils.py +++ b/nemo_text_processing/inverse_text_normalization/hy/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py index c265f7ef9..fa6bebd87 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py @@ -23,7 +23,7 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. 二十三 -> cardinal { integer: "23" } + e.g. 二十三 -> cardinal { integer: "23" } e.g. にじゅうさん -> cardinal { integer: "23" } """ diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py index 52d72be58..0e30449e8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/date.py @@ -22,7 +22,7 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g., + Finite state transducer for classifying date, e.g., 一日 -> 1日 date { day: "1" } 五から九日 -> (5~9日) date { day: "5~9" } 一月 -> 1月 date { month: "1" } diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py index 7ec070457..6e070231c 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/decimal.py @@ -31,8 +31,8 @@ def get_quantity(decimal): class DecimalFst(GraphFst): """ Finite state transducer for classifying decimal - e.g. 一点五 -> decimnl { integer_part: "1" fractional_part: "5" } - e.g. 一点五万 -> decimal { integer_part: "1" fractional_part: "5" quantity: "万" } + e.g. 一点五 -> decimnl { integer_part: "1" fractional_part: "5" } + e.g. 一点五万 -> decimal { integer_part: "1" fractional_part: "5" quantity: "万" } """ def __init__(self, cardinal: GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py index 458448fb4..bf3b60630 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): """ Fitite state transducer for classifying fractions - e.g., + e.g., 四分の三 -> fraction { denominator: "4" numerator: "3" } 一と四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } 一荷四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py index d478e5f4c..5ef844495 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction_old.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): """ Fitite state transducer for classifying fractions - e.g., + e.g., 四分の三 -> fraction { denominator: "4" numerator: "3" } 一と四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } 一荷四分の三 -> fraction { integer: "1" denominator: "4" numerator: "3" } diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py index ad20ab82f..1f48bc273 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/ordinal.py @@ -22,7 +22,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. 第二十三 -> cardinal { morphsyntactic_feature: "第" integer: "23" } + e.g. 第二十三 -> cardinal { morphsyntactic_feature: "第" integer: "23" } e.g. 百番目 -> cardinal { integer: "100" morphsyntactic_feature:"番目" } """ diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py index 742be01bb..8fca40fdd 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py @@ -22,13 +22,13 @@ class PreProcessorFst(GraphFst): ''' - Preprocessing of TN: - 1. interjections removal such as '啊, 呃' - 2. fullwidth -> halfwidth char conversion - 好啊 -> 好 - 呃对 -> 对 - : -> : - ; -> ; + Preprocessing of TN: + 1. interjections removal such as '啊, 呃' + 2. fullwidth -> halfwidth char conversion + 好啊 -> 好 + 呃对 -> 对 + : -> : + ; -> ; ''' def __init__( diff --git a/nemo_text_processing/inverse_text_normalization/ja/utils.py b/nemo_text_processing/inverse_text_normalization/ja/utils.py index bb0c588c2..28f7b70d8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ja/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py index cea461463..b765b338f 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/date.py @@ -21,7 +21,7 @@ class DateFst(GraphFst): """ - Finite state transducer for verbalizing date, e.g., + Finite state transducer for verbalizing date, e.g., date { day: "1" } -> 1日 date { day: "5~9" } -> 5~9日 date { month: "1" } -> 1月 diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py index 7c37886f8..028864ee9 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self): """ Fitite state transducer for classifying fractions - e.g., + e.g., fraction { denominator: "4" numerator: "3" } -> 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py index cae890be5..2269f9999 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/fraction_old.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): def __init__(self): """ Fitite state transducer for classifying fractions - e.g., + e.g., fraction { denominator: "4" numerator: "3" } -> 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 fraction { integer: "1" denominator: "4" numerator: "3" } -> 1 3/4 diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py index 4bafef0bd..8b196dcaf 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py index e78dba58c..7bbc16516 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py @@ -29,10 +29,10 @@ class PostProcessor(GraphFst): ''' - Postprocessing of TN, now contains: - 1. punctuation removal - 2. letter case conversion - 3. oov tagger + Postprocessing of TN, now contains: + 1. punctuation removal + 2. letter case conversion + 3. oov tagger ''' def __init__( diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py index 386f1d4a1..798cd001d 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py @@ -22,11 +22,11 @@ class TimeFst(GraphFst): """ Finite state transducer for verbalizing time, e.g., - time { hours: "1" minutes: "0" } -> 1時30分 -> + time { hours: "1" minutes: "0" } -> 1時30分 -> time { hours: "5" minutes: "20" suffix: "過ぎ" } -> 5時20分 time { hours: "8" minutes: "半" suffix: "頃" } -> 8時半頃 - time { hours: "10" minutes: "25" suffix: "前" } -> 10時5分前 - time { hours: "正午" minutes: "1" suffix: "前" } -> 正午1分前 + time { hours: "10" minutes: "25" suffix: "前" } -> 10時5分前 + time { hours: "正午" minutes: "1" suffix: "前" } -> 正午1分前 time { hours: "正午" minutes: "10" suffix: "過ぎ" } -> 正午10分過ぎ """ diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py index 8f68abe65..980e41816 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py @@ -26,9 +26,7 @@ class VerbalizeFinalFst(GraphFst): - """ - - """ + """ """ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py index debe75196..1c21ce8d3 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/whitelist.py @@ -21,7 +21,7 @@ class WhiteListFst(GraphFst): ''' - tokens { whitelist: "ATM" } -> A T M + tokens { whitelist: "ATM" } -> A T M ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py index d7c2cc874..621ae003e 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py @@ -21,7 +21,7 @@ class WordFst(GraphFst): ''' - tokens { char: "一" } -> 一 + tokens { char: "一" } -> 一 ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py index 9434f77fe..8882b860c 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py @@ -51,12 +51,12 @@ def get_quantity(decimal, cardinal_fst): class DecimalFst(GraphFst): """ - Finite state transducer for classifying cardinals - e.g. तेहतीस पूर्णांक तीन -> decimal { integer_part: "३३" fractional_part: "३" } - e.g. उणे तेहतीस पूर्णांक तीन लाख -> decimal { negative: "true" integer_part: "३३" fractional_part: "३" quantity: "लाख" } + Finite state transducer for classifying cardinals + e.g. तेहतीस पूर्णांक तीन -> decimal { integer_part: "३३" fractional_part: "३" } + e.g. उणे तेहतीस पूर्णांक तीन लाख -> decimal { negative: "true" integer_part: "३३" fractional_part: "३" quantity: "लाख" } - Args: - cardinal: CardinalFst + Args: + cardinal: CardinalFst """ def __init__(self, cardinal: GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py index c4b311e4b..b6e1080da 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py @@ -23,14 +23,14 @@ class TimeFst(GraphFst): """ - Finite state transducer for classifying time - e.g. साडे चार -> time { hours: "४" minutes: "३०" } - e.g. सव्वा बारा -> time { hours: "१२" minutes: "१५" } - e.g. पावणे दहा -> time { hours: "९" minutes: "४५" } - e.g. अकराला पाच मिनिटे -> time { hours: "१०" minutes: "५५" } - e.g. अकरा वाजून दोन मिनिटे -> time { hours: "११" minutes: "२" } - e.g. अडीच -> time { hours: "२" minutes: "३०" } - """ + Finite state transducer for classifying time + e.g. साडे चार -> time { hours: "४" minutes: "३०" } + e.g. सव्वा बारा -> time { hours: "१२" minutes: "१५" } + e.g. पावणे दहा -> time { hours: "९" minutes: "४५" } + e.g. अकराला पाच मिनिटे -> time { hours: "१०" minutes: "५५" } + e.g. अकरा वाजून दोन मिनिटे -> time { hours: "११" minutes: "२" } + e.g. अडीच -> time { hours: "२" minutes: "३०" } + """ def __init__(self): super().__init__(name="time", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/mr/utils.py b/nemo_text_processing/inverse_text_normalization/mr/utils.py index f7179e35b..1f1349115 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/utils.py +++ b/nemo_text_processing/inverse_text_normalization/mr/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py index 7cc99b311..15fcf6e45 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py @@ -21,11 +21,11 @@ class TimeFst(GraphFst): """ - Finite state transducer for verbalizing time, e.g. - e.g. time { hours: "४" minutes: "३०" } -> ०४:३० - e.g. time { hours: "११" minutes: "३०" } -> ११:३० - e.g. time { hours: "८" minutes: "१५" } -> ०८:१५ - """ + Finite state transducer for verbalizing time, e.g. + e.g. time { hours: "४" minutes: "३०" } -> ०४:३० + e.g. time { hours: "११" minutes: "३०" } -> ११:३० + e.g. time { hours: "८" minutes: "१५" } -> ०८:१५ + """ def __init__(self): super().__init__(name="time", kind="verbalize") diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py index 20bab26f2..cfb6add51 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py @@ -21,7 +21,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "тысяча один" -> cardinal { integer: "1 001" } Args: diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py index bfa68b8a6..53a325c15 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py index d08b39589..d352284be 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py @@ -20,7 +20,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals. Numbers below ten are not converted. + Finite state transducer for classifying cardinals. Numbers below ten are not converted. Allows both compound numeral strings or separated by whitespace. e.g. minus tjugoen -> cardinal { negative: "-" integer: "21" } } diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py index c1c2bc2a3..484efff78 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py @@ -22,7 +22,7 @@ class ElectronicFst(GraphFst): """ Finite state transducer for classifying electronic: email addresses, etc. e.g. c d f ett at a b c punkt e d u -> tokens { name: "cdf1.abc.edu" } - + Args: tn_electronic_tagger: TN eletronic tagger tn_electronic_verbalizer: TN eletronic verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py index 2ba361280..df56d8d7f 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py @@ -25,7 +25,7 @@ class FractionFst(GraphFst): e.g. halv -> tokens { name: "1/2" } e.g. ett och en halv -> tokens { name: "1 1/2" } e.g. tre och fyra femtedelar -> tokens { name: "3 4/5" } - + Args: itn_cardinal_tagger: ITN cardinal tagger tn_fraction_verbalizer: TN fraction verbalizer diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py index 7c319e0f3..74369e70f 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py @@ -20,9 +20,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone numbers, e.g. + Finite state transducer for classifying telephone numbers, e.g. noll åtta sjuhundraåttionio femtiotvå tjugofem -> tokens { name: "08-789 52 25" } - + Args: tn_cardinal_tagger: TN Cardinal Tagger """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py index cf8fdc202..311c14c36 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py @@ -62,7 +62,7 @@ class TimeFst(GraphFst): e.g. klockan tretton tio -> time { hours: "kl. 13" minutes: "10" } e.g. kvart i tolv -> time { minutes: "45" hours: "11" } e.g. kvart över tolv -> time { minutes: "15" hours: "12" } - + Args: tn_cardinal_tagger: TN cardinal verbalizer """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/utils.py b/nemo_text_processing/inverse_text_normalization/sv/utils.py index 0a7f1ff2d..e645db2dd 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/utils.py +++ b/nemo_text_processing/inverse_text_normalization/sv/utils.py @@ -21,7 +21,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py index 272f047e1..643017c47 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "klockan" } tokens { name: "är" } tokens { time { hours: "12" minutes: "30" } } -> klockan är 12:30 """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index c99ae25d2..f3b30238c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -25,7 +25,7 @@ def __init__(self): Fitite state transducer for classifying cardinals (e.g., 负五十 -> cardinal { negative: "-" integer: "50" }) This class converts cardinals up to hundred millions (i.e., (10**10)) Single unit digits are not converted (e.g., 五 -> 五) - Numbers less than 20 are not converted. + Numbers less than 20 are not converted. 二十 (2 characters/logograms) is kept as it is but 二十一 (3 characters/logograms) would become 21 """ super().__init__(name="cardinal", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index c4911e832..49fd428c1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): Finite state transducer for classifying fraction e.g. 二分之一 -> tokens { fraction { denominator: "2" numerator: "1"} } e.g. 五又二分之一 -> tokens { fraction { integer_part: "1" denominator: "2" numerator: "1" } } - + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 96266df25..4c69b697c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -39,7 +39,7 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py index 92336fe0f..8db669ff6 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]): Args x: integer - Returns: spoken representation + Returns: spoken representation """ if isinstance(x, int): x = str(x) @@ -41,7 +41,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py index b379c4d94..5368e2c42 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -26,7 +26,7 @@ class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py index 849cc690d..5538d8ed6 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py @@ -22,7 +22,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ diff --git a/nemo_text_processing/text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/text_normalization/ar/taggers/cardinal.py index 9a8ba7cd4..a6ab7aca3 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/cardinal.py @@ -21,7 +21,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "9837" -> cardinal { integer: "تسعة اَلاف وثمان مئة وسبعة وثلاثون" } Args: diff --git a/nemo_text_processing/text_normalization/ar/taggers/decimal.py b/nemo_text_processing/text_normalization/ar/taggers/decimal.py index f276155e9..72d2dc47b 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/decimal.py @@ -21,8 +21,8 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - 321.7 --> ثلاث مئة وواحد وعشرون وسبعة من عشرة + Finite state transducer for classifying decimal, e.g. + 321.7 --> ثلاث مئة وواحد وعشرون وسبعة من عشرة -321.7 -> decimal { negative: "true" integer_part: "321" fractional_part: ".7" } cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/ar/taggers/fraction.py b/nemo_text_processing/text_normalization/ar/taggers/fraction.py index aad046011..1ef390506 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ar/taggers/fraction.py @@ -26,7 +26,7 @@ class FractionFst(GraphFst): tokens { fraction { integer_part: "واحد" numerator: "واحد" denominator: "نص" } } Args: - cardinal: cardinal fst + cardinal: cardinal fst """ def __init__(self, cardinal): diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 0df176491..5098989c6 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -36,7 +36,7 @@ class MoneyFst(GraphFst): "$1,99" -> money { integer_part: "سبعة" currency_maj: "دولار" fractional_part: "تسعة وتسعون" currency_min: "سنت" preserve_order: true} "$0,10" -> money { fractional_part: "عشرة" currency_min: "بنسات" preserve_order: true } "$9" -> money { integer_part: "تسعة" currency_maj: "دولار" preserve_order: true} - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/ar/utils.py b/nemo_text_processing/text_normalization/ar/utils.py index fac39551c..1ad8f9927 100644 --- a/nemo_text_processing/text_normalization/ar/utils.py +++ b/nemo_text_processing/text_normalization/ar/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/measure.py b/nemo_text_processing/text_normalization/ar/verbalizers/measure.py index aaca02de0..b762eaa3b 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/measure.py @@ -27,7 +27,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "20" } units: "%" } -> "عشرون في المائة" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py index 8388f8e84..4145c2330 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py @@ -31,7 +31,7 @@ class VerbalizeFinalFst(GraphFst): """ Finite state transducer that verbalizes an entire sentence - + Args: deterministic: if True will provide a single transduction option, for False multiple options (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 01a85ec10..040a9e74c 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -50,7 +50,7 @@ def _load_kaggle_text_norm_file(file_path: str, to_lower: bool) -> List[Instance """ https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish Loads text file in the Kaggle Google text normalization file format: \t\t<`self` if trivial class or normalized text> - E.g. + E.g. PLAIN Brillantaisia PLAIN is PLAIN a @@ -66,7 +66,7 @@ def _load_kaggle_text_norm_file(file_path: str, to_lower: bool) -> List[Instance Args: file_path: file path to text file - Returns: flat list of instances + Returns: flat list of instances """ res = [] with open(file_path, 'r') as fp: @@ -91,7 +91,7 @@ def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file, to_ """ Load given list of text files using the `load_func` function. - Args: + Args: file_paths: list of file paths load_func: loading function @@ -119,7 +119,7 @@ def clean_generic(text: str) -> str: def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = None, verbose: bool = True) -> float: """ - Evaluates accuracy given predictions and labels. + Evaluates accuracy given predictions and labels. Args: preds: predictions @@ -250,7 +250,7 @@ def load_file(file_path: str) -> List[str]: """ Loads given text file with separate lines into list of string. - Args: + Args: file_path: file path Returns: flat list of string @@ -269,7 +269,7 @@ def write_file(file_path: str, data: List[str]): Args: file_path: file path data: list of string - + """ with open(file_path, 'w') as fp: for line in data: diff --git a/nemo_text_processing/text_normalization/de/taggers/cardinal.py b/nemo_text_processing/text_normalization/de/taggers/cardinal.py index bb14d2c95..a8ef5af17 100644 --- a/nemo_text_processing/text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/cardinal.py @@ -66,7 +66,7 @@ def get_ties_digit(digit_path: str, tie_path: str) -> 'pynini.FstLike': class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "101" -> cardinal { integer: "ein hundert und zehn" } Args: diff --git a/nemo_text_processing/text_normalization/de/taggers/date.py b/nemo_text_processing/text_normalization/de/taggers/date.py index 673bd8868..21b32eb2b 100644 --- a/nemo_text_processing/text_normalization/de/taggers/date.py +++ b/nemo_text_processing/text_normalization/de/taggers/date.py @@ -64,7 +64,7 @@ def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. "01.04.2010" -> date { day: "erster" month: "april" year: "zwei tausend zehn" preserve_order: true } "1994" -> date { year: "neunzehn vier und neuzig" } "1900" -> date { year: "neunzehn hundert" } diff --git a/nemo_text_processing/text_normalization/de/taggers/decimal.py b/nemo_text_processing/text_normalization/de/taggers/decimal.py index 6381d942b..8d1540110 100644 --- a/nemo_text_processing/text_normalization/de/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/de/taggers/decimal.py @@ -27,7 +27,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL e.g. 1 million -> integer_part: "eine" quantity: "million" e.g. 1.4 million -> integer_part: "eins" fractional_part: "vier" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -48,7 +48,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -11,4006 billion -> decimal { negative: "true" integer_part: "elf" fractional_part: "vier null null sechs" quantity: "billion" preserve_order: true } 1 billion -> decimal { integer_part: "eins" quantity: "billion" preserve_order: true } Args: diff --git a/nemo_text_processing/text_normalization/de/taggers/ordinal.py b/nemo_text_processing/text_normalization/de/taggers/ordinal.py index f446099df..a99e4e4a8 100644 --- a/nemo_text_processing/text_normalization/de/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "2." -> ordinal { integer: "zwei" } } "2tes" -> ordinal { integer: "zwei" } } diff --git a/nemo_text_processing/text_normalization/de/taggers/telephone.py b/nemo_text_processing/text_normalization/de/taggers/telephone.py index f40173b0f..90af2f07e 100644 --- a/nemo_text_processing/text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/de/taggers/telephone.py @@ -21,9 +21,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone, which includes country code, number part and extension + Finite state transducer for classifying telephone, which includes country code, number part and extension - E.g + E.g "+49 1234-1233" -> telephone { country_code: "plus neun und vierzig" number_part: "eins zwei drei vier eins zwei drei drei" preserve_order: true } "(012) 1234-1233" -> telephone { country_code: "null eins zwei" number_part: "eins zwei drei vier eins zwei drei drei" preserve_order: true } (0**) diff --git a/nemo_text_processing/text_normalization/de/utils.py b/nemo_text_processing/text_normalization/de/utils.py index d2dc9ce80..0b364938b 100644 --- a/nemo_text_processing/text_normalization/de/utils.py +++ b/nemo_text_processing/text_normalization/de/utils.py @@ -24,7 +24,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path diff --git a/nemo_text_processing/text_normalization/de/verbalizers/decimal.py b/nemo_text_processing/text_normalization/de/verbalizers/decimal.py index 915d5ab67..b544a2d6c 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/decimal.py @@ -26,8 +26,8 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "elf" fractional_part: "vier null sechs" quantity: "billionen" } -> minus elf komma vier null sechs billionen + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "elf" fractional_part: "vier null sechs" quantity: "billionen" } -> minus elf komma vier null sechs billionen decimal { integer_part: "eins" quantity: "billion" } -> eins billion """ diff --git a/nemo_text_processing/text_normalization/de/verbalizers/measure.py b/nemo_text_processing/text_normalization/de/verbalizers/measure.py index 41f7fb89c..675659044 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/measure.py @@ -28,7 +28,7 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "zwei" units: "unzen" } } -> "zwei unzen" measure { cardinal { integer_part: "zwei" quantity: "millionen" units: "unzen" } } -> "zwei millionen unzen" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/de/verbalizers/telephone.py b/nemo_text_processing/text_normalization/de/verbalizers/telephone.py index 7a50e785f..5bae8fe2d 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/telephone.py @@ -21,7 +21,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for verbalizing telephone, e.g. - telephone { country_code: "plus neun und vierzig" number_part: "null eins eins eins null null null" } + telephone { country_code: "plus neun und vierzig" number_part: "null eins eins eins null null null" } -> "plus neun und vierzig null eins eins eins null null null" Args: diff --git a/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py index f4e19ea0f..6cda902f8 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py @@ -31,7 +31,7 @@ class VerbalizeFinalFst(GraphFst): """ Finite state transducer that verbalizes an entire sentence - + Args: deterministic: if True will provide a single transduction option, for False multiple options (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/en/clean_eval_data.py b/nemo_text_processing/text_normalization/en/clean_eval_data.py index a7dc24310..9d0aaed6b 100644 --- a/nemo_text_processing/text_normalization/en/clean_eval_data.py +++ b/nemo_text_processing/text_normalization/en/clean_eval_data.py @@ -67,7 +67,7 @@ def process(self, instance: Instance) -> Instance: Args: processes given instance with process function - + Returns: processed instance if instance belongs to expected class type or original instance """ if instance.token_type != self.class_type: diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py index 616e018e3..6ec0ac9dd 100644 --- a/nemo_text_processing/text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/cardinal.py @@ -30,7 +30,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. -23 -> cardinal { negative: "true" integer: "twenty three" } } Args: diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index c5e3dd418..869716ef9 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -43,7 +43,7 @@ def get_ties_graph(deterministic: bool = True): """ - Returns two digit transducer, e.g. + Returns two digit transducer, e.g. 03 -> o three 12 -> thirteen 20 -> twenty @@ -119,9 +119,9 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): Transducer for year, only from 1000 - 2999 e.g. 1290 -> twelve nineteen 2000 - 2009 will be verbalized as two thousand. - + Transducer for 3 digit year, e.g. 123-> one twenty three - + Transducer for year with suffix 123 A.D., 4200 B.C """ @@ -159,7 +159,7 @@ def _get_financial_period_graph(): class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. jan. 5, 2012 -> date { month: "january" day: "five" year: "twenty twelve" preserve_order: true } jan. 5 -> date { month: "january" day: "five" preserve_order: true } 5 january 2012 -> date { day: "five" month: "january" year: "twenty twelve" preserve_order: true } diff --git a/nemo_text_processing/text_normalization/en/taggers/decimal.py b/nemo_text_processing/text_normalization/en/taggers/decimal.py index df9a3bddb..f68d7da4b 100644 --- a/nemo_text_processing/text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/en/taggers/decimal.py @@ -31,7 +31,7 @@ def get_quantity( e.g. 1 million -> integer_part: "one" quantity: "million" e.g. 1.5 million -> integer_part: "one" fractional_part: "five" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -63,7 +63,7 @@ def get_quantity( class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -12.5006 billion -> decimal { negative: "true" integer_part: "12" fractional_part: "five o o six" quantity: "billion" } 1 billion -> decimal { integer_part: "one" quantity: "billion" } diff --git a/nemo_text_processing/text_normalization/en/taggers/ordinal.py b/nemo_text_processing/text_normalization/en/taggers/ordinal.py index 70ae2d70d..8687b493c 100644 --- a/nemo_text_processing/text_normalization/en/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal, e.g. 13th -> ordinal { integer: "thirteen" } - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index 5e0d017d4..9d57a9fb9 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -22,7 +22,7 @@ class RangeFst(GraphFst): """ This class is a composite class of two other class instances - + Args: time: composed tagger and verbalizer date: composed tagger and verbalizer diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index e1a76dd63..913c09285 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -31,7 +31,7 @@ class SerialFst(GraphFst): """ This class is a composite class of two other class instances - + Args: time: composed tagger and verbalizer date: composed tagger and verbalizer diff --git a/nemo_text_processing/text_normalization/en/taggers/telephone.py b/nemo_text_processing/text_normalization/en/taggers/telephone.py index 06d791264..aa9865928 100644 --- a/nemo_text_processing/text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/en/taggers/telephone.py @@ -30,11 +30,11 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension - country code optional: +*** + Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension + country code optional: +*** number part: ***-***-****, or (***) ***-**** extension optional: 1-9999 - E.g + E.g +1 123-123-5678-1 -> telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" } 1-800-GO-U-HAUL -> telephone { country_code: "one" number_part: "one, eight hundred GO U HAUL" } Args: diff --git a/nemo_text_processing/text_normalization/en/taggers/time.py b/nemo_text_processing/text_normalization/en/taggers/time.py index a66f18314..b9e4e824f 100644 --- a/nemo_text_processing/text_normalization/en/taggers/time.py +++ b/nemo_text_processing/text_normalization/en/taggers/time.py @@ -41,7 +41,7 @@ class TimeFst(GraphFst): 02:00 -> time { hours: "two" } 2:00 -> time { hours: "two" } 10:00:05 a.m. -> time { hours: "ten" minutes: "zero" seconds: "five" suffix: "a m" } - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py index 95c22bcbe..5fc8bdbaf 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py @@ -65,7 +65,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py index 110747cab..239984a80 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py @@ -65,7 +65,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/en/utils.py b/nemo_text_processing/text_normalization/en/utils.py index 31d9ec635..a2a765a06 100644 --- a/nemo_text_processing/text_normalization/en/utils.py +++ b/nemo_text_processing/text_normalization/en/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -44,7 +44,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/text_normalization/en/verbalizers/measure.py b/nemo_text_processing/text_normalization/en/verbalizers/measure.py index ae5fa8800..c998a809f 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/measure.py @@ -30,7 +30,7 @@ class MeasureFst(GraphFst): measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" } -> minus twelve kilograms measure { decimal { integer_part: "twelve" fractional_part: "five" } units: "kilograms" } -> twelve point five kilograms tokens { measure { units: "covid" decimal { integer_part: "nineteen" fractional_part: "five" } } } -> covid nineteen point five - + Args: decimal: DecimalFst cardinal: CardinalFst diff --git a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py index b64abf6a2..33a472656 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py @@ -99,10 +99,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ punct_marks_all = PunctuationFst().punct_marks diff --git a/nemo_text_processing/text_normalization/es/taggers/ordinal.py b/nemo_text_processing/text_normalization/es/taggers/ordinal.py index 8af8773e5..d1822103a 100644 --- a/nemo_text_processing/text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/ordinal.py @@ -49,7 +49,7 @@ def get_one_to_one_thousand(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal - "21.º" -> ordinal { integer: "vigésimo primero" morphosyntactic_features: "gender_masc" } + "21.º" -> ordinal { integer: "vigésimo primero" morphosyntactic_features: "gender_masc" } This class converts ordinal up to the millionth (millonésimo) order (exclusive). This FST also records the ending of the ordinal (called "morphosyntactic_features"): diff --git a/nemo_text_processing/text_normalization/es/taggers/telephone.py b/nemo_text_processing/text_normalization/es/taggers/telephone.py index 83efc587c..1cc332f07 100644 --- a/nemo_text_processing/text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/es/taggers/telephone.py @@ -43,8 +43,8 @@ class TelephoneFst(GraphFst): (we ignore more complicated cases such as "doscientos y dos" or "tres nueves"). Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py index 972100be8..968075e11 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py @@ -24,13 +24,13 @@ class CardinalFst(GraphFst): """ - Finite state transducer for verbalizing cardinals - e.g. cardinal { integer: "dos" } -> "dos" + Finite state transducer for verbalizing cardinals + e.g. cardinal { integer: "dos" } -> "dos" - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/decimals.py b/nemo_text_processing/text_normalization/es/verbalizers/decimals.py index 3a94899fc..4feedd37d 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/decimals.py @@ -32,14 +32,14 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones - decimal { integer_part: "un" quantity: "billón" } -> un billón + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones + decimal { integer_part: "un" quantity: "billón" } -> un billón Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py index 094098f2e..3758c1bd5 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py @@ -33,15 +33,15 @@ class FractionFst(GraphFst): """ - Finite state transducer for verbalizing fraction - e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> - treinta y tres y cuatro quintos + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> + treinta y tres y cuatro quintos - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py index 4def8307a..039b00de5 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): Finite state transducer for verbalizing ordinals e.g. ordinal { integer: "tercer" } } -> "tercero" -> "tercera" - -> "tercer" + -> "tercer" Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/fr/taggers/ordinal.py b/nemo_text_processing/text_normalization/fr/taggers/ordinal.py index d3afb13da..73b42053c 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/fr/taggers/ordinal.py @@ -23,7 +23,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal - "2e" -> ordinal { integer: "deux" morphosyntactic_features: "ième" } + "2e" -> ordinal { integer: "deux" morphosyntactic_features: "ième" } This grammar covers from single digits to hundreds of billions ("milliardième" in French). This FST also records the ending of the ordinal (called "morphosyntactic_features"). Args: diff --git a/nemo_text_processing/text_normalization/fr/utils.py b/nemo_text_processing/text_normalization/fr/utils.py index 4f6882b51..7523e5762 100644 --- a/nemo_text_processing/text_normalization/fr/utils.py +++ b/nemo_text_processing/text_normalization/fr/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -34,7 +34,7 @@ def load_labels(abs_path): Args: abs_path: absolute path - + Returns dictionary of mappings """ label_tsv = open(abs_path) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py index a12dbf520..347922a1d 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py @@ -19,12 +19,12 @@ class CardinalFst(GraphFst): """ - Finite state transducer for verbalizing cardinals - e.g. cardinal { negative: "true" integer: "un milliard et un" } -> "moins un milliard et un" - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Finite state transducer for verbalizing cardinals + e.g. cardinal { negative: "true" integer: "un milliard et un" } -> "moins un milliard et un" + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py b/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py index af892e6ca..a720b405b 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py @@ -25,13 +25,13 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "onze" fractional_part: "quatre cent six" quantity: "millions" preserve_order: true } -> moins onze virgule quatre cent six millions - decimal { integer_part: "cent quatorze" quantity: "billions" preserve_order: true } -> cent quatorze billions + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "onze" fractional_part: "quatre cent six" quantity: "millions" preserve_order: true } -> moins onze virgule quatre cent six millions + decimal { integer_part: "cent quatorze" quantity: "billions" preserve_order: true } -> cent quatorze billions Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py b/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py index 7d2ecb395..9388cf343 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py @@ -26,13 +26,13 @@ class FractionFst(GraphFst): """ - Finite state transducer for verbalizing fraction - e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> - treinta y tres y cuatro quintos - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } -> + treinta y tres y cuatro quintos + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/hu/taggers/date.py b/nemo_text_processing/text_normalization/hu/taggers/date.py index 0cfddd652..da410dc31 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/date.py +++ b/nemo_text_processing/text_normalization/hu/taggers/date.py @@ -41,7 +41,7 @@ def day_inflector(number, day): Args: number: the day number day: the day name - + Returns: a list of expanded forms, two per ending. """ @@ -71,7 +71,7 @@ def day_adj_endings(number, word, basic=True): 1-jei -> elsejei 2-i -> másodiki 2-ai -> másodikai - 4-i -> negyediki + 4-i -> negyediki 4-ei -> negyedikei This is based on other -i adjectives, because these forms are rare. """ @@ -103,7 +103,7 @@ def day_adj_endings(number, word, basic=True): class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. "2010. április 1." -> date { year: "kettőezer-tíz" month: "április" day: "elseje" preserve_order: true } "2010. ápr. 1." -> date { year: "kettőezer-tíz" month: "április" day: "elseje" preserve_order: true } "2010. IV. 1." -> date { year: "kettőezer-tíz" month: "április" day: "elseje" preserve_order: true } diff --git a/nemo_text_processing/text_normalization/hu/taggers/decimal.py b/nemo_text_processing/text_normalization/hu/taggers/decimal.py index a6f819d17..5026caec3 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/decimal.py @@ -46,7 +46,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL e.g. 1 millió -> integer_part: "egy" quantity: "millió" e.g. 1,4 million -> integer_part: "egy" fractional_part: "négy" quantity: "millió" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -68,7 +68,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -11,4006 milliárd -> decimal { negative: "true" integer_part: "tizenegy" fractional_part: "négyezer-hat tízezred" quantity: "milliárd" preserve_order: true } 1 milliárd -> decimal { integer_part: "egy" quantity: "milliárd" preserve_order: true } Args: diff --git a/nemo_text_processing/text_normalization/hu/taggers/ordinal.py b/nemo_text_processing/text_normalization/hu/taggers/ordinal.py index 634e006e6..a63a9f02a 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/ordinal.py @@ -25,7 +25,7 @@ class OrdinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "2." -> ordinal { integer: "második" } } Args: diff --git a/nemo_text_processing/text_normalization/hu/taggers/telephone.py b/nemo_text_processing/text_normalization/hu/taggers/telephone.py index 856353a30..b031ca5dc 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/taggers/telephone.py @@ -41,8 +41,8 @@ class TelephoneFst(GraphFst): https://en.wikipedia.org/wiki/Telephone_numbers_in_Hungary Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/hu/taggers/time.py b/nemo_text_processing/text_normalization/hu/taggers/time.py index 65dc26398..ae1592f74 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/time.py +++ b/nemo_text_processing/text_normalization/hu/taggers/time.py @@ -71,7 +71,7 @@ class TimeFst(GraphFst): "09:00 óra" -> time { hours: "2" } "02:15:10 óra" -> time { hours: "2" minutes: "15" seconds: "10"} "negyed 2" -> time { minutes: "15" hours: "1" } - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/hu/utils.py b/nemo_text_processing/text_normalization/hu/utils.py index 8a87a3166..a5fb4fc3c 100644 --- a/nemo_text_processing/text_normalization/hu/utils.py +++ b/nemo_text_processing/text_normalization/hu/utils.py @@ -49,7 +49,7 @@ def load_inflection(abs_path): Args: abs_path: absolute path - + Returns dictionary of mappings of word endings to lists of case endings. """ @@ -97,7 +97,7 @@ def inflect_abbreviation(abbr: str, word: str, singular_only=False): word: the base (nominative singular) form of the expansion of abbr singular_only: whether or not to add plural forms - + Returns a list of tuples containing the inflected abbreviation and its expansion. """ @@ -133,7 +133,7 @@ def naive_inflector(abbr: str, word: str, singular_only=False): word: the base (nominative singular) form of the expansion of abbr singular_only: whether or not to add plural forms - + Returns a list of tuples containing the inflected abbreviation and its expansion. """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/measure.py b/nemo_text_processing/text_normalization/hu/verbalizers/measure.py index 41f7fb89c..675659044 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/measure.py @@ -28,7 +28,7 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "zwei" units: "unzen" } } -> "zwei unzen" measure { cardinal { integer_part: "zwei" quantity: "millionen" units: "unzen" } } -> "zwei millionen unzen" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/hy/utils.py b/nemo_text_processing/text_normalization/hy/utils.py index 7abe91e9e..26c9f5119 100644 --- a/nemo_text_processing/text_normalization/hy/utils.py +++ b/nemo_text_processing/text_normalization/hy/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -34,7 +34,7 @@ def load_labels(abs_path): Args: abs_path: absolute path - + Returns dictionary of mappings """ label_tsv = open(abs_path) diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py index 810b1af49..d5d56cf66 100644 --- a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py @@ -25,14 +25,14 @@ class VerbalizeFst(GraphFst): """ - Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. - More details to deployment at NeMo/tools/text_processing_deployment. + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ def __init__(self, deterministic=True): super().__init__(name="verbalize", kind="verbalize") diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py index aebadd456..e5afd807e 100644 --- a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py @@ -23,15 +23,15 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. - tokens { name: "Երևանում" } tokens { name: "ժամը" } tokens { time { hours: "տասներկուսն" minutes: "հիսունհինգ" } } tokens { name: "է" } tokens { name: ":" } -> Երևանում ժամը տասներկուսն անց հիսունհինգ է: + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "Երևանում" } tokens { name: "ժամը" } tokens { time { hours: "տասներկուսն" minutes: "հիսունհինգ" } } tokens { name: "է" } tokens { name: ":" } -> Երևանում ժամը տասներկուսն անց հիսունհինգ է: - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. - overwrite_cache: set to True to overwrite .far files - """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ def __init__(self, deterministic=True, cache_dir=None, overwrite_cache=False): super().__init__(name="verbalize_final", kind="verbalize") diff --git a/nemo_text_processing/text_normalization/it/taggers/decimals.py b/nemo_text_processing/text_normalization/it/taggers/decimals.py index 4e32855ad..8f98d5a2b 100644 --- a/nemo_text_processing/text_normalization/it/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/it/taggers/decimals.py @@ -36,8 +36,7 @@ def get_quantity(decimal_graph: "pynini.FstLike", cardinal_graph: "pynini.FstLike") -> "pynini.FstLike": - """ - """ + """ """ numbers = pynini.closure(NEMO_DIGIT, 1, 6) @ cardinal_graph numbers = pynini.cdrewrite(pynutil.delete(cardinal_separator), "", "", NEMO_SIGMA) @ numbers diff --git a/nemo_text_processing/text_normalization/it/taggers/measure.py b/nemo_text_processing/text_normalization/it/taggers/measure.py index d3591089e..40144cd61 100644 --- a/nemo_text_processing/text_normalization/it/taggers/measure.py +++ b/nemo_text_processing/text_normalization/it/taggers/measure.py @@ -45,7 +45,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for classifying measure, e.g. "2,4 g" -> measure { cardinal { integer_part: "due" fractional_part: "quattro" units: "grammi" preserve_order: true } } - + Args: cardinal: CardinalFst decimal: DecimalFst diff --git a/nemo_text_processing/text_normalization/it/taggers/money.py b/nemo_text_processing/text_normalization/it/taggers/money.py index e8f68c2ac..d92906f1d 100644 --- a/nemo_text_processing/text_normalization/it/taggers/money.py +++ b/nemo_text_processing/text_normalization/it/taggers/money.py @@ -40,7 +40,7 @@ class MoneyFst(GraphFst): "€1" -> money { currency_maj: "euro" integer_part: "un"} "€1,000" -> money { currency_maj: "euro" integer_part: "un" } "4,2 £" -> money { integer_part: "quattro" currency_maj: "sterline" fractional_part: "venti" currency_min: "penny" preserve_order: true } - + Args: cardinal: CardinalFst decimal: DecimalFst diff --git a/nemo_text_processing/text_normalization/it/taggers/time.py b/nemo_text_processing/text_normalization/it/taggers/time.py index 351b6f40c..97d952489 100644 --- a/nemo_text_processing/text_normalization/it/taggers/time.py +++ b/nemo_text_processing/text_normalization/it/taggers/time.py @@ -25,7 +25,7 @@ class TimeFst(GraphFst): 15:30:30 tokens { time { hours: "15" minutes: "30" seconds: "30" preserve_order: true } } -> quindici e mezza trenta secondi 12:15 tokens { time { hours: "12" minutes: "15" } } -> dodici e un quarto 03:38 tokens { time { hours: "3" minutes: "38" } } -> tre e trentotto minuti - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/it/utils.py b/nemo_text_processing/text_normalization/it/utils.py index eadec4d89..be8bdb5ad 100644 --- a/nemo_text_processing/text_normalization/it/utils.py +++ b/nemo_text_processing/text_normalization/it/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/text_normalization/it/verbalizers/decimal.py b/nemo_text_processing/text_normalization/it/verbalizers/decimal.py index 568361603..f257d7df4 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/decimal.py @@ -26,18 +26,18 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "venti" fractional_part: "trentaquattro" quantity: "miliardi" } -> + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "venti" fractional_part: "trentaquattro" quantity: "miliardi" } -> meno venti virgola trentaquattro - decimal { integer_part: "un milione" fractional_part: "zero zero zero" quantity: "milioni" preserve_order: true } --> + decimal { integer_part: "un milione" fractional_part: "zero zero zero" quantity: "milioni" preserve_order: true } --> un milione virgola zero zero zero decimal { integer_part: "due" quantity: "milioni" preserve_order: true } --> due milioni Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/it/verbalizers/measure.py b/nemo_text_processing/text_normalization/it/verbalizers/measure.py index 93fa50500..c7fe33969 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/measure.py @@ -27,7 +27,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "due" units: "grammi" } } -> "due grammi" - + Args: decimal: decimal GraphFst cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/it/verbalizers/money.py b/nemo_text_processing/text_normalization/it/verbalizers/money.py index ba9687bd5..f4b3fdef8 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/money.py @@ -40,7 +40,7 @@ class MoneyFst(GraphFst): Finite state transducer for verbalizing money, e.g. money { currency_maj: "euro" integer_part: "un"} -> "un euro" money { integer_part: "quattro" currency_maj: "sterline" fractional_part: "venti" currency_min: "penny" preserve_order: true } -> "quattro sterline venti penny" - + Args: decimal: GraphFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 14093dadf..c6d19f82f 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -165,6 +165,9 @@ def __init__( elif lang == 'hy': from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.hy.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'rw': + from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.rw.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") diff --git a/nemo_text_processing/text_normalization/ru/taggers/cardinal.py b/nemo_text_processing/text_normalization/ru/taggers/cardinal.py index d0bc8cc07..5e780969a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ru/taggers/cardinal.py @@ -32,7 +32,7 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "1 001" -> cardinal { integer: "тысяча один" } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/date.py b/nemo_text_processing/text_normalization/ru/taggers/date.py index dd3872e2f..2dc87ee06 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/text_normalization/ru/taggers/date.py @@ -29,7 +29,7 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, e.g. + Finite state transducer for classifying date, e.g. "01.05" -> tokens { date { day: "первое мая" } } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/decimals.py b/nemo_text_processing/text_normalization/ru/taggers/decimals.py index 29c208777..40ced8d52 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/ru/taggers/decimals.py @@ -50,7 +50,7 @@ def prepare_labels_for_insertion(file_path: str): class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. "1,08" -> tokens { decimal { integer_part: "одно целая" fractional_part: "восемь сотых} } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/ordinal.py b/nemo_text_processing/text_normalization/ru/taggers/ordinal.py index 09cd57d33..43277db46 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/ru/taggers/ordinal.py @@ -25,7 +25,7 @@ class OrdinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. + Finite state transducer for classifying cardinals, e.g. "2" -> ordinal { integer: "второе" } } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/telephone.py b/nemo_text_processing/text_normalization/ru/taggers/telephone.py index d2b3d508c..4fbfbf06a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/taggers/telephone.py @@ -21,9 +21,9 @@ class TelephoneFst(GraphFst): """ - Finite state transducer for classifying telephone, which includes country code, number part and extension + Finite state transducer for classifying telephone, which includes country code, number part and extension - E.g + E.g "8-913-983-56-01" -> telephone { number_part: "восемь девятьсот тринадцать девятьсот восемьдесят три пятьдесят шесть ноль один" } Args: diff --git a/nemo_text_processing/text_normalization/ru/taggers/time.py b/nemo_text_processing/text_normalization/ru/taggers/time.py index 4b3f40560..427aab00d 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/time.py +++ b/nemo_text_processing/text_normalization/ru/taggers/time.py @@ -24,7 +24,7 @@ class TimeFst(GraphFst): """ Finite state transducer for classifying time, e.g. "02:15" -> time { hours: "два часа пятнадцать минут" } - + Args: number_names: number_names for cardinal and ordinal numbers deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/ru/utils.py b/nemo_text_processing/text_normalization/ru/utils.py index 5f5c4bbfb..a55659868 100644 --- a/nemo_text_processing/text_normalization/ru/utils.py +++ b/nemo_text_processing/text_normalization/ru/utils.py @@ -24,7 +24,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/measure.py b/nemo_text_processing/text_normalization/ru/verbalizers/measure.py index ad2e85bf5..001691518 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/measure.py @@ -28,7 +28,7 @@ class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, e.g. measure { cardinal { integer: "два килограма" } } -> "два килограма" - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py index 8d92e3efe..ceaf04d7d 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py @@ -29,7 +29,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now Args: diff --git a/nemo_text_processing/text_normalization/rw/__init__.py b/nemo_text_processing/text_normalization/rw/__init__.py new file mode 100644 index 000000000..876f20b3f --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/rw/data/__init__.py b/nemo_text_processing/text_normalization/rw/data/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py b/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv new file mode 100644 index 000000000..bf85b743b --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits.tsv @@ -0,0 +1,9 @@ +rimwe 1 +kabiri 2 +gatatu 3 +kane 4 +gatanu 5 +gatandatu 6 +karindwi 7 +umunani 8 +icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv new file mode 100644 index 000000000..ee31aadee --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_for_thousands.tsv @@ -0,0 +1,10 @@ + 0 +kimwe 1 +bibiri 2 +bitatu 3 +bine 4 +bitanu 5 +bitandatu 6 +birindwi 7 +umunani 8 +icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv new file mode 100644 index 000000000..126ad90a3 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/digits_millions_trillions.tsv @@ -0,0 +1,10 @@ + 0 +imwe 1 +ebyiri 2 +eshatu 3 +enye 4 +eshanu 5 +esheshatu 6 +zirindwi 7 +umunani 8 +icyenda 9 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv new file mode 100644 index 000000000..a46623cc1 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds.tsv @@ -0,0 +1,9 @@ +ijana 1 +magana_abiri 2 +magana_atatu 3 +magana_ane 4 +magana_atanu 5 +magana_atandatu 6 +magana_arindwi 7 +magana_inani 8 +magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv new file mode 100644 index 000000000..6e38c3ceb --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_millions.tsv @@ -0,0 +1,9 @@ +miliyoni_ijana 1 +miliyoni_magana_abiri 2 +miliyoni_magana_atatu 3 +miliyoni_magana_ane 4 +miliyoni_magana_atanu 5 +miliyoni_magana_atandatu 6 +miliyoni_magana_arindwi 7 +miliyoni_magana_inani 8 +miliyoni_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv new file mode 100644 index 000000000..a73477c14 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_thousands.tsv @@ -0,0 +1,9 @@ +ibihumbi_ijana 1 +ibihumbi_magana_abiri 2 +ibihumbi_magana_atatu 3 +ibihumbi_magana_ane 4 +ibihumbi_magana_atanu 5 +ibihumbi_magana_atandatu 6 +ibihumbi_magana_arindwi 7 +ibihumbi_magana_inani 8 +ibihumbi_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv new file mode 100644 index 000000000..00fc01aa4 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/hundreds_of_trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni_ijana 1 +tiriyoni_magana_abiri 2 +tiriyoni_magana_atatu 3 +tiriyoni_magana_ane 4 +tiriyoni_magana_atanu 5 +tiriyoni_magana_atandatu 6 +tiriyoni_magana_arindwi 7 +tiriyoni_magana_inani 8 +tiriyoni_magana_cyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv new file mode 100644 index 000000000..fded5ed55 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/millions.tsv @@ -0,0 +1,9 @@ +miliyoni 1 +miliyoni_ebyiri 2 +miliyoni_eshatu 3 +miliyoni_enye 4 +miliyoni_eshanu 5 +miliyoni_esheshatu 6 +miliyoni_zirindwi 7 +miliyoni_umunani 8 +miliyoni_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv new file mode 100644 index 000000000..6e63c3875 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens.tsv @@ -0,0 +1,9 @@ + 0 +makumyabiri 2 +mirongo_itatu 3 +mirongo_ine 4 +mirongo_itanu 5 +mirongo_itandatu 6 +mirongo_irindwi 7 +mirongo_inani 8 +mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv new file mode 100644 index 000000000..36f077d00 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_millions.tsv @@ -0,0 +1,9 @@ +miliyoni_cumi 1 +miliyoni_makumyabiri 2 +miliyoni_mirongo_itatu 3 +miliyoni_mirongo_ine 4 +miliyoni_mirongo_itanu 5 +miliyoni_mirongo_itandatatu 6 +miliyoni_mirongo_irindwi 7 +miliyoni_mirongo_inani 8 +miliyoni_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv new file mode 100644 index 000000000..f230751bf --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_thousands.tsv @@ -0,0 +1,9 @@ +ibihumbi_cumi 1 +ibihumbi_makumyabiri 2 +ibihumbi_mirongo_itatu 3 +ibihumbi_mirongo_ine 4 +ibihumbi_mirongo_itanu 5 +ibihumbi_mirongo_itandatatu 6 +ibihumbi_mirongo_irindwi 7 +ibihumbi_mirongo_inani 8 +ibihumbi_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv new file mode 100644 index 000000000..3cf483594 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/tens_of_trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni_icumi 1 +tiriyoni_makumyabiri 2 +tiriyoni_mirongo_itatu 3 +tiriyoni_mirongo_ine 4 +tiriyoni_mirongo_itanu 5 +tiriyoni_mirongo_itandatatu 6 +tiriyoni_mirongo_irindwi 7 +tiriyoni_mirongo_inani 8 +tiriyoni_mirongo_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv new file mode 100644 index 000000000..39d262443 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/thousands.tsv @@ -0,0 +1,10 @@ +igihumbi 1 +ibihumbi_bibiri 2 +ibihumbi_bitatu 3 +ibihumbi_bine 4 +ibihumbi_bitanu 5 +ibihumbi_bitandatu 6 +ibihumbi_birindwi 7 +ibihumbi_umunani 8 +ibihumbi_icyenda 9 + diff --git a/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv b/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv new file mode 100644 index 000000000..8098158df --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/cardinal/trillions.tsv @@ -0,0 +1,9 @@ +tiriyoni 1 +tiriyoni_ebyiri 2 +tiriyoni_eshatu 3 +tiriyoni_enye 4 +tiriyoni_eshanu 5 +tiriyoni_esheshatu 6 +tiriyoni_zirindwi 7 +tiriyoni_umunani 8 +tiriyoni_icyenda 9 diff --git a/nemo_text_processing/text_normalization/rw/data/time/__init__.py b/nemo_text_processing/text_normalization/rw/data/time/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/time/hours.tsv b/nemo_text_processing/text_normalization/rw/data/time/hours.tsv new file mode 100644 index 000000000..fae6f0898 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/hours.tsv @@ -0,0 +1,12 @@ +1 saa saba +2 saa munani +3 saa cyenda +4 saa cumi +5 saa cumi n'imwe +6 saa cumi n'ebyiri +7 saa moya +8 saa mbiri +9 saa tatu +10 saa ine +11 saa tanu +12 saa sita \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv b/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv new file mode 100644 index 000000000..c30327106 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/time/minutes.tsv @@ -0,0 +1,60 @@ +00 +01 n'umunota umwe +02 n'iminota ibiri +03 n'iminota itatu +04 n'iminota ine +05 n'iminota itanu +06 n'iminota itandatu +07 n'iminota irindwi +08 n'iminota umunani +09 n'iminota icyenda +10 n'iminota icumi +11 n'iminota cumi n'umwe +12 n'iminota cumi n'ibiri +13 n'iminota cumi n'itatu +14 n'iminota cumi n'ine +15 n'iminota cumi n'itanu +16 n'iminota cumi n'itandatu +17 n'iminota cumi n'irindwi +18 n'iminota cumi n'umunani +19 n'iminota cumi n'icyenda +20 n'iminota makumyabiri +21 n'iminota makumyabiri na rimwe +22 n'iminota makumyabiri n'ibiri +23 n'iminota makumyabiri n'itatu +24 n'iminota makumyabiri n'ine +25 n'iminota makumyabiri n'itanu +26 n'iminota makumyabiri n'itandatu +27 n'iminota makumyabiri n'irindwi +28 n'iminota makumyabiri n'umunani +29 n'iminota makumyabiri n'icyenda +30 n'iminota mirongo itatu +31 n'iminota mirongo itatu n'umwe +32 n'iminota mirongo itatu n'ibiri +33 n'iminota mirongo itatu n'itatu +34 n'iminota mirongo itatu n'ine +35 n'iminota mirongo itatu n'itanu +36 n'iminota mirongo itatu n'itandatu +37 n'iminota mirongo itatu n'irindwi +38 n'iminota mirongo itatu n'umunani +39 n'iminota mirongo itatu n'icyenda +40 n'iminota mirongo ine +41 n'iminota mirongo ine n'umwe +42 n'iminota mirongo ine n'ibiri +43 n'iminota mirongo ine n'itatu +44 n'iminota mirongo ine n'ine +45 n'iminota mirongo ine n'itanu +46 n'iminota mirongo ine n'itandatu +47 n'iminota mirongo ine n'irindwi +48 n'iminota mirongo ine n'umunani +49 n'iminota mirongo ine n'icyenda +50 n'iminota mirongo itanu +51 n'iminota mirongo itanu n'umwe +52 n'iminota mirongo itanu n'ibiri +53 n'iminota mirongo itanu n'itatu +54 n'iminota mirongo itanu n'ine +55 n'iminota mirongo itanu n'itanu +56 n'iminota mirongo itanu n'itandatu +57 n'iminota mirongo itanu n'irindwi +58 n'iminota mirongo itanu n'umunani +59 n'iminota mirongo itanu n'icyenda \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/whitelist/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv b/nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv new file mode 100644 index 000000000..e550214cd --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/data/whitelist/kinya_transliterations.tsv @@ -0,0 +1,175 @@ +Chris kiris +Hipkins Hipikineze +Dexamethasone dekisametazone +corticosteroid koritikositeroyide +immunosuppressant iminosupuresa +CDC sidisi +RBC arabisi +RISA risa +minisante minisante +sars sarisi +pfizer pifiza +BionTech biyoniteki +dollar dorari +ADN ade eni +BBC bibisi +Victoria vikitoria +Espagne esipanye +Nouvelle-Zélande nuveli zerandi +lopinavir lopinaviri +rotinavir rotinaviri +HIV heci ayivi +seychelles seyisheli +maroc maroki +sputnik siputinike +la crosse la korosi +paul pawulo +www wawawa +gov govu +rw rwa +http hecititipi +Berlin iberile +Remdesivir remudesivire +coranavirus koronavirusi +covid kovide +quarantine karantine +oms o e mesi +basketball basiketibalu +football futibolu +cholera kolera +radio radiyo +television televiziyo +service serivise +prof purofu +royal ruyolo +college koreji +health ubuzima +SARS-CoV-kabiri sarisi-kov-kabiri +recovery rekoveri +Dr dogiteri +protein puroteyine +spike sipiyike +victoria vigitoriya +technique tekinike +cell selile +electro erekitoro +sanitizer sanitayiza +Orthocoronavirinae oritocoronavirinaye +coronavirinae coronavirinaye +nidovirales nidoviralesi +Covs covuse +antibody antibodi +Hydroxychloroquine hidurokulorokine +company kompani +oxygen ogisijeni +Carolina karolina +jonathan jonatani +hyper hiperi +micro mikoro +microscope mikorosikope +microchip mikorocipu +glycoproteine gilicopuroteyine +sport siporo +lockdown lokidawuno +email imeli +japan japani +science siyansi +pubmed pubimedi +koica koyika +jica jika +DNA diyeniyi +RNA araneyi +wuhan wuhani +huanan hwanani +thermoregulation terimoregulashiyoni +alveolar aliviyola +hypoxemia hipokisemiya +PCR pisiyara +rapid-test rapidi-tesite +sepsis sepusisi +septique seputike +pulmonary pirimonari +extra egisitura +Real riyo +Time tayimu +Polymerase porimerase +poly pori +Chain ceyini +Reaction reyakishoni +hypoxic hipokisike +ICU ayisiyu +ambulance amburansi +antibiotic antibiyotike +vaccine vagisine +MEDAIR medayire +guardian garidiyani +covax covagise +paris parisi +transplant turansipulanti +laboratoire laboratuwari +Tedros tewodurosi +Ghebreyesus gebureyesusi +polybasic poribazike +china chinwa +RT-PCR arati-pisiyara +UNICEF yunicefu +HCR hashiseyeri +UNESCO yunesico +UN oni +World woridi +bank banki +FMI efu emi +new-york nuyoriki +times tayimuze +MERS merise +electron erekituronu +RDB aradibi +Platelet-fibrin puratele-fibirini +arterial ariteriyo +coagulopathie kowagulopati +RBD arabidi +RDF aradiyefu +module modile +Oxford ogisiforudu +AstraZeneca asutarazeneka +Astra-Zeneca asutarazeneka +astra asutara +zeneca zeneka +chlorine kulorakine +acide aside +peroxyacetic perukisiyatike +chlorhexidine kulorekidine +chloroform kuloroforume +disinfectant dezenkifekita +carbon kariboni +Hopkins hopikinze +communist komunisite +Tanzania tanzaniya +Africa afurika +VOA vi o aye +Jean yohana +Marie mariya +Vianney viyane +chimiotherapie kimyoterapi +sinopharm sinofarume +bus busi +ventilator ventirata +ventilators ventirataze +mRNA emu araneyi +Favipiravir favipiravire +command komandi +center santire +app apu +phone fone +telephone terefone +clinical kiliniko +clinique kilinike +lymphocytes lemfosite +twitter tuwita +youtube yutubi +facebook fasibuki +google gugoli +com komu +Antibodies antibodize +COVID-CUMI kovide-cumi +COVID-CUMI-N'ICYENDA kovide-cumi-n'icyenda diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py new file mode 100644 index 000000000..ce75cd17e --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -0,0 +1,273 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_VOWELS = pynini.union(*"aeiouAEIOU").optimize() +NEMO_CONSONANTS = pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz").optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_space_or_punct = NEMO_PUNCT | delete_space +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, +) -> "pynini.FstLike": + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/rw/taggers/__init__.py b/nemo_text_processing/text_normalization/rw/taggers/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/taggers/cardinal.py b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py new file mode 100644 index 000000000..958a95234 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/cardinal.py @@ -0,0 +1,243 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.rw.graph_utils import ( + NEMO_ALPHA, + NEMO_CHAR, + NEMO_CONSONANTS, + NEMO_DIGIT, + NEMO_VOWELS, + GraphFst, + delete_extra_space, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.rw.utils import get_abs_path + + +class CardinalFst(GraphFst): + def __init__(self): + super().__init__(name="cardinal", kind="classify") + vowels_or_space = NEMO_VOWELS | " " + rewrite_na_fst = pynini.cdrewrite( + pynini.cross(" ", " na "), vowels_or_space, NEMO_CONSONANTS, NEMO_CHAR.closure() + ) + rewrite_n_fst = pynini.cdrewrite(pynini.cross(" ", " n'"), vowels_or_space, NEMO_VOWELS, NEMO_CHAR.closure()) + remove_underscore_fst = pynini.cdrewrite( + pynini.cross("_", " "), pynini.union(NEMO_ALPHA), pynini.union(NEMO_ALPHA), NEMO_CHAR.closure() + ) + remove_extra_space_fst = pynini.cdrewrite( + delete_extra_space, pynini.union(NEMO_ALPHA), pynini.union(NEMO_ALPHA), NEMO_CHAR.closure() + ) + remove_trailing_space_fst = pynini.cdrewrite( + delete_space, pynini.union(NEMO_ALPHA).closure(), '[EOS]', NEMO_CHAR.closure() + ) + + rewrite_add_separator_fst = pynini.compose(rewrite_na_fst, rewrite_n_fst) + ten_thousand = pynini.string_map([("ibihumbi_icumi", "10")]) + ten = pynini.string_map([("icumi", "10")]) + digits = pynini.string_file(get_abs_path("data/cardinal/digits.tsv")) + digits_for_thousands = pynini.string_file(get_abs_path("data/cardinal/digits_for_thousands.tsv")) + digits_millions_trillions = pynini.string_file(get_abs_path("data/cardinal/digits_millions_trillions.tsv")) + tens = pynini.string_file(get_abs_path("data/cardinal/tens.tsv")) + tens_for_ends = pynini.string_map([("icumi", "1")]) | tens + tens_for_beginnings = pynini.string_map([("cumi", "1")]) | tens + hundreds = pynini.string_file(get_abs_path("data/cardinal/hundreds.tsv")) + thousands = pynini.string_file(get_abs_path("data/cardinal/thousands.tsv")) + tens_of_thousands = pynini.string_file(get_abs_path("data/cardinal/tens_of_thousands.tsv")) + hundreds_of_thousands = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_thousands.tsv")) + millions = pynini.string_file(get_abs_path("data/cardinal/millions.tsv")) + tens_of_millions = pynini.string_file(get_abs_path("data/cardinal/tens_of_millions.tsv")) + hundreds_of_millions = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_millions.tsv")) + trillions = pynini.string_file(get_abs_path("data/cardinal/trillions.tsv")) + tens_of_trillions = pynini.string_file(get_abs_path("data/cardinal/tens_of_trillions.tsv")) + hundreds_of_trillions = pynini.string_file(get_abs_path("data/cardinal/hundreds_of_trillions.tsv")) + + THREE_ZEROS = "000" + FOUR_ZEROS = "0000" + FIVE_ZEROS = "00000" + SIX_ZEROS = "000000" + SEVEN_ZEROS = "0000000" + EIGHT_ZEROS = "00000000" + NINE_ZEROS = "000000000" + + zero = pynini.string_map([("zeru", "0")]) + rewrite_remove_comma_fst = pynini.cdrewrite( + pynini.cross(",", ""), pynini.union(NEMO_DIGIT), pynini.union(NEMO_DIGIT), NEMO_CHAR.closure() + ) + single_digits_graph = pynini.invert(digits | zero) + single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) + remove_comma = rewrite_remove_comma_fst @ single_digits_graph + + graph_tens_ends = tens_for_ends + pynutil.delete(" ") + digits | tens_for_ends + pynutil.insert("0") + graph_tens_starts = tens_for_beginnings + pynutil.delete(" ") + digits | tens_for_beginnings + pynutil.insert( + "0" + ) + + graph_tens_for_thousands = tens_for_beginnings + pynutil.delete( + " " + ) + digits_for_thousands | tens_for_beginnings + pynutil.insert("0") + + graph_tens_for_millions_trillions = tens_for_beginnings + pynutil.delete( + " " + ) + digits_millions_trillions | tens_for_beginnings + pynutil.insert("0") + graph_hundreds = ( + hundreds + pynutil.delete(" ") + graph_tens_ends + | hundreds + pynutil.insert("00") + | hundreds + pynutil.delete(" ") + pynutil.insert("0") + digits + ) + graph_thousands = ( + thousands + pynutil.delete(" ") + graph_hundreds + | thousands + pynutil.insert(THREE_ZEROS) + | thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_ends + | thousands + pynutil.delete(" ") + pynutil.insert("00") + digits + ) + + graph_ten_thousand_and_hundreds = ( + ten_thousand + pynutil.insert(THREE_ZEROS) + | ten_thousand + pynutil.delete(" ") + graph_hundreds + | ten_thousand + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_ends + | ten_thousand + pynutil.delete(" ") + pynutil.insert("00") + digits + ) + prefix_tens_of_thousands = tens_of_thousands + pynutil.delete(" ") + digits_for_thousands + graph_tens_of_thousands = ( + pynutil.add_weight(graph_ten_thousand_and_hundreds, weight=-0.1) + | prefix_tens_of_thousands + pynutil.delete(" ") + graph_hundreds + | prefix_tens_of_thousands + pynutil.insert(THREE_ZEROS) + | prefix_tens_of_thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_hundreds + | prefix_tens_of_thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_ends + | prefix_tens_of_thousands + pynutil.delete(" ") + pynutil.insert("00") + digits + ) + + prefix_hundreds_of_thousands = hundreds_of_thousands + pynutil.delete(" ") + graph_tens_for_thousands + graph_hundreds_of_thousands = ( + hundreds_of_thousands + pynutil.insert(FIVE_ZEROS) + | prefix_hundreds_of_thousands + pynutil.insert(THREE_ZEROS) + | prefix_hundreds_of_thousands + pynutil.delete(" ") + graph_hundreds + | pynutil.add_weight( + prefix_hundreds_of_thousands + pynutil.delete(" ") + pynutil.insert("00") + digits, weight=-0.1 + ) + | prefix_hundreds_of_thousands + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_for_thousands + ) + + graph_millions = ( + millions + pynutil.delete(" ") + graph_hundreds_of_thousands + | millions + pynutil.insert(SIX_ZEROS) + | millions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_thousands + | millions + pynutil.delete(" ") + pynutil.insert("00") + graph_thousands + | millions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds + | millions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_ends + | millions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + digits + ) + + prefix_tens_of_millions = tens_of_millions + pynutil.delete(" ") + digits_millions_trillions + graph_tens_of_millions = ( + prefix_tens_of_millions + pynutil.delete(" ") + graph_hundreds_of_thousands + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_thousands + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_ends + | tens_of_millions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_tens_ends + | prefix_tens_of_millions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + digits + ) + + prefix_hundreds_of_millions = hundreds_of_millions + pynutil.delete(" ") + graph_tens_for_millions_trillions + graph_hundreds_of_millions = ( + prefix_hundreds_of_millions + pynutil.delete(" ") + graph_hundreds_of_thousands + | prefix_hundreds_of_millions + pynutil.insert(SIX_ZEROS) + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_thousands + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert("00") + graph_thousands + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds + | prefix_hundreds_of_millions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_ends + ) + + graph_trillions = ( + trillions + pynutil.delete(" ") + graph_hundreds_of_millions + | trillions + pynutil.insert(NINE_ZEROS) + | trillions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_millions + | trillions + pynutil.delete(" ") + pynutil.insert("00") + graph_millions + | trillions + pynutil.delete(" ") + pynutil.insert(THREE_ZEROS) + graph_hundreds_of_thousands + | trillions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_of_thousands + | trillions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_thousands + | trillions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + graph_hundreds + | trillions + pynutil.delete(" ") + pynutil.insert(SEVEN_ZEROS) + graph_tens_ends + | trillions + pynutil.delete(" ") + pynutil.insert(EIGHT_ZEROS) + digits + ) + + prefix_tens_of_trillions = tens_of_trillions + pynutil.delete(" ") + digits_millions_trillions + graph_tens_of_trillions = ( + prefix_tens_of_trillions + pynutil.delete(" ") + graph_hundreds_of_millions + | prefix_tens_of_trillions + pynutil.insert(NINE_ZEROS) + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_millions + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert("00") + graph_millions + | prefix_tens_of_trillions + + pynutil.delete(" ") + + pynutil.insert(THREE_ZEROS) + + graph_hundreds_of_thousands + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_of_thousands + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_thousands + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + graph_hundreds + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(SEVEN_ZEROS) + graph_tens_ends + | prefix_tens_of_trillions + pynutil.delete(" ") + pynutil.insert(EIGHT_ZEROS) + digits + ) + + prefix_hundreds_of_trillions = hundreds_of_trillions + pynutil.delete(" ") + graph_tens_for_millions_trillions + graph_hundreds_of_trillions = ( + prefix_hundreds_of_trillions + pynutil.delete(" ") + graph_hundreds_of_millions + | prefix_hundreds_of_trillions + pynutil.insert(NINE_ZEROS) + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert("0") + graph_tens_of_millions + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert("00") + graph_millions + | prefix_hundreds_of_trillions + + pynutil.delete(" ") + + pynutil.insert(THREE_ZEROS) + + graph_hundreds_of_thousands + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(FOUR_ZEROS) + graph_tens_of_thousands + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(FIVE_ZEROS) + graph_thousands + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(SIX_ZEROS) + graph_hundreds + | prefix_hundreds_of_trillions + pynutil.delete(" ") + pynutil.insert(SEVEN_ZEROS) + graph_tens_ends + ) + + graph_all = ( + graph_hundreds_of_trillions + | graph_tens_of_trillions + | graph_trillions + | graph_hundreds_of_millions + | graph_tens_of_millions + | graph_millions + | graph_hundreds_of_thousands + | graph_tens_of_thousands + | graph_thousands + | graph_hundreds + | pynutil.add_weight(ten, weight=-0.1) + | graph_tens_starts + | digits + | pynini.cross("zeru", "0") + ) + + inverted_graph_all = pynini.compose(pynini.invert(graph_all), rewrite_add_separator_fst) + inverted_graph_all = pynini.compose(inverted_graph_all, remove_extra_space_fst) + inverted_graph_all = pynini.compose(inverted_graph_all, remove_trailing_space_fst) + inverted_graph_all = pynini.compose(inverted_graph_all, remove_underscore_fst) | pynutil.add_weight( + remove_comma, 0.0001 + ) + + inverted_graph_all = inverted_graph_all.optimize() + final_graph = pynutil.insert("integer: \"") + inverted_graph_all + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/rw/taggers/time.py b/nemo_text_processing/text_normalization/rw/taggers/time.py new file mode 100644 index 000000000..0caf4f7d5 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/time.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.utils import get_abs_path + + +class TimeFst(GraphFst): + def __init__(self): + super().__init__(name="time", kind="classify") + + hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) + + minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) + + final_graph = ( + pynutil.insert("hours:\"") + + hours + + pynutil.insert("\"") + + pynutil.delete(":") + + pynutil.insert(" minutes:\"") + + minutes + + pynutil.insert("\"") + ) + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..01ec1e370 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/tokenize_and_classify.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.en.taggers.word import WordFst +from nemo_text_processing.text_normalization.rw.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.rw.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.rw.taggers.time import TimeFst +from nemo_text_processing.text_normalization.rw.taggers.whitelist import WhiteListFst + + +class ClassifyFst(GraphFst): + def __init__( + self, + input_case: str, + cache_dir: str = None, + overwrite_cache: bool = False, + deterministic: bool = True, + whitelist: str = None, + ): + super().__init__(name='tokenize_and_classify', kind='classify', deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "rw_tn_tokenize_and_classify.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + print("FAR file: ", far_file) + self.fst = pynini.Far(far_file, mode="r")["TOKENIZE_AND_CLASSIFY"] + else: + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + time_graph = TimeFst().fst + punctuation = PunctuationFst() + punct_graph = punctuation.fst + + word_graph = WordFst(punctuation=punctuation).fst + + whitelist_graph = WhiteListFst().fst + classify = ( + pynutil.add_weight(time_graph, 1.05) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(word_graph, 1.50) + | pynutil.add_weight(whitelist_graph, 1.01) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"TOKENIZE_AND_CLASSIFY": self.fst}) diff --git a/nemo_text_processing/text_normalization/rw/taggers/whitelist.py b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py new file mode 100644 index 000000000..382243d26 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/taggers/whitelist.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.utils import get_abs_path + +transliterations = pynini.string_file(get_abs_path("data/whitelist/kinya_transliterations.tsv")) + + +class WhiteListFst(GraphFst): + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = transliterations + graph = pynutil.insert("name: \"") + whitelist + pynutil.insert("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/rw/utils.py b/nemo_text_processing/text_normalization/rw/utils.py new file mode 100644 index 000000000..460596bca --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/utils.py @@ -0,0 +1,27 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py new file mode 100644 index 000000000..9c4313114 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/time.py b/nemo_text_processing/text_normalization/rw/verbalizers/time.py new file mode 100644 index 000000000..50c0f71a2 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/time.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.rw.graph_utils import NEMO_CHAR, GraphFst, delete_space + + +class VerbalizeTimeFst(GraphFst): + def __init__(self): + super().__init__(name="time", kind="verbalize") + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR) + + pynutil.delete("\"") + + delete_space + + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR) + + pynutil.delete("\"") + ) + + graph = hour + delete_tokens = self.delete_tokens(graph) + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py new file mode 100644 index 000000000..267215145 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst +from nemo_text_processing.text_normalization.rw.verbalizers.time import VerbalizeTimeFst + + +class VerbalizeFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + time = VerbalizeTimeFst().fst + + graph = cardinal_graph | time + self.fst = graph diff --git a/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py new file mode 100644 index 000000000..d39210ff5 --- /dev/null +++ b/nemo_text_processing/text_normalization/rw/verbalizers/verbalize_final.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.rw.graph_utils import GraphFst, delete_space, generator_main +from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst + + +class VerbalizeFinalFst(GraphFst): + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"rw_tn_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + else: + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space + + self.fst = graph + + if far_file: + generator_main(far_file, {"ALL": self.fst, 'REDUP': pynini.accep("REDUP")}) diff --git a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py index 7cb62517f..25dfb6e9b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py @@ -32,7 +32,7 @@ class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal - "21:a" -> ordinal { integer: "tjugoförsta" } + "21:a" -> ordinal { integer: "tjugoförsta" } Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/sv/taggers/telephone.py b/nemo_text_processing/text_normalization/sv/taggers/telephone.py index 4b37d28de..a03e0430b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/taggers/telephone.py @@ -50,8 +50,8 @@ class TelephoneFst(GraphFst): https://codegolf.stackexchange.com/questions/195787/format-a-swedish-phone-number Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py b/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py index 404b42495..dd71814a1 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py @@ -25,14 +25,14 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones - decimal { integer_part: "un" quantity: "billón" } -> un billón + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "dos" fractional_part: "cuatro cero" quantity: "billones" } -> menos dos coma quatro cero billones + decimal { integer_part: "un" quantity: "billón" } -> un billón Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ def __init__(self, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/token_parser.py b/nemo_text_processing/text_normalization/token_parser.py index 638b71bbf..4adcd7d7f 100644 --- a/nemo_text_processing/text_normalization/token_parser.py +++ b/nemo_text_processing/text_normalization/token_parser.py @@ -34,7 +34,7 @@ def __call__(self, text): Args: text: text to be parsed - + """ self.text = text self.len_text = len(text) @@ -107,11 +107,11 @@ def parse_token_value(self) -> Union[str, dict]: def parse_char(self, exp) -> bool: """ - Parses character + Parses character Args: exp: character to read in - + Returns true if successful """ assert self.char == exp @@ -124,7 +124,7 @@ def parse_chars(self, exp) -> bool: Args: exp: characters to read in - + Returns true if successful """ ok = False @@ -181,8 +181,8 @@ def parse_ws(self): def read(self): """ - Reads in next char. - + Reads in next char. + Returns true if not EOS """ if self.index < self.len_text - 1: # should be unique diff --git a/nemo_text_processing/text_normalization/zh/taggers/date.py b/nemo_text_processing/text_normalization/zh/taggers/date.py index 607b63511..f5ea122e7 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/text_normalization/zh/taggers/date.py @@ -32,7 +32,7 @@ class DateFst(GraphFst): 2002/02 -> is an error format according to the national standard 02/11 -> is an error format according to the national standard According to national standard, only when the year, month, and day are all exist, it is allowed to use symbols to separate them - + """ def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/text_normalization/zh/taggers/decimal.py b/nemo_text_processing/text_normalization/zh/taggers/decimal.py index d4afb3fd9..713fd4ab2 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/decimal.py @@ -64,7 +64,7 @@ class DecimalFst(GraphFst): 0.5 -> decimal { integer_part: "零" fractional_part: "五" } 0.5万 -> decimal { integer_part: "零" fractional_part: "五" quantity: "万" } -0.5万 -> decimal { negative: "负" integer_part: "零" fractional_part: "五" quantity: "万"} - + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/fraction.py b/nemo_text_processing/text_normalization/zh/taggers/fraction.py index 3f9ce42c7..e3ad5b513 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/taggers/fraction.py @@ -30,7 +30,7 @@ class FractionFst(GraphFst): 100分之1 -> tokens { fraction { denominator: "一百" numerator: "一"} } 百分之1 -> tokens { fraction { denominator: "百" numerator: "一"} } 98% -> tokens { fraction { denominator: "百" numerator: "九十八"} } - + Args: cardinal: CardinalFst, decimal: DecimalFst """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/measure.py b/nemo_text_processing/text_normalization/zh/taggers/measure.py index d7da8f524..1ec47aae9 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/measure.py +++ b/nemo_text_processing/text_normalization/zh/taggers/measure.py @@ -22,7 +22,7 @@ class MeasureFst(GraphFst): ''' - 1kg -> tokens { measure { cardinal { integer: "一" } units: "千克" } } + 1kg -> tokens { measure { cardinal { integer: "一" } units: "千克" } } ''' def __init__( diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index 82e1c174f..5cd95e58c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -22,13 +22,13 @@ class PreProcessorFst(GraphFst): ''' - Preprocessing of TN: - 1. interjections removal such as '啊, 呃' - 2. fullwidth -> halfwidth char conversion - 好啊 -> 好 - 呃对 -> 对 - : -> : - ; -> ; + Preprocessing of TN: + 1. interjections removal such as '啊, 呃' + 2. fullwidth -> halfwidth char conversion + 好啊 -> 好 + 呃对 -> 对 + : -> : + ; -> ; ''' def __init__( diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index d35ea178b..3a0b28aeb 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -35,9 +35,9 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/zh/utils.py b/nemo_text_processing/text_normalization/zh/utils.py index 4d08f1deb..175aba206 100644 --- a/nemo_text_processing/text_normalization/zh/utils.py +++ b/nemo_text_processing/text_normalization/zh/utils.py @@ -28,7 +28,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -50,7 +50,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py index 00ba3b8ed..4487c6449 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py @@ -21,7 +21,7 @@ class MeasureFst(GraphFst): ''' - tokens { measure { cardinal: "一" } units: "千克" } } -> 一千克 + tokens { measure { cardinal: "一" } units: "千克" } } -> 一千克 ''' def __init__( diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py index 4bafef0bd..8b196dcaf 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index a63769787..dab0cea0f 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -29,10 +29,10 @@ class PostProcessor(GraphFst): ''' - Postprocessing of TN, now contains: - 1. punctuation removal - 2. letter case conversion - 3. oov tagger + Postprocessing of TN, now contains: + 1. punctuation removal + 2. letter case conversion + 3. oov tagger ''' def __init__( diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py index 221fbcbc7..a927f4716 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py @@ -31,7 +31,7 @@ class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index b16625530..4592d7841 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -24,9 +24,7 @@ class VerbalizeFinalFst(GraphFst): - """ - - """ + """ """ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py index 662cf9f28..0dc6cca68 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py @@ -21,7 +21,7 @@ class Whitelist(GraphFst): ''' - tokens { whitelist: "ATM" } -> A T M + tokens { whitelist: "ATM" } -> A T M ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/word.py b/nemo_text_processing/text_normalization/zh/verbalizers/word.py index f30f254c5..b481d78d5 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/word.py @@ -20,7 +20,7 @@ class WordFst(GraphFst): ''' - tokens { char: "你" } -> 你 + tokens { char: "你" } -> 你 ''' def __init__(self, deterministic: bool = True, lm: bool = False): diff --git a/tests/conftest.py b/tests/conftest.py index b2216e874..8db3b106c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -62,7 +62,7 @@ def pytest_addoption(parser): @pytest.fixture def device(request): - """ Simple fixture returning string denoting the device [CPU | GPU] """ + """Simple fixture returning string denoting the device [CPU | GPU]""" if request.config.getoption("--cpu"): return "CPU" else: @@ -104,7 +104,7 @@ def cleanup_local_folder(): @pytest.fixture def test_data_dir(): - """ Fixture returns test_data_dir. """ + """Fixture returns test_data_dir.""" # Test dir. test_data_dir_ = join(dirname(__file__), __TEST_DATA_SUBDIR) return test_data_dir_ diff --git a/tests/nemo_text_processing/rw/__init__.py b/tests/nemo_text_processing/rw/__init__.py new file mode 100644 index 000000000..4f53d71f2 --- /dev/null +++ b/tests/nemo_text_processing/rw/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..baca7cbe4 --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,57 @@ +1~rimwe +2~kabiri +3~gatatu +4~kane +5~gatanu +6~gatandatu +7~karindwi +8~umunani +9~icyenda +10~icumi +002~zeru zeru kabiri +11~cumi na rimwe +12~cumi na kabiri +13~cumi na gatatu +2,3,4,5~kabiri gatatu kane gatanu +14~cumi na kane +15~cumi na gatanu +16~cumi na gatandatu +17~cumi na karindwi +18~cumi n'umunani +19~cumi n'icyenda +20~makumyabiri +89~mirongo inani n'icyenda +123~ijana na makumyabiri na gatatu +730~magana arindwi na mirongo itatu +100~ijana +2004~ibihumbi bibiri na kane +9041~ibihumbi icyenda na mirongo ine na rimwe +5324~ibihumbi bitanu na magana atatu na makumyabiri na kane +8567~ibihumbi umunani na magana atanu na mirongo itandatu na karindwi +10000~ibihumbi icumi +14000~ibihumbi cumi na bine +24404~ibihumbi makumyabiri na bine na magana ane na kane +9000~ibihumbi icyenda +9700~ibihumbi icyenda na magana arindwi +250~magana abiri na mirongo itanu +367~magana atatu na mirongo itandatu na karindwi +90104~ibihumbi mirongo icyenda n'ijana na kane +111001~ibihumbi ijana na cumi na kimwe na rimwe +10999~ibihumbi icumi na magana cyenda na mirongo icyenda n'icyenda +100000~ibihumbi ijana +200000~ibihumbi magana abiri +101000~ibihumbi ijana na kimwe +130000~ibihumbi ijana na mirongo itatu +531000~ibihumbi magana atanu na mirongo itatu na kimwe +2200345~miliyoni ebyiri n'ibihumbi magana abiri na magana atatu na mirongo ine na gatanu +7000000~miliyoni zirindwi +9101100~miliyoni icyenda n'ibihumbi ijana na kimwe n'ijana +19034004~miliyoni cumi n'icyenda n'ibihumbi mirongo itatu na bine na kane +29000000~miliyoni makumyabiri n'icyenda +40000000~miliyoni mirongo ine +400000000~miliyoni magana ane +100000001~miliyoni ijana na rimwe +340000000~miliyoni magana atatu na mirongo ine +783100000~miliyoni magana arindwi na mirongo inani n'eshatu n'ibihumbi ijana +340010010~miliyoni magana atatu na mirongo ine n'ibihumbi icumi n'icumi +9374514510~tiriyoni icyenda na miliyoni magana atatu na mirongo irindwi n'enye n'ibihumbi magana atanu na cumi na bine na magana atanu n'icumi diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..4a4ec27bc --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_time.txt @@ -0,0 +1,14 @@ +1:00~saa saba +2:01~saa munani n'umunota umwe +3:30~saa cyenda n'iminota mirongo itatu +4:21~saa cumi n'iminota makumyabiri na rimwe +5:12~saa cumi n'imwe n'iminota cumi n'ibiri +6:49~saa cumi n'ebyiri n'iminota mirongo ine n'icyenda +7:05~saa moya n'iminota itanu +8:23~saa mbiri n'iminota makumyabiri n'itatu +9:47~saa tatu n'iminota mirongo ine n'irindwi +10:56~saa ine n'iminota mirongo itanu n'itandatu +11:00~saa tanu +12:09~saa sita n'iminota icyenda +1:59~saa saba n'iminota mirongo itanu n'icyenda +12:31~saa sita n'iminota mirongo itatu n'umwe \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..b9b597932 --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,3 @@ +www~wawawa +maroc~maroki +television~televiziyo \ No newline at end of file diff --git a/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..1c97057aa --- /dev/null +++ b/tests/nemo_text_processing/rw/data_text_normalization/test_cases_word.txt @@ -0,0 +1,26 @@ +~ + ~ + no~no +x ~x +X!~X! +—~— +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aalem~aalem +a'ali~a'ali +aaliyan's~aaliyan's +mother-in-law~mother-in-law +1~rimwe +mar~mar +umwangavu~umwangavu diff --git a/tests/nemo_text_processing/rw/test_cardinal.py b/tests/nemo_text_processing/rw/test_cardinal.py new file mode 100644 index 000000000..eed4be57a --- /dev/null +++ b/tests/nemo_text_processing/rw/test_cardinal.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + + normalizer_rw = Normalizer( + input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + print(pred) diff --git a/tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..c67b247e6 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_sparrowhawk_normalization.sh @@ -0,0 +1,60 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/rw"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + + +testTNCardinal() { + input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + + +testTNTime() { + input=$TEST_DIR/data_text_normalization/test_cases_time.txt + runtest $input +} + + +testTNWhitelist() { + input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testTNWord() { + input=$TEST_DIR/data_text_normalization/test_cases_word.txt + runtest $input +} + + + + + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/rw/test_time.py b/tests/nemo_text_processing/rw/test_time.py new file mode 100644 index 000000000..a8ada8f73 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_time.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + + normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/rw/test_whitelist.py b/tests/nemo_text_processing/rw/test_whitelist.py new file mode 100644 index 000000000..3726dbaff --- /dev/null +++ b/tests/nemo_text_processing/rw/test_whitelist.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + + normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/rw/test_word.py b/tests/nemo_text_processing/rw/test_word.py new file mode 100644 index 000000000..10f2e1883 --- /dev/null +++ b/tests/nemo_text_processing/rw/test_word.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + + normalizer_rw = Normalizer(input_case='cased', lang='rw', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('rw/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_rw.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index d6ceb84f2..f20660502 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -1,4 +1,5 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, DIGITAL UMUGANDA # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +21,7 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import generator_main +from nemo_text_processing.text_normalization.rw.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files # tokenize_and_classify.far and verbalize.far for production purposes @@ -86,7 +87,25 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr', 'ja'], + choices=[ + "en", + "de", + "es", + "pt", + "ru", + 'fr', + 'hu', + 'sv', + 'vi', + 'zh', + 'ar', + 'it', + 'es_en', + 'hy', + 'mr', + 'ja', + 'rw', + ], type=str, default='en', ) @@ -270,6 +289,11 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'rw': + from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir,