Addition of whitelist and word

Signed-off-by: Tarushi V <[email protected]>
NVIDIA · Nov 6, 2024 · f4d88c4 · f4d88c4
1 parent 34e1087
commit f4d88c4
Show file tree

Hide file tree

Showing 11 changed files with 122 additions and 9 deletions.
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv
@@ -0,0 +1,13 @@
+१/४	पाव
+१/२	आधा
+३/४	पौन
+१:३०	डेढ़ बजे
+२:३०	ढाई बजे
+१.५	डेढ़
+२.५	ढाई
+कु.	कुमारी
+स्मि.	श्रीमती
+श्री.	श्री
+श्री.	श्रीमान
+मा.	मास्टर
+डॉ.	डॉक्टर
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py
@@ -34,8 +34,8 @@
 from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
-from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
 
 
 class ClassifyFst(GraphFst):
@@ -83,7 +83,7 @@ def __init__(
             money = MoneyFst(cardinal, decimal)
             money_graph = money.fst
             punct_graph = PunctuationFst().fst
-            # whitelist_graph = WhiteListFst(input_file=whitelist).fst
+            whitelist_graph = WhiteListFst().fst
             word_graph = WordFst().fst
 
             classify = (
@@ -96,7 +96,7 @@ def __init__(
                 | pynutil.add_weight(measure_graph, 1.1)
                 | pynutil.add_weight(money_graph, 1.1)
                 | pynutil.add_weight(word_graph, 100)
-                # |  pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(whitelist_graph, 1.01)
             )
 
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")

diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py
@@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
         if input_file is None:
-            input_file = get_abs_path("data/whitelist.tsv")
+            input_file = get_abs_path("data/whitelist/whitelist.tsv")
 
         if not os.path.exists(input_file):
             raise ValueError(f"Whitelist file {input_file} not found")

diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py
@@ -23,6 +23,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
+from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst
 
 
 class VerbalizeFst(GraphFst):
@@ -45,11 +46,13 @@ def __init__(self):
         measure_graph = MeasureFst(cardinal, decimal).fst
         money_graph = MoneyFst(cardinal, decimal).fst
 
+        word_graph = WordFst().fst
         whitelist_graph = WhiteListFst().fst
 
         graph = (
             cardinal_graph
             | whitelist_graph
+            | word_graph
             | ordinal_graph
             | decimal_graph
             | fraction_graph

diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt
@@ -0,0 +1,12 @@
+डेढ़ बजे~१:३०
+ढाई बजे~२:३०
+मास्टर निखिल तनिष~मा. निखिल तनिष
+पाव~१/४
+श्रीमती ज्योत्सना~स्मि. ज्योत्सना
+डॉक्टर~डॉ.
+आधा कप चाय~१/२ कप चाय
+श्रीमान भारत कुमार~श्री. भारत कुमार
+डॉक्टर प्रशांत~डॉ. प्रशांत
+डेढ़~१.५
+कुमारी~कु.
+ढाई~२.५
diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt
@@ -0,0 +1,15 @@
+नींद~नींद
+याहू!~याहू!
+-~-
+आआआ~आआआ
+आकाशगंगा~आकाशगंगा
+लटरपटर~लटरपटर
+कच्चा-पक्का~कच्चा-पक्का
+गुब्बारा~गुब्बारा
+चिट्ठी~चिट्ठी
+ढूंढना~ढूंढना
+लोहे का!~लोहे का!
+टाटा~टाटा
+~
+झ~झ
+संगीत~संगीत
diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh
@@ -63,6 +63,15 @@ testITNMoney() {
   runtest $input
 }
 
+testITNWord() {
+  input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt
+  runtest $input
+}
+
+testITNWhiteList() {
+  input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt
+  runtest $input
+}
 
 # Load shUnit2
 . $PROJECT_DIR/../shunit2/shunit2
diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from parameterized import parameterized
+
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio
+
+from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file
+
+
+class TestWhiteList:
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from parameterized import parameterized
+
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio
+
+from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file
+
+
+class TestWord:
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected