diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 96429a5cb..e0c3f9894 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -24,8 +24,9 @@ class TimeFst(GraphFst): """ Finite state transducer for classifying time, e.g. - १२:३० -> time { hours: "बारह" minutes: "तीस" } + १२:३०:३० -> time { hours: "बारह" minutes: "तीस" seconds: "तीस" } १:४० -> time { hours: "एक" minutes: "चालीस" } + १:०० -> time { hours: "एक" } Args: time: GraphFst @@ -47,14 +48,11 @@ def __init__(self): #hour minute graph_hm = self.hours + delete_colon + insert_space + self.minutes + + #hour + graph_h = self.hours + delete_colon + pynutil.delete("००") - final_graph = graph_hms | graph_hm + final_graph = graph_hms | graph_hm | graph_h final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() - - -#input_text = "१२:१०:१०" -input_text = "७:४०" -output = apply_fst(input_text, TimeFst().fst) -print(output) \ No newline at end of file + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py new file mode 100644 index 000000000..319cea632 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.utils import get_abs_path, apply_fst + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time, e.g. + time { hours: "बारह" minutes: "दस" seconds: "दस" } -> बारह बजकर दस मिनट दस सेकंड + time { hours: "सात" minutes: "चालीस"" } -> सात बजकर चालीस मिनट + time { hours: "दस" } -> दस बजे + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space + + minute = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space + + second = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space + + insert_minute = pynutil.insert("मिनट") + insert_second = pynutil.insert("सेकंड") + insert_bajkar = pynutil.insert("बजकर") + insert_baje = pynutil.insert("बजे") + + #hour minute second + graph_hms = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute + insert_space + second + delete_space + insert_second + + #hour minute + graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute + + #hour + graph_h = hour + delete_space + insert_baje + + self.graph = graph_hms | graph_hm | graph_h + + final_graph = self.graph + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..1b4a04f70 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt @@ -0,0 +1,18 @@ +१२:१०:१०~बारह बजकर दस मिनट दस सेकंड +५:१२:०१~पाँच बजकर बारह मिनट एक सेकंड +३:१८:४३~तीन बजकर अठारह मिनट तैंतालीस सेकंड +२:१६~दो बजकर सोलह मिनट +७:२१~सात बजकर इक्कीस मिनट +११:५७~ग्यारह बजकर सत्तावन मिनट +८:००~आठ बजे +४:००~चार बजे +९:००~नौ बजे +सुबह के ५:२०:१२~सुबह के पाँच बजकर बीस मिनट बारह सेकंड +सुबह के ६:३९~सुबह के छः बजकर उनतालीस मिनट +सुबह के २:००~सुबह के दो बजे +दोपहर के ३:५९:३६~दोपहर के तीन बजकर उनसठ मिनट छत्तीस सेकंड +दोपहर के १:३६~दोपहर के एक बजकर छत्तीस मिनट +दोपहर के ३:००~दोपहर के तीन बजे +रात के १०:४८:५०~रात के दस बजकर अड़तालीस मिनट पचास सेकंड +रात के ११:५०~रात के ग्यारह बजकर पचास मिनट +रात के ८:००~रात के आठ बजे diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 8ff360890..3a5a815e6 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -82,10 +82,10 @@ testTNFraction() { # runtest $input #} -#testTNTime() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_time.txt -# runtest $input -#} +testTNTime() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt + runtest $input +} #testTNMeasure() { # input=$PROJECT_DIR/en/data_text_normalization/test_cases_measure.txt diff --git a/tests/nemo_text_processing/hi/test_time.py b/tests/nemo_text_processing/hi/test_time.py new file mode 100644 index 000000000..1a3b6535d --- /dev/null +++ b/tests/nemo_text_processing/hi/test_time.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from parameterized import parameterized +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestTime: + + normalizer = Normalizer(input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self,test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip()