Skip to content

Commit

Permalink
Hindi TN support Cardinal,Decimal,Fraction,Date,Time
Browse files Browse the repository at this point in the history
Signed-off-by: Ankit Narwade <[email protected]>
  • Loading branch information
ankitnv committed Sep 2, 2024
1 parent 0212f36 commit 637b345
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 30 deletions.
46 changes: 20 additions & 26 deletions nemo_text_processing/text_normalization/hi/taggers/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,19 @@

import pynini
from pynini.lib import pynutil, rewrite

from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space
from nemo_text_processing.text_normalization.hi.utils import apply_fst, get_abs_path
from nemo_text_processing.text_normalization.hi.utils import get_abs_path, apply_fst

hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv"))
minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv"))
seconds_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv"))


class TimeFst(GraphFst):
"""
Finite state transducer for classifying time, e.g.
१२:३० -> time { hours: "बारह" minutes: "तीस" }
१२:३०:३० -> time { hours: "बारह" minutes: "तीस" seconds: "तीस" }
१:४० -> time { hours: "एक" minutes: "चालीस" }
१:०० -> time { hours: "एक" }
Args:
time: GraphFst
Expand All @@ -39,26 +38,21 @@ def __init__(self):
super().__init__(name="time", kind="classify")

delete_colon = pynutil.delete(":")

self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ")
self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ")
self.seconds = pynutil.insert("seconds: \"") + seconds_graph + pynutil.insert("\" ")

# hour minute seconds
graph_hms = (
self.hours + delete_colon + insert_space + self.minutes + delete_colon + insert_space + self.seconds
)

# hour minute
graph_hm = self.hours + delete_colon + insert_space + self.minutes

final_graph = graph_hms | graph_hm


self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ")
self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ")
self.seconds = pynutil.insert("seconds: \"") + seconds_graph + pynutil.insert("\" ")

#hour minute seconds
graph_hms = self.hours + delete_colon + insert_space + self.minutes + delete_colon + insert_space + self.seconds

#hour minute
graph_hm = self.hours + delete_colon + insert_space + self.minutes

#hour
graph_h = self.hours + delete_colon + pynutil.delete("००")

final_graph = graph_hms | graph_hm | graph_h

final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()


# input_text = "१२:१०:१०"
input_text = "७:४०"
output = apply_fst(input_text, TimeFst().fst)
print(output)
self.fst = final_graph.optimize()
60 changes: 60 additions & 0 deletions nemo_text_processing/text_normalization/hi/verbalizers/time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil
from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
from nemo_text_processing.text_normalization.hi.utils import get_abs_path, apply_fst

class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time, e.g.
time { hours: "बारह" minutes: "दस" seconds: "दस" } -> बारह बजकर दस मिनट दस सेकंड
time { hours: "सात" minutes: "चालीस"" } -> सात बजकर चालीस मिनट
time { hours: "दस" } -> दस बजे
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self):
super().__init__(name="time", kind="verbalize")

hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space

minute = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space

second = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space

insert_minute = pynutil.insert("मिनट")
insert_second = pynutil.insert("सेकंड")
insert_bajkar = pynutil.insert("बजकर")
insert_baje = pynutil.insert("बजे")

#hour minute second
graph_hms = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute + insert_space + second + delete_space + insert_second

#hour minute
graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute

#hour
graph_h = hour + delete_space + insert_baje

self.graph = graph_hms | graph_hm | graph_h

final_graph = self.graph

delete_tokens = self.delete_tokens(final_graph)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
१२:१०:१०~बारह बजकर दस मिनट दस सेकंड
५:१२:०१~पाँच बजकर बारह मिनट एक सेकंड
३:१८:४३~तीन बजकर अठारह मिनट तैंतालीस सेकंड
२:१६~दो बजकर सोलह मिनट
७:२१~सात बजकर इक्कीस मिनट
११:५७~ग्यारह बजकर सत्तावन मिनट
८:००~आठ बजे
४:००~चार बजे
९:००~नौ बजे
सुबह के ५:२०:१२~सुबह के पाँच बजकर बीस मिनट बारह सेकंड
सुबह के ६:३९~सुबह के छः बजकर उनतालीस मिनट
सुबह के २:००~सुबह के दो बजे
दोपहर के ३:५९:३६~दोपहर के तीन बजकर उनसठ मिनट छत्तीस सेकंड
दोपहर के १:३६~दोपहर के एक बजकर छत्तीस मिनट
दोपहर के ३:००~दोपहर के तीन बजे
रात के १०:४८:५०~रात के दस बजकर अड़तालीस मिनट पचास सेकंड
रात के ११:५०~रात के ग्यारह बजकर पचास मिनट
रात के ८:००~रात के आठ बजे
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ testTNFraction() {
# runtest $input
#}

#testTNTime() {
# input=$PROJECT_DIR/en/data_text_normalization/test_cases_time.txt
# runtest $input
#}
testTNTime() {
input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt
runtest $input
}

#testTNMeasure() {
# input=$PROJECT_DIR/en/data_text_normalization/test_cases_measure.txt
Expand Down
31 changes: 31 additions & 0 deletions tests/nemo_text_processing/hi/test_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
from nemo_text_processing.text_normalization.normalize import Normalizer
from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio
from parameterized import parameterized
from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file


class TestTime:

normalizer = Normalizer(input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_time.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_norm(self,test_input, expected):
pred = self.normalizer.normalize(test_input, verbose=False)
assert pred.strip() == expected.strip()

0 comments on commit 637b345

Please sign in to comment.