Skip to content

Commit

Permalink
Addition of whitelist and word
Browse files Browse the repository at this point in the history
Signed-off-by: Tarushi V <[email protected]>
  • Loading branch information
tarushi2k2 committed Nov 6, 2024
1 parent 34e1087 commit f4d88c4
Show file tree
Hide file tree
Showing 11 changed files with 122 additions and 9 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
१/४ पाव
१/२ आधा
३/४ पौन
:३० डेढ़ बजे
:३० ढाई बजे
१.५ डेढ़
२.५ ढाई
कु. कुमारी
स्मि. श्रीमती
श्री. श्री
श्री. श्रीमान
मा. मास्टर
डॉ. डॉक्टर

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst


class ClassifyFst(GraphFst):
Expand Down Expand Up @@ -83,7 +83,7 @@ def __init__(
money = MoneyFst(cardinal, decimal)
money_graph = money.fst
punct_graph = PunctuationFst().fst
# whitelist_graph = WhiteListFst(input_file=whitelist).fst
whitelist_graph = WhiteListFst().fst
word_graph = WordFst().fst

classify = (
Expand All @@ -96,7 +96,7 @@ def __init__(
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
# | pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(whitelist_graph, 1.01)
)

punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
super().__init__(name="whitelist", kind="classify")

if input_file is None:
input_file = get_abs_path("data/whitelist.tsv")
input_file = get_abs_path("data/whitelist/whitelist.tsv")

if not os.path.exists(input_file):
raise ValueError(f"Whitelist file {input_file} not found")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst


class VerbalizeFst(GraphFst):
Expand All @@ -45,11 +46,13 @@ def __init__(self):
measure_graph = MeasureFst(cardinal, decimal).fst
money_graph = MoneyFst(cardinal, decimal).fst

word_graph = WordFst().fst
whitelist_graph = WhiteListFst().fst

graph = (
cardinal_graph
| whitelist_graph
| word_graph
| ordinal_graph
| decimal_graph
| fraction_graph
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
डेढ़ बजे~१:३०
ढाई बजे~२:३०
मास्टर निखिल तनिष~मा. निखिल तनिष
पाव~१/४
श्रीमती ज्योत्सना~स्मि. ज्योत्सना
डॉक्टर~डॉ.
आधा कप चाय~१/२ कप चाय
श्रीमान भारत कुमार~श्री. भारत कुमार
डॉक्टर प्रशांत~डॉ. प्रशांत
डेढ़~१.५
कुमारी~कु.
ढाई~२.५
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
नींद~नींद
याहू!~याहू!
-~-
आआआ~आआआ
आकाशगंगा~आकाशगंगा
लटरपटर~लटरपटर
कच्चा-पक्का~कच्चा-पक्का
गुब्बारा~गुब्बारा
चिट्ठी~चिट्ठी
ढूंढना~ढूंढना
लोहे का!~लोहे का!
टाटा~टाटा
~
झ~झ
संगीत~संगीत
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ testITNMoney() {
runtest $input
}

testITNWord() {
input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt
runtest $input
}

testITNWhiteList() {
input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt
runtest $input
}

# Load shUnit2
. $PROJECT_DIR/../shunit2/shunit2
33 changes: 33 additions & 0 deletions tests/nemo_text_processing/hi/test_whitelist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
from parameterized import parameterized

from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio

from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file


class TestWhiteList:
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected
33 changes: 33 additions & 0 deletions tests/nemo_text_processing/hi/test_word.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
from parameterized import parameterized

from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio

from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file


class TestWord:
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected

0 comments on commit f4d88c4

Please sign in to comment.