diff --git a/Jenkinsfile b/Jenkinsfile index 689c2933d..2a52d09ee 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-29-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } @@ -319,11 +319,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' } } - // stage('L0: ZH ITN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="二零零二年一月二十八日 " --cache_dir ${ZH_TN_CACHE}' - // } - // } + stage('L0: ZH ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="二零零二年一月二十八日 " --cache_dir ${ZH_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index d47cb2140..4618678c0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -106,6 +106,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'zh': # Mandarin + from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -150,7 +155,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'ar', 'es_en', 'zh'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/zh/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/__init__.py new file mode 100644 index 000000000..ab4301382 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv new file mode 100644 index 000000000..fd3e3ddab --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv @@ -0,0 +1,74 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +壹 1 +貳 2 +參 3 +肆 4 +伍 5 +陸 6 +柒 7 +捌 8 +玖 9 +幺 1 +两 2 +兩 2 +拾 10 +拾壹 11 +拾貳 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陸 16 +拾柒 17 +拾捌 18 +拾玖 19 +貳拾 20 +貳拾壹 21 +貳拾貳 22 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +貳拾玖 29 +叁拾 30 +叁拾壹 31 +壹 1 +拾壹 11 +贰拾壹 21 +贰 2 +陆 6 +拾贰 12 +拾陆 16 +贰拾贰 22 +贰拾陆 26 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv new file mode 100644 index 000000000..5b2f33539 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv @@ -0,0 +1,49 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +一十 10 +零一 1 +零二 2 +零三 3 +零四 4 +零五 5 +零六 6 +零七 7 +零八 8 +零九 9 +壹 1 +贰 2 +叁 3 +肆 4 +伍 5 +陆 6 +柒 7 +捌 8 +玖 9 +拾 10 +拾壹 11 +拾贰 12 +壹拾 10 +零壹 1 +零贰 2 +零叁 3 +零肆 4 +零伍 5 +零陆 6 +零柒 7 +零捌 8 +零玖 9 +貳 2 +零貳 2 +陸 6 +零陸 6 +拾貳 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv new file mode 100644 index 000000000..9761245b8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv @@ -0,0 +1,75 @@ +美元 US$ +欧元 € +歐元 € +英镑 £ +英鎊 £ +加拿大元 CAD$ +加拿大币 CAD$ +加拿大幣 CAD$ +加元 CAD$ +加币 CAD$ +加幣 CAD$ +瑞士法郎 Fr +法郎 ₣ +圆 ¥ +圓 ¥ +瑞典克朗 Kr +墨西哥比索 NXN$ +新西兰元 NZD$ +新西蘭元 NZD$ +新加坡币 SGD$ +新加坡幣 SGD$ +新加坡元 SGD$ +港元 HKD$ +港币 HKD$ +港幣 HKD$ +挪威克朗 NOKkr +韩元 ₩ +韓元 ₩ +韩币 ₩ +韓幣 ₩ +土耳其里拉 TRY₺ +印度卢布 ₹ +印度盧布 ₹ +印度卢比 ₹ +印度盧比 ₹ +俄罗斯卢布 ₽ +俄羅斯盧布 ₽ +俄罗斯卢比 ₽ +俄羅斯盧比 ₽ +巴西雷亚尔 BRLR$ +巴西雷亞爾 BRLR$ +南非兰特 R +南非蘭特 R +丹麦克朗 DKKkr +丹麥克朗 DKKkr +波兰兹罗提 zł +波蘭兹儸提 zł +新台币 TWDNT$ +新臺幣 TWDNT$ +泰铢 ฿ +泰銖 ฿ +马来西亚林吉特 RM +馬來西亞林吉特 RM +印尼盾 Rp +匈牙利福林 Ft +捷克克朗 Kč +以色列新谢克尔 ₪ +以色列新謝克爾 ₪ +智利披索 CLP$ +菲律宾披索 ₱ +菲律賓披索 ₱ +阿联酋迪拉姆 د.إ +阿聯酋迪拉姆 د.إ +哥伦比亚披索 COL$ +哥倫比亞披索 COL$ +马来西亚令吉 RM +馬來西亞令吉 RM +罗马尼亚列伊 L +羅馬尼亞列伊 L +日元 JPY¥ +日圆 JPY¥ +日圓 JPY¥ +人民币 ¥ +人民幣 ¥ +元 ¥ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor.tsv new file mode 100644 index 000000000..d0451613a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor.tsv @@ -0,0 +1,9 @@ +美分 US$ +欧分 € +便士 £ +加拿大分 CAD$ +生丁 ₣ +瑞典欧尔 KrOre +分 NXN$ +新西兰仙 NZD$ +挪威欧尔 NOKOre diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent.tsv new file mode 100644 index 000000000..dd65818fd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent.tsv @@ -0,0 +1 @@ +分 ¥ diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent.tsv new file mode 100644 index 000000000..2f0e91476 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent.tsv @@ -0,0 +1,2 @@ +毛 ¥ +角 ¥ diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv new file mode 100644 index 000000000..d6bb500ae --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv @@ -0,0 +1,22 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +壹 1 +贰 2 +叁 3 +肆 4 +伍 5 +陆 6 +柒 7 +捌 8 +玖 9 +貳 2 +陸 6 +两 2 +兩 2 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/ties-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/ties-nano.tsv new file mode 100644 index 000000000..d4ed9d9ef --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/ties-nano.tsv @@ -0,0 +1,18 @@ +二十 2 +三十 3 +四十 4 +五十 5 +六十 6 +七十 7 +八十 8 +九十 9 +贰拾 2 +叁拾 3 +肆拾 4 +伍拾 5 +陆拾 6 +柒拾 7 +捌拾 8 +玖拾 9 +貳拾 2 +陸拾 6 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero.tsv new file mode 100644 index 000000000..4b4120706 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero.tsv @@ -0,0 +1 @@ +零 0 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours.tsv new file mode 100644 index 000000000..82a20bfea --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours.tsv @@ -0,0 +1,55 @@ +一 01 +二 02 +三 03 +四 04 +五 05 +六 06 +七 07 +八 08 +九 09 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +壹 01 +貳 02 +參 03 +肆 04 +伍 05 +陸 06 +柒 07 +捌 08 +玖 09 +拾 10 +拾壹 11 +拾貳 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陸 16 +拾柒 17 +拾捌 18 +拾玖 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +兩 02 +两 02 +贰 02 +陆 06 +拾贰 12 +拾陆 16 +贰拾贰 22 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv new file mode 100644 index 000000000..7fd465fa1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv @@ -0,0 +1,61 @@ +一 1 +二 2 +两 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +三十二 32 +三十三 33 +三十四 34 +三十五 35 +三十六 36 +三十七 37 +三十八 38 +三十九 39 +四十 40 +四十一 41 +四十二 42 +四十三 43 +四十四 44 +四十五 45 +四十六 46 +四十七 47 +四十八 48 +四十九 49 +五十 50 +五十一 51 +五十二 52 +五十三 53 +五十四 54 +五十五 55 +五十六 56 +五十七 57 +五十八 58 +五十九 59 +六十 60 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes.tsv new file mode 100644 index 000000000..081e2226b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes.tsv @@ -0,0 +1,164 @@ +一 01 +二 02 +三 03 +四 04 +五 05 +六 06 +七 07 +八 08 +九 09 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +三十二 32 +三十三 33 +三十四 34 +三十五 35 +三十六 36 +三十七 37 +三十八 38 +三十九 39 +四十 40 +四十一 41 +欸十二 42 +四十三 43 +四十四 44 +四十五 45 +四十六 46 +四十七 47 +四十八 48 +四十九 49 +五十 50 +五十一 51 +五二十 52 +五十三 53 +五十四 54 +五十五 55 +五十六 56 +五十七 57 +五十八 58 +五十九 59 +六十 60 +一十 10 +零一 01 +零二 02 +零三 03 +零四 04 +零五 05 +零六 06 +零七 07 +零八 08 +零九 09 +壹 01 +贰 02 +叁 03 +肆 04 +伍 05 +陆 06 +柒 07 +捌 08 +玖 09 +拾 10 +拾壹 11 +拾贰 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陆 16 +拾柒 17 +拾捌 18 +拾玖 19 +贰拾 20 +贰拾壹 21 +贰拾贰 22 +贰拾叁 23 +贰拾肆 24 +贰拾伍 25 +贰拾陆 26 +贰拾柒 27 +贰拾捌 28 +贰拾玖 29 +叁拾 30 +叁拾壹 31 +叁拾贰 32 +叁拾叁 33 +叁拾肆 34 +叁拾伍 35 +叁拾陆 36 +叁拾柒 37 +叁拾捌 38 +叁拾玖 39 +肆拾 40 +肆拾壹 41 +肆拾贰 42 +肆拾叁 43 +肆拾肆 44 +肆拾伍 45 +肆拾陆 46 +肆拾柒 47 +肆拾捌 48 +肆拾玖 49 +伍拾 50 +伍拾壹 51 +伍拾贰 52 +伍拾叁 53 +伍拾肆 54 +伍拾伍 55 +伍拾陆 56 +伍拾柒 57 +伍拾捌 58 +伍拾玖 59 +陆拾 60 +壹拾 10 +零壹 01 +零贰 02 +零叁 03 +零肆 04 +零伍 05 +零陆 06 +零柒 07 +零捌 08 +零玖 09 +貳 02 +零貳 02 +陸 06 +零陸 06 +拾貳 12 +貳拾貳 22 +貳拾 20 +貳拾壹 21 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +貳拾玖 29 +叁拾貳 32 +肆拾貳 42 +吳氏貳 52 +拾陸 16 +叁拾陸 36 +肆拾陸 46 +伍拾陸 56 +陸拾 60 +零 00 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv new file mode 100644 index 000000000..099f1fedc --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv @@ -0,0 +1,9 @@ +一 1 +二 2 +三 3 +四 4 +壹 1 +贰 2 +叁 3 +肆 4 +貳 2 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv new file mode 100644 index 000000000..fa4fc9dd8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv @@ -0,0 +1,144 @@ +一 01 +二 02 +三 03 +四 04 +五 05 +六 06 +七 07 +八 08 +九 09 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +三十二 32 +三十三 33 +三十四 34 +三十五 35 +三十六 36 +三十七 37 +三十八 38 +三十九 39 +四十 40 +四十一 41 +四十二 42 +四十三 43 +四十四 44 +四十五 45 +四十六 46 +四十七 47 +四十八 48 +四十九 49 +五十 50 +五十一 51 +五十二 52 +五十三 53 +五十四 54 +五十五 55 +五十六 56 +五十七 57 +五十八 58 +五十九 59 +六十 60 +壹 01 +貳 02 +叁 03 +肆 04 +伍 05 +陆 06 +柒 07 +捌 08 +玖 09 +拾 10 +拾壹 11 +拾贰 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陆 16 +拾柒 17 +拾捌 18 +拾玖 19 +贰拾 20 +贰拾壹 21 +贰拾贰 22 +贰拾叁 23 +贰拾肆 24 +贰拾伍 25 +贰拾陆 26 +贰拾柒 27 +贰拾捌 28 +贰拾玖 29 +叁拾 30 +叁拾壹 31 +叁拾贰 32 +叁拾叁 33 +叁拾肆 34 +叁拾伍 35 +叁拾陆 36 +叁拾柒 37 +叁拾捌 38 +叁拾玖 39 +肆拾 40 +肆拾壹 41 +肆拾贰 42 +肆拾叁 43 +肆拾肆 44 +肆拾伍 45 +肆拾陆 46 +肆拾柒 47 +肆拾捌 48 +肆拾玖 49 +伍拾 50 +伍拾壹 51 +伍拾贰 52 +伍拾叁 53 +伍拾肆 54 +伍拾伍 55 +伍拾陆 56 +伍拾柒 57 +伍拾捌 58 +伍拾玖 59 +陆拾 60 +貳 02 +陸 06 +兩 02 +两 02 +拾貳 12 +拾陸 16 +貳拾 20 +貳拾壹 21 +貳拾貳 22 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +二室玖 29 +叁拾貳 32 +叁拾陸 36 +肆拾貳 42 +肆拾陸 46 +伍拾貳 52 +伍拾陸 56 +陸拾 60 +零 00 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv new file mode 100644 index 000000000..33cd63758 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv @@ -0,0 +1,21 @@ +人力资源 HR +自动取款机 ATM +人力资源 HR +首席执行官 CEO +美国研究生入学考试 GRE +研究生管理专业入学考试 GMAT +全球定位系统 GPS +刷卡机 POS机 +数位多功能光碟 DVD +镭射唱片 CD +通用串行总线 USB +统一资源定位符 URL +虚拟专用网络 VPN +网络互联协议 IP +脱氧核糖核酸 DNA +核糖核酸 RNA +平均学分绩点 GPA +发光二极管 LED +可移植文档格式 PDF +社会性网络服务 SNS +博士 PhD diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py new file mode 100644 index 000000000..13e8ab6d0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -0,0 +1,182 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +NEMO_CHAR = utf8.VALID_UTF8_CHAR +NEMO_DIGIT = byte.DIGIT +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + print(f"Created {file_name}") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + print(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = str + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = pynini.Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py new file mode 100644 index 000000000..b29fc5fb3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -0,0 +1,365 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class CardinalFst(GraphFst): + def __init__(self): + """ + Fitite state transducer for classifying cardinals (e.g., 负五十 -> cardinal { negative: "-" integer: "50" }) + This class converts cardinals up to hundred millions (i.e., (10**10)) + Single unit digits are not converted (e.g., 五 -> 五) + Numbers less than 20 are not converted. + 二十 (2 characters/logograms) is kept as it is but 二十一 (3 characters/logograms) would become 21 + """ + super().__init__(name="cardinal", kind="classify") + + # number of digits to be processed + delete_hundreds = pynutil.delete("百") | pynutil.delete("佰") + delete_thousands = pynutil.delete("千") | pynutil.delete("仟") + closure_ten_thousands = pynini.accep("萬") | pynini.accep("万") + delete_ten_thousands = pynutil.delete("萬") | pynutil.delete("万") + closure_hundred_millions = pynini.accep("亿") | pynini.accep("億") + delete_hundred_millions = pynutil.delete("亿") | pynutil.delete("億") + + # data imported + zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + ties = pynini.string_file(get_abs_path("data/numbers/ties-nano.tsv")) + + # grammar for digits + graph_digits = digits | pynutil.insert("0") + + # grammar for teens + ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("一十", "1")]) + graph_teens = ten + graph_digits + + # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) + graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) + graph_all = graph_tens | graph_teens | pynutil.insert("00") + + # grammar for hundreds 百 + graph_hundreds_complex = ( + (graph_digits + delete_hundreds + graph_all) + | (graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits) + | (graph_digits + delete_hundreds + graph_teens) + ) + graph_hundreds = graph_hundreds_complex + graph_hundreds = graph_hundreds | pynutil.insert("000") + + # grammar for thousands 千 + graph_thousands_complex = ( + (graph_digits + delete_thousands + graph_hundreds_complex) + | (graph_digits + delete_thousands + pynini.cross(pynini.closure("零"), "0") + graph_all) + | (graph_digits + delete_thousands + pynini.cross(pynini.closure("零"), "00") + graph_digits) + ) + graph_thousands = graph_thousands_complex | pynutil.insert("000") + + # grammar for ten thousands 万 + graph_ten_thousands_simple = graph_digits + closure_ten_thousands + graph_ten_thousands_complex = ( + (graph_digits + delete_ten_thousands + graph_thousands_complex) + | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "0") + graph_hundreds_complex) + | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + ) + graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000") + + # grammmar for hundred thousands 十万 + graph_hundred_thousands_simple = graph_all + closure_ten_thousands + graph_hundred_thousands_complex = ( + (graph_all + delete_ten_thousands + graph_thousands_complex) + | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "0") + graph_hundreds_complex) + | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + ) + graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert( + "000000" + ) + + # grammar for millions 百万 + graph_millions_simple = graph_hundreds_complex + closure_ten_thousands + graph_millions_complex = ( + (graph_hundreds_complex + delete_ten_thousands + graph_thousands_complex) + | ( + graph_hundreds_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "0") + + graph_hundreds_complex + ) + | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + ) + graph_millions = ( + pynutil.add_weight(graph_millions_simple, -1.0) | graph_millions_complex | pynutil.insert("0000000") + ) + + # grammar for ten millions 千万 + graph_ten_millions_simple = graph_thousands_complex + closure_ten_thousands + graph_ten_millions_complex = ( + (graph_thousands_complex + delete_ten_thousands + graph_thousands_complex) + | ( + graph_thousands_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "0") + + graph_hundreds_complex + ) + | (graph_thousands_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | ( + graph_thousands_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "000") + + graph_digits + ) + ) + graph_ten_millions = pynutil.add_weight(graph_ten_millions_simple, -1.0) | graph_ten_millions_complex + graph_ten_millions = graph_ten_millions | pynutil.insert("00000000") + + # grammar for hundred millions 亿 + graph_hundred_millions_simple = graph_digits + closure_hundred_millions + graph_hundred_millions_complex = ( + (graph_digits + delete_hundred_millions + graph_ten_millions_complex) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0") + + graph_millions_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) + | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) + ) + graph_hundred_millions = ( + graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000") + ) + + # grammar for billions 十亿 + graph_billions_simple = graph_all + closure_hundred_millions + graph_billions_complex = ( + (graph_all + delete_hundred_millions + graph_ten_millions_complex) + | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0") + graph_millions_complex) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) + | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) + ) + graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000") + + # grammar for ten billions 百亿 + graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions + graph_ten_billions_complex = ( + (graph_hundreds_complex + delete_hundred_millions + graph_ten_millions_complex) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0") + + graph_millions_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000000") + + graph_all + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000000") + + graph_digits + ) + ) + graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000") + + # grammar for hundred billions 千亿 + graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions + graph_hundred_billions_complex = ( + (graph_thousands_complex + delete_hundred_millions + graph_ten_millions_complex) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0") + + graph_millions_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000000") + + graph_all + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000000") + + graph_digits + ) + ) + graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex + + # combining grammar; output for cardinal grammar + graph = pynini.union( + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_hundred_millions, + graph_ten_millions, + graph_millions, + graph_hundred_thousands, + graph_ten_thousands, + graph_thousands, + graph_hundreds, + graph_all, + graph_teens, + graph_digits, + zero, + ) + + # combining grammar; output consists only arabic numbers + graph_just_cardinals = pynini.union( + graph_hundred_billions_complex, + graph_ten_billions_complex, + graph_billions_complex, + graph_hundred_millions_complex, + graph_ten_millions_complex, + graph_millions_complex, + graph_hundred_thousands_complex, + graph_ten_thousands_complex, + graph_thousands_complex, + graph_hundreds_complex, + graph_all, + graph_teens, + graph_digits, + zero, + ) + + # delete unnecessary leading zero + delete_leading_zeros = pynutil.delete(pynini.closure("0")) + stop_at_non_zero = pynini.difference(NEMO_DIGIT, "0") + rest_of_cardinal = pynini.closure(NEMO_DIGIT) | pynini.closure(NEMO_SIGMA) + + # output for cardinal grammar without leading zero + clean_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal + clean_cardinal = clean_cardinal | "0" + graph = graph @ clean_cardinal # output for regular cardinals + self.for_ordinals = graph # used for ordinal grammars + + # output for pure arabic number without leading zero + clean_just_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal + clean_just_cardinal = clean_just_cardinal | "0" + graph_just_cardinals = graph_just_cardinals @ clean_just_cardinal # output for other grammars + self.just_cardinals = graph_just_cardinals # used for other grammars + + # final grammar for cardinal output; tokenization + optional_minus_graph = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("负", '"-"'))) | ( + pynini.closure(pynutil.insert("negative: ") + pynini.cross("負", '"-"')) + ) + final_graph = optional_minus_graph + pynutil.insert('integer: "') + graph + pynutil.insert('"') + final_graph = self.add_tokens(final_graph) + self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py new file mode 100644 index 000000000..55e77aeba --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -0,0 +1,89 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class DateFst(GraphFst): + def __init__(self): + """ + Finite state transducer for classifying date + 1798年五月三十日 -> date { year: "1798" month: "5" day: "30" } + 五月三十日 -> date { month: "5" day: "30" } + 一六七二年 -> date { year: "1672" } + """ + super().__init__(name="date", kind="classify") + + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) # imported for year-component + months = pynini.string_file(get_abs_path("data/date/months.tsv")) # imported for month-component + days = pynini.string_file(get_abs_path("data/date/day.tsv")) # imported for day-component + + # grammar for year + graph_year = ( + pynini.closure(digits) + + pynini.closure(pynini.cross("零", "0")) + + pynini.closure(digits) + + pynini.closure(pynini.cross("零", "0")) + + pynutil.delete("年") + ) + graph_year = pynutil.insert('year: "') + graph_year + pynutil.insert('"') + + # grammar for month + graph_month = pynutil.insert('month: "') + months + pynutil.delete("月") + pynutil.insert('"') + + # grammar for day + graph_day_suffix = pynini.accep("日") | pynini.accep("号") | pynini.accep("號") + graph_delete_day_suffix = pynutil.delete(graph_day_suffix) + graph_day = pynutil.insert('day: "') + days + graph_delete_day_suffix + pynutil.insert('"') + + # grammar for combinations of year+month, month+day, and year+month+day + graph_ymd = graph_year + pynutil.insert(" ") + graph_month + pynutil.insert(" ") + graph_day + graph_ym = graph_year + pynutil.insert(" ") + graph_month + graph_md = graph_month + pynutil.insert(" ") + graph_day + + # final grammar for standard date + graph_date = graph_ymd | graph_ym | graph_md | graph_year | graph_month | graph_day + # graph_date = graph_year | graph_month | graph_day + + # grammar for optional prefix ad or bc + graph_bc_prefix = pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + graph_bc = pynutil.delete(graph_bc_prefix) + + graph_ad_prefix = ( + pynini.closure("公元", 0, 1) + | pynini.closure("公元后", 0, 1) + pynini.closure("紀元", 0, 1) + | pynini.closure("纪元", 0, 1) + | pynini.closure("西元", 0, 1) + ) + graph_ad = pynutil.delete(graph_ad_prefix) + + graph_suffix_bc = ( + graph_bc + graph_date + pynutil.insert(' era: "') + pynutil.insert("B.C.") + pynutil.insert('"') + ) + graph_suffix_ad = ( + graph_ad + graph_date + pynutil.insert(' era: "') + pynutil.insert("A.D.") + pynutil.insert('"') + ) + + graph_era = graph_suffix_bc | graph_suffix_ad + + # grammar for standard date and with era + graph_date_final = graph_era | graph_date + + # graph_date_final = graph_date + + final_graph = self.add_tokens(graph_date_final) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py new file mode 100644 index 000000000..f334f2675 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +def get_quantity(decimal, cardinal): + suffix = pynini.union( + "万", + "十万", + "百万", + "千万", + "亿", + "十亿", + "百亿", + "千亿", + "萬", + "十萬", + "百萬", + "千萬", + "億", + "十億", + "百億", + "千億", + "拾萬", + "佰萬", + "仟萬", + "拾億", + "佰億", + "仟億", + "拾万", + "佰万", + "仟万", + "仟亿", + "佰亿", + "仟亿", + ) + numbers = cardinal + res = ( + pynutil.insert('integer_part: "') + + numbers + + pynutil.insert('"') + + pynutil.insert(' quantity: "') + + suffix + + pynutil.insert('"') + ) + res = res | decimal + pynutil.insert(' quantity: "') + suffix + pynutil.insert('"') + + return res + + +class DecimalFst(GraphFst): + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1)) + + delete_decimal = pynutil.delete("点") | pynutil.delete( + "點" + ) # delete decimal character, 'point' in english in 'one point two for 1.2' + + # grammar for integer part + graph_integer = ( + pynutil.insert('integer_part: "') + + (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1))) + + pynutil.insert('" ') + ) # tokenization on just numbers + graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01) # integer or zero + + # grammar for fractional part + delete_zero = pynini.closure(pynini.cross("零", "0")) + graph_string_of_cardinals = cardinal_after_decimal + graph_string_of_cardinals = pynini.closure( + (pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1 + ) + graph_fractional = pynini.closure( + pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1 + ) + + # grammar for decimal: integer+delete character+part after decimal point + graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1) + + # New Grammar added for Money + self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity( + graph_decimal_no_sign, cardinal.just_cardinals + ) + + graph_negative = pynini.cross("负", 'negative: "-" ') | pynini.cross("負", 'negative: "-" ') + graph_negative = pynini.closure(graph_negative, 0, 1) # captures only one "负" + + graph_decimal = graph_negative + graph_decimal_no_sign + graph_decimal = graph_decimal | (graph_negative + get_quantity(graph_decimal_no_sign, cardinal_before_decimal)) + self.final_graph_wo_negative = graph_decimal + + final_graph = self.add_tokens(graph_decimal) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py new file mode 100644 index 000000000..33fcd20a9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -0,0 +1,46 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from pynini.lib import pynutil + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying fraction + e.g. 二分之一 -> tokens { fraction { denominator: "2" numerator: "1"} } + e.g. 五又二分之一 -> tokens { fraction { integer_part: "1" denominator: "2" numerator: "1" } } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="fraction", kind="classify") + + graph_cardinal = cardinal.just_cardinals + integer_component = pynutil.insert('integer_part: "') + graph_cardinal + pynutil.insert('"') + denominator_component = ( + pynutil.insert('denominator: "') + graph_cardinal + pynutil.delete("分之") + pynutil.insert('"') + ) + numerator_component = pynutil.insert('numerator: "') + graph_cardinal + pynutil.insert('"') + + graph_only_fraction = denominator_component + pynutil.insert(" ") + numerator_component + graph_fraction_with_int = integer_component + pynutil.delete("又") + pynutil.insert(" ") + graph_only_fraction + + graph_fraction = graph_only_fraction | graph_fraction_with_int + + final_graph = self.add_tokens(graph_fraction) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py new file mode 100644 index 000000000..d0a24ab3b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -0,0 +1,147 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class MoneyFst(GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="money", kind="classify") + + # imports + major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) + minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + graph_cardinal = cardinal.for_ordinals + graph_decimal = decimal.final_graph_wo_negative # + + # add leding zero to the number: 1 -> 01 + add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) # + graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit # + + # regular number and yuan part + graph_integer_component = pynutil.insert('integer_part: "') + graph_cardinal + pynutil.insert('"') + graph_fractional_component = ( + pynutil.insert('fractional_part: "') + graph_fractional_values + pynutil.insert('"') + ) + + # regular symbol part + graph_major_currency = pynutil.insert('currency: "') + major_currency + pynutil.insert('"') + graph_minor_currency = pynutil.insert('currency: "') + minor_currency + pynutil.insert('"') + + # regular combine number and symbol part + graph_only_major = graph_integer_component + pynutil.insert(" ") + graph_major_currency + graph_only_minor = graph_fractional_component + pynutil.insert(" ") + graph_minor_currency + graph_money = graph_only_major + pynutil.insert(" ") + graph_fractional_component + + # regular large money with decimals + graph_large_money = graph_decimal + pynutil.insert(" ") + graph_major_currency + + # final graph for regular currency + graph_regular_money = graph_only_major | graph_only_minor | graph_money | graph_large_money + + # yuan major plus minor + major_symbol = pynini.accep("块") | pynini.cross("塊", "块") + tencent = pynini.accep("毛") | pynini.accep("角",) + cent = pynini.accep("分") + graph_kuai = ( + graph_integer_component + + pynutil.insert(" ") + + pynutil.insert('currency_major: "') + + pynini.closure(major_symbol, 1, 1) + + pynutil.insert('"') + ) + graph_mao = ( + graph_integer_component + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + ) + graph_fen = ( + graph_integer_component + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + + graph_digits = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + graph_kuaimao = ( + graph_kuai + + pynutil.insert(" ") + + graph_digits + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + ) + graph_kuaifen = ( + graph_kuai + + pynutil.insert(" ") + + graph_digits + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + graph_maofen = ( + pynutil.insert('fractional_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('fraction_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_min: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + + graph_kuaimaofen = ( + graph_kuai + + pynutil.insert(" ") + + pynutil.insert('fractional_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('fraction_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_min: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + + graph_mandarin = ( + graph_kuai | graph_mao | graph_fen | graph_kuaimao | graph_kuaifen | graph_maofen | graph_kuaimaofen + ) + + # combing both + graph_final = graph_regular_money | graph_mandarin + final = self.add_tokens(graph_final) + self.fst = final.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py new file mode 100644 index 000000000..47ffbdd36 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -0,0 +1,29 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from pynini.lib import pynutil + + +class OrdinalFst(GraphFst): + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + graph_cardinals = cardinal.for_ordinals + mandarin_morpheme = pynini.accep("第") + graph_ordinal = mandarin_morpheme + graph_cardinals + graph_ordinal_final = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"') + graph_ordinal_final = self.add_tokens(graph_ordinal_final) + self.fst = graph_ordinal_final.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py new file mode 100644 index 000000000..4ca8eab9b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from pynini.lib import pynutil + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&'()*+,-./:;<=>?@^_`{|}~。,;:《》“”·~【】!?、‘’.<>-——_" + punct = pynini.union(*s) + + graph = pynutil.insert('name: "') + punct + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py new file mode 100644 index 000000000..9a3aca388 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -0,0 +1,105 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g., 五点十分 -> time { hours: "05" minutes: "10" } + e.g., 五时十五分 -> time { hours: "05" minutes: "15" } + e.g., 十五点十分 -> time { hours: "15" minutes: "10" } + e.g., 十五点十分二十秒 -> time { hours: "15" minutes: "10" seconds: "20 } + e.g., 两点一刻 -> time { hours: "2" minutes: "1刻" } + e.g., 五点 -> time { hours: "5点" } + e.g., 五小时 -> time { hours: "5小时" } + e.g., 五分 -> time { minutess: "5分" } + e.g., 五分钟 -> time { seconds: "5分钟" } + e.g., 五秒 -> time { seconds: "5秒" } + e.g., 五秒钟 -> time { seconds: "5秒钟" } + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) + minutes = pynini.string_file(get_abs_path("data/time/time_minutes.tsv")) + seconds = pynini.string_file(get_abs_path("data/time/time_seconds.tsv")) + quarters = pynini.string_file(get_abs_path("data/time/time_quarters.tsv")) + for_mandarin = pynini.string_file(get_abs_path("data/time/time_mandarin.tsv")) + + graph_delete_hours = pynutil.delete("点") | pynutil.delete("點") | pynutil.delete("时") | pynutil.delete("時") + graph_hours = hours + graph_delete_hours + graph_hours_component = pynutil.insert('hours: "') + graph_hours + pynutil.insert('"') + + graph_minutes = pynutil.delete("分") + graph_minutes = minutes + graph_minutes + graph_minutes_component = pynutil.insert('minutes: "') + graph_minutes + pynutil.insert('"') + + graph_seconds = pynutil.delete("秒") + graph_seconds = seconds + graph_seconds + graph_seconds_component = pynutil.insert('seconds: "') + graph_seconds + pynutil.insert('"') + + graph_time_standard = (graph_hours_component + pynutil.insert(" ") + graph_minutes_component) | ( + graph_hours_component + + pynutil.insert(" ") + + graph_minutes_component + + pynutil.insert(" ") + + graph_seconds_component + ) + + quarter_mandarin = ( + quarters + pynini.accep("刻") | pynini.cross("刻鈡", "刻钟") | pynini.accep("刻钟") | pynini.accep("半") + ) + hour_mandarin = ( + pynini.accep("点") + | pynini.accep("时") + | pynini.cross("點", "点") + | pynini.cross("時", "时") + | pynini.accep("小时") + | pynini.cross("小時", "小时") + | pynini.cross("個點", "个点") + | pynini.accep("个点") + | pynini.accep("个钟头") + | pynini.cross("個鐘頭", "个钟头") + | pynini.accep("个小时") + | pynini.cross("個小時", "个小时") + ) + minute_mandarin = pynini.accep("分") | pynini.cross("分鐘", "分钟") | pynini.accep("分钟") + second_mandarin = pynini.accep("秒") | pynini.cross("秒鐘", "秒钟") | pynini.accep("秒钟") + + hours_only = for_mandarin + hour_mandarin + minutes_only = for_mandarin + minute_mandarin + seconds_only = for_mandarin + second_mandarin + + graph_mandarin_hour = pynutil.insert('hours: "') + hours_only + pynutil.insert('"') + graph_mandarin_minute = pynutil.insert('minutes: "') + minutes_only + pynutil.insert('"') + graph_mandarin_second = pynutil.insert('seconds: "') + seconds_only + pynutil.insert('"') + graph_mandarin_quarter = pynutil.insert('minutes: "') + quarter_mandarin + pynutil.insert('"') + graph_mandarins = ( + graph_mandarin_hour + | graph_mandarin_minute + | graph_mandarin_second + | graph_mandarin_quarter + | (graph_mandarin_hour + pynutil.insert(" ") + graph_mandarin_quarter) + ) + + final_graph = graph_time_standard | graph_mandarins + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..a46563170 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -0,0 +1,106 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.word import WordFst +from pynini.lib import pynutil + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__( + self, input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "_zh_itn.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + date_graph = DateFst().fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + fraction = FractionFst(cardinal) + fraction_graph = fraction.fst + punct_graph = PunctuationFst().fst + whitelist_graph = WhiteListFst(input_file=whitelist, input_case=input_case).fst + + classify = ( + pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.2) + | pynutil.add_weight(cardinal_graph, 1.09) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(whitelist_graph, 1.01) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py new file mode 100644 index 000000000..8e0cbd328 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import INPUT_LOWER_CASED, GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. 贵宾 -> tokens { name: "VIP" } + 美国研究生入学考试 -> { name: "GRE" } + 人力资源 -> { name: "HR" } + 工商管理学硕士 -> { name: "MBA" } + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + """ + + def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = (pynutil.insert('name: "')) + (whitelist) + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py new file mode 100644 index 000000000..3e129fb98 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py @@ -0,0 +1,29 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst +from pynini.lib import pynutil + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py new file mode 100644 index 000000000..d63a1b2f7 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -0,0 +1,62 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Union + +import inflect + +_inflect = inflect.engine() + + +def num_to_word(x: Union[str, int]): + """ + converts integer to spoken representation + + Args + x: integer + + Returns: spoken representation + """ + if isinstance(x, int): + x = str(x) + x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "") + return x + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path + + +def get_various_formats(text: str) -> List[str]: + """ + Return various formats for text, e.g., all caps, the first letter upper cased, space separated, etc. + """ + result = [] + if len(text) == 0: + return [] + + for t in [text, " ".join(list(text))]: + result.append(t) + result.append(t.upper()) + result.append(t.capitalize()) + return result diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py new file mode 100644 index 000000000..3eec1a88b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -0,0 +1,95 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class CardinalFst(GraphFst): + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + # group numbers by three + exactly_three_digits = NEMO_DIGIT ** 3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + suffix = pynini.union( + "千", + "仟", + "万", + "十万", + "百万", + "千万", + "亿", + "十亿", + "百亿", + "千亿", + "萬", + "十萬", + "百萬", + "千萬", + "億", + "十億", + "百億", + "千億", + "拾萬", + "佰萬", + "仟萬", + "拾億", + "佰億", + "仟億", + "拾万", + "佰万", + "仟万", + "仟亿", + "佰亿", + "仟亿", + ) + + # inserting a "," between every 3 numbers + group_by_threes = ( + at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + ) + pynini.closure(suffix) + + # remove the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative: ") + + delete_space + + pynutil.delete('"') + + pynini.accep("-") + + pynutil.delete('"') + + delete_space + ) + + # remove integer aspect + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 0, 1) + + pynini.closure(NEMO_SIGMA) + + pynutil.delete('"') + ) + graph = graph @ group_by_threes + + graph = optional_sign + graph + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py new file mode 100644 index 000000000..2b979e6b8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py @@ -0,0 +1,85 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date { year: "1798" month: "5" day: "30" } -> 1798年5月30日 + date { year: "1798" month: "5" } -> 1798年5月 + date { month: "5" day: "30" } -> 5月30日 + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + # removing tokenization for year, month and day + year = ( + pynutil.delete("year:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete('"') + ) + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete('"') + ) + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete('"') + ) + era = pynutil.delete("era:") + bc = era + delete_space + pynutil.delete('"') + pynini.cross("A.D.", "公元") + pynutil.delete('"') + ad = era + delete_space + pynutil.delete('"') + pynini.cross("B.C.", "公元前") + pynutil.delete('"') + + # combining above 3 for variations + graph_ymd = ( + year + + pynutil.insert("年") + + delete_space + + month + + pynutil.insert("月") + + delete_space + + day + + pynutil.insert("日") + ) + graph_ym = year + pynutil.insert("年") + delete_space + month + pynutil.insert("月") + graph_md = month + pynutil.insert("月") + delete_space + day + pynutil.insert("日") + graph_year = year + pynutil.insert("年") + graph_month = month + pynutil.insert("月") + graph_day = day + pynutil.insert("日") + graph_era = bc | ad + + optional_era = pynini.closure(graph_era) + + final_graph = ( + optional_era + delete_space + (graph_ymd | graph_ym | graph_md | graph_year | graph_month | graph_day) + ) + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py new file mode 100644 index 000000000..ea8fa4ab0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -0,0 +1,91 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class DecimalFst(GraphFst): + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + + # group numbers by three + exactly_three_digits = NEMO_DIGIT ** 3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # insert a "," for every three numbers before decimal point + space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + # insert a "," for every three numbers after decimal point + space_every_three_decimal = ( + pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits + ) + + # combine both + group_by_threes = space_every_three_integer | space_every_three_decimal + self.group_by_threes = group_by_threes + + # removing tokenizations, 'negative: ' + optional_sign = pynini.closure( + pynutil.delete("negative: ") + + delete_space + + pynutil.delete('"') + + pynini.accep("-") + + pynutil.delete('"') + + delete_space + ) + + # removing tokenzations, 'integer_part:' + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + integer = integer @ group_by_threes + optional_integer = pynini.closure(integer + delete_space, 0, 1) + + # removing tokenizations, 'fractionl_part' + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + + # removing tokenization, 'quantity:' + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_quantity = pynini.closure(quantity + delete_space) + + # combining graphs removing tokenizations *3 + graph = (optional_integer + optional_fractional + optional_quantity).optimize() + graph = optional_sign + graph # add optional sign for negative number + self.numebrs = graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py new file mode 100644 index 000000000..d5ea2ced1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py @@ -0,0 +1,60 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from pynini.lib import pynutil + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { denominator: "2" numerator: "1"} } -> 1/2 + e.g. tokens { fraction { integer_part: "1" denominator: "2" numerator: "1" } } -> 1又1/2 + """ + + def __init__(self): + super().__init__(name="fraction", kind="verbalize") + + integer_part = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT) + + pynutil.insert("又") + + pynutil.delete('"') + ) + denominator_part = ( + pynutil.delete("denominator:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT) + + pynutil.delete('"') + ) + numerator_part = ( + pynutil.delete("numerator:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT) + + pynutil.insert("/") + + pynutil.delete('"') + ) + + graph_with_integer = integer_part + delete_space + numerator_part + delete_space + denominator_part + graph_no_integer = numerator_part + delete_space + denominator_part + + final_graph = graph_with_integer | graph_no_integer + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py new file mode 100644 index 000000000..2fd3919a4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class MoneyFst(GraphFst): + def __init__(self): + super().__init__(name="money", kind="verbalize") + + currency_unit = pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + number_unit = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + fraction_unit = pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + decimal_unit = ( + pynutil.insert(".") + + pynutil.delete('fractional_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('quantity: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # regular money part + graph_money_regular = ( + currency_unit + delete_space + number_unit + delete_space + pynutil.insert(".") + fraction_unit + ) + graph_only_major_regular = currency_unit + delete_space + number_unit + graph_only_minor_regular = currency_unit + delete_space + pynutil.insert("0.") + fraction_unit + graph_large_money = currency_unit + delete_space + number_unit + delete_space + decimal_unit + + graph_regular = graph_money_regular | graph_only_major_regular | graph_only_minor_regular | graph_large_money + + major_symbol = pynini.accep("块") + minor_symbol = pynini.accep("毛") | pynini.accep("角") + lesser_symbol = pynini.accep("分") + + major_currency = pynutil.delete('currency_major: "') + major_symbol + pynutil.delete('"') + minor_currency = pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + lesser_currency = pynutil.delete('currency_min:"') + lesser_symbol + pynutil.delete('"') + + graph_kuai = number_unit + delete_space + major_currency + graph_mao = number_unit + delete_space + minor_currency + graph_fen = number_unit + delete_space + lesser_currency + + graph_kuaimao = graph_kuai + delete_space + fraction_unit + delete_space + minor_currency + graph_kuaifen = graph_kuai + delete_space + fraction_unit + delete_space + lesser_currency + graph_maofen = ( + pynutil.delete('fractional_part: "') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('currency_minor: "') + + minor_symbol + + pynutil.delete('"') + + delete_space + + pynutil.delete('fraction_part: "') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('currency_min: "') + + lesser_symbol + + pynutil.delete('"') + ) + + graph_all = graph_kuai + delete_space + graph_maofen + + graph_mandarin = ( + (graph_kuai | graph_mao | graph_fen) | graph_kuaimao | graph_kuaifen | graph_maofen | graph_all + ) + + graph_verbalizer = graph_regular | pynutil.add_weight(graph_mandarin, -2.0) + + delete_tokens = self.delete_tokens(graph_verbalizer) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py new file mode 100644 index 000000000..93f2a678d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class OrdinalFst(GraphFst): + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph_integer = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.accep("第") + + pynini.closure(NEMO_DIGIT) + + pynini.closure(NEMO_SIGMA) + + pynutil.delete('"') + ) + + delete_tokens = self.delete_tokens(graph_integer) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py new file mode 100644 index 000000000..4560fdf62 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -0,0 +1,120 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from pynini.lib import pynutil + + +class TimeFst(GraphFst): + """ + Finite state transcucer for verbalizing time, e.g., + time { hours: "12" minutes: "30" } -> 12:30 + time { hours: "01" minutes: "30" } -> 01:30 + time { hours: "1" minutes: "30" seconds: "05" } -> 01:30:05 + time { hours: "1" minutes: "1刻" } -> 1点1刻 + time { hours: "一点" } -> 1点 + time { hours: "一小时" } -> 1小时 + time { hours: "一个钟头" } -> 1个钟头 + time { minutes: "一分" } -> 1分 + time { minutes: "一分钟" } -> 1分钟 + time { seconds: "一秒" } -> 1秒 + time { seconds: "一秒钟" } -> 1秒钟 + time { hours: "五点" minutes: "一刻" } -> 5点1刻 + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + # add_leading_zero = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + token_hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1, 2) + + pynutil.delete('"') + ) + token_minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1, 2) + + pynutil.delete('"') + ) + token_second = ( + pynutil.delete("seconds:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1, 2) + + pynutil.delete('"') + ) + + add_colon = pynutil.insert(":") + graph_regular_time = (token_hour + delete_space + add_colon + token_minute) | ( + token_hour + delete_space + add_colon + token_minute + delete_space + add_colon + token_second + ) + + hours = ( + pynini.accep("点") + | pynini.accep("小时") + | pynini.accep("时") + | pynini.accep("个钟头") + | pynini.accep("个点") + | pynini.accep("个小时") + ) + hour_mandarin = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(hours, 1)) + + pynutil.delete('"') + ) + minutes = pynini.accep("分") | pynini.accep("分钟") | pynini.accep("半") + minute_mandarin = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete('"') + + (((pynini.closure(NEMO_DIGIT) + pynini.closure(minutes, 1))) | pynini.closure(minutes, 1)) + + pynutil.delete('"') + ) + seconds = pynini.accep("秒") | pynini.accep("秒钟") + second_mandarin = ( + pynutil.delete("seconds:") + + delete_space + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(seconds, 1)) + + pynutil.delete('"') + ) + quarters = pynini.accep("刻") | pynini.accep("刻钟") + quarter_mandarin = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(quarters, 1)) + + pynutil.delete('"') + ) + + graph_mandarin_time = ( + hour_mandarin + | minute_mandarin + | second_mandarin + | quarter_mandarin + | (hour_mandarin + delete_space + quarter_mandarin) + | (hour_mandarin + delete_space + minute_mandarin) + | (hour_mandarin + delete_space + minute_mandarin + delete_space + second_mandarin) + | (minute_mandarin + delete_space + second_mandarin) + ) + + final_graph = graph_regular_time | graph_mandarin_time + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py new file mode 100644 index 000000000..b379c4d94 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -0,0 +1,58 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + ordinal_graph = OrdinalFst().fst + decimal = DecimalFst() + decimal_graph = decimal.fst + fraction = FractionFst() + fraction_graph = fraction.fst + money = MoneyFst() + money_graph = money.fst + time_graph = TimeFst().fst + date_graph = DateFst().fst + whitelist_graph = WhiteListFst().fst + + graph = ( + time_graph + | date_graph + | money_graph + | fraction_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + | whitelist_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py new file mode 100644 index 000000000..e21b1d332 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py @@ -0,0 +1,43 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.word import WordFst +from pynini.lib import pynutil + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py new file mode 100644 index 000000000..994935b2b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py @@ -0,0 +1,42 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "USB" } -> USB + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete('"') + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py new file mode 100644 index 000000000..5888e2d8c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py @@ -0,0 +1,37 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/graph_utils.py b/nemo_text_processing/text_normalization/zh/graph_utils.py index fd609ab5e..37cb14d45 100644 --- a/nemo_text_processing/text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/text_normalization/zh/graph_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# gaph_utils is kept due to the fatc that importing from en folders will cause import errors that the data file names have to be the same with what are in the en folder import logging import os import string diff --git a/nemo_text_processing/text_normalization/zh/utils.py b/nemo_text_processing/text_normalization/zh/utils.py index b3d03c602..2d78a8ea0 100644 --- a/nemo_text_processing/text_normalization/zh/utils.py +++ b/nemo_text_processing/text_normalization/zh/utils.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# gaph_utils is kept due to the fatc that importing from en folders will cause import errors that the data file names have to be the same with what are in the en folder import csv import os diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..02d3dcbcf --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,130 @@ +一百~100 +一百零一~101 +一百一十一~111 +两百~200 +九百~900 +九百五十~950 +九百五十一~951 +一千~1,000 +一千零一~1,001 +一千一百~1,100 +一千一百零一~1,101 +一千零五十~1,050 +一千一百一十~1,110 +一千一百十~1,110 +一千一百一十一~1,111 +两千~2,000 +九千九百九十九~9,999 +一万一千~11,000 +一万一千一百~11,100 +一万一千一百一十~11,110 +一万一千一百一十一~11,111 +一万零一百~10,100 +一万零一百五十~10,150 +一万零一百五十一~10,151 +一万零一~10,001 +一万零五十~10,050 +一万零五十一~10,051 +一万~1万 +两万~2万 +三万~3万 +四万~4万 +五万~5万 +六万~6万 +七万~7万 +八万~8万 +九万~9万 +十万~10万 +十萬~10萬 +九十万~90万 +九十一万~91万 +九十万五千八百二十五~905,825 +九十一万五千八百二十五~915,825 +十一万~11万 +十万一千一百一十一~101,111 +十万一千一百~101,100 +十万一千~101,000 +十万零一百~100,100 +十万零十~100,010 +十万零一~100,001 +一百万~100万 +一百一十万~110万 +一百一十一万~111万 +两百万~200万 +两百一十万~210万 +两百零一万~201万 +一百一十九万~119万 +一百一十九万九千~1,199,000 +一百一十九万九千九百~1,199,900 +一百一十九万九千九百九十~1,199,990 +一百一十九万九千九百九十九~1,199,999 +一百一十九万零九~1,190,009 +一百一十九万零九百九十一~1,190,991 +一千万~1,000万 +一千一百万~1,100万 +一千一百一十万~1,110万 +一千一百一十一万~1,111万 +一千一百一十一万九千~11,119,000 +一千一百一十一万九千一百~11,119,100 +一千一百一十一万九千一百二十~11,119,120 +一千一百一十一万九千一百二十一~11,119,121 +一千一百一十一万零一~11,110,001 +一千一百一十一万零一十~11,110,010 +一千一百一十一万零一百~11,110,100 +一千零一十万零一百~10,100,100 +一千零一十一万零一百~10,110,100 +一千零一万零一百~10,010,100 +一億~1億 +一億一千萬~110,000,000 +一億一千一百萬~111,000,000 +一億一千一百一十萬~111,100,000 +一億一千一百一十一萬~111,110,000 +一億零一百萬~101,000,000 +一億零一百一十萬~101,100,000 +一億零一百一十一萬~101,110,000 +一億零一十萬~100,100,000 +一億零一十一萬~100,110,000 +一億零一萬~100,010,000 +一億零一萬一千~100,011,000 +一億零一萬一千一百~100,011,100 +一億零一萬一千一百一~100,011,101 +一億零一萬一千一百一十一~100,011,111 +一億零一萬一千一百零五~100,011,105 +一億零一萬一千零五~100,011,005 +十億~10億 +十一億~11億 +十一億九千萬~1,190,000,000 +十一億九千一百萬~1,191,000,000 +十一億九千一百一十萬~1,191,100,000 +十一億九千一百一十一萬~1,191,110,000 +十一億零一百一十萬~1,101,100,000 +十一億零一十萬~1,100,100,000 +十一億零一萬~1,100,010,000 +十一億零十萬~1,100,100,000 +十一億零九千~1,100,009,000 +十一億零九百~1,100,000,900 +十一億零九十~1,100,000,090 +十一億零九~1,100,000,009 +一百億~100億 +一百一十億~110億 +一百一十一億~111億 +一百一十一億九千萬~11,190,000,000 +一百一十一億九千九百萬~11,199,000,000 +一百一十一億九千九百一十萬~11,199,100,000 +一百一十一億九千九百一十一萬~11,199,110,000 +一百一十一億九千九百一十一萬九千~11,199,119,000 +一百一十一億九千九百一十一萬九千九百一十一~11,199,119,911 +一百零一億~101億 +一百零一億零九百萬~10,109,000,000 +一百零一億零九十萬~10,100,900,000 +一百零一億零九萬~10,100,090,000 +一百零一億零九萬零一百~10,100,090,100 +一千億~1,000億 +一千一百億~1,100億 +一千零五十億~1,050億 +一千零五億~1,005億 +一千億九千萬~100,090,000,000 +一千億零九百萬~100,009,000,000 +一千億零九十萬~100,000,900,000 +一千億零九萬~100,000,090,000 +一千億零九十萬零五百~100,000,900,500 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..5404b8ec3 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,31 @@ +一七九八年五月三十日~1798年5月30日 +五月三十日~5月30日 +一七九八年五月~1798年5月 +八月~8月 +一七九八年~1798年 +十九日~19日 +一九九四年一月二日~1994年1月2日 +一九九五年二月三日~1995年2月3日 +二零零零年三月五日~2000年3月5日 +二零零一年四月六日~2001年4月6日 +公元一七九八年五月三十日~公元1798年5月30日 +公元一八三五年~公元1835年 +公元一八三四年八月~公元1834年8月 +公元一九九四年一月二日~公元1994年1月2日 +公元一九九五年二月三日~公元1995年2月3日 +公元二零零零年三月五日~公元2000年3月5日 +公元二零零一年四月六日~公元2001年4月6日 +公元前一七九八年~公元前1798年 +公元前二八零九年~公元前2809年 +公元前一九九四年一月二日~公元前1994年1月2日 +公元前一九九五年二月三日~公元前1995年2月3日 +公元前二零零零年三月五日~公元前2000年3月5日 +公元前二零零一年四月六日~公元前2001年4月6日 +纪元前一九三四年一月二日~公元前1934年1月2日 +纪元前一九九八年三月三日~公元前1998年3月3日 +纪元前二零零零年三月五日~公元前2000年3月5日 +纪元前二零零一年四月六日~公元前2001年4月6日 +纪元一二三四年一月二日~公元1234年1月2日 +纪元二零五六年二月三日~公元2056年2月3日 +纪元二零零零年三月五日~公元2000年3月5日 +纪元二零零一年四月六日~公元2001年4月6日 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..a73dc302e --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,42 @@ +一点零~1.0 +十五点零~15.0 +一百点零~100.0 +一百零一点五~101.5 +一点零五六~1.056 +一点零零五六~1.0056 +一点零零零五六~1.00056 +两百点一~200.1 +三千点五~3,000.5 +四万点六~40,000.6 +一點零零五~1.005 +九十九點零零零五~99.0005 +一百點五七三五~100.5735 +一千五百点零一~1,500.01 +负五万点二四五~-50,000.245 +负十五万点三七九~-150,000.379 +负一点一~-1.1 +负十点五~-10.5 +負十點五~-10.5 +負九十九點九五~-99.95 +負一百五十點一二~-150.12 +負一千五百零九點五一~-1,509.51 +負五萬點三~-50,000.3 +負五點零一~-5.01 +負十點零零一~-10.001 +負十點零零零三~-10.0003 +負一百點零零零零四~-100.00004 +一点一二三四五六七八九~1.123456789 +负五点一零二~-5.102 +负三点一二零三~-3.1203 +负十点一二三零五~-10.12305 +伍拾壹点肆~51.4 +壹佰点叁肆~100.34 +贰拾点伍陆~20.56 +柒拾捌点玖~78.9 +负叁拾壹点肆~-31.4 +负壹佰点叁肆~-100.34 +负贰拾点伍陆~-20.56 +负柒拾点玖~-70.9 +負贰拾点叁肆~-20.34 +負玖点玖~-9.9 +負壹佰贰拾点叁肆~-120.34 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..473f1dfb9 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,20 @@ +五分之一~1/5 +二分之一~1/2 +三分之一~1/3 +十分之一~1/10 +一百分之一~1/100 +一千分之一~1/1000 +五分之二~2/5 +三分之二~2/3 +十分之五~5/10 +一千分之五~5/1000 +三又五分之一~3又1/5 +一又二分之一~1又1/2 +一又三分之一~1又1/3 +三又十分之一~3又1/10 +五十又一百分之一~50又1/100 +三又一千分之五~3又5/1000 +六又十分之五~6又5/10 +八又七分之五~8又5/7 +九又四分之三~9又3/4 +五分之四~4/5 diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..2504e7d44 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,49 @@ +一千美元~US$1000 +五千美元~US$5000 +一万美元~US$1万 +一点五万美元~US$1.5万 +五十万美元~US$50万 +一百万美元~US$100万 +一千万美元~US$1000万 +一千元~¥1000 +五千元~¥5000 +一万元~¥1万 +一千五万元~¥1005万 +五十万元~¥50万 +一百万元~¥100万 +一千万元~¥1000万 +一千欧元~€1000 +五千欧元~€5000 +一万欧元~€1万 +一点五万欧元~€1.5万 +五十万欧元~€50万 +一百万欧元~€100万 +一千万欧元~€1000万 +一千英镑~£1000 +五千英镑~£5000 +一万英镑~£1万 +一点五万英镑~£1.5万 +五十万英镑~£50万 +一百万英镑~£100万 +一千万英镑~£1000万 +一千韩元~₩1000 +五千韩元~₩5000 +一万韩元~₩1万 +一点五万韩元~₩1.5万 +五十万韩元~₩50万 +一百万韩元~₩100万 +一千万韩元~₩1000万 +一千印度卢布~₹1000 +五千印度卢布~₹5000 +一万印度卢布~₹1万 +一点五万印度卢布~₹1.5万 +五十万印度卢布~₹50万 +一百万印度卢布~₹100万 +一千万印度卢布~₹1000万 +一千日元~JPY¥1000 +五千日元~JPY¥5000 +一万日元~JPY¥1万 +一点五万日元~JPY¥1.5万 +五十万日元~JPY¥50万 +一百万日元~JPY¥100万 +一千万日元~JPY¥1000万 diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..828ec6203 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,57 @@ +第一百~第100 +第五百~第500 +第兩萬一千一百一十一~第21111 +第一百~第100 +第二百~第200 +第兩千~第2000 +第两万~第2万 +第十万~第10万 +第一百万~第100万 +第一千万~第1000万 +第一亿~第1亿 +第一百零一~第101 +第十亿~第10亿 +第五十万~第50万 +第一百一十一~第111 +第十万一千一百一十一~第101111 +第十万一千一百~第101100 +第十万一千~第101000 +第十万零一百~第100100 +第十万零十~第100010 +第十万零一~第100001 +第一百万~第100万 +第一百一十万~第110万 +第一百一十一万~第111万 +第两百万~第200万 +第两百一十万~第210万 +第两百零一万~第201万 +第一百一十九万~第119万 +第一百一十九万九千~第1199000 +第一百一十九万九千九百~第1199900 +第一百一十九万九千九百九十~第1199990 +第一百一十九万九千九百九十九~第1199999 +第一百一十九万零九~第1190009 +第一百一十九万零九十~第1190090 +第一百一十九万零九十一~第1190091 +第一百一十九万零九百九十一~第1190991 +第一千万~第1000万 +第一千一百万~第1100万 +第一千一百一十万~第1110万 +第一千一百一十一万~第1111万 +第一千一百一十一万九千~第11119000 +第一千一百一十一万九千一百~第11119100 +第一千一百一十一万九千一百二十~第11119120 +第一千一百一十一万九千一百二十一~第11119121 +第一千一百一十一万零一~第11110001 +第一千一百一十一万零一十~第11110010 +第一千一百一十一万零一百~第11110100 +第一千零一十万零一百~第10100100 +第一千零一十一万零一百~第10110100 +第一千零一万零一百~第10010100 +第一億~第1億 +第一億一千萬~第110000000 +第一億一千一百萬~第111000000 +第一億一千一百一十萬~第111100000 +第一億一千一百一十一萬~第111110000 +第一億零一百萬~第101000000 +第一億零一百一十萬~第101100000 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..01b2a5d15 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,23 @@ +五点五分~05:05 +五点一刻~5点1刻 +两点二刻~2点2刻 +三点三刻~3点3刻 +六点~6点 +五点五分~05:05 +五点半~5点半 +五点一刻~5点1刻 +两点三刻~2点3刻 +三点三刻~3点3刻 +五点五分~05:05 +两点一刻~2点1刻 +三点二刻~3点2刻 +四点~4点 +一点五分十秒~01:05:10 +十三点五分十秒~13:05:10 +十点~10点 +五分钟~5分钟 +五秒钟~5秒钟 +十三点五分~13:05 +十三点零五分~13:05 +五点二十五分~05:25 +十一点三十四分~11:34 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/test_cardinal.py b/tests/nemo_text_processing/zh/test_cardinal.py new file mode 100644 index 000000000..ebd00b16a --- /dev/null +++ b/tests/nemo_text_processing/zh/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_date.py b/tests/nemo_text_processing/zh/test_date.py index d8079e3a6..01d3e038b 100644 --- a/tests/nemo_text_processing/zh/test_date.py +++ b/tests/nemo_text_processing/zh/test_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,12 @@ class TestDate: def test_norm_date(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_decimal.py b/tests/nemo_text_processing/zh/test_decimal.py new file mode 100644 index 000000000..92af62a30 --- /dev/null +++ b/tests/nemo_text_processing/zh/test_decimal.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_fraction.py b/tests/nemo_text_processing/zh/test_fraction.py index 03b508b21..264d64d13 100644 --- a/tests/nemo_text_processing/zh/test_fraction.py +++ b/tests/nemo_text_processing/zh/test_fraction.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,12 @@ class TestFraction: def test_norm_fraction(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_money.py b/tests/nemo_text_processing/zh/test_money.py index d06a2b812..3d50ce5fa 100644 --- a/tests/nemo_text_processing/zh/test_money.py +++ b/tests/nemo_text_processing/zh/test_money.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,12 @@ class TestMoney: def test_norm_money(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_ordinal.py b/tests/nemo_text_processing/zh/test_ordinal.py new file mode 100644 index 000000000..9775d5522 --- /dev/null +++ b/tests/nemo_text_processing/zh/test_ordinal.py @@ -0,0 +1,30 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..4ca12af7f --- /dev/null +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,84 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNOrdinal() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testITNFraction() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + +testITNTime() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +testITNMoney() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNWhitelist() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testITNTelephone() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + +testITNElectronic() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt + runtest $input +} + +testITNWord() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/zh/test_time.py b/tests/nemo_text_processing/zh/test_time.py index d36737afb..80d79a78c 100644 --- a/tests/nemo_text_processing/zh/test_time.py +++ b/tests/nemo_text_processing/zh/test_time.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,12 @@ class TestTime: def test_norm_time(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected