Skip to content

Commit

Permalink
Merge branch 'main' into ZH_TN
Browse files Browse the repository at this point in the history
Signed-off-by: Buyuan(Alex) Cui <[email protected]>
  • Loading branch information
BuyuanCui authored Jun 23, 2023
2 parents 69ac7c2 + 87e09f2 commit 05e51d4
Show file tree
Hide file tree
Showing 247 changed files with 488,065 additions and 310 deletions.
1 change: 1 addition & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ Add a one line overview of what this PR aims to accomplish.
- [ ] New Feature
- [ ] Bugfix
- [ ] Documentation
- [ ] Test

If you haven't finished some of the above items you can still open "Draft" PR.
111 changes: 89 additions & 22 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@ pipeline {
}
environment {

AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-18-23-1'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-16-23-1'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-27-23-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-14-23-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-1'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

}
stages {
Expand Down Expand Up @@ -65,6 +67,7 @@ pipeline {
}
failFast true
parallel {

stage('L0: En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}'
Expand Down Expand Up @@ -130,6 +133,25 @@ pipeline {
}
}

stage('L0: Create Codeswitched ES/EN TN/ITN Grammars') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {

stage('L0: ES/EN ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=es_en --text="ciento uno " --cache_dir ${ES_EN_TN_CACHE}'
}
}

}
}

stage('L0: Create AR TN/ITN Grammars') {
when {
anyOf {
Expand Down Expand Up @@ -175,6 +197,27 @@ pipeline {

}
}
stage('L0: Create HU TN/ITN Grammars') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('L0: HU TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}'
}
}
// stage('L0: HU ITN grammars') {
// steps {
// sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hu --text="száz " --cache_dir ${HU_TN_CACHE}'
// }
// }
}
}
stage('L0: Create VI TN/ITN Grammars') {
when {
anyOf {
Expand Down Expand Up @@ -255,11 +298,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=sv --text="100" --cache_dir ${SV_TN_CACHE}'
}
}
// stage('L0: SV ITN grammars') {
// steps {
// sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}'
// }
// }
// stage('L0: SV ITN grammars') {
// steps {
// sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}'
// }
// }
}
}
stage('L0: Create ZH TN/ITN Grammars') {
Expand Down Expand Up @@ -309,6 +352,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/es/ -m "not pleasefixme" --cpu --tn_cache_dir ${ES_TN_CACHE}'
}
}
stage('L1: Run all Codeswitched ES/EN TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/es_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${ES_EN_TN_CACHE}'
}
}
stage('L1: Run all AR TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ar/ -m "not pleasefixme" --cpu --tn_cache_dir ${AR_TN_CACHE}'
Expand All @@ -334,11 +382,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ru/ -m "not pleasefixme" --cpu --tn_cache_dir ${RU_TN_CACHE}'
}
}
stage('L1: Run all SV TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/sv/ -m "not pleasefixme" --cpu --tn_cache_dir ${SV_TN_CACHE}'
}
}
// stage('L1: Run all SV TN/ITN tests (restore grammars from cache)') {
// steps {
// sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/sv/ -m "not pleasefixme" --cpu --tn_cache_dir ${SV_TN_CACHE}'
// }
// }
stage('L1: Run all ZH TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/zh/ -m "not pleasefixme" --cpu --tn_cache_dir ${ZH_TN_CACHE}'
Expand Down Expand Up @@ -375,7 +423,26 @@ pipeline {
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \
rm -rf $OUTPUT_DIR'
rm -rf $DENORM_OUTPUT_DIR'
}
}


stage('L2: Eng alignment TN') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
cd nemo_text_processing/fst_alignment && python alignment.py --text="2615 Forest Av, 90501 CA, Santa Clara. 10kg, 12/16/2018" --grammar=tn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_tn_True_deterministic_cased__tokenize.far 2>&1 | tee $NORM_OUTPUT_DIR/pred.txt && \
cmp --silent $NORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_norm/ci/alignment_gold.txt || exit 1 && \
rm -rf $NORM_OUTPUT_DIR'
}
}

stage('L2: Eng alignment ITN') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
cd nemo_text_processing/fst_alignment && python alignment.py --text="one million twenty three thousand two hundred eleven ten kilograms one hundred twenty three dollars and twenty five cents" --grammar=itn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_itn_lower_cased.far 2>&1 | tee $DENORM_OUTPUT_DIR/pred.txt && \
cmp --silent $DENORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_denorm/ci/alignment_gold.txt || exit 1 && \
rm -rf $DENORM_OUTPUT_DIR'
}
}

Expand Down
9 changes: 6 additions & 3 deletions nemo_text_processing/fst_alignment/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import logging
import string
from argparse import ArgumentParser
from typing import List

Expand Down Expand Up @@ -99,6 +100,7 @@ def parse_args():
WHITE_SPACE = "\u23B5"
ITN_MODE = "itn"
TN_MODE = "tn"
tn_itn_symbols = list(string.ascii_letters + string.digits) + list("$\:+-=")


def get_word_segments(text: str) -> List[List[int]]:
Expand Down Expand Up @@ -210,26 +212,27 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: st
aligned_end = _get_aligned_index(alignment, end - 1) # inclusive

logging.debug(f"0: |{list(map(remove, [x[0] for x in alignment[aligned_start:aligned_end+1]]))}|")
logging.debug(f"1: |{aligned_start}:{aligned_end+1}|")

# extend aligned_start to left

while (
aligned_start - 1 > 0
and alignment[aligned_start - 1][0] == EPS
and (alignment[aligned_start - 1][1].isalnum() or alignment[aligned_start - 1][1] == EPS)
and (alignment[aligned_start - 1][1] in tn_itn_symbols or alignment[aligned_start - 1][1] == EPS)
):
aligned_start -= 1

while (
aligned_end + 1 < len(alignment)
and alignment[aligned_end + 1][0] == EPS
and (alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS)
and (alignment[aligned_end + 1][1] in tn_itn_symbols or alignment[aligned_end + 1][1] == EPS)
):
aligned_end += 1

if mode == TN_MODE:
while (aligned_end + 1) < len(alignment) and (
alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS
alignment[aligned_end + 1][1] in tn_itn_symbols or alignment[aligned_end + 1][1] == EPS
):
aligned_end += 1

Expand Down
13 changes: 13 additions & 0 deletions nemo_text_processing/g2p/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
13 changes: 13 additions & 0 deletions nemo_text_processing/g2p/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading

0 comments on commit 05e51d4

Please sign in to comment.