Skip to content

Commit

Permalink
IT TN Fixes for Issue #166 (#221)
Browse files Browse the repository at this point in the history
* Fixes the tagger

Signed-off-by: Simon Zuberek <[email protected]>

* Updates ELECTRONIC tests

Signed-off-by: Simon Zuberek <[email protected]>

* Fixes test cases

Signed-off-by: Simon Zuberek <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Passes Sparrowhawk and normalization.py but not pyTest

Signed-off-by: Simon Zuberek <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Updates test cases and cache

Signed-off-by: Simon Zuberek <[email protected]>

* Removes the commented out lines

Signed-off-by: Simon Zuberek <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Simon Zuberek <[email protected]>
Co-authored-by: Simon Zuberek <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Aug 26, 2024
1 parent 23a63ce commit 2cb0275
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pipeline {
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0'
IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
.us punto US
.de punto DE
.it punto IT
.jpg punto jpeg
.jpg punto jpeg
.edu punto edu
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def __init__(self, deterministic: bool = True):
+ protocol
+ pynutil.insert(double_quotes)
)

url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)

graph = url | domain_graph | email | tag
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
Expand Down Expand Up @@ -109,28 +111,19 @@ def __init__(
)

punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct),
1,
)

token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
)

graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct)

graph = delete_space + graph + delete_space
graph |= punct

self.fst = graph.optimize()
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Trovateci su mail.university.edu.~Trovateci su m a i l punto u n i v e r s i t y punto edu .
Trovateci su mail.nasa.gov.~Trovateci su m a i l punto n a s a punto gov .
Trovateci su mail.nasa.mx.~Trovateci su m a i l punto n a s a punto m x .
https://www.nvidia.com~h t t p s due punti slash slash w w w punto nvidia punto com
https://www.nvidia.com/abc/ciao.html~h t t p s due punti slash slash w w w punto nvidia punto com slash a b c slash c i a o punto html
https://www.nvidia.it/abc/12df/like.py~h t t p s due punti slash slash w w w punto nvidia punto IT slash a b c slash uno due d f slash l i k e punto python
Expand All @@ -6,4 +9,5 @@ abc.de!f@23d_f.sd.us~a b c punto d e punto esclamativo f chiocciola due tre d tr
www.nvidia.file&TN.com~w w w punto nvidia punto f i l e e commerciale T N punto com
@jensen~chiocciola j e n s e n
@jensen.me~chiocciola j e n s e n punto m e
@wezyr1986~chiocciola w e z y r uno nove otto sei
@wezyr1986~chiocciola w e z y r uno nove otto sei
visitate il sito https://www.nvidia.com.~visitate il sito h t t p s due punti slash slash w w w punto nvidia punto com .

0 comments on commit 2cb0275

Please sign in to comment.