Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cardinals up to a hundred trillions, timeFST and transliteration #209

Merged
merged 7 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nemo_text_processing/fst_alignment/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: st
alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions
start: inclusive start position in input string
end: exclusive end position in input string
mode: grammar type for either tn or itn
mode: grammar type for either tn or itn

Returns:
output_og_start_index: inclusive start position in output string
Expand Down
2 changes: 1 addition & 1 deletion nemo_text_processing/hybrid/mlm_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def score_sentence(self, sentence: str):

def __mask_text__(self, idx: int, tokens: List[str]):
"""
replaces string at index idx in list `tokens` with a masked token and returns the modified list.
replaces string at index idx in list `tokens` with a masked token and returns the modified list.
"""
masked = tokens.copy()
masked[idx] = self.MASK_LABEL
Expand Down
2 changes: 1 addition & 1 deletion nemo_text_processing/hybrid/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_masked_score(text, model, do_lower=True):

def _get_ambiguous_positions(sentences: List[str]):
"""returns None or index list of ambigous semiotic tokens for list of sentences.
E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only
E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only
the first semiotic span <street>/<saint> is ambiguous."""
l_sets = [set([x]) for x in re.findall(r"<\s.+?\s>", sentences[0])]
for sentence in sentences[1:]:
Expand Down
8 changes: 4 additions & 4 deletions nemo_text_processing/hybrid/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,8 @@ def clean_post_norm(

def clean_libri_tts(target: str):
"""
Replace abbreviations in LibriTTS dataset
"""
Replace abbreviations in LibriTTS dataset
"""

# Normalized text in LibriTTS by Google which contains abbreviations from `libri_sometimes_converts_abbrs` sometimes wasn't converted.
libri_sometimes_converts_abbrs = {"St.": "saint", "Rev.": "reverend"}
Expand Down Expand Up @@ -641,15 +641,15 @@ def get_diff(a: str, b: str):

def diff_pred_gt(pred: str, gt: str):
"""returns list of different substrings between prediction and gt
relies on that prediction uses '< ' ' >'
relies on that prediction uses '< ' ' >'

Args:
pred (str): prediction
gt (str): ground truth

Returns:
list of Tuple(pred start and end, gt start and end) subsections

e.g. pred="< Edward third >., king Our own . loss had been < two thousand two hundred >"
gt ="Edward III., king Our own loss had been twenty two hundred"
--> [([0, 16], [0, 10]), ([32, 34], [26, 26]), ([48, 76], [40, 58])]
Expand Down
2 changes: 1 addition & 1 deletion nemo_text_processing/hybrid/wfst_lm_rescoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def threshold_weights(norm_texts_weights, delta: float = 0.2):
delta: delta to add to minimum weight in options to compose upper limit for threshhold

returns:
filter list of same format as input
filter list of same format as input
"""
# threshold value is factor applied to lowest/first weight of all normalization options for every input
res = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. سالب تسعة وتسعون -> cardinal { integer: "99" negative: "-" } }
Numbers below thirteen are not converted.
Args:
Numbers below thirteen are not converted.
Args:
tn_cardinal: cardinal FST for TN
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class FractionFst(GraphFst):
"""
Finite state transducer for classifying fraction
e.g. واحد و نصف -> tokens { integer_part: "1" numerator: "1" denominator: "2" }

Args:
tn_cardinal: TN cardinal tagger

Expand Down
4 changes: 2 additions & 2 deletions nemo_text_processing/inverse_text_normalization/ar/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]):
Args
x: integer

Returns: spoken representation
Returns: spoken representation
"""
if isinstance(x, int):
x = str(x)
Expand All @@ -41,7 +41,7 @@ def get_abs_path(rel_path):

Args:
rel_path: relative path to this file

Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals. Numbers below ten are not converted.
Finite state transducer for classifying cardinals. Numbers below ten are not converted.
Allows both compound numeral strings or separated by whitespace.
"und" (en: "and") can be inserted between "hundert" and following number or "tausend" and following single or double digit number.

Expand All @@ -32,7 +32,7 @@ class CardinalFst(GraphFst):
e.g. ein tausend -> cardinal { integer: "1000" } }
e.g. eintausend -> cardinal { integer: "1000" } }
e.g. ein tausend zwanzig -> cardinal { integer: "1020" } }

Args:
tn_cardinal_tagger: TN cardinal tagger
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic: email addresses, etc.
e.g. c d f eins at a b c punkt e d u -> tokens { name: "cdf1.abc.edu" }

Args:
tn_electronic_tagger: TN eletronic tagger
tn_electronic_verbalizer: TN eletronic verbalizer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class FractionFst(GraphFst):
e.g. ein halb -> tokens { name: "1/2" }
e.g. ein ein halb -> tokens { name: "1 1/2" }
e.g. drei zwei ein hundertstel -> tokens { name: "3 2/100" }

Args:
itn_cardinal_tagger: ITN cardinal tagger
tn_fraction_verbalizer: TN fraction verbalizer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@

class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
Finite state transducer for classifying telephone numbers, e.g.
null vier eins eins eins zwei drei vier eins zwei drei vier -> tokens { name: "(0411) 1234-1234" }

Args:
tn_cardinal_tagger: TN Cardinal Tagger
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class TimeFst(GraphFst):
e.g. drei vor zwölf -> time { minutes: "57" hours: "11" }
e.g. drei nach zwölf -> time { minutes: "3" hours: "12" }
e.g. drei uhr zehn minuten zehn sekunden -> time { hours: "3" hours: "10" sekunden: "10"}

Args:
tn_time_verbalizer: TN time verbalizer
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class TimeFst(GraphFst):
Finite state transducer for verbalizing time, e.g.
time { hours: "8" minutes: "30" zone: "e s t" } -> 08:30 Uhr est
time { hours: "8" } -> 8 Uhr
time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr
time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr
"""

def __init__(self, deterministic: bool = True):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "jetzt" } tokens { name: "ist" } tokens { time { hours: "12" minutes: "30" } } -> jetzt ist 12:30 Uhr
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def process(self, instance: Instance) -> Instance:

Args:
processes given instance with process function

Returns: processed instance if instance belongs to expected class type or original instance
"""
if instance.token_type != self.class_type:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
self.fst = final_graph.optimize()

def delete_word(self, word: str):
""" Capitalizes word for `cased` input"""
"""Capitalizes word for `cased` input"""
delete_graph = pynutil.delete(word).optimize()
if self.input_case == INPUT_CASED:
if len(word) > 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def _get_thousands_graph():

class DateFst(GraphFst):
"""
Finite state transducer for classifying date,
Finite state transducer for classifying date,
e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
e.g. twenty twenty -> date { year: "2012" preserve_order: true }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_quantity(
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"

Args:
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
input_case: accepting either "lower_cased" or "cased" input.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_serial_number(cardinal):

class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
Finite state transducer for classifying telephone numbers, e.g.
one two three one two three five six seven eight -> { number_part: "123-123-5678" }

This class also support card number and IP format.
Expand Down
4 changes: 2 additions & 2 deletions nemo_text_processing/inverse_text_normalization/en/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def num_to_word(x: Union[str, int]):
Args
x: integer

Returns: spoken representation
Returns: spoken representation
"""
if isinstance(x, int):
x = str(x)
Expand All @@ -41,7 +41,7 @@ def get_abs_path(rel_path):

Args:
rel_path: relative path to this file

Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

class FractionFst(GraphFst):
"""
Finite state transducer for verbalizing fraction,
Finite state transducer for verbalizing fraction,
"""

def __init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
-> 123-123-5678
"""

def __init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"}
e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"}
This class converts cardinals up to (but not including) "un cuatrillón",
i.e up to "one septillion" in English (10^{24}).
Cardinals below ten are not converted (in order to avoid
Cardinals below ten are not converted (in order to avoid
"vivo en una casa" --> "vivo en 1 casa" and any other odd conversions.)

Although technically Spanish grammar requires that "y" only comes after
Expand Down Expand Up @@ -199,7 +199,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
self.fst = final_graph.optimize()

def delete_word(self, word: str):
""" Capitalizes word for `cased` input"""
"""Capitalizes word for `cased` input"""
delete_graph = pynutil.delete(word).optimize()
if self.input_case == INPUT_CASED:
if len(word) > 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

class DateFst(GraphFst):
"""
Finite state transducer for classifying date,
Finite state transducer for classifying date,
e.g. primero de enero -> date { day: "1" month: "enero" }
e.g. uno de enero -> date { day: "1" month: "enero" }

Args:
cardinal: CardinalFst
input_case: accepting either "lower_cased" or "cased" input.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_quantity(
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"

Args:
Args:
decimal: decimal FST
cardinal_up_to_million: cardinal FST
input_case: accepting either "lower_cased" or "cased" input.
Expand Down Expand Up @@ -87,7 +87,7 @@ class DecimalFst(GraphFst):
This decimal rule assumes that decimals can be pronounced as:
(a cardinal) + ('coma' or 'punto') plus (any sequence of cardinals <1000, including 'zero')

Also writes large numbers in shortened form, e.g.
Also writes large numbers in shortened form, e.g.
e.g. uno coma dos seis millón -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" }
e.g. dos millones -> decimal { negative: "false" integer_part: "2" quantity: "millones" }
e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class ElectronicFst(GraphFst):
and URLS (which get converted to a "protocol" field).
e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } }

Args:
input_case: accepting either "lower_cased" or "cased" input.
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@
class FractionFst(GraphFst):
"""
Finite state transducer for classifying fractions
e.g. dos quintos -> fraction { numerator: "2" denominator: "5" }
This class converts fractions with a denominator up to (and including)
e.g. dos quintos -> fraction { numerator: "2" denominator: "5" }
This class converts fractions with a denominator up to (and including)
"1/999".

Fractions with 4 as their denominator, read as "cuarto(s)", are not
converted because "room" is also "cuarto", which could cause issues like
"quiero reservar un cuarto" -> quiero reservar 1/2".

Fractions without a numerator are not converted either to prevent issues
like:
"estaba medio dormido" -> "estaba 1/2 dormido"

Args:
cardinal: CardinalFst
ordinal: OrdinalFst
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure
e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" }
e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" }

Args:
cardinal: CardinalFst
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class OrdinalFst(GraphFst):
vigésimo primero -> ordinal { integer: "21" morphosyntactic_features: "o" }
This class converts ordinal up to "millesímo" (one thousandth) exclusive.

Cardinals below ten are not converted (in order to avoid
Cardinals below ten are not converted (in order to avoid
e.g. "primero hice ..." -> "1.º hice...", "segunda guerra mundial" -> "2.ª guerra mundial"
and any other odd conversions.)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
Finite state transducer for classifying telephone numbers, e.g.
uno dos tres uno dos tres cinco seis siete ocho -> { number_part: "123-123-5678" }.
If 10 digits are spoken, they are grouped as 3+3+4 (eg. 123-456-7890).
If 9 digits are spoken, they are grouped as 3+3+3 (eg. 123-456-789).
Expand All @@ -37,7 +37,7 @@ class TelephoneFst(GraphFst):
"twelve thirty four" = "1234".

(we ignore more complicated cases such as "three hundred and two" or "three nines").

Args:
input_case: accepting either "lower_cased" or "cased" input.
"""
Expand Down
Loading
Loading