Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Profanity filtering for ITN - EN #86

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a8078de
Zh itn (#74)
BuyuanCui Jun 30, 2023
29a6272
Add profanity filtering for english ITN
gayu-thri Jun 30, 2023
d65ff7d
Add copyrights
gayu-thri Jun 30, 2023
d70a4ec
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 30, 2023
252bb6d
Add filter_profanity attr to InverseNormalizer
gayu-thri Jun 30, 2023
2e71ebb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 30, 2023
a8a7826
Different fst names with/without pf
gayu-thri Jun 30, 2023
0cfb3a8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 30, 2023
62efdd6
Rm written form in TSV and use fst operations to get it
gayu-thri Jun 30, 2023
1d5a362
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 30, 2023
f9e5bde
user configurable input file for profane words
gayu-thri Jul 3, 2023
b0a6a98
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 3, 2023
b3375c2
Merge branch 'main' into add-profanity-filtering
gayu-thri Jul 3, 2023
e6548dd
Fix error in CodeQL
gayu-thri Jul 25, 2023
a46ea2d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 25, 2023
bf4e9a1
Resolve PR comments
gayu-thri Aug 7, 2023
3c79f42
disable filtering profanity by default
gayu-thri Aug 7, 2023
1ce954c
Remove raising explicit ValueError when custom list is not passed
gayu-thri Aug 7, 2023
9c710c4
Set filer_profanity to True in profane test
gayu-thri Aug 7, 2023
9880c05
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright 2015 and onwards Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path
from nemo_text_processing.text_normalization.en.graph_utils import (
INPUT_CASED,
INPUT_LOWER_CASED,
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
capitalized_input_graph,
)
from pynini.lib import pynutil


class ProfaneFst(GraphFst):
"""
Finite state transducer for classifying profane words
e.g. bitch -> profane { filtered: "b****" }

This class has highest priority among all classifier grammars

Args:
input_case: accepting either "lower_cased" or "cased" input.
input_file: path to a file with profane words to be redacted with "*" symbol. (each line of the file: spoken_form\n)
e.g. nemo_text_processing/inverse_text_normalization/en/data/swear_sequences.tsv
"""

def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
super().__init__(name="profane", kind="classify")
# Profane Grammar
if input_file is None:
input_file = "data/swear_sequences.tsv"

profane_graph = pynini.string_file(get_abs_path(input_file))

bowdlerize = (
(NEMO_ALPHA | NEMO_DIGIT) + pynini.closure(pynini.cross(NEMO_SPACE | NEMO_ALPHA | NEMO_DIGIT, "*"), 1)
).optimize()

profane_graph = (profane_graph @ bowdlerize).optimize()

if input_case == INPUT_CASED:
profane_graph = capitalized_input_graph(profane_graph)

# Token insertion
final_profane_graph = pynutil.insert('filtered: "') + profane_graph + pynutil.insert('"')

# Inserts the profane tag
final_profane_graph = self.add_tokens(final_profane_graph)
self.fst = final_profane_graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from nemo_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.en.taggers.profane import ProfaneFst
from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.en.taggers.time import TimeFst
Expand All @@ -50,6 +51,8 @@ class ClassifyFst(GraphFst):
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
filter_profanity: set to True to enable profanity filtering
profane_words: path to a file with profane words for redacting with "*" symbol
"""

def __init__(
Expand All @@ -58,15 +61,23 @@ def __init__(
cache_dir: str = None,
overwrite_cache: bool = False,
whitelist: str = None,
filter_profanity: bool = False,
profane_words: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify")

far_file = None

if filter_profanity:
fst_name = "tokenize_and_classify_with_profane_filtering"
else:
fst_name = "tokenize_and_classify_without_profane_filtering"

if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, f"en_itn_{input_case}.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
self.fst = pynini.Far(far_file, mode="r")[fst_name]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars.")
Expand All @@ -88,6 +99,7 @@ def __init__(
punct_graph = PunctuationFst().fst
electronic_graph = ElectronicFst(input_case=input_case).fst
telephone_graph = TelephoneFst(cardinal, input_case=input_case).fst
profane_graph = ProfaneFst(input_case=input_case, input_file=profane_words).fst

classify = (
pynutil.add_weight(whitelist_graph, 1.01)
Expand All @@ -102,6 +114,9 @@ def __init__(
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
# Attempts to filter profane words only if `filter_profanity` field is set to True
if filter_profanity:
classify = (pynutil.add_weight(profane_graph, 0.0001) | classify).optimize()

punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
Expand All @@ -115,5 +130,5 @@ def __init__(
self.fst = graph.optimize()

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
generator_main(far_file, {fst_name: self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright 2015 and onwards Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
from pynini.lib import pynutil


class ProfaneFst(GraphFst):
"""
Finite state transducer for verbalizing profane words
e.g. bitch -> profane { filtered: "b****" } -> b****
"""

def __init__(self):
super().__init__(name="profane", kind="verbalize")
graph = (
pynutil.delete("filtered:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
+ delete_space
)

delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from nemo_text_processing.inverse_text_normalization.en.verbalizers.measure import MeasureFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.profane import ProfaneFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.whitelist import WhiteListFst
Expand Down Expand Up @@ -47,6 +48,8 @@ def __init__(self):
whitelist_graph = WhiteListFst().fst
telephone_graph = TelephoneFst().fst
electronic_graph = ElectronicFst().fst
profane_graph = ProfaneFst().fst

graph = (
time_graph
| date_graph
Expand All @@ -58,5 +61,6 @@ def __init__(self):
| whitelist_graph
| telephone_graph
| electronic_graph
| profane_graph
)
self.fst = graph
146 changes: 91 additions & 55 deletions nemo_text_processing/inverse_text_normalization/inverse_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class InverseNormalizer(Normalizer):
overwrite_cache: set to True to overwrite .far files
max_number_of_permutations_per_split: a maximum number
of permutations which can be generated from input sequence of tokens.
filter_profanity: set to True to enable profanity filtering
profane_words: path to a file with profane words to be redacted with "*" symbol. (each line of the file: spoken_form\n)
e.g. nemo_text_processing/inverse_text_normalization/en/data/swear_sequences.tsv
"""

def __init__(
Expand All @@ -48,73 +51,106 @@ def __init__(
cache_dir: str = None,
overwrite_cache: bool = False,
max_number_of_permutations_per_split: int = 729,
filter_profanity: bool = False,
profane_words: str = None,
):

assert input_case in ["lower_cased", "cased"]

self.tagger = None

if lang == 'en': # English
from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'es': # Spanish (Espanol)
from nemo_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'pt': # Portuguese (Português)
from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'ru': # Russian (Russkiy Yazyk)
from nemo_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'de': # German (Deutsch)
from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'fr': # French (Français)
from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'sv': # Swedish (Svenska)
from nemo_text_processing.inverse_text_normalization.sv.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.sv.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'vi': # Vietnamese (Tiếng Việt)
from nemo_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'ar': # Arabic
from nemo_text_processing.inverse_text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'es_en': # Arabic
from nemo_text_processing.inverse_text_normalization.es_en.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
self.tagger = ClassifyFst(
cache_dir=cache_dir,
whitelist=whitelist,
overwrite_cache=overwrite_cache,
input_case=input_case,
filter_profanity=filter_profanity,
profane_words=profane_words,
)
elif lang == 'zh': # Mandarin
from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
VerbalizeFinalFst,
else:
if lang == 'es': # Spanish (Espanol)
from nemo_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'pt': # Portuguese (Português)
from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'ru': # Russian (Russkiy Yazyk)
from nemo_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'de': # German (Deutsch)
from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'fr': # French (Français)
from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'sv': # Swedish (Svenska)
from nemo_text_processing.inverse_text_normalization.sv.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.sv.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'vi': # Vietnamese (Tiếng Việt)
from nemo_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'ar': # Arabic
from nemo_text_processing.inverse_text_normalization.ar.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'es_en': # Arabic
from nemo_text_processing.inverse_text_normalization.es_en.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == 'zh': # Mandarin
from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (
ClassifyFst,
)
from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

self.tagger = ClassifyFst(

Check failure

Code scanning / CodeQL

Potentially uninitialized local variable

Local variable 'ClassifyFst' may be used before it is initialized.
cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case
)

self.tagger = ClassifyFst(
cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case
)
self.verbalizer = VerbalizeFinalFst()
self.parser = TokenParser()
self.lang = lang
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
fuck off~f*** off
dont give a fuck about your problems~dont give a f*** about your problems
holy shit~holy s***
he was pissing us off~he was p****** us off
the movie was bad ass~the movie was bad a**
son of a bitch~son of a b****
two girls one cup~t****************
zoophilia~z********
get out of the way fuckin bastard~get out of the way f***** b******
belief that the white power is inherently superior is bad~belief that the w********** is inherently superior is bad
two g one c~t**********
not give a fuck~not give a f***
oh shit~oh s***
rapist~r*****
sexting~s******
assholes~a*******
bullshit~b*******
fucker~f*****
piece of shit~p************
2g1c~2***
2 g 1 c~2******
Loading