NVIDIA · gayu-thri · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023
diff --git a/nemo_text_processing/inverse_text_normalization/en/data/swear_sequences.tsv b/nemo_text_processing/inverse_text_normalization/en/data/swear_sequences.tsv
diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/profane.py b/nemo_text_processing/inverse_text_normalization/en/taggers/profane.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2015 and onwards Google, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    INPUT_CASED,
+    INPUT_LOWER_CASED,
+    NEMO_ALPHA,
+    NEMO_DIGIT,
+    NEMO_SPACE,
+    GraphFst,
+    capitalized_input_graph,
+)
+from pynini.lib import pynutil
+
+
+class ProfaneFst(GraphFst):
+    """
+    Finite state transducer for classifying profane words
+        e.g. bitch -> profane { filtered: "b****" }
+
+    This class has highest priority among all classifier grammars
+
+    Args:
+        input_case: accepting either "lower_cased" or "cased" input.
+        input_file: path to a file with profane words to be redacted with "*" symbol. (each line of the file: spoken_form\n)
+            e.g. nemo_text_processing/inverse_text_normalization/en/data/swear_sequences.tsv
+    """
+
+    def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
+        super().__init__(name="profane", kind="classify")
+        # Profane Grammar
+        if input_file is None:
+            input_file = "data/swear_sequences.tsv"
+
+        profane_graph = pynini.string_file(get_abs_path(input_file))
+
+        bowdlerize = (
+            (NEMO_ALPHA | NEMO_DIGIT) + pynini.closure(pynini.cross(NEMO_SPACE | NEMO_ALPHA | NEMO_DIGIT, "*"), 1)
+        ).optimize()
+
+        profane_graph = (profane_graph @ bowdlerize).optimize()
+
+        if input_case == INPUT_CASED:
+            profane_graph = capitalized_input_graph(profane_graph)
+
+        # Token insertion
+        final_profane_graph = pynutil.insert('filtered: "') + profane_graph + pynutil.insert('"')
+
+        # Inserts the profane tag
+        final_profane_graph = self.add_tokens(final_profane_graph)
+        self.fst = final_profane_graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py
@@ -24,6 +24,7 @@
 from nemo_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst
 from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.profane import ProfaneFst
 from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
 from nemo_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.en.taggers.time import TimeFst
@@ -50,6 +51,8 @@ class ClassifyFst(GraphFst):
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
         whitelist: path to a file with whitelist replacements
+        filter_profanity: set to True to enable profanity filtering
+        profane_words: path to a file with profane words for redacting with "*" symbol
     """
 
     def __init__(
@@ -58,15 +61,23 @@ def __init__(
         cache_dir: str = None,
         overwrite_cache: bool = False,
         whitelist: str = None,
+        filter_profanity: bool = False,
+        profane_words: str = None,
     ):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
+
+        if filter_profanity:
+            fst_name = "tokenize_and_classify_with_profane_filtering"
+        else:
+            fst_name = "tokenize_and_classify_without_profane_filtering"
+
         if cache_dir is not None and cache_dir != "None":
             os.makedirs(cache_dir, exist_ok=True)
             far_file = os.path.join(cache_dir, f"en_itn_{input_case}.far")
         if not overwrite_cache and far_file and os.path.exists(far_file):
-            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            self.fst = pynini.Far(far_file, mode="r")[fst_name]
             logging.info(f"ClassifyFst.fst was restored from {far_file}.")
         else:
             logging.info(f"Creating ClassifyFst grammars.")
@@ -88,6 +99,7 @@ def __init__(
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst(input_case=input_case).fst
             telephone_graph = TelephoneFst(cardinal, input_case=input_case).fst
+            profane_graph = ProfaneFst(input_case=input_case, input_file=profane_words).fst
 
             classify = (
                 pynutil.add_weight(whitelist_graph, 1.01)
@@ -102,6 +114,9 @@ def __init__(
                 | pynutil.add_weight(electronic_graph, 1.1)
                 | pynutil.add_weight(word_graph, 100)
             )
+            # Attempts to filter profane words only if `filter_profanity` field is set to True
+            if filter_profanity:
+                classify = (pynutil.add_weight(profane_graph, 0.0001) | classify).optimize()
 
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
@@ -115,5 +130,5 @@ def __init__(
             self.fst = graph.optimize()
 
             if far_file:
-                generator_main(far_file, {"tokenize_and_classify": self.fst})
+                generator_main(far_file, {fst_name: self.fst})
                 logging.info(f"ClassifyFst grammars are saved to {far_file}.")
diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/profane.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/profane.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2015 and onwards Google, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
+from pynini.lib import pynutil
+
+
+class ProfaneFst(GraphFst):
+    """
+    Finite state transducer for verbalizing profane words
+        e.g. bitch -> profane { filtered: "b****" } -> b****
+    """
+
+    def __init__(self):
+        super().__init__(name="profane", kind="verbalize")
+        graph = (
+            pynutil.delete("filtered:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(NEMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+            + delete_space
+        )
+
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize.py
@@ -20,6 +20,7 @@
 from nemo_text_processing.inverse_text_normalization.en.verbalizers.measure import MeasureFst
 from nemo_text_processing.inverse_text_normalization.en.verbalizers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.en.verbalizers.ordinal import OrdinalFst
+from nemo_text_processing.inverse_text_normalization.en.verbalizers.profane import ProfaneFst
 from nemo_text_processing.inverse_text_normalization.en.verbalizers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.en.verbalizers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.en.verbalizers.whitelist import WhiteListFst
@@ -47,6 +48,8 @@ def __init__(self):
         whitelist_graph = WhiteListFst().fst
         telephone_graph = TelephoneFst().fst
         electronic_graph = ElectronicFst().fst
+        profane_graph = ProfaneFst().fst
+
         graph = (
             time_graph
             | date_graph
@@ -58,5 +61,6 @@ def __init__(self):
             | whitelist_graph
             | telephone_graph
             | electronic_graph
+            | profane_graph
         )
         self.fst = graph
diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py
@@ -38,6 +38,9 @@ class InverseNormalizer(Normalizer):
         overwrite_cache: set to True to overwrite .far files
         max_number_of_permutations_per_split: a maximum number
             of permutations which can be generated from input sequence of tokens.
+        filter_profanity: set to True to enable profanity filtering
+        profane_words: path to a file with profane words to be redacted with "*" symbol. (each line of the file: spoken_form\n)
+            e.g. nemo_text_processing/inverse_text_normalization/en/data/swear_sequences.tsv
     """
 
     def __init__(
@@ -48,73 +51,106 @@ def __init__(
         cache_dir: str = None,
         overwrite_cache: bool = False,
         max_number_of_permutations_per_split: int = 729,
+        filter_profanity: bool = False,
+        profane_words: str = None,
     ):
 
         assert input_case in ["lower_cased", "cased"]
 
+        self.tagger = None
+
         if lang == 'en':  # English
             from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
             from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
                 VerbalizeFinalFst,
             )
 
-        elif lang == 'es':  # Spanish (Espanol)
-            from nemo_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-
-        elif lang == 'pt':  # Portuguese (Português)
-            from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-
-        elif lang == 'ru':  # Russian (Russkiy Yazyk)
-            from nemo_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-
-        elif lang == 'de':  # German (Deutsch)
-            from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-        elif lang == 'fr':  # French (Français)
-            from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-        elif lang == 'sv':  # Swedish (Svenska)
-            from nemo_text_processing.inverse_text_normalization.sv.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.sv.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-        elif lang == 'vi':  # Vietnamese (Tiếng Việt)
-            from nemo_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-        elif lang == 'ar':  # Arabic
-            from nemo_text_processing.inverse_text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
-            )
-        elif lang == 'es_en':  # Arabic
-            from nemo_text_processing.inverse_text_normalization.es_en.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
+            self.tagger = ClassifyFst(
+                cache_dir=cache_dir,
+                whitelist=whitelist,
+                overwrite_cache=overwrite_cache,
+                input_case=input_case,
+                filter_profanity=filter_profanity,
+                profane_words=profane_words,
             )
-        elif lang == 'zh':  # Mandarin
-            from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
-                VerbalizeFinalFst,
+        else:
+            if lang == 'es':  # Spanish (Espanol)
+                from nemo_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+
+            elif lang == 'pt':  # Portuguese (Português)
+                from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+
+            elif lang == 'ru':  # Russian (Russkiy Yazyk)
+                from nemo_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+
+            elif lang == 'de':  # German (Deutsch)
+                from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+            elif lang == 'fr':  # French (Français)
+                from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+            elif lang == 'sv':  # Swedish (Svenska)
+                from nemo_text_processing.inverse_text_normalization.sv.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.sv.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+            elif lang == 'vi':  # Vietnamese (Tiếng Việt)
+                from nemo_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+            elif lang == 'ar':  # Arabic
+                from nemo_text_processing.inverse_text_normalization.ar.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+            elif lang == 'es_en':  # Arabic
+                from nemo_text_processing.inverse_text_normalization.es_en.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+            elif lang == 'zh':  # Mandarin
+                from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
+                from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
+                    VerbalizeFinalFst,
+                )
+
+            self.tagger = ClassifyFst(
+                cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case
             )
-
-        self.tagger = ClassifyFst(
-            cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case
-        )
         self.verbalizer = VerbalizeFinalFst()
         self.parser = TokenParser()
         self.lang = lang

diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_profane.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_profane.txt
@@ -0,0 +1,21 @@
+fuck off~f*** off
+dont give a fuck about your problems~dont give a f*** about your problems
+holy shit~holy s***
+he was pissing us off~he was p****** us off
+the movie was bad ass~the movie was bad a**
+son of a bitch~son of a b****
+two girls one cup~t****************
+zoophilia~z********
+get out of the way fuckin bastard~get out of the way f***** b******
+belief that the white power is inherently superior is bad~belief that the w********** is inherently superior is bad
+two g one c~t**********
+not give a fuck~not give a f***
+oh shit~oh s***
+rapist~r*****
+sexting~s******
+assholes~a*******
+bullshit~b*******
+fucker~f*****
+piece of shit~p************
+2g1c~2***
+2 g 1 c~2******