From cc712ffc70b8485ea8fbd25d06381a4fc2b9b906 Mon Sep 17 00:00:00 2001
From: Patrick Goldinger <patrick.goldinger@pm.me>
Date: Fri, 26 Feb 2021 02:18:04 +0100
Subject: [PATCH] Add profanity word matching

---
 clb.py      | 15 +++++++++++++--
 dicttool.py |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/clb.py b/clb.py
index 01777ef..e02a95a 100644
--- a/clb.py
+++ b/clb.py
@@ -26,7 +26,7 @@ def __header(lang_code):
     return "dictionary=main:{lang_code},locale={lang_code},description=Auto-generated dictionary for {lang_code},date={date},version=1\n" \
             .format(lang_code=lang_code,date=int(time.time()*1000))
 
-def cBpack(lang_code, src_path, dst_path):
+def cBpack(lang_code, src_path, swear_path, dst_path):
     """
     Works with a cBpack source and builds a combined list.
     Explaination of cBpack: https://github.com/LuminosoInsight/wordfreq/blob/7a742499a42a6539be772ab26b6460d7e160ae04/wordfreq/__init__.py#L37-L76
@@ -50,6 +50,13 @@ def cBpack(lang_code, src_path, dst_path):
                 sanitized_list.append(word_sanitized)
         if len(sanitized_list) > 0:
             data_sanitized.append(sanitized_list)
+    swear_words = []
+    with io.open(swear_path, encoding="utf-8") as f_swear:
+        for swear_word in f_swear.readlines():
+            swear_word = swear_word.strip()
+            if len(swear_word) > 0:
+                if swear_word[0] != "#":
+                    swear_words.append(swear_word)
     index = 0
     with io.open(dst_path, "w", encoding="utf-8") as f_dst:
         # Write header of combined list first
@@ -58,5 +65,9 @@ def cBpack(lang_code, src_path, dst_path):
         for innerlist in data_sanitized:
             freq = __freq_for_index(index, len_list)
             for word in innerlist:
-                f_dst.write(" word={},f={}\n".format(word, freq))
+                adjusted_freq = freq
+                for swear_word in swear_words:
+                    if swear_word in word:
+                        adjusted_freq = 0
+                f_dst.write(" word={},f={}\n".format(word, adjusted_freq))
             index += 1
diff --git a/dicttool.py b/dicttool.py
index cac4497..da39e35 100644
--- a/dicttool.py
+++ b/dicttool.py
@@ -42,7 +42,7 @@ def make(src_def):
     os.makedirs(".dicttool", exist_ok=True)
     clb_path = f".dicttool/combined-list-{lang_code}.txt"
     if src_type == "cBpack":
-        clb.cBpack(lang_code, src_path, clb_path)
+        clb.cBpack(lang_code, src_path, ".srcin/swearWords.txt", clb_path)
     else:
         print("    Error: Unsupported src_type provided. Skipping this entry...\n")
         return False