From cc712ffc70b8485ea8fbd25d06381a4fc2b9b906 Mon Sep 17 00:00:00 2001 From: Patrick Goldinger Date: Fri, 26 Feb 2021 02:18:04 +0100 Subject: [PATCH] Add profanity word matching --- clb.py | 15 +++++++++++++-- dicttool.py | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/clb.py b/clb.py index 01777ef..e02a95a 100644 --- a/clb.py +++ b/clb.py @@ -26,7 +26,7 @@ def __header(lang_code): return "dictionary=main:{lang_code},locale={lang_code},description=Auto-generated dictionary for {lang_code},date={date},version=1\n" \ .format(lang_code=lang_code,date=int(time.time()*1000)) -def cBpack(lang_code, src_path, dst_path): +def cBpack(lang_code, src_path, swear_path, dst_path): """ Works with a cBpack source and builds a combined list. Explaination of cBpack: https://github.com/LuminosoInsight/wordfreq/blob/7a742499a42a6539be772ab26b6460d7e160ae04/wordfreq/__init__.py#L37-L76 @@ -50,6 +50,13 @@ def cBpack(lang_code, src_path, dst_path): sanitized_list.append(word_sanitized) if len(sanitized_list) > 0: data_sanitized.append(sanitized_list) + swear_words = [] + with io.open(swear_path, encoding="utf-8") as f_swear: + for swear_word in f_swear.readlines(): + swear_word = swear_word.strip() + if len(swear_word) > 0: + if swear_word[0] != "#": + swear_words.append(swear_word) index = 0 with io.open(dst_path, "w", encoding="utf-8") as f_dst: # Write header of combined list first @@ -58,5 +65,9 @@ def cBpack(lang_code, src_path, dst_path): for innerlist in data_sanitized: freq = __freq_for_index(index, len_list) for word in innerlist: - f_dst.write(" word={},f={}\n".format(word, freq)) + adjusted_freq = freq + for swear_word in swear_words: + if swear_word in word: + adjusted_freq = 0 + f_dst.write(" word={},f={}\n".format(word, adjusted_freq)) index += 1 diff --git a/dicttool.py b/dicttool.py index cac4497..da39e35 100644 --- a/dicttool.py +++ b/dicttool.py @@ -42,7 +42,7 @@ def make(src_def): os.makedirs(".dicttool", exist_ok=True) clb_path = f".dicttool/combined-list-{lang_code}.txt" if src_type == "cBpack": - clb.cBpack(lang_code, src_path, clb_path) + clb.cBpack(lang_code, src_path, ".srcin/swearWords.txt", clb_path) else: print(" Error: Unsupported src_type provided. Skipping this entry...\n") return False