Skip to content

Commit

Permalink
Add profanity word matching
Browse files Browse the repository at this point in the history
  • Loading branch information
patrickgold committed Feb 26, 2021
1 parent 247aa37 commit cc712ff
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
15 changes: 13 additions & 2 deletions clb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __header(lang_code):
return "dictionary=main:{lang_code},locale={lang_code},description=Auto-generated dictionary for {lang_code},date={date},version=1\n" \
.format(lang_code=lang_code,date=int(time.time()*1000))

def cBpack(lang_code, src_path, dst_path):
def cBpack(lang_code, src_path, swear_path, dst_path):
"""
Works with a cBpack source and builds a combined list.
Explaination of cBpack: https://github.com/LuminosoInsight/wordfreq/blob/7a742499a42a6539be772ab26b6460d7e160ae04/wordfreq/__init__.py#L37-L76
Expand All @@ -50,6 +50,13 @@ def cBpack(lang_code, src_path, dst_path):
sanitized_list.append(word_sanitized)
if len(sanitized_list) > 0:
data_sanitized.append(sanitized_list)
swear_words = []
with io.open(swear_path, encoding="utf-8") as f_swear:
for swear_word in f_swear.readlines():
swear_word = swear_word.strip()
if len(swear_word) > 0:
if swear_word[0] != "#":
swear_words.append(swear_word)
index = 0
with io.open(dst_path, "w", encoding="utf-8") as f_dst:
# Write header of combined list first
Expand All @@ -58,5 +65,9 @@ def cBpack(lang_code, src_path, dst_path):
for innerlist in data_sanitized:
freq = __freq_for_index(index, len_list)
for word in innerlist:
f_dst.write(" word={},f={}\n".format(word, freq))
adjusted_freq = freq
for swear_word in swear_words:
if swear_word in word:
adjusted_freq = 0
f_dst.write(" word={},f={}\n".format(word, adjusted_freq))
index += 1
2 changes: 1 addition & 1 deletion dicttool.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def make(src_def):
os.makedirs(".dicttool", exist_ok=True)
clb_path = f".dicttool/combined-list-{lang_code}.txt"
if src_type == "cBpack":
clb.cBpack(lang_code, src_path, clb_path)
clb.cBpack(lang_code, src_path, ".srcin/swearWords.txt", clb_path)
else:
print(" Error: Unsupported src_type provided. Skipping this entry...\n")
return False
Expand Down

0 comments on commit cc712ff

Please sign in to comment.