-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathclb.py
73 lines (67 loc) · 2.69 KB
/
clb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
# encoding: utf-8
import gzip
import msgpack
import io
import regex
import time
STR_VALIDATE_REGEX = r"^((\p{L}\p{M}*)|\'|\-)+$"
def __validate_str(str):
return regex.search(STR_VALIDATE_REGEX, str) != None and len(str) != 0
def __freq_for_index(index, len_list):
"""
Calculates the frequency for a given index based on the list length.
The base is f(x) = -(x^2) + 1
The result, which is always between 0.0 and 1.0, is then adjusted to be
in the interval [15;255]. This value can then be used in the combined list.
"""
x = index / len_list
return int(240 * (-(x**2) + 1) + 15)
def __header(lang_code):
return "dictionary=main:{lang_code},locale={lang_code},description=Auto-generated dictionary for {lang_code},date={date},version=1\n" \
.format(lang_code=lang_code,date=int(time.time()*1000))
def cBpack(lang_code, src_path, swear_path, dst_path):
"""
Works with a cBpack source and builds a combined list.
Explaination of cBpack: https://github.com/LuminosoInsight/wordfreq/blob/7a742499a42a6539be772ab26b6460d7e160ae04/wordfreq/__init__.py#L37-L76
Frequency calculation adjusted to work with 15..255 format
"""
with gzip.open(src_path, "rb") as f_src:
data_raw = msgpack.load(f_src, raw=False)
header = data_raw[0]
if (
not isinstance(header, dict) or header.get("format") != "cB"
or header.get("version") != 1
):
raise ValueError("Unexpected header: {}".format(header))
data = data_raw[1:]
data_sanitized = []
for innerlist in data:
sanitized_list = []
for word in innerlist:
word_sanitized = word.strip()
if (__validate_str(word_sanitized)):
sanitized_list.append(word_sanitized)
if len(sanitized_list) > 0:
data_sanitized.append(sanitized_list)
swear_words = []
with io.open(swear_path, encoding="utf-8") as f_swear:
for swear_word in f_swear.readlines():
swear_word = swear_word.strip()
if len(swear_word) > 0:
if swear_word[0] != "#":
swear_words.append(swear_word)
index = 0
with io.open(dst_path, "w", encoding="utf-8") as f_dst:
# Write header of combined list first
f_dst.write(__header(lang_code))
len_list = len(data_sanitized)
for innerlist in data_sanitized:
freq = __freq_for_index(index, len_list)
for word in innerlist:
if word in swear_words:
adjusted_freq = 0
else:
adjusted_freq = freq
f_dst.write(" word={},f={}\n".format(word, adjusted_freq))
index += 1