-
Notifications
You must be signed in to change notification settings - Fork 5
/
WordDecompounder.py
141 lines (115 loc) · 5.18 KB
/
WordDecompounder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import pickle
import string
from collections import Counter
from math import log
from nltk import word_tokenize, sent_tokenize
class WordDictBuilder:
def __init__(self):
self.__stop_words = string.punctuation + '«»→↑—✰⛭№•/\\'
self.__word_dict = None
def build_dict(self, collection):
self.__word_dict = Counter()
for f in os.listdir(collection):
if f.endswith('.txt'):
self.collect_dictionary(open(os.path.join(collection, f), 'r', encoding='utf-8').read())
def save_dictionary(self, filename):
with open(filename, 'wb') as f:
pickle.dump(self.__word_dict, f)
def load_dictionary(self, filename):
with open(filename, 'rb') as f:
self.__word_dict = pickle.load(f)
return self.__word_dict
def collect_dictionary(self, text):
counter = Counter(
[
word.lower() for sent in sent_tokenize(text)
for word in word_tokenize(sent)
if not (word in self.__stop_words or word.isdecimal() or not word.isalpha())
]
)
self.__word_dict += counter
def normalize_frequencies(self, alpha=1.0):
n = sum(self.__word_dict.values())
return {word: float((freq + alpha) / (n + alpha * n)) for word, freq in self.__word_dict.items()}
class WordDecompounder:
def __init__(self, path_to_dictionary):
word_dict_builder = WordDictBuilder()
self.__dictionary = word_dict_builder.load_dictionary(path_to_dictionary)
self.__total = sum(self.__dictionary.values())
self.__max_word_length = max(map(len, self.__dictionary))
def split(self, text):
return self._viterbi_segment(text)
def __word_probability(self, word):
prob = self.__dictionary[word]
total = self.__total if not 0 <= prob <= 1 else 1
return prob / total
# Find the most probable sequence of words with Viterbi algorithm
def _viterbi_segment(self, text):
probs, lasts = [1.0], [0]
for i in range(1, len(text) + 1):
prob_pos, pos = max((probs[j] * self.__word_probability(text[j:i]), j)
for j in range(max(0, i - self.__max_word_length), i))
probs.append(prob_pos)
lasts.append(pos)
words = []
i = len(text)
while i > 0:
words.append(text[lasts[i]:i])
i = lasts[i]
words.reverse()
return words, probs[-1]
class WordDecompounderWithoutPrioriProb:
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
def __init__(self, path_to_dictionary):
self.__words = list(pickle.load(open(path_to_dictionary, 'rb')).keys())
total = len(self.__words)
self.__wordcost = dict((w, log((i + 1) * log(total))) for i, w in enumerate(self.__words))
self.__maxword = max(map(len, self.__words))
def split(self, text):
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(pos):
candidates = enumerate(reversed(costs[max(0, pos - self.__maxword):pos]))
return min((c + self.__wordcost.get(text[pos - k - 1:pos], 9e999), k + 1) for k, c in candidates)
# Build the cost array.
costs, lengths = [0.0], [0]
for i in range(1, len(text) + 1):
cost, length = best_match(i)
costs.append(cost)
lengths.append(length)
# Backtrack to recover the minimal-cost string.
out = []
i = len(text)
while i > 0:
out.append(text[i - lengths[i]:i])
i -= lengths[i]
return " ".join(reversed(out))
if __name__ == "__main__":
path_to_collection = 'resources/corpus'
print('Collecting of a dictionary..')
wdb = WordDictBuilder()
wdb.build_dict(path_to_collection)
normalized_dict = wdb.normalize_frequencies()
print(len(normalized_dict))
for key, val in sorted(normalized_dict.items(), key=lambda x: -x[1]):
print(key, '--', val)
PATH_TO_DICTIONARY = 'resources/corpus/dictionary.pkl'
wdb.save_dictionary(PATH_TO_DICTIONARY)
print()
print('Decompounding of a text into words..')
decompounder = WordDecompounder(PATH_TO_DICTIONARY)
print(decompounder.split('малышикарлсонкоторыйживётнакрыше'))
print(decompounder.split('зубзолотой'))
print(decompounder.split('огнибольшогогорода'))
print(decompounder.split('сказкаонильсеидикихгусях'))
print(decompounder.split('царьпетр'))
print()
print('Decompounding of a text into words ..')
decompounder = WordDecompounderWithoutPrioriProb(PATH_TO_DICTIONARY)
print(decompounder.split('малышикарлсонкоторыйживётнакрыше'))
print(decompounder.split('зубзолотой'))
print(decompounder.split('огнибольшогогорода'))
print(decompounder.split('сказкаонильсеидикихгусях'))
print(decompounder.split('царьпетр'))