forked from dtuggener/CharSplit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
char_split_train.py
65 lines (52 loc) · 2.07 KB
/
char_split_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Estimate compound word boundaries based ngram probabilities of n characters at word endings
Take as input a file with one word (a noun) per line
"""
__author__ = '[email protected]'
import re
import sys
import pickle
from collections import defaultdict
# Dicts for counting the ngrams
end_ngrams = defaultdict(int)
start_ngrams = defaultdict(int)
in_ngrams = defaultdict(int)
all_ngrams = defaultdict(int)
# Counters and limits
c = 0 # Line counter
max_words = 10000000 # Words to consider
max_len = 20 # Maximum ngram length
# Gather counts
print('Words analyzed of max.', str(max_words))
for line in open(sys.argv[1], 'r', encoding='utf-8'):
line = line.strip().lower()
if '-' in line:
line = re.sub('.*-', '', line) # Hyphen: take part following last hyphen
line_middle = line[1:-1]
for n in range(3, max_len+1): # "Overcount" long words
#for n in range(3, len(line)+1): # Lower performance
if n <= max_len:
ngram = line[:n] # start_grams: max_len 3-5
start_ngrams[ngram] += 1
all_ngrams[ngram] += 1
ngram = line[-n:] # end_grams: max_len 3-5
end_ngrams[ngram] += 1
all_ngrams[ngram] += 1
for m in range(len(line_middle) - n + 1): # in_grams: max_len 3-5
ngram = line_middle[m:m+n]
if not ngram == '':
in_ngrams[ngram] += 1
all_ngrams[ngram] += 1
if c % 10000 == 0:
sys.stderr.write('\r'+str(c))
sys.stderr.flush()
c += 1
if c == max_words:
break
sys.stderr.write('\n')
print('Calculating ngrams probabilities')
start_ngrams = {k: v/all_ngrams[k] for k,v in start_ngrams.items() if v > 1}
end_ngrams = {k: v/all_ngrams[k] for k,v in end_ngrams.items() if v > 1}
in_ngrams = {k: v/all_ngrams[k] for k,v in in_ngrams.items() if v > 1}
with open('ngram_probs.pickle', 'wb') as f:
pickle.dump({'prefix': dict(start_ngrams), 'infix': dict(in_ngrams), 'suffix': dict(end_ngrams)}, f, protocol=pickle.HIGHEST_PROTOCOL)