forked from dtuggener/CharSplit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchar_split.py
118 lines (93 loc) · 4.28 KB
/
char_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Split German compound words
"""
__author__ = '[email protected], [email protected]'
import pickle
import re
import sys
import os
ngram_probs = None
def init(file: str='ngram_probs.pickle'):
if os.path.dirname(file) == '':
file = os.path.join(os.path.dirname(__file__), file)
global ngram_probs
ngram_probs = None # allows earlier garbage collection if this isn't the first init call
with open(file, 'rb') as f:
ngram_probs = pickle.load(f)
def split_compound(word: str):
"""
Return list of possible splits, best first
:param word: Word to be split
:return: List of all splits
"""
global ngram_probs
if ngram_probs is None:
init()
word = word.lower()
# If there is a hyphen in the word, return part of the word behind the last hyphen
if '-' in word:
return [[1., word.title(), re.sub('.*-', '', word.title())]]
scores = [] # Score for each possible split position
# Iterate through characters, start at forth character, go to 3rd last
for n in range(3, len(word)-2):
pre_slice = word[:n]
# Cut of Fugen-S
if pre_slice.endswith('ts') or pre_slice.endswith('gs') or pre_slice.endswith('ks') \
or pre_slice.endswith('hls') or pre_slice.endswith('ns'):
if len(word[:n-1]) > 2: pre_slice = word[:n-1]
# Start, in, and end probabilities
pre_slice_prob = []
in_slice_prob = []
start_slice_prob = []
# Extract all ngrams
for k in range(len(word)+1, 2, -1):
# Probability of first compound, given by its ending prob
if pre_slice_prob == [] and k <= len(pre_slice):
end_ngram = pre_slice[-k:] # Look backwards
pre_slice_prob.append(ngram_probs['suffix'].get(end_ngram, -1)) # Punish unlikely pre_slice end_ngram
# Probability of ngram in word, if high, split unlikely
in_ngram = word[n:n+k]
in_slice_prob.append(ngram_probs['infix'].get(in_ngram, 1)) # Favor ngrams not occurring within words
# Probability of word starting
if start_slice_prob == []:
ngram = word[n:n+k]
# Cut Fugen-S
if ngram.endswith('ts') or ngram.endswith('gs') or ngram.endswith('ks') \
or ngram.endswith('hls') or ngram.endswith('ns'):
if len(ngram[:-1]) > 2:
ngram = ngram[:-1]
start_slice_prob.append(ngram_probs['prefix'].get(ngram, -1))
if pre_slice_prob == [] or start_slice_prob == []: continue
start_slice_prob = max(start_slice_prob)
pre_slice_prob = max(pre_slice_prob) # Highest, best preslice
in_slice_prob = min(in_slice_prob) # Lowest, punish splitting of good ingrams
score = start_slice_prob - in_slice_prob + pre_slice_prob
scores.append([score, word[:n].title(), word[n:].title()])
scores.sort(reverse=True)
if scores == []:
scores=[ [0, word.title(), word.title()] ]
return sorted(scores, reverse = True)
def germanet_evaluation(file: str='split_compounds_from_GermaNet13.0.txt', print_errors: bool=False):
""" Test on GermaNet compounds from http://www.sfs.uni-tuebingen.de/lsd/compounds.shtml """
cases, correct = 0, 0
for line in open(file,'r').readlines()[2:]:
cases += 1
sys.stderr.write('\r'+str(cases))
sys.stderr.flush()
line = line.strip().split('\t')
if not len(line) == 3:
continue # A few corrupted lines
split_result = split_compound(line[0])
if split_result != []:
if split_result[0][2] == line[2]:
correct += 1
elif print_errors:
print(line, split_result)
if cases % 10000 == 0: print(' Accuracy (' + str(correct) + '/' + str(cases) + '): ', 100*correct/cases)
print(' Accuracy (' + str(correct) + '/' + str(cases) + '): ', 100*correct/cases)
if __name__ == '__main__':
do_eval = False
if do_eval:
germanet_evaluation(print_errors=False)
for x in split_compound(sys.argv[1]):
print('\t'.join([str(y) for y in x]))