-
Notifications
You must be signed in to change notification settings - Fork 1
/
spell_correct.py
67 lines (45 loc) · 1.91 KB
/
spell_correct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
INSERTION_COST = 0.2
def get_prefix_distance(input_string, candidate_string):
input_string = input_string.lower()
prefix_len = len(input_string)
candidate_string = candidate_string.lower()[:prefix_len]
candidate_len = len(candidate_string)
row_num = prefix_len + 1
col_num = candidate_len + 1
distance_matrix = np.zeros((row_num, col_num))
distance_matrix[0, 1:] = [x for x in range(1, col_num)]
distance_matrix[1:, 0] = [x for x in range(1, row_num)]
for i in range(1, row_num):
for j in range(1, col_num):
if input_string[i-1] == candidate_string[j-1]:
distance_matrix[i, j] = distance_matrix[i-1, j-1]
else:
distance_matrix[i, j] = min(distance_matrix[i-1,j-1],
distance_matrix[i-1,j],
distance_matrix[i,j-1]) + 1
return distance_matrix[-1,-1]
def get_insert_distance(input_string, candidate_string):
insert_ops = len(candidate_string) - len(input_string)
return INSERTION_COST*insert_ops
def get_edit_distance(input_string, candidate_string):
pdistance = get_prefix_distance(input_string, candidate_string)
idistance = get_insert_distance(input_string, candidate_string)
return pdistance + idistance
def get_candidates_with_distance(query, candidates):
rank = {}
for candidate in candidates:
edist = get_edit_distance(query, candidate)
# if edist > 3:
# continue
if rank.get(edist):
rank[edist].append(candidate)
else:
rank[edist] = [candidate]
return rank
def generate_rank(query, candidates):
result = get_candidates_with_distance(query, candidates)
ranked_candidates = []
for k in sorted(list(result.keys())):
ranked_candidates.extend(list(result[k]))
return ranked_candidates