forked from JimChengLin/JimCollection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SuffixTree.py
170 lines (138 loc) · 5.26 KB
/
SuffixTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
g_target = ''
class Node:
def __init__(self):
self.sub = {}
self.op = self.ed = None
self.link_to = None
@property
def is_root(self):
return self.op is None and self.ed is None
@property
def is_leaf(self):
return not self.is_root and self.link_to is None
@property
def is_inner(self):
return not self.is_root and self.link_to is not None
def __repr__(self):
if self.is_root:
return '<root>'
else:
op, ed = self.op, self.ed
if ed == ':ed':
ed = len(g_target)
return g_target[op:ed] + ' {}:{}'.format(op, ed)
def __lt__(self, other: 'Node'):
return str(self) < str(other)
class SuffixTree:
def __init__(self):
self.root = Node()
self.remainder = 0
self.cursor = 0
self.ac_node = self.root
self.ac_direction = 0
self.ac_offset = 0
def repr(self):
def print_tree(node: 'Node', level=0):
if level == 0:
prefix = ''
elif level == 1:
prefix = '-- '
else:
prefix = ' ' * (level - 1) + '-- '
print(prefix + str(node))
for child in sorted(node.sub.values()):
print_tree(child, level + 1)
print_tree(self.root)
print(self.__dict__)
def insert(self, char: str):
global g_target
g_target += char
self.remainder += 1
def case_1():
# 1.1. 无法坍缩, 建立新的叶节点
if char not in self.ac_node.sub:
leaf_node = Node()
leaf_node.op = self.cursor
leaf_node.ed = ':ed'
self.ac_node.sub[char] = leaf_node
self.remainder -= 1
# 1.2. 开始坍缩
else:
collapse_node = self.ac_node.sub[char]
self.ac_direction = collapse_node.op
self.ac_offset += 1
# 1. ac_node 是 root 的初始状态
if self.ac_node.is_root and self.ac_offset == 0:
case_1()
# 2. 已经坍缩
else:
collapse_node = self.ac_node.sub[g_target[self.ac_direction]]
# 2.1. 能否扩大坍缩? 可以
if char == g_target[collapse_node.op + self.ac_offset]:
# 2.1.1. 如果是 inner_node, 坍缩是否达已经到极限? 是
if collapse_node.is_inner \
and collapse_node.op + self.ac_offset == collapse_node.ed:
# 推移 ac_node
self.ac_node = collapse_node
self.ac_direction = collapse_node.op + self.ac_offset
self.ac_offset = 1
# 2.1.2. 一般情况
else:
self.ac_offset += 1
# 2.2. 无法继续坍缩, 炸开累积的后缀
else:
def split_grow():
# 原坍缩点成为 inner_node
collapse_node.ed = collapse_node.op + self.ac_offset
# 新节点继承 :ed
inherit_node = Node()
inherit_node.op = collapse_node.ed
inherit_node.ed = ':ed'
# 回连
collapse_node.sub[g_target[inherit_node.op]] = inherit_node
# 新节点记录 char
leaf_node = Node()
leaf_node.op = self.cursor
leaf_node.ed = ':ed'
collapse_node.sub[g_target[leaf_node.op]] = leaf_node
self.remainder -= 1
while self.remainder > 0:
# 2.2.1. 没有 suffix link 用于状态转移
if self.ac_node.link_to is None:
split_grow()
# 状态转移, 继续爆炸
self.ac_offset -= 1
self.ac_direction += 1
if self.ac_offset > 0:
next_collapse_node = self.ac_node.sub[g_target[self.ac_direction]]
collapse_node.link_to = next_collapse_node
collapse_node = next_collapse_node
# 累积后缀已炸完
else:
# 进入 case 1.
collapse_node.link_to = self.root
case_1()
# 2.2.2. 需要运用 suffix link
else:
split_grow()
# todo 跳转校正
# 状态转移
self.ac_node = self.ac_node.link_to
next_collapse_node = self.ac_node.sub[g_target[self.ac_direction]]
collapse_node.link_to = next_collapse_node
collapse_node = next_collapse_node
# suffix link 消耗完之后. 进入 case 2.2.1.
self.cursor += 1
if __name__ == '__main__':
t = SuffixTree()
t.insert('x')
t.insert('y')
t.insert('z')
t.insert('x')
t.insert('y')
t.insert('a')
t.insert('x')
t.insert('y')
t.insert('z')
t.insert('$')
t.repr()