-
Notifications
You must be signed in to change notification settings - Fork 267
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding suffix tree #323
base: main
Are you sure you want to change the base?
Adding suffix tree #323
Changes from 5 commits
4432547
598330b
391302d
7fd9da7
68ef229
57fd9f9
cac7126
b315c5e
e3586fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,273 @@ | ||
from pydatastructs.utils.misc_util import SuffixNode | ||
|
||
__all__ = [ | ||
'SuffixTree' | ||
] | ||
|
||
class SuffixTree(): | ||
""" | ||
Represents Suffix Tree. | ||
|
||
Examples | ||
======== | ||
|
||
>>> from pydatastructs.strings import SuffixTree as suffix | ||
>>> s = suffix('hello') | ||
>>> s.find('he') | ||
0 | ||
>>> s.find_all('l') | ||
{2, 3} | ||
>>> s.find('f') | ||
-1 | ||
>>> lt=["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"] | ||
>>> s1 = suffix(lt) | ||
>>> s1.lcs() | ||
'abeced' | ||
|
||
References | ||
========== | ||
|
||
.. [1] https://en.wikipedia.org/wiki/Suffix_tree | ||
.. [2] https://en.wikipedia.org/wiki/Generalized_suffix_tree | ||
""" | ||
|
||
def __new__(cls, input=''): | ||
obj = object.__new__(cls) | ||
obj.root = SuffixNode() | ||
obj.root.depth = 0 | ||
obj.root.idx = 0 | ||
obj.root.parent = obj.root | ||
obj.root._add_suffix_link(obj.root) | ||
if not input == '': | ||
obj.build(input) | ||
return obj | ||
|
||
@classmethod | ||
def methods(cls): | ||
return ['__new__', 'lcs', 'find', 'find_all'] | ||
|
||
def _check_input(self, input): | ||
if isinstance(input, str): | ||
return 'str' | ||
elif isinstance(input, list): | ||
if all(isinstance(item, str) for item in input): | ||
return 'list' | ||
raise ValueError("String argument should be of type String or a list of strings") | ||
|
||
def build(self, x): | ||
""" | ||
Builds the Suffix tree on the given input. | ||
|
||
Parameters | ||
========== | ||
|
||
x: str or list of str | ||
|
||
Returns | ||
======= | ||
|
||
None | ||
""" | ||
type = self._check_input(x) | ||
if type == 'str': | ||
x += next(self._terminalSymbolsGenerator()) | ||
self._build(x) | ||
if type == 'list': | ||
self._build_generalized(x) | ||
|
||
def _build(self, x): | ||
self.word = x | ||
self._build_McCreight(x) | ||
|
||
def _build_McCreight(self, x): | ||
u = self.root | ||
d = 0 | ||
for i in range(len(x)): | ||
while u.depth == d and u._has_transition(x[d + i]): | ||
u = u._get_transition_link(x[d + i]) | ||
d = d + 1 | ||
while d < u.depth and x[u.idx + d] == x[i + d]: | ||
d = d + 1 | ||
if d < u.depth: | ||
u = self._create_node(x, u, d) | ||
self._create_leaf(x, i, u, d) | ||
if not u._get_suffix_link(): | ||
self._compute_slink(x, u) | ||
u = u._get_suffix_link() | ||
d = d - 1 | ||
if d < 0: | ||
d = 0 | ||
|
||
def _create_node(self, x, u, d): | ||
i = u.idx | ||
p = u.parent | ||
v = SuffixNode(idx=i, depth=d) | ||
v._add_transition_link(u, x[i + d]) | ||
u.parent = v | ||
p._add_transition_link(v, x[i + p.depth]) | ||
v.parent = p | ||
return v | ||
|
||
def _create_leaf(self, x, i, u, d): | ||
w = SuffixNode() | ||
w.idx = i | ||
w.depth = len(x) - i | ||
u._add_transition_link(w, x[i + d]) | ||
w.parent = u | ||
return w | ||
|
||
def _compute_slink(self, x, u): | ||
d = u.depth | ||
v = u.parent._get_suffix_link() | ||
while v.depth < d - 1: | ||
v = v._get_transition_link(x[u.idx + v.depth + 1]) | ||
if v.depth > d - 1: | ||
v = self._create_node(x, v, d - 1) | ||
u._add_suffix_link(v) | ||
|
||
def _build_generalized(self, xs): | ||
terminal_gen = self._terminalSymbolsGenerator() | ||
_xs = ''.join([x + next(terminal_gen) for x in xs]) | ||
self.word = _xs | ||
self._generalized_word_starts(xs) | ||
self._build(_xs) | ||
self.root._traverse(self._label_generalized) | ||
|
||
def _label_generalized(self, node): | ||
if node.is_leaf(): | ||
x = {self._get_word_start_index(node.idx)} | ||
else: | ||
x = {n for ns in node.transition_links.values() for n in ns.generalized_idxs} | ||
node.generalized_idxs = x | ||
|
||
def _get_word_start_index(self, idx): | ||
i = 0 | ||
for _idx in self.word_starts[1:]: | ||
if idx < _idx: | ||
return i | ||
else: | ||
i += 1 | ||
return i | ||
|
||
def lcs(self, stringIdxs = -1): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please avoid using short forms. Use the full name, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We had some methods added to algorithms under this module related with longest common substring I believe. With that backtracking thing. How is this method different from that one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Backtracking just searches all the strings in the given list by going in reverse direction from the child to root and finds the longest among that but by using this method we give users the freedom to search the string from the index they want. And this function just reduces the comparison time by removing all the non subset of longest sequence |
||
""" | ||
Finds the Largest Common Substring of Strings provided in stringIdxs. | ||
If stringIdxs is not provided, the LCS of all strings is returned. | ||
|
||
Parameters | ||
========== | ||
|
||
stringIdxs: int or list of int | ||
|
||
Returns | ||
======= | ||
|
||
Longest Common Substring | ||
""" | ||
if stringIdxs == -1 or not isinstance(stringIdxs, list): | ||
stringIdxs = set(range(len(self.word_starts))) | ||
else: | ||
stringIdxs = set(stringIdxs) | ||
deepestNode = self._find_lcs(self.root, stringIdxs) | ||
start = deepestNode.idx | ||
end = deepestNode.idx + deepestNode.depth | ||
return self.word[start:end] | ||
|
||
def _find_lcs(self, node, stringIdxs): | ||
nodes = [self._find_lcs(n, stringIdxs) | ||
for n in node.transition_links.values() | ||
if n.generalized_idxs.issuperset(stringIdxs)] | ||
if nodes == []: | ||
return node | ||
deepestNode = max(nodes, key=lambda n: n.depth) | ||
return deepestNode | ||
|
||
def _generalized_word_starts(self, xs): | ||
self.word_starts = [] | ||
i = 0 | ||
for n in range(len(xs)): | ||
self.word_starts.append(i) | ||
i += len(xs[n]) + 1 | ||
|
||
def find(self, y): | ||
""" | ||
Finds the starting position of the substring y in the string used for | ||
building the Suffix tree. | ||
|
||
Parameters | ||
========== | ||
|
||
y: str | ||
|
||
Returns | ||
======= | ||
|
||
Index of the starting position of string y in the string used for building the Suffix tree | ||
-1 if y is not a substring. | ||
""" | ||
node = self.root | ||
while True: | ||
edge = self._edgeLabel(node, node.parent) | ||
if edge.startswith(y): | ||
return node.idx | ||
|
||
i = 0 | ||
while (i < len(edge) and edge[i] == y[0]): | ||
y = y[1:] | ||
i += 1 | ||
|
||
if i != 0: | ||
if i == len(edge) and y != '': | ||
pass | ||
else: | ||
return -1 | ||
|
||
node = node._get_transition_link(y[0]) | ||
if not node: | ||
return -1 | ||
|
||
def find_all(self, y): | ||
""" | ||
Finds the starting position of the substring y in the string used for | ||
building the Suffix tree. | ||
|
||
Parameters | ||
========== | ||
|
||
y: str | ||
|
||
Returns | ||
======= | ||
|
||
Set of Index of the starting positions of string y in the string used for building the Suffix tree | ||
{} if y is not a substring. | ||
""" | ||
node = self.root | ||
while True: | ||
edge = self._edgeLabel(node, node.parent) | ||
if edge.startswith(y): | ||
break | ||
i = 0 | ||
while (i < len(edge) and edge[i] == y[0]): | ||
y = y[1:] | ||
i += 1 | ||
if i != 0: | ||
if i == len(edge) and y != '': | ||
pass | ||
else: | ||
return {} | ||
node = node._get_transition_link(y[0]) | ||
if not node: | ||
return {} | ||
|
||
leaves = node._get_leaves() | ||
return {n.idx for n in leaves} | ||
|
||
def _edgeLabel(self, node, parent): | ||
return self.word[node.idx + parent.depth: node.idx + node.depth] | ||
|
||
def _terminalSymbolsGenerator(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the logic of this method? How it is doing it's job? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in range from Hexadecimal (57344 - 1114109) where it can accommodate the nodes of these range and generate symbols for the suffix tree with ascii as terminal symbols. The yield helps optimizing the code that it don't have to initialize the UPPA value again There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These ranges are used just to make sure that the symbols aren't part of the suffix tree inputs |
||
UPPAs = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use a more descriptive variable name. It's a bit difficult to understand the purpose from the name, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep changed |
||
for i in UPPAs: | ||
yield (chr(i)) | ||
raise ValueError("To many input strings.") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from pydatastructs import SuffixTree | ||
from pydatastructs.utils.raises_util import raises | ||
import random, string | ||
|
||
def test_suffixtree(): | ||
""" | ||
References | ||
========== | ||
.. https://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm | ||
|
||
""" | ||
s = SuffixTree("HelloworldHe") | ||
assert s.find("Hel") == 0 | ||
assert s.find_all("He") == {0, 10} | ||
assert s.find("Win") == -1 | ||
assert s.find_all("go") == {} | ||
|
||
f = ['integer', 'inteinteger', 'integralerint', 'iaingerntier', 'regetnerireg', 'reger'] | ||
s = SuffixTree(f) | ||
assert s.lcs() == 'er' | ||
|
||
assert raises(ValueError, lambda: SuffixTree(123)) | ||
res = (100, 1, 0) | ||
assert raises(ValueError, lambda: SuffixTree(res)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
Set, | ||
CartesianTreeNode, | ||
RedBlackTreeNode, | ||
TrieNode | ||
TrieNode, | ||
SuffixNode | ||
) | ||
__all__.extend(misc_util.__all__) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops left that point
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You have come early ☺