Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Suffix Tree implementation using Ukkonen algorithm #524

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a9a5ecd
Added Z-function implementation
CarolLuca Apr 1, 2023
3943502
Fixed error in testing Z-function algorithm
CarolLuca Apr 1, 2023
4c30d0a
Added two arguments to z_function
CarolLuca Apr 1, 2023
2baa77a
Small string mistake fixed
CarolLuca Apr 1, 2023
fdec7c9
Instance of ODA wrong initialized fixed
CarolLuca Apr 1, 2023
7f584a8
Reorganized the algorithm's structure
CarolLuca Apr 1, 2023
b5887c0
Added missing newline character
CarolLuca Apr 1, 2023
a241ff1
Corrected error in test_algo.py
CarolLuca Apr 1, 2023
48366ba
Treated the null tests
CarolLuca Apr 1, 2023
767b7e7
Deleted trailing white spaces
CarolLuca Apr 1, 2023
55a7ae2
Fixed L206 and L231
CarolLuca Apr 2, 2023
774b402
Suffix tree class using Ukkonen algo
CarolLuca Apr 3, 2023
b331389
Merge branch 'codezonediitj:main' into main
CarolLuca Apr 3, 2023
a1bef9a
MMerge https://github.com/CarolLuca/pydatastructs
CarolLuca Apr 3, 2023
b1bc9a8
Updated the suffix tree imports
CarolLuca Apr 3, 2023
8659f84
Solved import issue
CarolLuca Apr 3, 2023
c0309f8
Solved reported issues + preferences
CarolLuca Apr 3, 2023
67313c3
Made __new__ method work
CarolLuca Apr 4, 2023
b3bf2de
Updated asserts and coding style
CarolLuca Apr 4, 2023
0ad5483
Redistributed the auxiliar classes and improved test code
CarolLuca Apr 5, 2023
719a095
Fixed typo
CarolLuca Apr 5, 2023
4e1247d
Added test for long string
CarolLuca Apr 7, 2023
dbfed79
Changed test file location
CarolLuca Apr 7, 2023
9349742
Fixed test code for Linux/MacOS
CarolLuca Apr 7, 2023
466b3ef
Switched to a common encoding for all platforms
CarolLuca Apr 8, 2023
9622c6d
Added tests for auxiliar classes
CarolLuca Apr 8, 2023
9af2a5d
Fixed coding style preferences
CarolLuca Apr 8, 2023
d3a8a04
Added more tests
CarolLuca Apr 8, 2023
f0b3d35
Modified requested changes
CarolLuca Apr 19, 2023
2b8770f
Minor modifications regarding __new__ method
CarolLuca Apr 19, 2023
75a12d4
Try again with __init__ method
CarolLuca Apr 19, 2023
65f87ce
Coding style
CarolLuca Apr 19, 2023
cf67130
Minor flaw in testing
CarolLuca Apr 19, 2023
77af09a
Eliminated __init__ method
CarolLuca Apr 19, 2023
b8c6b45
Added the last part of the documentation
CarolLuca Apr 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pydatastructs/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
__all__ = []

from . import (
suffix_tree,
trie,
algorithms
)

from .suffix_tree import (
SuffixTree
)

__all__.extend(suffix_tree.__all__)

from .trie import (
Trie
)
Expand Down
241 changes: 241 additions & 0 deletions pydatastructs/strings/suffix_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
from pydatastructs.utils.misc_util import (
SuffixTreeNode, SuffixTreeEdge, Suffix, Backend, raise_if_backend_is_not_python)

__all__ = [
'SuffixTree'
]


class SuffixTree(object):
"""
Represents a suffix tree.

Parameters
==========
string
Required, it represents the sequence of
characters around which the construction
of the suffix tree takes place

case_insensitive
Optional, through this parameter it's specified
if the suffix tree should consider the case of
the given characters; otherwise set to False,
meaning that 'A' is different from 'a'

References
==========
.. [1] https://en.wikipedia.org/wiki/Suffix_tree
"""
@classmethod
def methods(cls):
return ['__new__', '__repr__', 'find', 'has']

def __new__(cls, string="", case_insensitive=False, **kwargs):
obj = super().__new__(cls)
obj.string = string
obj.case_insensitive = case_insensitive
obj.N = len(string) - 1
obj.nodes = [SuffixTreeNode()]
obj.edges = {}
obj.active = Suffix(0, 0, -1)
if obj.case_insensitive:
obj.string = obj.string.lower()
for i in range(len(string)):
obj._add_prefix(i)
return obj

def __repr__(self):
curr_index = self.N
s = "\tStart \tEnd \tSuf \tFirst \tLast \tString\n"
values = list(self.edges.values())
values.sort(key=lambda x: x.source_node_index)
for edge in values:
if edge.source_node_index == -1:
continue
s += "\t%s \t%s \t%s \t%s \t%s \t" % (edge.source_node_index, edge.dest_node_index,
self.nodes[edge.dest_node_index].suffix_node, edge.first_char_index, edge.last_char_index)

top = min(curr_index, edge.last_char_index)
s += self.string[edge.first_char_index:top + 1] + "\n"
return s

def _add_prefix(self, last_char_index):
"""
This method adds a prefix to the suffix tree using Ukkonen's algorithm.
It starts from the active node and iteratively inserts the prefix into the tree.

Parameters
==========
last_char_index
The index of the last character to be added to the tree.

Returns
=======
None
"""
last_parent_node = -1
while True:
parent_node = self.active.source_node_index
if self.active.explicit():
if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
# prefix is already in tree
break
else:
e = self.edges[self.active.source_node_index,
self.string[self.active.first_char_index]]
if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
# prefix is already in tree
break
parent_node = self._split_edge(e, self.active)

self.nodes.append(SuffixTreeNode())
e = SuffixTreeEdge(last_char_index, self.N,
parent_node, len(self.nodes) - 1)
self._insert_edge(e)

if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
last_parent_node = parent_node

if self.active.source_node_index == 0:
self.active.first_char_index += 1
else:
self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node
self._canonize_suffix(self.active)
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
self.active.last_char_index += 1
self._canonize_suffix(self.active)

def _insert_edge(self, edge):
"""
Inserts a new edge into the suffix tree using the Ukkonen's
algorithm.

Parameters
==========
edge
The Edge object to be inserted.

Returns
=======
None
"""
self.edges[(edge.source_node_index,
self.string[edge.first_char_index])] = edge

def _remove_edge(self, edge):
"""
Removes the edge passed as parameter from the suffix tree using
the Ukkonen algorithm.

Parameters
==========
edge
The edge to be removed.

Returns
=======
None
"""
self.edges.pop(
(edge.source_node_index, self.string[edge.first_char_index]))

def _split_edge(self, edge, suffix):
"""
Inserts a new node and creates a new edge by splitting
an existing edge in the suffix tree using Ukkonen algorithm.

Parameters
==========
edge
The edge to be split.
suffix
The suffix to be inserted.

Returns
=======
None
"""
self.nodes.append(SuffixTreeNode())
e = SuffixTreeEdge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index,
len(self.nodes) - 1)
self._remove_edge(edge)
self._insert_edge(e)
# need to add node for each edge
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index
edge.first_char_index += suffix.length + 1
edge.source_node_index = e.dest_node_index
self._insert_edge(edge)
return e.dest_node_index

def _canonize_suffix(self, suffix):
"""
Canonize the given suffix using the iterative Ukkonen's algorithm
in the suffix tree.

Parameters
==========
suffix
The suffix to be canonized.

Returns
=======
None
"""
if not suffix.explicit():
e = self.edges[suffix.source_node_index,
self.string[suffix.first_char_index]]
if e.length <= suffix.length:
suffix.first_char_index += e.length + 1
suffix.source_node_index = e.dest_node_index
self._canonize_suffix(suffix)

# Public methods
def find(self, substring):
"""
Searches for the given substring in the suffix tree using Ukkonen's algorithm.

Parameters
==========
substring
The substring to search for.

Returns
=======
None
"""
if not substring:
return -1
if self.case_insensitive:
substring = substring.lower()
curr_node = 0
i = 0
while i < len(substring):
edge = self.edges.get((curr_node, substring[i]))
if not edge:
return -1
ln = min(edge.length + 1, len(substring) - i)
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
return -1
i += edge.length + 1
curr_node = edge.dest_node_index
return edge.first_char_index - len(substring) + ln

def has(self, substring):
"""
Checks if the given substring is present in the suffix tree using the
find() method and returns True if present, False otherwise.

Parameters
==========
substring
The substring to be searched for in the suffix tree.

Returns
=======
bool
True if the substring is present in the suffix tree, False otherwise
"""
return self.find(substring) != -1
1 change: 1 addition & 0 deletions pydatastructs/strings/tests/long_string.txt

Large diffs are not rendered by default.

119 changes: 119 additions & 0 deletions pydatastructs/strings/tests/test_suffix_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from pydatastructs.strings.suffix_tree import SuffixTree
from pydatastructs.utils.misc_util import SuffixTreeNode, SuffixTreeEdge, Suffix


def test_suffix_tree():
"""Some functional tests.
"""

# test_empty_string(self):
st = SuffixTree('')
assert (st.find('not there') == -1)
assert (st.find('') == -1)
assert (st.has('not there') is False)
assert (st.has('') is False)

# test_repeated_string(self):
st = SuffixTree("aaa")
assert (st.find('a') == 0)
assert (st.find('aa') == 0)
assert (st.find('aaa') == 0)
assert (st.find('b') == -1)
assert (st.has('a') is True)
assert (st.has('aa') is True)
assert (st.has('aaa') is True)

assert (st.has('aaaa') is False)
assert (st.has('b') is False)
# case sensitive by default
assert (st.has('A') is False)
assert (st.find('x') == -1)

# test with case insensitve
st = SuffixTree("aaa", True)
assert (st.find('a') == 0)
assert (st.find('aa') == 0)
assert (st.find('aaa') == 0)
assert (st.find('b') == -1)
assert (st.has('a') is True)
assert (st.has('aa') is True)
assert (st.has('aaa') is True)

assert (st.has('aaaa') is False)
assert (st.has('b') is False)
# case sensitive set manually
assert (st.has('A') is True)
assert (st.find('x') == -1)

# test repr method
assert (repr(st) == str(
"\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t2 \taaa\n"))

# check methods function
assert (st.methods() == ['__new__', '__repr__', 'find', 'has'])


def test_suffix_tree2():
f = open("./pydatastructs/strings/tests/long_string.txt",
encoding="iso-8859-1")
st = SuffixTree(f.read())
assert (st.find('Ukkonen') == 1498)
assert (st.find('Optimal') == 11131)
assert (st.has('ukkonen') is False)
f.close()


def test_suffix_tree3():
# Test SuffixTreeNode
node = SuffixTreeNode()
assert isinstance(node, SuffixTreeNode)
assert (node.suffix_node == -1)
assert (repr(node) == "Node(suffix link: -1)")

# Test SuffixTreeEdge
edge = SuffixTreeEdge(0, 3, 1, 2)
assert isinstance(edge, SuffixTreeEdge)
assert (edge.first_char_index == 0)
assert (edge.last_char_index == 3)
assert (edge.source_node_index == 1)
assert (edge.dest_node_index == 2)
assert (edge.length == 3)
assert (repr(edge) == "Edge(1, 2, 0, 3)")

# Test Suffix implicit() method
suffix = Suffix(1, 2, 3)
assert isinstance(suffix, Suffix)
assert (suffix.source_node_index == 1)
assert (suffix.first_char_index == 2)
assert (suffix.last_char_index == 3)
assert (suffix.length == 1)
assert (suffix.explicit() is False)
assert (suffix.implicit() is True)


def test_suffix_tree4():
edge = SuffixTreeEdge(0, 5, -1, 1)
assert (edge.source_node_index == -1)
edge = SuffixTreeEdge(0, 5, 0, 1)
assert (edge.source_node_index == 0)
edge = SuffixTreeEdge(0, 5, 1, 2)
assert (edge.source_node_index == 1)
# Create a SuffixTree instance
string = "banana"
suffix_tree = SuffixTree(string)

# Add some edges to the suffix tree
edge1 = SuffixTreeEdge(-1, 1, -1, 1)
suffix_tree.edges[(0, "b")] = edge1

# Test the if condition
assert (edge1.source_node_index == -1)
assert (repr(suffix_tree) ==
"\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t2 \t-1 \t1 \t5 \tanana\n\t0 \t3 \t-1 \t2 \t5 \tnana\n")


if __name__ == '__main__':
test_suffix_tree()
test_suffix_tree2()
test_suffix_tree3()
test_suffix_tree4()
Loading