Skip to content

Commit

Permalink
Represent symbols as tuple to improve performance
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Jul 23, 2024
1 parent c495c68 commit 82eba31
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 25 deletions.
12 changes: 6 additions & 6 deletions src/biotite/sequence/align/kmeralphabet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ class KmerAlphabet(Alphabet):
>>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
>>> print(base_alphabet.get_symbols())
['A', 'C', 'G', 'T']
('A', 'C', 'G', 'T')
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
>>> print(kmer_alphabet.get_symbols())
['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']
('AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT')
Encode and decode *k-mers*:
Expand Down Expand Up @@ -210,8 +210,8 @@ class KmerAlphabet(Alphabet):
Returns
-------
symbols : list
A list of all *k-mer* symbols, i.e. all possible
symbols : tuple
A tuple of all *k-mer* symbols, i.e. all possible
combinations of *k* symbols from its *base alphabet*.
Notes
Expand All @@ -224,9 +224,9 @@ class KmerAlphabet(Alphabet):
to be created first.
"""
if isinstance(self._base_alph, LetterAlphabet):
return ["".join(self.decode(code)) for code in range(len(self))]
return tuple(["".join(self.decode(code)) for code in range(len(self))])
else:
return [list(self.decode(code)) for code in range(len(self))]
return tuple([list(self.decode(code)) for code in range(len(self))])


def extends(self, alphabet):
Expand Down
2 changes: 1 addition & 1 deletion src/biotite/sequence/align/multiple.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
# Create new matrix with neutral gap symbol
gap_symbol = GapSymbol.instance()
new_alphabet = Alphabet(
matrix.get_alphabet1().get_symbols() + [gap_symbol]
matrix.get_alphabet1().get_symbols() + (gap_symbol,)
)
new_score_matrix = np.zeros(
(len(new_alphabet), len(new_alphabet)), dtype=np.int32
Expand Down
31 changes: 13 additions & 18 deletions src/biotite/sequence/alphabet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
"common_alphabet",
]

import copy
import string
from numbers import Integral
import numpy as np
Expand Down Expand Up @@ -105,7 +104,7 @@ class Alphabet(object):
def __init__(self, symbols):
if len(symbols) == 0:
raise ValueError("Symbol list is empty")
self._symbols = copy.deepcopy(list(symbols))
self._symbols = tuple(symbols)
self._symbol_dict = {}
for i, symbol in enumerate(symbols):
self._symbol_dict[symbol] = i
Expand All @@ -120,10 +119,10 @@ def get_symbols(self):
Returns
-------
symbols : list
Copy of the internal list of symbols.
symbols : tuple
The symbols.
"""
return copy.deepcopy(self._symbols)
return self._symbols

def extends(self, alphabet):
"""
Expand Down Expand Up @@ -244,7 +243,7 @@ def is_letter_alphabet(self):
return False
if isinstance(symbol, str):
symbol = symbol.encode("ASCII")
if symbol not in LetterAlphabet.PRINATBLES:
if symbol not in LetterAlphabet.PRINTABLES:
return False
return True

Expand All @@ -261,7 +260,11 @@ def __contains__(self, symbol):
return symbol in self.get_symbols()

def __hash__(self):
return hash(tuple(self._symbols))
symbols = self.get_symbols()
if isinstance(symbols, tuple):
return hash(symbols)
else:
return hash(tuple(symbols))

def __eq__(self, item):
if item is self:
Expand Down Expand Up @@ -293,7 +296,7 @@ class LetterAlphabet(Alphabet):
in this list.
"""

PRINATBLES = (string.digits + string.ascii_letters + string.punctuation).encode(
PRINTABLES = (string.digits + string.ascii_letters + string.punctuation).encode(
"ASCII"
)

Expand All @@ -306,7 +309,7 @@ def __init__(self, symbols):
raise ValueError(f"Symbol '{symbol}' is not a single letter")
if isinstance(symbol, str):
symbol = symbol.encode("ASCII")
if symbol not in LetterAlphabet.PRINATBLES:
if symbol not in LetterAlphabet.PRINTABLES:
raise ValueError(
f"Symbol {repr(symbol)} is not printable or whitespace"
)
Expand All @@ -332,15 +335,7 @@ def extends(self, alphabet):
return super().extends(alphabet)

def get_symbols(self):
"""
Get the symbols in the alphabet.
Returns
-------
symbols : list
Copy of the internal list of symbols.
"""
return [symbol.decode("ASCII") for symbol in self._symbols_as_bytes()]
return tuple([symbol.decode("ASCII") for symbol in self._symbols_as_bytes()])

def encode(self, symbol):
if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
Expand Down

0 comments on commit 82eba31

Please sign in to comment.