Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 3di encoding to biotite.structure #665

Merged
merged 13 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,6 @@
"set_component",
"list_assemblies",
"get_assembly"

],
"CIF format" : [
"CIFFile",
Expand All @@ -402,5 +401,13 @@
"StringArrayEncoding",
"TypeCode"
]
},
"biotite.structure.alphabet" : {
"Structural alphabets": [
"I3DSequence"
],
"Conversion Function": [
"to_3di"
]
}
}
16 changes: 15 additions & 1 deletion doc/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ @article{Steele2021
eprint = {2001.05304},
primaryclass = {cs},
doi = {10.48550/arXiv.2001.05304},
archiveprefix = {arxiv}
archiveprefix = {arXiv}
}

@article{Steinegger2017,
Expand Down Expand Up @@ -838,6 +838,20 @@ @article{VanHerk1992
doi = {10.1016/0167-8655(92)90069-C}
}

@article{VanKempen2024,
title = {Fast and Accurate Protein Structure Search with {{Foldseek}}},
author = {{van Kempen}, Michel and Kim, Stephanie S. and Tumescheit, Charlotte and Mirdita, Milot and Lee, Jeongjae and Gilchrist, Cameron L. M. and Söding, Johannes and Steinegger, Martin},
year = {2024},
month = feb,
journal = {Nature Biotechnology},
volume = {42},
number = {2},
pages = {243--246},
publisher = {Nature Publishing Group},
issn = {1546-1696},
doi = {10.1038/s41587-023-01773-0}
}

@article{Westbrook2015,
title = {The Chemical Component Dictionary: Complete Descriptions of Constituent Molecules in Experimentally Determined {{3D}} Macromolecules in the {{Protein Data Bank}}},
shorttitle = {The Chemical Component Dictionary},
Expand Down
52 changes: 34 additions & 18 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

__all__ = ["SubstitutionMatrix"]
__name__ = "biotite.sequence.align"
__author__ = "Patrick Kunzmann"

import os
import functools
from pathlib import Path
import numpy as np
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence

__all__ = ["SubstitutionMatrix"]
# Directory of matrix files
_DB_DIR = Path(__file__).parent / "matrix_data"


class SubstitutionMatrix(object):
Expand Down Expand Up @@ -59,6 +62,10 @@ class SubstitutionMatrix(object):
- **RBLOSUM<n>_<BLOCKS>**
- **CorBLOSUM<n>_<BLOCKS>**

- Structural alphabet substitution matrices

- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`

A list of all available matrix names is returned by
:meth:`list_db()`.

Expand Down Expand Up @@ -124,9 +131,6 @@ class SubstitutionMatrix(object):
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
"""

# Directory of matrix files
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")

def __init__(self, alphabet1, alphabet2, score_matrix):
self._alph1 = alphabet1
self._alph2 = alphabet2
Expand Down Expand Up @@ -350,7 +354,7 @@ def dict_from_db(matrix_name):
matrix_dict : dict
A dictionary representing the substitution matrix.
"""
filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
filename = _DB_DIR / f"{matrix_name}.mat"
with open(filename, "r") as f:
return SubstitutionMatrix.dict_from_str(f.read())

Expand All @@ -364,11 +368,10 @@ def list_db():
db_list : list
List of matrix names in the internal database.
"""
files = os.listdir(SubstitutionMatrix._db_dir)
# Remove '.mat' from files
return [file[:-4] for file in sorted(files)]
return [path.stem for path in _DB_DIR.glob("*.mat")]

@staticmethod
@functools.cache
def std_protein_matrix():
"""
Get the default :class:`SubstitutionMatrix` for protein sequence
Expand All @@ -379,9 +382,12 @@ def std_protein_matrix():
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_blosum62
return SubstitutionMatrix(
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
)

@staticmethod
@functools.cache
def std_nucleotide_matrix():
"""
Get the default :class:`SubstitutionMatrix` for DNA sequence
Expand All @@ -392,13 +398,23 @@ def std_nucleotide_matrix():
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_nuc
return SubstitutionMatrix(
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
)

@staticmethod
@functools.cache
def std_3di_matrix():
"""
Get the default :class:`SubstitutionMatrix` for 3Di sequence
alignments.

Returns
-------
matrix : SubstitutionMatrix
Default matrix.
"""
# Import inside function to avoid circular import
from biotite.structure.alphabet.i3d import I3DSequence

# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
_matrix_blosum62 = SubstitutionMatrix(
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
)
_matrix_nuc = SubstitutionMatrix(
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
)
return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
25 changes: 25 additions & 0 deletions src/biotite/sequence/align/matrix_data/3Di.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 3Di bit/2
# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
# Lambda (precomputed optional): 0.351568
A C D E F G H I K L M N P Q R S T V W Y X
A 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 0
C -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 0
D 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 0
E 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 0
F 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 0
G -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 0
H -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 0
I -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 0
K -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 0
L -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 0
M -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 0
N -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 0
P -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 0
Q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 0
R -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 0
S -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 0
T -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 0
V -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 0
W 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 0
Y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 changes: 12 additions & 0 deletions src/biotite/structure/alphabet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
A subpackage for converting structures to structural alphabet sequences.
"""

__name__ = "biotite.structure.alphabet"
__author__ = "Martin Larralde, Patrick Kunzmann"

from .i3d import *
Loading
Loading