diff --git a/doc/apidoc.json b/doc/apidoc.json index cce4cb16a..55371866d 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -376,7 +376,6 @@ "set_component", "list_assemblies", "get_assembly" - ], "CIF format" : [ "CIFFile", @@ -402,5 +401,13 @@ "StringArrayEncoding", "TypeCode" ] + }, + "biotite.structure.alphabet" : { + "Structural alphabets": [ + "I3DSequence" + ], + "Conversion Function": [ + "to_3di" + ] } } diff --git a/doc/references.bib b/doc/references.bib index 4a08c984c..c83f89f40 100644 --- a/doc/references.bib +++ b/doc/references.bib @@ -742,7 +742,7 @@ @article{Steele2021 eprint = {2001.05304}, primaryclass = {cs}, doi = {10.48550/arXiv.2001.05304}, - archiveprefix = {arxiv} + archiveprefix = {arXiv} } @article{Steinegger2017, @@ -838,6 +838,20 @@ @article{VanHerk1992 doi = {10.1016/0167-8655(92)90069-C} } +@article{VanKempen2024, + title = {Fast and Accurate Protein Structure Search with {{Foldseek}}}, + author = {{van Kempen}, Michel and Kim, Stephanie S. and Tumescheit, Charlotte and Mirdita, Milot and Lee, Jeongjae and Gilchrist, Cameron L. M. and Söding, Johannes and Steinegger, Martin}, + year = {2024}, + month = feb, + journal = {Nature Biotechnology}, + volume = {42}, + number = {2}, + pages = {243--246}, + publisher = {Nature Publishing Group}, + issn = {1546-1696}, + doi = {10.1038/s41587-023-01773-0} +} + @article{Westbrook2015, title = {The Chemical Component Dictionary: Complete Descriptions of Constituent Molecules in Experimentally Determined {{3D}} Macromolecules in the {{Protein Data Bank}}}, shorttitle = {The Chemical Component Dictionary}, diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 2a7d23437..7c80fa1fc 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -2,14 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +__all__ = ["SubstitutionMatrix"] __name__ = "biotite.sequence.align" __author__ = "Patrick Kunzmann" -import os +import functools +from pathlib import Path import numpy as np from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence -__all__ = ["SubstitutionMatrix"] +# Directory of matrix files +_DB_DIR = Path(__file__).parent / "matrix_data" class SubstitutionMatrix(object): @@ -59,6 +62,10 @@ class SubstitutionMatrix(object): - **RBLOSUM_** - **CorBLOSUM_** + - Structural alphabet substitution matrices + + - **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024` + A list of all available matrix names is returned by :meth:`list_db()`. @@ -124,9 +131,6 @@ class SubstitutionMatrix(object): >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50") """ - # Directory of matrix files - _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data") - def __init__(self, alphabet1, alphabet2, score_matrix): self._alph1 = alphabet1 self._alph2 = alphabet2 @@ -350,7 +354,7 @@ def dict_from_db(matrix_name): matrix_dict : dict A dictionary representing the substitution matrix. """ - filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat" + filename = _DB_DIR / f"{matrix_name}.mat" with open(filename, "r") as f: return SubstitutionMatrix.dict_from_str(f.read()) @@ -364,11 +368,10 @@ def list_db(): db_list : list List of matrix names in the internal database. """ - files = os.listdir(SubstitutionMatrix._db_dir) - # Remove '.mat' from files - return [file[:-4] for file in sorted(files)] + return [path.stem for path in _DB_DIR.glob("*.mat")] @staticmethod + @functools.cache def std_protein_matrix(): """ Get the default :class:`SubstitutionMatrix` for protein sequence @@ -379,9 +382,12 @@ def std_protein_matrix(): matrix : SubstitutionMatrix Default matrix. """ - return _matrix_blosum62 + return SubstitutionMatrix( + ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62" + ) @staticmethod + @functools.cache def std_nucleotide_matrix(): """ Get the default :class:`SubstitutionMatrix` for DNA sequence @@ -392,13 +398,23 @@ def std_nucleotide_matrix(): matrix : SubstitutionMatrix Default matrix. """ - return _matrix_nuc + return SubstitutionMatrix( + NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" + ) + + @staticmethod + @functools.cache + def std_3di_matrix(): + """ + Get the default :class:`SubstitutionMatrix` for 3Di sequence + alignments. + Returns + ------- + matrix : SubstitutionMatrix + Default matrix. + """ + # Import inside function to avoid circular import + from biotite.structure.alphabet.i3d import I3DSequence -# Preformatted BLOSUM62 and NUC substitution matrix from NCBI -_matrix_blosum62 = SubstitutionMatrix( - ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62" -) -_matrix_nuc = SubstitutionMatrix( - NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" -) + return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di") diff --git a/src/biotite/sequence/align/matrix_data/3Di.mat b/src/biotite/sequence/align/matrix_data/3Di.mat new file mode 100644 index 000000000..93fe4e97b --- /dev/null +++ b/src/biotite/sequence/align/matrix_data/3Di.mat @@ -0,0 +1,25 @@ +# 3Di bit/2 +# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001 +# Lambda (precomputed optional): 0.351568 + A C D E F G H I K L M N P Q R S T V W Y X +A 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 0 +C -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 0 +D 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 0 +E 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 0 +F 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 0 +G -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 0 +H -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 0 +I -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 0 +K -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 0 +L -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 0 +M -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 0 +N -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 0 +P -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 0 +Q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 0 +R -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 0 +S -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 0 +T -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 0 +V -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 0 +W 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 0 +Y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 0 +X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ No newline at end of file diff --git a/src/biotite/structure/alphabet/__init__.py b/src/biotite/structure/alphabet/__init__.py new file mode 100644 index 000000000..1e1678822 --- /dev/null +++ b/src/biotite/structure/alphabet/__init__.py @@ -0,0 +1,12 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +A subpackage for converting structures to structural alphabet sequences. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde, Patrick Kunzmann" + +from .i3d import * diff --git a/src/biotite/structure/alphabet/encoder.py b/src/biotite/structure/alphabet/encoder.py new file mode 100644 index 000000000..9793a59f4 --- /dev/null +++ b/src/biotite/structure/alphabet/encoder.py @@ -0,0 +1,332 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Implementation of the encoder neural network adapted from ``foldseek``. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["Encoder", "VirtualCenterEncoder", "PartnerIndexEncoder", "FeatureEncoder"] + +import abc +from importlib.resources import files as resource_files +import numpy +import numpy.ma +from biotite.structure.alphabet.layers import CentroidLayer, Model +from biotite.structure.alphabet.unkerasify import load_kerasify + + +class _BaseEncoder(abc.ABC): + @abc.abstractmethod + def encode(self, ca, cb, n, c): + """ + Encode the given atom coordinates to a different representation. + + Parameters + ---------- + ca, cb, n, c : ndarray, shape=(n, 3), dtype=float + The coordinates of the ``CA``, ``CB``, ``N`` and ``C`` atoms for each + residue. + *NaN* if missing, e.g. ``CB`` for glycine. + + Returns + ------- + encoded : MaskedArray, shape=(n, m), dtype=float + The encoded representation. + """ + raise NotImplementedError + + +class VirtualCenterEncoder(_BaseEncoder): + r""" + An encoder for converting a protein structure to a virtual center. + + For each residue, the coordinates of the virtual center are computed + from the coordinates of the ``CA``, ``CB`` and ``N`` atoms. The virtual center + :math:`V` is defined by the angle :math:`\theta = \angle V C_{\alpha} C_{\beta}`, + the dihedral angle :math:`\tau = \angle V C_{\alpha} C_{\beta} N` and the length + :math:`l = |V - C_{\alpha}|`. The default parameters used + in ``foldseek`` were selected after optimization on a validation set. + + Parameters + ---------- + distance_alpha_beta : float + The default distance between the ``CA`` and ``CB`` atoms to use when + reconstructing missing *Cβ* coordinates. + distance_alpha_v : float + The distance between the virtual center *V* and the ``CA`` atom, used to compute + the virtual center coordinates. + theta : float + The angle θ between the virtual center *V*, the ``CA`` and ``CB`` atoms, used to + compute the virtual center coordinates. + tau : float + The dihedral angle τ between the virtual center *V* and the ``CA``, ``CB`` + and ``N`` atoms, used to compute the virtual center coordinates. + """ + + _DISTANCE_ALPHA_BETA = 1.5336 + + def __init__( + self, + *, + distance_alpha_beta=_DISTANCE_ALPHA_BETA, + distance_alpha_v=2.0, + theta=270.0, + tau=0.0, + ): + self.theta = theta + self.tau = tau + self.distance_alpha_v = distance_alpha_v + self.distance_alpha_beta = distance_alpha_beta + + @property + def theta(self): + return numpy.rad2deg(self._theta) + + @theta.setter + def theta(self, theta): + self._theta = numpy.deg2rad(theta) + self._cos_theta = numpy.cos(self._theta) + self._sin_theta = numpy.sin(self._theta) + + @property + def tau(self): + return numpy.rad2deg(self._tau) + + @tau.setter + def tau(self, tau): + self._tau = numpy.deg2rad(tau) + self._cos_tau = numpy.cos(self._tau) + self._sin_tau = numpy.sin(self._tau) + + def _compute_virtual_center(self, ca, cb, n): + assert ca.shape == n.shape + assert ca.shape == cb.shape + v = cb - ca + a = cb - ca + b = n - ca + # normal angle + k = _normalize(numpy.cross(a, b, axis=-1), inplace=True) + v = ( + v * self._cos_theta + + numpy.cross(k, v) * self._sin_theta + + k * (k * v).sum(axis=-1).reshape(-1, 1) * (1 - self._cos_theta) + ) + # dihedral angle + k = _normalize(n - ca, inplace=True) + v = ( + v * self._cos_tau + + numpy.cross(k, v) * self._sin_tau + + k * (k * v).sum(axis=-1).reshape(-1, 1) * (1 - self._cos_tau) + ) + # apply final vector to Cα + v *= self.distance_alpha_v + v += ca + return v + + def _approximate_cb_position(self, ca, n, c): + """ + Approximate the position of ``CB`` from the backbone atoms. + """ + assert ca.shape == n.shape + assert ca.shape == c.shape + v1 = _normalize(c - ca, inplace=True) + v2 = _normalize(n - ca, inplace=True) + v3 = v1 / 3.0 + + b1 = numpy.add(v2, v3, out=v2) + b2 = numpy.cross(v1, b1, axis=-1) + u1 = _normalize(b1, inplace=True) + u2 = _normalize(b2, inplace=True) + + out = (numpy.sqrt(8) / 3.0) * ((-u1 / 2.0) - (u2 * numpy.sqrt(3) / 2.0)) - v3 + out *= self.distance_alpha_beta + out += ca + return out + + def _create_nan_mask(self, ca, n, c): + """ + Mask any column which contains at least one *NaN* value. + """ + mask_ca = numpy.isnan(ca).max(axis=1) + mask_n = numpy.isnan(n).max(axis=1) + mask_c = numpy.isnan(c).max(axis=1) + return (mask_ca | mask_n | mask_c).repeat(3).reshape(-1, 3) + + def encode(self, ca, cb, n, c): + ca = numpy.asarray(ca) + cb = numpy.asarray(cb) + n = numpy.asarray(n) + c = numpy.asarray(c) + + assert ca.shape == cb.shape + assert ca.shape == c.shape + assert ca.shape == n.shape + + # fix CB positions if needed + nan_indices = numpy.isnan(cb) + if numpy.any(nan_indices): + cb_approx = self._approximate_cb_position(ca, n, c) + # avoid writing to CB directly since it should be callee-save + cb_approx[~nan_indices] = cb[~nan_indices] + cb = cb_approx + # compute virtual center + vc = self._compute_virtual_center(ca, cb, n) + # mask residues without coordinates + return numpy.ma.masked_array( + vc, + mask=self._create_nan_mask(ca, n, c), + fill_value=numpy.nan, + ) + + +class PartnerIndexEncoder(_BaseEncoder): + """ + An encoder for converting a protein structure to partner indices. + + For each residue, the coordinates of the virtual center are computed from the + coordinates of the ``CA``, ``CB`` and ``N`` atoms. + A pairwise distance matrix is then created, and the index of the closest partner + residue is extracted for each position. + """ + + def __init__(self): + self.vc_encoder = VirtualCenterEncoder() + + def _find_residue_partners( + self, + x, + ): + # compute pairwise squared distance matrix + r = numpy.sum(x * x, axis=-1).reshape(-1, 1) + r[0] = r[-1] = numpy.nan + D = r - 2 * numpy.ma.dot(x, x.T) + r.T + # avoid selecting residue itself as the best + D[numpy.diag_indices_from(D)] = numpy.inf + # get the closest non-masked residue + return numpy.nan_to_num(D, copy=False, nan=numpy.inf).argmin(axis=1) + + def encode(self, ca, cb, n, c): + # encode backbone atoms to virtual center + vc = self.vc_encoder.encode(ca, cb, n, c) + # find closest neighbor for each residue + return self._find_residue_partners(vc) + + +class FeatureEncoder(_BaseEncoder): + """ + An encoder for converting a protein structure to structural descriptors. + """ + + def __init__(self): + self.partner_index_encoder = PartnerIndexEncoder() + self.vc_encoder = self.partner_index_encoder.vc_encoder + + def _calc_conformation_descriptors(self, ca, partner_index, dtype=numpy.float32): + # build arrays of indices to use for vectorized angles + i = numpy.arange(1, ca.shape[-2] - 1) + j = partner_index[i] + # compute conformational descriptors + u1 = _normalize(ca[..., i, :] - ca[..., i - 1, :], inplace=True) + u2 = _normalize(ca[..., i + 1, :] - ca[..., i, :], inplace=True) + u3 = _normalize(ca[..., j, :] - ca[..., j - 1, :], inplace=True) + u4 = _normalize(ca[..., j + 1, :] - ca[..., j, :], inplace=True) + u5 = _normalize(ca[..., j, :] - ca[..., i, :], inplace=True) + desc = numpy.zeros((ca.shape[0], 10), dtype=dtype) + desc[i, 0] = numpy.sum(u1 * u2, axis=-1) + desc[i, 1] = numpy.sum(u3 * u4, axis=-1) + desc[i, 2] = numpy.sum(u1 * u5, axis=-1) + desc[i, 3] = numpy.sum(u3 * u5, axis=-1) + desc[i, 4] = numpy.sum(u1 * u4, axis=-1) + desc[i, 5] = numpy.sum(u2 * u3, axis=-1) + desc[i, 6] = numpy.sum(u1 * u3, axis=-1) + desc[i, 7] = numpy.linalg.norm(ca[i] - ca[j], axis=-1) + desc[i, 8] = numpy.clip(j - i, -4, 4) + desc[i, 9] = numpy.copysign(numpy.log(numpy.abs(j - i) + 1), j - i) + return desc + + def _create_descriptor_mask(self, mask, partner_index): + i = numpy.arange(1, mask.shape[0] - 1) + j = partner_index[i] + out = numpy.zeros((mask.shape[0], 10), dtype=numpy.bool_) + out[1:-1, :] |= ( + mask[i - 1] | mask[i] | mask[i + 1] | mask[j - 1] | mask[j] | mask[j + 1] + ).reshape(mask.shape[0] - 2, 1) + out[0] = out[-1] = True + return out + + def encode(self, ca, cb, n, c): + # encode backbone atoms to virtual center + vc = self.vc_encoder.encode(ca, cb, n, c) + # find closest neighbor for each residue + partner_index = self.partner_index_encoder._find_residue_partners(vc) + # build position features from residue angles + descriptors = self._calc_conformation_descriptors(ca, partner_index) + # create mask + mask = self._create_descriptor_mask(vc.mask[:, 0], partner_index) + return numpy.ma.masked_array( + descriptors, + mask=mask, + fill_value=numpy.nan, + ) + + +class Encoder(_BaseEncoder): + """ + An encoder for converting a protein structure to 3di states. + """ + + _INVALID_STATE = 2 + _CENTROIDS = numpy.array( + [ + [-1.0729, -0.3600], + [-0.1356, -1.8914], + [0.4948, -0.4205], + [-0.9874, 0.8128], + [-1.6621, -0.4259], + [2.1394, 0.0486], + [1.5558, -0.1503], + [2.9179, 1.1437], + [-2.8814, 0.9956], + [-1.1400, -2.0068], + [3.2025, 1.7356], + [1.7769, -1.3037], + [0.6901, -1.2554], + [-1.1061, -1.3397], + [2.1495, -0.8030], + [2.3060, -1.4988], + [2.5522, 0.6046], + [0.7786, -2.1660], + [-2.3030, 0.3813], + [1.0290, 0.8772], + ] + ) + + def __init__(self): + self.feature_encoder = FeatureEncoder() + layers = load_kerasify( + resource_files(__package__).joinpath("encoder_weights_3di.kerasify") + ) + self.vae_encoder = Model(layers + (CentroidLayer(self._CENTROIDS),)) + + def encode( + self, + ca, + cb, + n, + c, + ): + descriptors = self.feature_encoder.encode(ca, cb, n, c) + states = self.vae_encoder(descriptors.data) + return numpy.ma.masked_array( + states, + mask=descriptors.mask[:, 0], + fill_value=self._INVALID_STATE, + ) + + +def _normalize(x, *, inplace=False): + norm = numpy.linalg.norm(x, axis=-1).reshape(*x.shape[:-1], 1) + return numpy.divide(x, norm, out=x if inplace else None, where=norm != 0) diff --git a/src/biotite/structure/alphabet/encoder_weights_3di.kerasify b/src/biotite/structure/alphabet/encoder_weights_3di.kerasify new file mode 100644 index 000000000..cfec8fbe4 Binary files /dev/null and b/src/biotite/structure/alphabet/encoder_weights_3di.kerasify differ diff --git a/src/biotite/structure/alphabet/i3d.py b/src/biotite/structure/alphabet/i3d.py new file mode 100644 index 000000000..1f295d1e1 --- /dev/null +++ b/src/biotite/structure/alphabet/i3d.py @@ -0,0 +1,131 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +NumPy port of the ``foldseek`` code for encoding structures to 3di. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["I3DSequence", "to_3di"] + +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.sequence import Sequence +from biotite.structure.alphabet.encoder import Encoder +from biotite.structure.chains import get_chain_starts +from biotite.structure.util import coord_for_atom_name_per_residue + + +class I3DSequence(Sequence): + """ + Representation of a structure in the 3Di structural alphabet. + :footcite:`VanKempen2024` + + Parameters + ---------- + sequence : iterable object, optional + The 3Di sequence. + This may either be a list or a string. + May take upper or lower case letters. + By default the sequence is empty. + + See also + -------- + to_3di : Create 3Di sequences from a structure. + + References + ---------- + + .. footbibliography:: + + """ + + alphabet = LetterAlphabet( + [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] + ) + unknown_symbol = "D" + + def __init__(self, sequence=""): + if isinstance(sequence, str): + sequence = sequence.upper() + else: + sequence = [symbol.upper() for symbol in sequence] + seq_code = I3DSequence.alphabet.encode_multiple(sequence) + super().__init__() + self.code = seq_code + + def get_alphabet(self): + return I3DSequence.alphabet + + def __repr__(self): + return f'I3DSequence("{"".join(self.symbols)}")' + + +def to_3di(atoms): + """ + Encode each chain in the given structure to the 3Di structure alphabet. + :footcite:`VanKempen2024` + + Parameters + ---------- + atoms : AtomArray + The atom array to encode. + May contain multiple chains. + + Returns + ------- + sequences : list of Sequence, length=n + The encoded 3Di sequence for each peptide chain in the structure. + chain_start_indices : ndarray, shape=(n,), dtype=int + The atom index where each chain starts. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> sequences, chain_starts = to_3di(atom_array) + >>> print(sequences[0]) + DQQVVCVVCPNVVNVDHGDD + """ + sequences = [] + chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) + for i in range(len(chain_start_indices) - 1): + start = chain_start_indices[i] + stop = chain_start_indices[i + 1] + chain = atoms[start:stop] + sequence = I3DSequence() + sequence.code = ( + Encoder() + .encode( + *coord_for_atom_name_per_residue(chain, ["CA", "CB", "N", "C"]), + ) + .filled() + ) + sequences.append(sequence) + return sequences, chain_start_indices[:-1] diff --git a/src/biotite/structure/alphabet/layers.py b/src/biotite/structure/alphabet/layers.py new file mode 100644 index 000000000..63279cd15 --- /dev/null +++ b/src/biotite/structure/alphabet/layers.py @@ -0,0 +1,86 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Implementation of the neural network layers used in ``foldseek``. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["Layer", "DenseLayer", "CentroidLayer", "Model"] + +import abc +import functools +import numpy + + +class Layer(abc.ABC): + @abc.abstractmethod + def __call__(self, x): + raise NotImplementedError + + +class DenseLayer(Layer): + def __init__(self, weights, biases=None, activation: bool = True): + self.activation = activation + self.weights = numpy.asarray(weights) + if biases is None: + self.biases = numpy.zeros(self.weights.shape[1]) + else: + self.biases = numpy.asarray(biases) + + def __call__(self, x): + x = numpy.asarray(x) + out = x @ self.weights + out += self.biases + + if self.activation: + return _relu(out, out=out) + else: + return out + + +class CentroidLayer(Layer): + def __init__(self, centroids) -> None: + self.centroids = numpy.asarray(centroids) + self.r2 = numpy.sum(self.centroids**2, axis=1).reshape(-1, 1).T + + def __call__(self, x): + # compute pairwise squared distance matrix + r1 = numpy.sum(x**2, axis=1).reshape(-1, 1) + D = r1 - 2 * x @ self.centroids.T + self.r2 + # find closest centroid + states = numpy.empty(D.shape[0], dtype=numpy.uint8) + D.argmin(axis=1, out=states) + return states + + +class Model: + def __init__(self, layers=()): + self.layers = list(layers) + + def __call__(self, x): + return functools.reduce(lambda x, f: f(x), self.layers, x) + + +def _relu( + x, + out=None, + *, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, +): + return numpy.maximum( + 0.0, + x, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) diff --git a/src/biotite/structure/alphabet/unkerasify.py b/src/biotite/structure/alphabet/unkerasify.py new file mode 100644 index 000000000..95e228af0 --- /dev/null +++ b/src/biotite/structure/alphabet/unkerasify.py @@ -0,0 +1,122 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Parser for extracting weights from Keras files. + +Adapted from `moof2k/kerasify `_. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["load_kerasify"] + +import enum +import functools +import itertools +import struct +import numpy as np +from biotite.structure.alphabet.layers import DenseLayer, Layer + + +class LayerType(enum.IntEnum): + DENSE = 1 + CONVOLUTION2D = 2 + FLATTEN = 3 + ELU = 4 + ACTIVATION = 5 + MAXPOOLING2D = 6 + LSTM = 7 + EMBEDDING = 8 + + +class ActivationType(enum.IntEnum): + LINEAR = 1 + RELU = 2 + SOFTPLUS = 3 + SIGMOID = 4 + TANH = 5 + HARD_SIGMOID = 6 + + +class KerasifyParser: + """An incomplete parser for model files serialized with `kerasify`. + + Notes + ----- + Only dense layers are supported, since the ``foldseek`` VQ-VAE model + is only using 3 dense layers. + """ + + def __init__(self, file) -> None: + self.file = file + self.buffer = bytearray(1024) + (self.n_layers,) = self._get("I") + + def read(self): + if self.n_layers == 0: + return None + + self.n_layers -= 1 + layer_type = LayerType(self._get("I")[0]) + if layer_type == LayerType.DENSE: + (w0,) = self._get("I") + (w1,) = self._get("I") + (b0,) = self._get("I") + weights = ( + np.frombuffer(self._read(f"={w0*w1}f"), dtype="f4") + .reshape(w0, w1) + .copy() + ) + biases = np.frombuffer(self._read(f"={b0}f"), dtype="f4").copy() + activation = ActivationType(self._get("I")[0]) + if activation not in (ActivationType.LINEAR, ActivationType.RELU): + raise NotImplementedError( + f"Unsupported activation type: {activation!r}" + ) + return DenseLayer(weights, biases, activation == ActivationType.RELU) + else: + raise NotImplementedError(f"Unsupported layer type: {layer_type!r}") + + def __iter__(self): + return self + + def __next__(self) -> Layer: + layer = self.read() + if layer is None: + raise StopIteration + return layer + + def _read(self, format: str) -> memoryview: + n = struct.calcsize(format) + if len(self.buffer) < n: + self.buffer.extend( + itertools.islice(itertools.repeat(0), n - len(self.buffer)) + ) + v = memoryview(self.buffer)[:n] + self.file.readinto(v) # type: ignore + return v + + def _get(self, format: str): + v = self._read(format) + return struct.unpack(format, v) + + +@functools.cache +def load_kerasify(file_path): + """ + Load the the model layers from a ``.kerasify`` file. + + Parameters + ---------- + file_path : str + The path to the ``.kerasify`` file. + + Returns + ------- + layers : tuple of Layer + The model layers. + """ + with open(file_path, "rb") as file: + return tuple(KerasifyParser(file)) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 4ff284168..f5661f58b 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -646,7 +646,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): def _find_matches(query_arrays, reference_arrays): """ For each index in the `query_arrays` find the indices in the - `reference_arrays` where all query values the reference counterpart. + `reference_arrays` where all query values match the reference counterpart. If no match is found for a query, the corresponding index is -1. """ match_masks_for_all_columns = np.stack( diff --git a/src/biotite/structure/segments.py b/src/biotite/structure/segments.py index 5841346b3..f67c24d21 100644 --- a/src/biotite/structure/segments.py +++ b/src/biotite/structure/segments.py @@ -16,7 +16,7 @@ import numpy as np -def apply_segment_wise(starts, data, function, axis): +def apply_segment_wise(starts, data, function, axis=None): """ Generalized version of :func:`apply_residue_wise()` for residues and chains. @@ -36,7 +36,6 @@ def apply_segment_wise(starts, data, function, axis): value = function(segment) else: value = function(segment, axis=axis) - value = function(segment, axis=axis) # Identify the shape of the resulting array by evaluation # of the function return value for the first segment if processed_data is None: diff --git a/src/biotite/structure/util.py b/src/biotite/structure/util.py index cabbdc8f5..43816adfe 100644 --- a/src/biotite/structure/util.py +++ b/src/biotite/structure/util.py @@ -8,9 +8,18 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["vector_dot", "norm_vector", "distance", "matrix_rotate"] +__all__ = [ + "vector_dot", + "norm_vector", + "distance", + "matrix_rotate", + "coord_for_atom_name_per_residue", +] import numpy as np +from biotite.structure.atoms import AtomArray +from biotite.structure.error import BadStructureError +from biotite.structure.residues import get_residue_masks, get_residue_starts def vector_dot(v1, v2): @@ -94,3 +103,54 @@ def matrix_rotate(v, matrix): if orig_ndim > 2: v = v.reshape(*orig_shape) return v + + +def coord_for_atom_name_per_residue(atoms, atom_names): + """ + Get the coordinates of a specific atom for every residue. + + If a residue does not contain the specified atom, the coordinates are `NaN`. + If a residue contains multiple atoms with the specified name, an exception is + raised. + + Parameters + ---------- + atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) + The atom array or stack to get the residue-wise coordinates from. + atom_names : list of str, length=k + + Returns + ------- + coord: ndarray, shape=(k, m, r, 3) or shape=(k, r, 3) + The coordinates of the specified atom for each residue. + """ + residue_starts = get_residue_starts(atoms) + all_residue_masks = get_residue_masks(atoms, residue_starts) + + if isinstance(atoms, AtomArray): + coord = np.full( + (len(atom_names), len(residue_starts), 3), + np.nan, + dtype=np.float32, + ) + else: + coord = np.full( + (len(atom_names), atoms.stack_depth(), len(residue_starts), 3), + np.nan, + dtype=np.float32, + ) + + for i, atom_name in enumerate(atom_names): + atom_mask_for_name = atoms.atom_name == atom_name + all_residue_masks_for_specified_atom = all_residue_masks & atom_mask_for_name + number_of_specified_atoms_per_residue = np.count_nonzero( + all_residue_masks_for_specified_atom, axis=-1 + ) + if np.any(number_of_specified_atoms_per_residue > 1): + raise BadStructureError(f"Multiple '{atom_name}' atoms per residue") + residues_with_specified_atom = number_of_specified_atoms_per_residue == 1 + coord[i, ..., residues_with_specified_atom, :] = atoms.coord[ + ..., atom_mask_for_name, : + ] + + return coord diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 570878945..ca10133bb 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -6,6 +6,7 @@ import pytest import biotite.sequence as seq import biotite.sequence.align as align +import biotite.structure.alphabet as strucalph @pytest.mark.parametrize( @@ -13,7 +14,7 @@ [ entry for entry in align.SubstitutionMatrix.list_db() - if entry not in ["NUC", "GONNET"] + if entry not in ["NUC", "GONNET", "3Di"] ], ) def test_matrices(db_entry): @@ -25,6 +26,35 @@ def test_matrices(db_entry): align.SubstitutionMatrix(alph1, alph2, db_entry) +@pytest.mark.parametrize( + "matrix_name, alphabet", + [ + ("3Di", strucalph.I3DSequence.alphabet), + ], +) +def test_structural_alphabet_matrices(matrix_name, alphabet): + """ + Test for exceptions when reading structural alphabet matrix files. + """ + align.SubstitutionMatrix(alphabet, alphabet, matrix_name) + + +@pytest.mark.parametrize( + "method_name", + [ + "std_protein_matrix", + "std_nucleotide_matrix", + "std_3di_matrix", + ], +) +def test_default_matrices(method_name): + """ + Test for exceptions when using the static methods for getting default matrices. + """ + matrix = getattr(align.SubstitutionMatrix, method_name)() + assert isinstance(matrix, align.SubstitutionMatrix) + + def test_matrix_str(): """ Test conversion of substitution matrix to string via a small diff --git a/tests/structure/data/alphabet/README.rst b/tests/structure/data/alphabet/README.rst new file mode 100644 index 000000000..957108467 --- /dev/null +++ b/tests/structure/data/alphabet/README.rst @@ -0,0 +1,18 @@ +Structural alphabet sequences +============================== + +This directory contains structural alphabet sequences for the test structure files +from the `tests/structure/data/` directory, generated with the respective reference +implementation. + +3Di sequences +------------- + +The 3Di sequences in `i3d.fasta` were generated with `foldseek` according to +`these instructions `_: + +.. code-block:: console + + $ foldseek createdb --chain-name-mode 1 tests/structure/data/*.cif /tmp/biotite_3di + $ foldseek lndb /tmp/biotite_3di_h /tmp/biotite_3di_ss_h + $ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta \ No newline at end of file diff --git a/tests/structure/data/alphabet/i3d.fasta b/tests/structure/data/alphabet/i3d.fasta new file mode 100644 index 000000000..a931e9059 --- /dev/null +++ b/tests/structure/data/alphabet/i3d.fasta @@ -0,0 +1,216 @@ +>1aki_A THE STRUCTURE OF THE ORTHORHOMBIC FORM OF HEN EGG-WHITE LYSOZYME AT 1.5 ANGSTROMS RESOLUTION +DADDLQRVLVLCVVLPPACQVHDHSLLVSQQLCQPPVRAQADWDADPQQWIQGGSNRHIQQEDAPQPRGDNGNNPVVYHSVVSSDPRCPSVVSRVSVVCVPPVHVVVRVSSVVPPPPDPSVVSCPPHDD +>1crr_MODEL_21_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEAAEDFDQPQCVVLLVVCVAVNDRDPPPCLPDFDWDWHWDCLVHDRYTYTYGDHHPPDDDDDVNLVCLLPGFFYLQEYEQAVCVRVVVSVPVQVSSCVNVVHLDTQHAYEYEPPPDHPRNDDVVVVVVVVVVSVYHYWYYYSVPRPGSCVRVSSRVVSVVVD +>1crr_MODEL_22_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEEWDAQLCLVQLLVCVPVVDGDNCPPQADFDWRKDWDAAPNRIYIYGYGNGGDHPDDDPVSLVVLVVHFQYAQEYAQLVCVRVVVSQVVQVSSCVNVVHLQTLYAYEYEPPPDDRGPHPVVVCVVRCVVSPHYYWYYYSNPRPGSCVRVRSRVVSVVVD +>1crr_MODEL_23_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEAFDDPQCLVLLQVCVAVVDGDPDDALLDADWDWHWDQQDPGIYTYGYHDHGDHPPDDDVSLVVLQVGFFYELEAAQLPCVRVVVVQVVQPSSCVSVVHLDTLYAYEYENPPDDNGNNPPPPVCVRCVVSVHHYWYAYSNPRPGSCCRVSSRVVSVVPD +>1crr_MODEL_24_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEAEEDWAQFLCLVLLLVCVPVVDGDGCPDQADFDWRWHWDAAPHDIYIYTYGDGHDHDDDDVVSLVCLQVHFFYQLRAAQQPVVRVVRVVVVQVSSCVSVVHLQTQYEDEHEPPPDDDGNHDPPVQCCVQVVSVHHYWYAYSPPRRGSVCRVNVRVVSVVPD +>1crr_MODEL_25_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEAQEAWDLPQCRVLLLVCVAVNDRDPCPPLPDFDWRWDWDCLPPNIYIYGYGDGRDDDPDDPPSLVVLQVHQFYAQEAAQAPCVRVVVVVVVQPSSCVSVVHLDTQYEYEYEPPPDDRGRRDVVNVCVVVVVSPYHYWYYYSVPRRGSCCRVSVRCVSVVVD +>1crr_MODEL_26_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEQLEAFDPPLCLPLLQCCQPVNDRDPPPDAQDADWHWDWDAQPNPIYIYIYGSGGDDDDPDDVSLVCLQPHFNYAHEAAFAPCVRVVVVQVVQVSSCVSVVHQQTLHAYEYENPPDDDGNDDPVRVCVVCVVSVHYYWYAYSVPRRGSCVRVNSRVVSVVPD +>1crr_MODEL_27_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEEQEDWDQPLCLVQLVVCVPVPAGDPDPDQADADFDKDWDALDPRIYIYRYGSGGPHDDPPCVSLVCLQVHFFYAYEAEQLPVVRVVVCLPVVVCSCVSVVHLDTLYAHEYENPPDPDGNRDPPVVCCRQVVSPHHYFYAYSNVRHGSSVRVSSRCVSVVVD +>1crr_MODEL_28_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEAEEAWAAPLCPVLLVVCVPVVDRDDPDQLQDADWDWDFDPLVDDTYIYTYGDHRHDCPPPDVSLLSLQVHQQYALEAAQAPVVRVVRVVVVQPSSCVRVVHRDGQHAYEHEPPPDDDGPHDPVVQCCVQVVSVHYYYYAYSPPRRGSSVRVSVRVVSVVPD +>1crr_MODEL_29_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEEFAQPQCLVLLLCCVPVNDRDDRVQLQDFDKDWDWDQQPNPIYIYIYGNGRHDPDDDCVSLVCLQVHFNYALEAAQVDCVRVVVSVVVQVSSCVNVVHLDTQYEYEHENPPDDPGPRDPVNVCVPCVVSVYYYFYYYSVPRHGSCVRVNSSVVSVVVD +>1crr_MODEL_30_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEAWDQPQCPVLLQVCVPVVAGDPCPPLVDFDWGWDWDPQVDDIYTYTYGDDHDDPPDPVVSLVVLQGGFNYALEAAQQDVVRVVRVVVVQVSSCVSVVHLDTQYEYEYEDPVDDDGPHDPVRQCCVQVVSVHHYFYAYSNHTPRSCVRVNSRVVVVVVD +>1crr_MODEL_31_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEEFDLPLCQVLLLVCVAPNDRDPCDPQANWDWDWHWDPLPPDIYTYTYGYDGHDDDDDDVSLVVLQVGFQYEQEYAQLVCVRVVVCVPVVVSSCVNVVDLQTLYAYEHEDPPDDRGDHPVPVSCVVQVVSVHHYFYAYRNVGGRSSVRVSSRVVSVVPD +>1crr_MODEL_32_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEQLEAWDQPQCLVQLQVCQPVNDGDPPPPLQDFDWGKDWDDQPPDIYIYTYGDDHDDPPPRVVSLVCLQVGFNYAYEAEQVPCVRVVVVVCVQPSSCVNVVHQDTQHAYEYENCPDDHGNDDPVNVCVVCVVSVHYYFYYYSVVRHRSSVRVNSSCVSVVVD +>1crr_MODEL_33_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEQLEAFDQPQQPVLLLVCLVPNAGDPPPDLPDFDWDKDWDCLDNRIHIYTYGDGRQPPDDDDVSLVRLQGGFLYALRAAQAPVVRVVRVQCVQPSSCVNVVHLQTLYADEHEPPPDPDGPHPVVVVCVVQVVSPHYYYYYYRRPRRRSSVRVNSRVVSVVVD +>1crr_MODEL_34_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEEQEAWDPQLCLVQLQVCVVPNAGDNPPPLQDFDFGWDQDAFPNHTYTYGYGSGHDHDPDDPPSLQVLQVGFFYAQEAEFAPRVRVVRCVVRQPSSCVNVVHLDTQHAYEHENPPDDHGNDPVVVSCVVVVVSPYHYYYYYSHPRRCSCVRVRSRVVSVVVD +>1crr_MODEL_35_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEFEAAEAFDDPLQPPLLQVCQAPNARDPCPDLPDAFWGWHWDAQVNRIYTYTYGDGRDCPDDNVDNLVCLLGGFLYALRAEQQDVVRVVRVQCVQVSSCVSVVHNQGLYAYEHEPPPDPDGPHPVVNSVVRCVVSPHHYYYAYNNVGRRSCVSVNSSVVSVVVD +>1crr_MODEL_36_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEDFDLPQCLVQLLVCVVVVAGDDDDPQQDFDWRWDWDDQPPHTYIYTYGNGGDNPDDDDVSLVVLQVHFNYALEAAQQDVVRVVVVVVVQVSSCVSVVHLDTQYAYEHENPPDDPGNHDPVVVCVVCVVSVHYYFYYYSNPRGRSSVRVNSRVVSVVVD +>1crr_MODEL_37_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEAFDQPLCPVQLQVCVVVNDGDDDDQLQDADWGWDFDDQDPDTHIYTYGDGRDDPPPPDVSLVVLQVHQNYAHEAEFAPCVRVVVVLCVLVSSCVSVVPLQGLYAYEHEPPVDPDGNDDPVNVVVVCVVSPHDYFYAYSVHRPRSSVRVNVSVVSVVVD +>1crr_MODEL_38_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEAFDAPLCPPLLQVCVPVNDRDPPPPQPDFDWGKDWDALPPHIYIYTYGDGGDDPDDCVVSLVVLQVHFFYAHEAAQLDVVRVVRVVVVQVSSCVSVVHLQTLYEHEYENPPDDNGNHDPPVQCVVCVVSVHHYFYYYSVVRHGSNCRVSVRVVSVVVD +>1crr_MODEL_39_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEQLEAFDQPQCPVQLVVCLVPNAGDPDPPLPDFDWDKDFDQQVNRTYIYTYGDHRDDDPDDPPSLQVLQVGFQYAHEAAQVPVVRVVVVVVVLVVSCVNVVHLDGLYAYEHEPPPDDDGNRPVVVVCVVCVVSVHYYWYAYSNHRPGSCVRVSSRVVSVVVD +>1crr_MODEL_40_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEEFDAPLCVVQLVVCQPVNDRDPDDDLVDFDWAWHWDAQPPHIYIYIYGDHRDDDDDDDVSLVCLQVGFQYALEAAQAPCVSVVVSVVVLVSSCVNVVHLQTLYAYEHEPCPDPHGNCPPVNCCVSAVVSPHHYYYYYSVVRPCSCVRVNSRVVSVVVD +>1dix_A CRYSTAL STRUCTURE OF RNASE LE +DDDPQLAWWKKWKWWFVLQVDQAPDHWAFAVVDRADRFIATQAIAGHHLVQHGAWQQPPPPFDDCVVVVVCVVVCQHHHGGRHPPHYNCVVRLRRRCRTGVVSLCVQQVDPNSRVVLSSVVCVVQRLQVLCVVLVHHLPWFWAFPVSSQVSSCVVPVEGKDFAWDAGHVGFTATTMIIWMGGSNSPDTDHDSDDHDHDRDTIHTRHGD +>1f2n_A RICE YELLOW MOTTLE VIRUS +DDDFKDKDWDKAFQWFQKFALPAQFWDKAFPAQLSGVVSLLVQLQFFKKAWPKKKKWWAFDDDPPDFWKKKKAKALFQPDDTDSADVSSVPHHQIFMDTLHDANVCVCCNPPHCCPRIGMRIDDRDPGDIATHANDAPVVDDRVVRSRHTRIMMIMGINDNDNHIDGGTTMMITIMMMGGHGDDSVPGD +>1f2n_B RICE YELLOW MOTTLE VIRUS +DDDQKAKDKDKAFQWFQKFAQPALWWDKAFPAQLSGVVSLLVQLQFFKKAWDKKKKWKAFDDDPPDFWKKWKAKALFLVDDTDSADVSRVPHHQIFMGGRRFANVQVVCNPPRDPDSHGMRIDPRVPQDMATHANDAPNVDDRVVRSSHGRIMMTMGINHRDNHIDGGITMMITIIMMGGHGDDSVVGD +>1f2n_C RICE YELLOW MOTTLE VIRUS +DDDDDDDDDPDDDDDDPADPDDDPGDQKDKDWAKAFQWFQKFAQPALFWDKAFPAQLSGVVSVVNQLQFFKKAWDKKKKWKAFPDDPVDAWKKWKAKDLFQPDDTDSADVSSVVGHQIFMGGLRFANCCVVCNPPHCCVGIGMRIDDRVPGDMATHANDAPNPDPSVVSSSHTRIMMIMGINHRDNHMDGGTTMMITIIMMGGHGDDSVVGD +>1gya_MODEL_1_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFDEDPAEDAAQFKFKADFDPDDDDDFFAKKFKAFVVPRHGAFMTGHPPDGDHPDLCWDADDSGMIIGHGDDPVSWGWMWIFTGGPVPDSPDIHTYGYYYDDD +>1gya_MODEL_2_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEAEAEDAFPFKWKADDDPDFDDDFFWKKWKAFVPPRGTQFITADPPDGDHDDLCWYADNGQMIIHGGAHQVSWTWMKMWTGGNPPDTDDIGTYGGDYDDD +>1gya_MODEL_3_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDQEDPAEDEAFDKDKADDDPDFDDDWFFKKWKAFVPPTGTQAITADPPDGPDPDLCWYADPRGMTIHHGDDVVSWTWMKIFTGTNVPHRPDIGTYGYDYDDD +>1gya_MODEL_4_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDDEDAFQFKDKDAFDPDFDDDFFAKWFKAFPPRRHGCFMGDDPPDGRDPDLCWYADPRQMIMRGGHHQVSWGKMKIFTGGPVPDRPGIHTYTYHYDDD +>1gya_MODEL_5_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFFEDEAEDAAFFKDKADDDPAFDDDFAWKWFKAFDPVRGTQFMTADPPDGHDPDQCWYADRRQMIIHGTDGQVSFGWMWIFGGGHVPDRPDIHTYGYHYDDD +>1gya_MODEL_6_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFFEDAAEDAAQFKGKFDDDPDADDQFFQKKFKAQVPPGHTQAIGDDPPDGDDPDLCWYADPRHMIIHHGHHPVSWTWMKMFTGGPVPDSPDIGTYTYDHDDD +>1gya_MODEL_7_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDQEDPAEDAAFFKDKADDDPDFDDDFFFKKFKAFVVVRHGDFMTQPPPDTDDDDLCWYADRRRMIMGGTHGQVSWTWMKMFTGGNVPDRPDITTYTYGYDDD +>1gya_MODEL_8_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDAEDAAFAKDKADDDQDADDDFFFKKFKAFVPPGHTQAITDDPPDGHDPDLCWYADRGRMTIHHGDHVVSWGWMKIFTGGNVPDRPDIGTYGHYYDDD +>1gya_MODEL_9_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDEAEDAAFFKGKAADDPDADDVWFFKKFKAQVVVRGGLFMTDDPPDGDAPDLLWHADNRNMTIHGGDDPVSWTWMKIFTGTPVPDRPDIGTYGYYHDDD +>1gya_MODEL_10_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDAADPAEDAAFFKDKFDFDPDADDDWFQKWFKFFPPVTGGAFMGDHPPDGDHPDLCWYADPRQMTMHHGDHPVSFGWMWIFTGGPVPDRPDIHTYGYYYDDD +>1gya_MODEL_11_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDAEDAAQDKDKFDFDPDADDDFFFKKFKAQVPPGHTQFITDDPPDGDHPDLCWYADPRRMIMHGGDDQVSWGWMKIFTGTPVPDRPDIGTYTYYYDDD +>1gya_MODEL_12_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDAEDAFQGKDKAADDPDADDDFFFKKFKAQVVVRHTQAMGDDPPDGHAPDQCWHADRRQMITHGGDDPVSFTWMKIFTGGPVPDRPDIGTYTYDYDDD +>1gya_MODEL_13_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDQEDEAEDAQPWKDKAAFDPDFDDDFAFKWFKFFDPVRGGQFMGDDPPDGDDDDLCWYADRRQMIIHGGQRPVSFGWMKIFGGGNVPDRPDIGTYGGYYDDD +>1gya_MODEL_14_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDEWEDAFQFKTKAQFDPDDDDDFFFKKFKAFVPVRHGQFIGDPPPDTDDDDLCWYADPNNMIMRGTHHVVSFGWMWIFGGGNVPDSPDIGTYGGDYDDD +>1gya_MODEL_15_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDEAEDAAFFWDKAAFDPDFDDPWFFKKFKAFDPVGGTQAIGDDPPDGDAPDQCWHADRGRMTIHGTGGQVSWGWMKMFTGTPVPDRPDIGTYGYHYDDD +>1gya_MODEL_16_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFADDAEDAAQDKDKFDDDPPFDDDFFFKKFKAQVPVRHTDAIGDDPPDGDDPDQQWYADRRRMIMGGTHDPVSWTWMKMFTGGNVPDRPDIHTYGGDYDDD +>1gya_MODEL_17_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFFADEAEDAAQFKDKDDDDPDFDDDQWFWKFKAQVVVRHGQFIGDDPPDGDDPDLCWYADNGQMTMHGGGHQVNFTWMKIFTGGNVPDGPDIHTYTYDYDDD +>1gya_MODEL_18_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDPAEDAAQGKDKADDDPDADDDFFFKKWKAFVPPGGTQAITDPPDDGRAPDQCWHADRRRMIMGGGDDPVSFGWMKIFTGGNVPDRPDIGTYGYYYDDD +>1igy_A STRUCTURE OF IMMUNOGLOBULIN +DDKAKPAQEAEEAWQAKDKIKMFDPWFQFQAKWKWWDAPPDDIHTAAGGQWHGDPPHDPQWTWHDGTGMIMIIGRTADLVRFTKMWMWRDRDPPIDIHPIHGYQYDDDWDFWDKDWDWFDPVVLVVWKTKIKMKTAQTPDPPKDKAKDLADGHRDDQKDKDWDDQDSPSRGIIIMIMHMDTSCVSVVHFKIKTWMDDPVDPDIDIDIGTDDPD +>1igy_B STRUCTURE OF IMMUNOGLOBULIN +DDWAKDDAEEDEQFAKDKMKTFDDDDQQLQWKKWKWWAADVGDIGTAWIAGNPRPDIDGDDVQPPQKDWDADSVRGIIMIMHGGHHQVVFTWMWMAIDVPGPHIHDTDTYGHDPDDFWAWDKDWFAPQCDPCPPQKDKTKIKTWFGPDDDKDKDKPNVPDDDDKAWDDWDDDPRMTITMMMHIGGRPCPQPPWMWIWMAGVVVGDTDIHTRDHDDPDPPPPDDDDPPDKDKDKAFFALCQLQVQVHWTWIKMKIWAAAPAWPQQDKWKDFPNDIDDFPPQWPWFDCPPNHTMTMGGGTDHSVSLVVWTWIWIFTDTDPDPDRDTGIDTHDDDDWFFWPKDKADFDPVPDPDFKGKIKIKTWFTDDQNKDKFKAFLHDGDDDWDKDGWDQDDVRGTMIMIIDMDGNVSQVVFSKMKMKMADVVDDSRIDIDIDTD +>1igy_C STRUCTURE OF IMMUNOGLOBULIN +DAKAKPAQEAEEAWQAKDKIKMFGPWFQFQAKWKWWDAPPDDIHTAAGRQWHGDPPHDPQWTWHDGTGMIMIIGRTAALVRFTKMWMWRDRDPPIDIHPIHGYQYDDDWDFWDKDWDWFDPVVLVVWKTKIKMKTAQTPDPPKDKAKDLADGHRDDQKDKDWDDQDSPSRGIIIMIMGMDTNCVSVVHFKMKTWMDDDVDPDIDIDIGGNPPD +>1igy_D STRUCTURE OF IMMUNOGLOBULIN +DDWAKDDAEEDEQFAKDKMKTFDDDDQQLQWKKFKWWAADVGDIGTAWIAGRPRPDIDGDDVQPPQKDWDADSVRGIIMIMHGGHHQVVFTWMWMAIDVPGPHIHDTDTYGHDPDDFDAWDKDWFAPQCDDCPPQKDKTKIKTWFGPDDDKDKDKPNVPADPDKAWDDWDDDPRMIITMMMHIGGRPCPVPPWMWIWMAGVVVGDTDIHTRDHPDDDDDQPQDDDDQDKDKDKAFFALVQLQDQVHWTWTKMKIWAAAPAWDQQDKWKDFANDIDDFPPQWPWFDCPSNHTMTMGGGTDHSVSLVVWTWIWIFTDTDSDPDRDIGIDTHDDDDWFFWPKDKAFFDPVPDPDFKGKIKIKTWFTDDQSKDKFKAFLHDGDDDWDKDGWDQDDVGGTMIMIIDMDGPVSQVVFSKMKMKMADVVDDSRTDIDIDTD +>1l2y_MODEL_1_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVNVDHGDD +>1l2y_MODEL_2_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNPDHGDD +>1l2y_MODEL_3_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNVVVVDHGPD +>1l2y_MODEL_4_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVNPDDGDD +>1l2y_MODEL_5_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNVDHGDD +>1l2y_MODEL_6_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNVDHGPD +>1l2y_MODEL_7_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAPVNVDHGPD +>1l2y_MODEL_8_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCVNPVNPDDGPD +>1l2y_MODEL_9_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNPVNVDDGDD +>1l2y_MODEL_10_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNPVNPDDGDD +>1l2y_MODEL_11_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVVPNCVVVDHGDD +>1l2y_MODEL_12_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNPDDGPD +>1l2y_MODEL_13_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVHPPNPDDGDD +>1l2y_MODEL_14_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCCVCPNVVNPDHGDD +>1l2y_MODEL_15_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNPDDGDD +>1l2y_MODEL_16_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQCVCVVCPNVVVVDHGDD +>1l2y_MODEL_17_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DVQVVCVVCPAPVNVDHGDD +>1l2y_MODEL_18_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAVVNPDHGDD +>1l2y_MODEL_19_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDHGDD +>1l2y_MODEL_20_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVVVDHGPD +>1l2y_MODEL_21_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCVNCVVVDHGDD +>1l2y_MODEL_22_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNPDHGPD +>1l2y_MODEL_23_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVVVNCVVVDHGPD +>1l2y_MODEL_24_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNVVNVDDGDD +>1l2y_MODEL_25_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DVQVVCVVCVNCVVVDHGDD +>1l2y_MODEL_26_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAVVNVDHGPD +>1l2y_MODEL_27_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNVVNVDHGPD +>1l2y_MODEL_28_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNPDHGDD +>1l2y_MODEL_29_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNPVNVDDGDD +>1l2y_MODEL_30_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPHPVNPDDGDD +>1l2y_MODEL_31_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDHGDD +>1l2y_MODEL_32_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNPDDGDD +>1l2y_MODEL_33_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVNPDHGPD +>1l2y_MODEL_34_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNPPNPDDGDD +>1l2y_MODEL_35_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAVVNPDHGPD +>1l2y_MODEL_36_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVVVDHGDD +>1l2y_MODEL_37_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDHGPD +>1l2y_MODEL_38_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDDGDD +>1o1z_A Crystal structure of glycerophosphodiester phosphodiesterase (GDPD) (TM1621) from Thermotoga maritima at 1.60 A resolution +DDDDAQEAAEQALVVVDPGVFLVRQLVLVVLPHLGYEWEWDAALVGFTWTDPDQADCPPQNDGGGRNRHHPVRVCVSVVNGIDGPVVSPVRDDLSGAYEYEYPDLSNVVVVVVVCPVGPRYEYEYCPVVSCQVRVPPHAYEYEDDPPSCPDVVSNLVVCVVRVHQEYEYELVLVVDVVSLVSLQVSVVVNRAYEYDDDDDVVSCVVCVNSHRYYHYSNSNVVVVVD +>2axd_MODEL_1_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPPCCVVVPPVNVLLVVLLVLLLQQLCVVVVVPDGDPVVSQVVNPPVVSVSSVVSNVVSVVVSVVVPVPDPDDDDD +>2axd_MODEL_2_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPDDCVNVPVVVVVLVVLLVLLLLLLCVVVVVDDGDNVVSPVVDDPVCVVSSVVSNVVSVVCSVVVHDPDDPPDDD +>2axd_MODEL_3_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DVDVPCVDDVVVVLLVVLLVLLLLLLVVVVVVPPRPNVVSVVPRDPVPVVSNVVSNVVSVVVSVCVVPDDPPDDDD +>2axd_MODEL_4_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPDDCVVDPPPVVLLVQLLVQLLLVLCVVVVVDDGPPVVLVVSQDPVPVVSSVVSNVVSVVCSVCVVVPDCPPPDD +>2axd_MODEL_5_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DVDDDCVVPVVNCVLVVLLVLLLLLLVVVCPVPDGDNVVSLVVDDPVPSVSNVVSNVVNVVLVVVVVNPPPPDDPD +>2axd_MODEL_6_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DDDDPVCDDVVVCLLVVLLVLLLLLLCVVVVVPPGPVCVSVVSQDPVPNVSNVVSNVVSVCVSVLVPGNDPNDDDD +>2axd_MODEL_7_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DQDPVCCDDPVNCLLVVLLVVLLLVLCCVPVVDDRPNVCNVVSDDPVCPVSNVVSNVVSVVVSVCVVVPPDDDDDD +>2axd_MODEL_8_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPPVVLVDDVCNVVLVVLLVLLLLVLCVVVPVPDGDPVVSVVVDDPVVVVSSVVSNVVSPVCSVCVPPPDPDDDDD +>2axd_MODEL_9_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DDDDVVVPDCVPVLLVVLLVLLLQVLCVVVVVDDGDPVVLVVSDDPVCVVSSVVSNVVNVVCVVVPVPVDDDPPDD +>2axd_MODEL_10_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DQCDPCSPPPVVVLLVVLLVLLLLLLCVVVVVPDRDNVVSLVVRDPVPNVSNVVSNVVSNVCSVVVCPDDDDDDDD +>2axd_MODEL_11_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPPCCVVDPPVVVLLVVLLVLLLVVLVCVCVVPDDDPVVSLVVDDVVPNVSSVVSNVVSVVCSVVVVVDPDDDDDD +>2axd_MODEL_12_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPDPCVVPDCNVCLLVVLLVLLLLQLCVVVVVDDGDPPVSQVPRDPVPPVSSVVSNVVSPVVSVVVVNVDDDPDDD +>2d0f_A Crystal Structure of Thermoactinomyces vulgaris R-47 Alpha-Amylase 1 (TVAI) Mutant D356N complexed with P2, a pullulan model oligosaccharide +DALDFEFDQLFWDDDCFFVFWPCLAAAQQDKIKGKIKGFPHRADWKWKWKAWPVVRDIDIWTWAWDQAFVQNGMTIIMTIDGGGQTKMFIKMWTGRVNQIWIQALLGIGRDDDPALTAIDHHPFDAFVCLLQFAEEEDLQQFEFQQDQVLAQDFQNDDDPPATAHEDEAPDDLCDDPRHHSFRYGDQGALNGVQVCVLLVCPLQNGAEYEYQDQADDRTSRQLQAQDLPWGHSSRPTVVSLLVSLVSSCDPPSHPGHAYEYAQNQFWHFCCHLLNNPPCPDPAAHLLRDCPHPSVQQFAAPPGPPRGQADVPPPRTTGGAPPDPPDVSLQQCFDDCNHSLNVQCAPPHNHAEYEYDLLQQRYHNNGRHPDPRSLVSQLVRLCRNCVSPVRRAYEYEDQEQCLVQAVNSRGHLEYAPNQQAQQLLLQDQLCAGQQLHHHHDFLVVSVSSNSVSCSSHHQSHQSNHEHENHEQATFQSCVSNVNDVLLLQLSLLDLLQARHHRYYHPPSQLSDGAHGPPRRSHRDDVVSSHCVRVSSVSSSLSSVVSSVDPQSSRFDKAWFDRDRVQRKTWIWGDHPPKIKIKIFGSAQAKDKDWTQSVSVVDDAQDKWAFSVVRDMFGHHPSTTIGIAGHSHMTITMD +>3o5r_A Complex of Fk506 with the Fk1 domain mutant A19T of FKBP51 +DVLVCQVVPFDQQDPVPPSQKTKDWDFAADDQDAADAFKKWWKWKWKDWPVGDTDDTPVVVVGTDIDGAPPPPAFQNVNRVSRRHHAFIKMKMWGFCNRHVAQVFDPPRGHGRITMIMIMHTHDIDHD +>5eil_A Computational design of a high-affinity metalloprotein homotrimer containing a metal chelating non-canonical amino acid +DDPLQVQLCCLLVQNPVSNVVSVVVPHDLCRADQQQDGSLLSNLQNLRLVSNVVSVVVPRQQCRQGNQQDGSLLNNLLVLNQNSNVVSLVSPHDQCRQTNQGDGSLLNNLLNLNQNNNVVSVVSPRDQCRATNVRDGSLRSNVVNVNVVSVVVSVVD +>5eil_B Computational design of a high-affinity metalloprotein homotrimer containing a metal chelating non-canonical amino acid +DDPLQVVLLVLLVQVPVVNVVSVVVPHDLCRADPQQDGSLLSNLQNLNLVSNVVSVVVPHQQCRQGNQQDGSLLNNLLNQNLNSNVVSVVSPHDQCRQGNQGDGSLLNNLLNLNQSVNVVSVVSPHAQCRATNVRDGSLNNNVVNVNVVNNVVSVPHD +>5eil_C Computational design of a high-affinity metalloprotein homotrimer containing a metal chelating non-canonical amino acid +DDPLQVQLLVLLVLPPVSNVVSVVVVHDLCRADPQQDGSLLSNLQNLNLVSNVVSVVVPRDQCGQGNQQDGSLLNNLLVQNQNSNVVSVVSPRDQCRQTNQGDGSLLNNLLNLNVVNNVVSVVSPHQQPRATNVRDGSLNSNVVNVNVVSNVVSVPVD +>5h73_A Crystal structure of human DHODH with 18F +DLLVCLQPPDLVVCVVVDFVVVSLVVVQVCLLVLVAPQDDDDQDVQQWDDAQNAIFSHQEEADEDSAAQLRRQVSRVSLPGREYEYHQAWAAADQFDDPPFKDDQVVQLKIFGQRGGHHNHLVRSLVSLVVCQVVQVVCRVVRYFYEYEYDYDQPDPDGLVRLLVCCQRCLLSGQEYEDEQAFQNRPPSVVCLAQVNVLVSVVSNVVSLVPDDVNSRHQYEYEEALPDDPVSLLSNLVSCVVRVRQAYEFHDFDQDQDPDRDGPCSPPGGGMAADVCQVSRLVVLLVNCVSNVLPHAYEGEGHDQALVSQLSSQLSAHPHYYYDSNCSNRNSVSSSRNSVRVVVVCVVVVHSHSVRNRNNVVD +>5ugo_A DNA polymerase beta nick complex with imidodiphosphate +DLPVLLLVQLQVVLLLCCQPVVNNVSNVLSNVLSVQVVPPPDNDDALVVSCVTDSRHDVSRVQRRVCNVPVDGPVVVVCCVDQLSVQRSVLCLQFPDGNVNSNVCVVVVQRDLVSCVVPLVPDDPLRNVSSVQVPLLPDKAFVVVVVVVVVVLQVQVCVVPVQKDKDWAEVVQLPDRIGSATEIEIEGQCFAPVHNNDPCSVVSSVVSCVVVQFFDFFSDDDRFKTWGWGFDDDDPPDDGRRIGIYMYGYAHVQQQLLQRVVRNADPVLVVVLQVLQVVLQWGGDSGAIFGADPVRDTDDGDDDDHNVVSCVSSVHGDDRSNVRPD +>5zng_A The crystal complex of immune receptor RGA5A_S of Pia from rice (Oryzae sativa) with rice blast (Magnaporthe oryzae) effector protein AVR1-CO39 +DVQAFFKKKWKKFAPQQDPVSLVVLQVLLCPFPFWDHWDQDDPNSGMITTIGGPGDVVVSQVRRCVVRVDMDTDDMDGD +>5zng_C The crystal complex of immune receptor RGA5A_S of Pia from rice (Oryzae sativa) with rice blast (Magnaporthe oryzae) effector protein AVR1-CO39 +DPAFKKKFKDDPHDTPDIDGDHAQDWDDDPNDIWGQHNQRAIPPQQDPVRIGMGMHTDPVHD +>7gsa_A PanDDA Analysis group deposition -- Crystal structure of PTP1B in complex with FMOPL000260a +DQVVVVLVVCVVVVCLVVVVVVLVVPADDADQVLCPDPVNPQQAPDPPFAFHPVFAQFQPDPPRRDHRWGWQQDVVLRWIAIFHAAGDPVCLLVVVSSCVVLQFQEEEEEEDCDAPRDGLHDPSADPDQPDWDADPVQQKIKGWDDWDDDDQWIKTWMWIARNVVRDIDIHIYIYGYPAHSDFGDPACVVVLVSLVVCVVVCRGPPVGGHHYYYYRSRAAVVLLSVLLVSLLSVLVPDPHNSPRDSSVSSSSSSVITPCHHPDSRSVSSSSNSNVVVCPVVPPD diff --git a/tests/structure/data/ids.txt b/tests/structure/data/ids.txt index ec9fcc5ab..e905de2f8 100644 --- a/tests/structure/data/ids.txt +++ b/tests/structure/data/ids.txt @@ -16,4 +16,4 @@ 5eil 4p5j 1crr -7gsa +7gsa \ No newline at end of file diff --git a/tests/structure/test_i3d.py b/tests/structure/test_i3d.py new file mode 100644 index 000000000..e08d94544 --- /dev/null +++ b/tests/structure/test_i3d.py @@ -0,0 +1,110 @@ +import re +from pathlib import Path +import numpy as np +import pytest +import biotite.sequence.io.fasta as fasta +import biotite.structure as struc +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +def _get_ref_3di_sequence(pdb_id, chain_id): + """ + Get the reference 3di sequence for the first model of the structure with the given + PDB ID and chain ID. + """ + ref_3di_file = fasta.FastaFile.read( + Path(data_dir("structure")) / "alphabet" / "i3d.fasta" + ) + for header, seq_string in ref_3di_file.items(): + # The first model of a structure is also the first sequence to appear + # and thus to be matched + if re.match(rf"^{pdb_id}(_MODEL_\d+)?_{chain_id}", header): + ref_3di_sequence = strucalph.I3DSequence(seq_string) + break + else: + raise ValueError( + f"Reference 3Di sequence not found for {pdb_id} chain {chain_id}" + ) + return ref_3di_sequence + + +@pytest.mark.parametrize( + "path", Path(data_dir("structure")).glob("*.bcif"), ids=lambda path: path.stem +) +def test_to_3di(path): + """ + Check if the 3di sequence of a chain is correctly generated, by comparing the result + to a reference sequence generated with *foldseek*. + """ + if ( + path.stem + in [ + "1dix" # `get_chain_starts()` does not work properly here with `use_author_fields=True` + ] + ): + pytest.skip("Miscellaneous issues") + + pdbx_file = pdbx.BinaryCIFFile.read(path) + if np.any( + pdbx_file.block["atom_site"]["label_alt_id"].mask.array + == pdbx.MaskValue.PRESENT + ): + # There is some inconsistency in how foldseek and Biotite handle altloc IDs + # -> skip these cases for the sake of simplicity + pytest.skip("Structure contains altlocs") + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + if len(atoms) == 0: + pytest.skip("Structure contains no peptide chains") + test_3di, chain_starts = strucalph.to_3di(atoms) + + ref_3di = [ + _get_ref_3di_sequence(path.stem, chain_id) + for chain_id in atoms.chain_id[chain_starts] + ] + + for test, ref, chain_id in zip(test_3di, ref_3di, atoms.chain_id[chain_starts]): + assert str(test) == str(ref), f"3Di sequence of chain {chain_id} does not match" + + +def test_missing_residues(): + """ + Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing. + Expect that these and adjacent residues get the unknown symbol 'Z' in the + PB sequence. + """ + PDB_ID = "1aki" + N_DELETIONS = 5 + MAX_MISMATCH_PERCENTAGE = 0.1 + UKNOWN_SYMBOL = strucalph.I3DSequence.unknown_symbol + + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif") + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + + rng = np.random.default_rng(1) + del_backbone_residue_ids = rng.choice( + np.unique(atoms.res_id), N_DELETIONS, replace=False + ) + atoms = atoms[ + ~np.isin(atoms.res_id, del_backbone_residue_ids) + | ~np.isin(atoms.atom_name, ("N", "CA", "CB", "C")) + ] + test_sequences, _ = strucalph.to_3di(atoms) + + ref_sequence = _get_ref_3di_sequence(PDB_ID, atoms.chain_id[0]) + for res_id in del_backbone_residue_ids: + seq_index = res_id - atoms.res_id[0] + # Convert the PDB symbol for residue and adjacent ones to 'Z' + start_index = max(0, seq_index - 1) + end_index = min(len(ref_sequence), seq_index + 1) + ref_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + + assert len(test_sequences) == 1 + # 3Di sequences are quite complex, i.e. removing backbone atoms at some position + # might alter the symbols in remote positions + # -> Allow for mismatches + n_mismatches = np.count_nonzero(test_sequences[0].code != ref_sequence.code) + assert n_mismatches / len(ref_sequence) <= MAX_MISMATCH_PERCENTAGE diff --git a/tests/test_doctest.py b/tests/test_doctest.py index 4c112451d..ff225d3b5 100644 --- a/tests/test_doctest.py +++ b/tests/test_doctest.py @@ -68,6 +68,7 @@ "biotite.structure.io.mol", ["biotite.structure", "biotite.structure.info"] ), pytest.param("biotite.structure.info", ["biotite.structure"]), + pytest.param("biotite.structure.alphabet", ["biotite.structure"]), pytest.param( "biotite.database.entrez", [], diff --git a/tests/test_repr.py b/tests/test_repr.py index f8bf319c4..7097ba072 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -20,6 +20,7 @@ ) from biotite.sequence.align import Alignment, SubstitutionMatrix from biotite.structure import Atom +from biotite.structure.alphabet import I3DSequence __author__ = "Maximilian Greil" @@ -32,6 +33,7 @@ ProteinSequence("BIQTITE"), Alphabet(["X", "Y", "Z"]), GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), + I3DSequence("ACDE"), LetterAlphabet(["X", "Y", "Z"]), Location(98, 178), Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}),