From 46d2c6187c99ba6a973d770ffcd3348c3be10efa Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 19 Oct 2024 11:26:38 +0200 Subject: [PATCH 1/9] Organize structure tests into subdirectories --- tests/structure/alphabet/__init__.py | 0 tests/structure/{ => alphabet}/test_i3d.py | 0 tests/structure/{ => alphabet}/test_pb.py | 0 tests/structure/io/__init__.py | 0 tests/structure/{ => io}/test_gro.py | 0 tests/structure/{ => io}/test_mol.py | 0 tests/structure/{ => io}/test_pdb.py | 0 tests/structure/{ => io}/test_pdbqt.py | 0 tests/structure/{ => io}/test_pdbx.py | 0 tests/structure/{ => io}/test_trajectory.py | 0 10 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/structure/alphabet/__init__.py rename tests/structure/{ => alphabet}/test_i3d.py (100%) rename tests/structure/{ => alphabet}/test_pb.py (100%) create mode 100644 tests/structure/io/__init__.py rename tests/structure/{ => io}/test_gro.py (100%) rename tests/structure/{ => io}/test_mol.py (100%) rename tests/structure/{ => io}/test_pdb.py (100%) rename tests/structure/{ => io}/test_pdbqt.py (100%) rename tests/structure/{ => io}/test_pdbx.py (100%) rename tests/structure/{ => io}/test_trajectory.py (100%) diff --git a/tests/structure/alphabet/__init__.py b/tests/structure/alphabet/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/structure/test_i3d.py b/tests/structure/alphabet/test_i3d.py similarity index 100% rename from tests/structure/test_i3d.py rename to tests/structure/alphabet/test_i3d.py diff --git a/tests/structure/test_pb.py b/tests/structure/alphabet/test_pb.py similarity index 100% rename from tests/structure/test_pb.py rename to tests/structure/alphabet/test_pb.py diff --git a/tests/structure/io/__init__.py b/tests/structure/io/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/structure/test_gro.py b/tests/structure/io/test_gro.py similarity index 100% rename from tests/structure/test_gro.py rename to tests/structure/io/test_gro.py diff --git a/tests/structure/test_mol.py b/tests/structure/io/test_mol.py similarity index 100% rename from tests/structure/test_mol.py rename to tests/structure/io/test_mol.py diff --git a/tests/structure/test_pdb.py b/tests/structure/io/test_pdb.py similarity index 100% rename from tests/structure/test_pdb.py rename to tests/structure/io/test_pdb.py diff --git a/tests/structure/test_pdbqt.py b/tests/structure/io/test_pdbqt.py similarity index 100% rename from tests/structure/test_pdbqt.py rename to tests/structure/io/test_pdbqt.py diff --git a/tests/structure/test_pdbx.py b/tests/structure/io/test_pdbx.py similarity index 100% rename from tests/structure/test_pdbx.py rename to tests/structure/io/test_pdbx.py diff --git a/tests/structure/test_trajectory.py b/tests/structure/io/test_trajectory.py similarity index 100% rename from tests/structure/test_trajectory.py rename to tests/structure/io/test_trajectory.py From 06647dc6ac6b0c715221788c291082a761f6e02b Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 19 Oct 2024 11:33:14 +0200 Subject: [PATCH 2/9] Add benchmarks for structural alphabet conversion --- benchmarks/structure/benchmark_alphabet.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 benchmarks/structure/benchmark_alphabet.py diff --git a/benchmarks/structure/benchmark_alphabet.py b/benchmarks/structure/benchmark_alphabet.py new file mode 100644 index 000000000..54a9080ad --- /dev/null +++ b/benchmarks/structure/benchmark_alphabet.py @@ -0,0 +1,22 @@ +from pathlib import Path +import pytest +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + +PDB_ID = "1aki" + + +@pytest.fixture +def atoms(): + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif") + return pdbx.get_structure(pdbx_file, model=1, include_bonds=True) + + +@pytest.mark.benchmark +@pytest.mark.parametrize("method", [strucalph.to_3di, strucalph.to_protein_blocks]) +def benchmark_structural_alphabet_methods(method, atoms): + """ + Convert a structure to the given structural alphabet. + """ + method(atoms) From 394cac2f59054385e4532cb68f278ecf99bb1092 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 19 Oct 2024 11:38:19 +0200 Subject: [PATCH 3/9] Integrate structural alphabets into the docs --- doc/conf.py | 1 + .../scripts/structure/alphabet/README.rst | 2 + .../scripts/structure/alphabet/msa.py | 87 ++++++++++ .../structure/protein/pb_alignment.license | 26 --- .../scripts/structure/protein/pb_alignment.py | 158 ------------------ doc/index.rst | 2 +- 6 files changed, 91 insertions(+), 185 deletions(-) create mode 100644 doc/examples/scripts/structure/alphabet/README.rst create mode 100644 doc/examples/scripts/structure/alphabet/msa.py delete mode 100644 doc/examples/scripts/structure/protein/pb_alignment.license delete mode 100644 doc/examples/scripts/structure/protein/pb_alignment.py diff --git a/doc/conf.py b/doc/conf.py index 7f19bc67c..d9d511858 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -183,6 +183,7 @@ "examples/scripts/structure/molecule", "examples/scripts/structure/contacts", "examples/scripts/structure/modeling", + "examples/scripts/structure/alphabet", "examples/scripts/structure/misc", ] ), diff --git a/doc/examples/scripts/structure/alphabet/README.rst b/doc/examples/scripts/structure/alphabet/README.rst new file mode 100644 index 000000000..fd475ca52 --- /dev/null +++ b/doc/examples/scripts/structure/alphabet/README.rst @@ -0,0 +1,2 @@ +Structural alphabets +-------------------- \ No newline at end of file diff --git a/doc/examples/scripts/structure/alphabet/msa.py b/doc/examples/scripts/structure/alphabet/msa.py new file mode 100644 index 000000000..67f36ad4f --- /dev/null +++ b/doc/examples/scripts/structure/alphabet/msa.py @@ -0,0 +1,87 @@ +""" +Multiple Structural alignment of orthologous proteins +===================================================== + +In this example we perform a structural alignment of multiple lysozyme +variants from different organisms. +A feasible approach to perfrom such a multiple structure alignment is the +usage of a structural alphabet: +At first the structure is translated into a sequence that represents +the structure. +Then the sequences can be aligned with the standard sequence alignment +techniques, using the substitution matrix of the structural alphabet. + +In this example, the structural alphabet we will use is called +*Protein Blocks* (PBs) :footcite:`Brevern2000, Barnoud2017`: +There are 16 different PBs, represented by the symbols ``a`` to ``p``. +Each one depicts a different set of the backbone dihedral angles of a +peptide 5-mer. + +.. footbibliography:: +""" + +# Code source: Patrick Kunzmann +# License: BSD 3 clause + +import matplotlib.pyplot as plt +import biotite.database.rcsb as rcsb +import biotite.sequence.align as align +import biotite.sequence.graphics as graphics +import biotite.structure as struc +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx + +PDB_IDS = ["1REX", "1AKI", "1DKJ", "1GD6"] + +# Create a PB sequence from each structure +structural_sequences = [] +organisms = [] +for pdb_id in PDB_IDS: + file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif")) + # Take only the first model into account + array = pdbx.get_structure(file, model=1) + # We are only interested in the first protein chain + array = array[struc.filter_amino_acids(array)] + array = array[array.chain_id == array.chain_id[0]] + pb_sequences, _ = strucalph.to_protein_blocks(array) + structural_sequences.append(pb_sequences[0].remove_undefined()) + + try: + organism_name = file.block["entity_src_nat"][ + "pdbx_organism_scientific" + ].as_item() + except KeyError: + organism_name = file.block["entity_src_gen"][ + "pdbx_gene_src_scientific_name" + ].as_item() + generic, specific = organism_name.split(" ") + abbreviation = f"{generic[0]}. {specific}" + organisms.append(abbreviation) + + +# Perform a multiple sequence alignment of the PB sequences +matrix = align.SubstitutionMatrix.std_protein_blocks_matrix() +alignment, order, _, _ = align.align_multiple( + structural_sequences, matrix, gap_penalty=(-500, -100), terminal_penalty=False +) + +# Visualize the alignment +# Order alignment according to guide tree +alignment = alignment[:, order.tolist()] +labels = [organisms[i] for i in order] +fig = plt.figure(figsize=(8.0, 4.0)) +ax = fig.add_subplot(111) +# The color scheme was generated with the 'Gecos' software +graphics.plot_alignment_type_based( + ax, + alignment, + labels=labels, + symbols_per_line=45, + spacing=2, + show_numbers=True, + color_scheme="flower", +) +# Organism names in italic +ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"}) +fig.tight_layout() +plt.show() diff --git a/doc/examples/scripts/structure/protein/pb_alignment.license b/doc/examples/scripts/structure/protein/pb_alignment.license deleted file mode 100644 index bd70fb8a2..000000000 --- a/doc/examples/scripts/structure/protein/pb_alignment.license +++ /dev/null @@ -1,26 +0,0 @@ -The protein block substitution matrix, the protein block reference angles -and the RSMDA calculation were taken from the PBxplore software, -governed by the following license: - - -The MIT License (MIT) - -Copyright (c) 2013 Poulain, A. G. de Brevern - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/doc/examples/scripts/structure/protein/pb_alignment.py b/doc/examples/scripts/structure/protein/pb_alignment.py deleted file mode 100644 index 9a3396ecf..000000000 --- a/doc/examples/scripts/structure/protein/pb_alignment.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -Structural alignment of orthologous proteins using 'Protein Blocks' -=================================================================== - -In this example we perform a structural alignment of multiple lysozyme -variants from different organisms. -A feasible approach to perfrom such a multiple structure alignment is the -usage of a structural alphabet: -At first the structure is translated into a sequence that represents -the structure. -Then the sequences can be aligned with the standard sequence alignment -techniques, using the substitution matrix of the structural alphabet. - -In this example, the structural alphabet we will use is called -*protein blocks* (PBs) :footcite:`Brevern2000, Barnoud2017`: -There are 16 different PBs, represented by the symbols ``a`` to ``p``. -Each one depicts a different set of the backbone dihedral angles of a -peptide 5-mer. -To assign a PB to an amino acid, the 5-mer centered on the respective -residue is taken, its backbone dihedral angles are calculated and the -PB with the least deviation to this set of angles is chosen. - -.. footbibliography:: -""" - -# Code source: Patrick Kunzmann -# License: BSD 3 clause - -from tempfile import gettempdir -import matplotlib.pyplot as plt -import numpy as np -import biotite.database.rcsb as rcsb -import biotite.sequence as seq -import biotite.sequence.align as align -import biotite.sequence.graphics as graphics -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx - -# PB alphabet -pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") - -# PB substitution matrix, adapted from PBxplore -matrix_str = """ - a b c d e f g h i j k l m n o p -a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 -b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 -c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 -d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 -e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 -f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 -g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 -h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 -i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 -j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 -k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 -l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 -m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 -n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 -o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 -p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 -""" - -# PB reference angles, adapted from PBxplore -ref_angles = np.array([ - [ 41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68], - [108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90], - [-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63], - [141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16], - [133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40], - [116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51], - [ 0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98], - [119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], - [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], - [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], - [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], - [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], - [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], - [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], - [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], - [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], -]) # fmt: skip - - -# Fetch animal lysoyzme structures -lyso_files = rcsb.fetch( - ["1REX", "1AKI", "1DKJ", "1GD6"], format="bcif", target_path=gettempdir() -) -organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"] - -# Create a PB sequence from each structure -pb_seqs = [] -for file_name in lyso_files: - file = pdbx.BinaryCIFFile.read(file_name) - # Take only the first model into account - array = pdbx.get_structure(file, model=1) - # Remove everything but the first protein chain - array = array[struc.filter_amino_acids(array)] - array = array[array.chain_id == array.chain_id[0]] - - # Calculate backbone dihedral angles, - # as the PBs are determined from them - phi, psi, omega = struc.dihedral_backbone(array) - # A PB requires the 8 phi/psi angles of 5 amino acids, - # centered on the amino acid to calculate the PB for - # Hence, the PBs are not defined for the two amino acids - # at each terminus - pb_angles = np.full((len(phi) - 4, 8), np.nan) - pb_angles[:, 0] = psi[:-4] - pb_angles[:, 1] = phi[1:-3] - pb_angles[:, 2] = psi[1:-3] - pb_angles[:, 3] = phi[2:-2] - pb_angles[:, 4] = psi[2:-2] - pb_angles[:, 5] = phi[3:-1] - pb_angles[:, 6] = psi[3:-1] - pb_angles[:, 7] = phi[4:] - pb_angles = np.rad2deg(pb_angles) - - # Angle RMSD of all reference angles with all actual angles - rmsda = np.sum( - ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2, - axis=-1, - ) - # Chose PB, where the RMSDA to the reference angle is lowest - # Due to the definition of Biotite symbol codes - # the index of the chosen PB is directly the symbol code - pb_seq_code = np.argmin(rmsda, axis=0) - # Put the array of symbol codes into actual sequence objects - pb_sequence = seq.GeneralSequence(pb_alphabet) - pb_sequence.code = pb_seq_code - pb_seqs.append(pb_sequence) - -# Perfrom a multiple sequence alignment of the PB sequences -matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str) -matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict) -alignment, order, _, _ = align.align_multiple( - pb_seqs, matrix, gap_penalty=(-500, -100), terminal_penalty=False -) - -# Visualize the alignment -# Order alignment according to guide tree -alignment = alignment[:, order.tolist()] -labels = [organisms[i] for i in order] -fig = plt.figure(figsize=(8.0, 4.0)) -ax = fig.add_subplot(111) -# The color scheme was generated with the 'Gecos' software -graphics.plot_alignment_type_based( - ax, - alignment, - labels=labels, - symbols_per_line=45, - spacing=2, - show_numbers=True, - color_scheme="flower", -) -# Organism names in italic -ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"}) -fig.tight_layout() -plt.show() diff --git a/doc/index.rst b/doc/index.rst index 9a09fbc89..06bcc6404 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -128,7 +128,7 @@ Biotite documentation

Analyze sequence data

Work with sequences of any kind: from the usual nucleotide and protein - sequences to your own sequence types created from a custom alphabet. + sequences to structural alphabets like *3Di*. Use the rapid and modular alignment tools to identify homologous regions or to map reads. Eventually, visualize your results in different *Matplotlib* based From 7170fca03286af403d505a73cd2976f2efddcdad Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 19 Oct 2024 12:20:55 +0200 Subject: [PATCH 4/9] Rename 'unknown' to 'undefined' symbol --- src/biotite/sequence/align/matrix.py | 14 +++++++------- src/biotite/structure/alphabet/i3d.py | 2 +- src/biotite/structure/alphabet/pb.py | 19 ++++++++++++++++++- tests/structure/alphabet/test_i3d.py | 4 ++-- tests/structure/alphabet/test_pb.py | 4 ++-- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 9e4324ac9..17dfc8a66 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -423,7 +423,7 @@ def std_3di_matrix(): @staticmethod @functools.cache - def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200): + def std_protein_blocks_matrix(undefined_match=200, undefined_mismatch=-200): """ Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences. @@ -431,7 +431,7 @@ def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200): Parameters ---------- - unknown_match, unknown_mismatch : int, optional + undefined_match, undefined_mismatch : int, optional The match and mismatch score for undefined symbols. The default values were chosen arbitrarily, but are in the order of magnitude of the other score values. @@ -450,15 +450,15 @@ def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200): from biotite.structure.alphabet.pb import ProteinBlocksSequence alphabet = ProteinBlocksSequence.alphabet - unknown_symbol = ProteinBlocksSequence.unknown_symbol + undefined_symbol = ProteinBlocksSequence.undefined_symbol matrix_dict = SubstitutionMatrix.dict_from_db("PB") # Add match/mismatch scores for undefined symbols residues for symbol in alphabet: - if symbol == unknown_symbol: + if symbol == undefined_symbol: continue - matrix_dict[symbol, unknown_symbol] = unkown_mismatch - matrix_dict[unknown_symbol, symbol] = unkown_mismatch - matrix_dict[unknown_symbol, unknown_symbol] = unknown_match + matrix_dict[symbol, undefined_symbol] = undefined_mismatch + matrix_dict[undefined_symbol, symbol] = undefined_mismatch + matrix_dict[undefined_symbol, undefined_symbol] = undefined_match return SubstitutionMatrix( alphabet, alphabet, diff --git a/src/biotite/structure/alphabet/i3d.py b/src/biotite/structure/alphabet/i3d.py index 1f295d1e1..5ecaacb46 100644 --- a/src/biotite/structure/alphabet/i3d.py +++ b/src/biotite/structure/alphabet/i3d.py @@ -65,7 +65,7 @@ class I3DSequence(Sequence): "Y", ] ) - unknown_symbol = "D" + undefined_symbol = "D" def __init__(self, sequence=""): if isinstance(sequence, str): diff --git a/src/biotite/structure/alphabet/pb.py b/src/biotite/structure/alphabet/pb.py index e2c527cca..43ed46a6a 100644 --- a/src/biotite/structure/alphabet/pb.py +++ b/src/biotite/structure/alphabet/pb.py @@ -64,11 +64,28 @@ class ProteinBlocksSequence(Sequence): """ alphabet = LetterAlphabet("abcdefghijklmnopZ") - unknown_symbol = "Z" + undefined_symbol = "Z" def get_alphabet(self): return ProteinBlocksSequence.alphabet + def remove_undefined(self): + """ + Remove undefined symbols from the sequence. + + Returns + ------- + filtered_sequence : ProteinBlocksSequence + The sequence without undefined symbols. + """ + undefined_code = ProteinBlocksSequence.alphabet.encode( + ProteinBlocksSequence.undefined_symbol + ) + filtered_code = self.code[self.code != undefined_code] + filtered_sequence = ProteinBlocksSequence() + filtered_sequence.code = filtered_code + return filtered_sequence + def to_protein_blocks(atoms): """ diff --git a/tests/structure/alphabet/test_i3d.py b/tests/structure/alphabet/test_i3d.py index 402554608..fc1419e2c 100644 --- a/tests/structure/alphabet/test_i3d.py +++ b/tests/structure/alphabet/test_i3d.py @@ -78,7 +78,7 @@ def test_missing_residues(): PDB_ID = "1aki" N_DELETIONS = 5 MAX_MISMATCH_PERCENTAGE = 0.1 - UKNOWN_SYMBOL = strucalph.I3DSequence.unknown_symbol + UNDEFINED_SYMBOL = strucalph.I3DSequence.undefined_symbol pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif") atoms = pdbx.get_structure(pdbx_file, model=1) @@ -102,7 +102,7 @@ def test_missing_residues(): # Convert the PDB symbol for residue and adjacent ones to 'Z' start_index = max(0, seq_index - 1) end_index = min(len(ref_sequence), seq_index + 1) - ref_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + ref_sequence[start_index : end_index + 1] = UNDEFINED_SYMBOL assert len(test_sequences) == 1 # 3Di sequences are quite complex, i.e. removing backbone atoms at some position diff --git a/tests/structure/alphabet/test_pb.py b/tests/structure/alphabet/test_pb.py index 5ecb57348..cfc13788c 100644 --- a/tests/structure/alphabet/test_pb.py +++ b/tests/structure/alphabet/test_pb.py @@ -50,7 +50,7 @@ def test_missing_residues(reference_chain, reference_sequence): """ N_DELETIONS = 5 # The 'Z' symbol - UKNOWN_SYMBOL = strucalph.ProteinBlocksSequence.unknown_symbol + UNDEFINED_SYMBOL = strucalph.ProteinBlocksSequence.undefined_symbol # Randomly delete some backbone atoms rng = np.random.default_rng(1) @@ -68,7 +68,7 @@ def test_missing_residues(reference_chain, reference_sequence): # Convert the PB symbol for residue and adjacent ones to 'Z' start_index = max(0, seq_index - 2) end_index = min(len(reference_sequence), seq_index + 2) - reference_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + reference_sequence[start_index : end_index + 1] = UNDEFINED_SYMBOL test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) From ea2816f021547cef222eb55ae6a5270b89c96b0b Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 19 Oct 2024 13:26:18 +0200 Subject: [PATCH 5/9] Use lower case letters for 3Di to distinguish from amino acid sequences --- .../sequence/align/matrix_data/3Di.mat | 43 +++++++++---------- src/biotite/structure/alphabet/i3d.py | 35 +++------------ src/biotite/structure/alphabet/pb.py | 19 ++++++-- tests/structure/alphabet/test_pb.py | 4 +- 4 files changed, 43 insertions(+), 58 deletions(-) diff --git a/src/biotite/sequence/align/matrix_data/3Di.mat b/src/biotite/sequence/align/matrix_data/3Di.mat index 93fe4e97b..932d675f4 100644 --- a/src/biotite/sequence/align/matrix_data/3Di.mat +++ b/src/biotite/sequence/align/matrix_data/3Di.mat @@ -1,25 +1,24 @@ # 3Di bit/2 # Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001 # Lambda (precomputed optional): 0.351568 - A C D E F G H I K L M N P Q R S T V W Y X -A 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 0 -C -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 0 -D 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 0 -E 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 0 -F 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 0 -G -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 0 -H -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 0 -I -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 0 -K -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 0 -L -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 0 -M -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 0 -N -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 0 -P -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 0 -Q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 0 -R -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 0 -S -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 0 -T -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 0 -V -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 0 -W 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 0 -Y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 0 -X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ No newline at end of file + a c d e f g h i k l m n p q r s t v w y +a 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 +c -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 +d 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 +e 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 +f 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 +g -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 +h -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 +i -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 +k -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 +l -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 +m -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 +n -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 +p -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 +q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 +r -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 +s -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 +t -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 +v -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 +w 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 +y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 \ No newline at end of file diff --git a/src/biotite/structure/alphabet/i3d.py b/src/biotite/structure/alphabet/i3d.py index 5ecaacb46..b9bc76787 100644 --- a/src/biotite/structure/alphabet/i3d.py +++ b/src/biotite/structure/alphabet/i3d.py @@ -41,40 +41,15 @@ class I3DSequence(Sequence): """ - alphabet = LetterAlphabet( - [ - "A", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "K", - "L", - "M", - "N", - "P", - "Q", - "R", - "S", - "T", - "V", - "W", - "Y", - ] - ) - undefined_symbol = "D" + alphabet = LetterAlphabet("acdefghiklmnpqrstvwy") + undefined_symbol = "d" def __init__(self, sequence=""): if isinstance(sequence, str): - sequence = sequence.upper() + sequence = sequence.lower() else: sequence = [symbol.upper() for symbol in sequence] - seq_code = I3DSequence.alphabet.encode_multiple(sequence) - super().__init__() - self.code = seq_code + super().__init__(sequence) def get_alphabet(self): return I3DSequence.alphabet @@ -111,7 +86,7 @@ def to_3di(atoms): >>> sequences, chain_starts = to_3di(atom_array) >>> print(sequences[0]) - DQQVVCVVCPNVVNVDHGDD + dqqvvcvvcpnvvnvdhgdd """ sequences = [] chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) diff --git a/src/biotite/structure/alphabet/pb.py b/src/biotite/structure/alphabet/pb.py index 43ed46a6a..5255204a3 100644 --- a/src/biotite/structure/alphabet/pb.py +++ b/src/biotite/structure/alphabet/pb.py @@ -63,8 +63,15 @@ class ProteinBlocksSequence(Sequence): """ - alphabet = LetterAlphabet("abcdefghijklmnopZ") - undefined_symbol = "Z" + alphabet = LetterAlphabet("abcdefghijklmnopz") + undefined_symbol = "z" + + def __init__(self, sequence=""): + if isinstance(sequence, str): + sequence = sequence.lower() + else: + sequence = [symbol.upper() for symbol in sequence] + super().__init__(sequence) def get_alphabet(self): return ProteinBlocksSequence.alphabet @@ -116,7 +123,7 @@ def to_protein_blocks(atoms): >>> sequences, chain_starts = to_protein_blocks(atom_array) >>> print(sequences[0]) - ZZmmmmmnopjmnopacdZZ + zzmmmmmnopjmnopacdzz """ sequences = [] chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) @@ -129,6 +136,10 @@ def to_protein_blocks(atoms): def _to_protein_blocks(chain): + undefined_code = ProteinBlocksSequence.alphabet.encode( + ProteinBlocksSequence.undefined_symbol + ) + phi, psi, _ = dihedral_backbone(chain) pb_angles = np.full((len(phi), 8), np.nan) @@ -148,7 +159,7 @@ def _to_protein_blocks(chain): axis=-1, ) # Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown - pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z")) + pb_seq_code = np.full(len(pb_angles), undefined_code, dtype=np.uint8) pb_available_mask = ~np.isnan(rmsda).any(axis=0) # Chose PB, where the RMSDA to the reference angle is lowest # Due to the definition of Biotite symbol codes diff --git a/tests/structure/alphabet/test_pb.py b/tests/structure/alphabet/test_pb.py index cfc13788c..521317cda 100644 --- a/tests/structure/alphabet/test_pb.py +++ b/tests/structure/alphabet/test_pb.py @@ -45,7 +45,7 @@ def test_to_protein_blocks(reference_chain, reference_sequence): def test_missing_residues(reference_chain, reference_sequence): """ Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing. - Expect that these and adjacent residues get the unknown symbol 'Z' in the + Expect that these and adjacent residues get the unknown symbol 'z' in the PB sequence. """ N_DELETIONS = 5 @@ -65,7 +65,7 @@ def test_missing_residues(reference_chain, reference_sequence): # Apply the same deletions to the reference sequence for res_id in del_backbone_residue_ids: seq_index = res_id - reference_chain.res_id[0] - # Convert the PB symbol for residue and adjacent ones to 'Z' + # Convert the PB symbol for residue and adjacent ones to 'z' start_index = max(0, seq_index - 2) end_index = min(len(reference_sequence), seq_index + 2) reference_sequence[start_index : end_index + 1] = UNDEFINED_SYMBOL From 29bad07d9d5270c8539decf20666f85fc363438b Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 19 Oct 2024 13:36:15 +0200 Subject: [PATCH 6/9] Add color scheme for 3Di, generated with 'gecos' --- .../scripts/sequence/misc/color_schemes.py | 26 ++++++--- .../graphics/color_schemes/3di_flower.json | 48 ++++++++++++++++ .../graphics/color_schemes/pb_flower.json | 3 +- src/biotite/sequence/graphics/colorschemes.py | 55 +++++++++++++++---- 4 files changed, 112 insertions(+), 20 deletions(-) create mode 100644 src/biotite/sequence/graphics/color_schemes/3di_flower.json diff --git a/doc/examples/scripts/sequence/misc/color_schemes.py b/doc/examples/scripts/sequence/misc/color_schemes.py index b84542932..11bc7ee93 100644 --- a/doc/examples/scripts/sequence/misc/color_schemes.py +++ b/doc/examples/scripts/sequence/misc/color_schemes.py @@ -2,7 +2,7 @@ Biotite color schemes ===================== -This script displays the available color schemes. +This script displays the available color schemes for the different built-in alphabets. """ # Code source: Patrick Kunzmann @@ -14,6 +14,7 @@ from matplotlib.patches import Rectangle import biotite.sequence as seq import biotite.sequence.graphics as graphics +import biotite.structure.alphabet as strucalph def plot_colors(ax, alphabet): @@ -48,25 +49,34 @@ def plot_colors(ax, alphabet): nuc_alphabet = seq.NucleotideSequence.alphabet_amb prot_alphabet = seq.ProteinSequence.alphabet -pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") +i3d_alphabet = strucalph.I3DSequence.alphabet +pb_alphabet = strucalph.ProteinBlocksSequence.alphabet -figure = plt.figure(figsize=(8.0, 5.0)) +figure = plt.figure(figsize=(8.0, 7.0)) gs = GridSpec( - 3, + 4, 1, height_ratios=[ len(graphics.list_color_scheme_names(alphabet)) - for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet) + for alphabet in (nuc_alphabet, prot_alphabet, i3d_alphabet, pb_alphabet) ], ) + ax = figure.add_subplot(gs[0, 0]) -ax.set_title("Nucleotide color schemes") +ax.set_title("Nucleotide") plot_colors(ax, nuc_alphabet) + ax = figure.add_subplot(gs[1, 0]) -ax.set_title("Protein color schemes") +ax.set_title("Protein") plot_colors(ax, prot_alphabet) + ax = figure.add_subplot(gs[2, 0]) -ax.set_title("Protein block color schemes") +ax.set_title("3Di") +plot_colors(ax, i3d_alphabet) + +ax = figure.add_subplot(gs[3, 0]) +ax.set_title("Protein Blocks") plot_colors(ax, pb_alphabet) + plt.tight_layout() plt.show() diff --git a/src/biotite/sequence/graphics/color_schemes/3di_flower.json b/src/biotite/sequence/graphics/color_schemes/3di_flower.json new file mode 100644 index 000000000..5e24f8fc8 --- /dev/null +++ b/src/biotite/sequence/graphics/color_schemes/3di_flower.json @@ -0,0 +1,48 @@ +{ + "comment": "Generated with 'gecos --matrix 3Di --name flower --lmin 60 --lmax 80 -f 3di_flower.json'", + "name": "flower", + "alphabet": [ + "a", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "y" + ], + "colors": { + "a": "#a189a1", + "c": "#ff5806", + "d": "#ab9a93", + "e": "#e754d5", + "f": "#8191b5", + "g": "#cbc7ae", + "h": "#dac1bc", + "i": "#5eaf6e", + "k": "#04c1fd", + "l": "#ff544b", + "m": "#07e560", + "n": "#f28d05", + "p": "#b68767", + "q": "#bc8277", + "r": "#eebe86", + "s": "#ffa103", + "t": "#a4c49a", + "v": "#ed6903", + "w": "#3a97d8", + "y": "#f7adfd" + } +} \ No newline at end of file diff --git a/src/biotite/sequence/graphics/color_schemes/pb_flower.json b/src/biotite/sequence/graphics/color_schemes/pb_flower.json index 76b367df8..63e698648 100644 --- a/src/biotite/sequence/graphics/color_schemes/pb_flower.json +++ b/src/biotite/sequence/graphics/color_schemes/pb_flower.json @@ -16,7 +16,8 @@ "m", "n", "o", - "p" + "p", + "z" ], "colors": { "a": "#31b5fc", diff --git a/src/biotite/sequence/graphics/colorschemes.py b/src/biotite/sequence/graphics/colorschemes.py index d38879c91..fa58ce803 100644 --- a/src/biotite/sequence/graphics/colorschemes.py +++ b/src/biotite/sequence/graphics/colorschemes.py @@ -94,27 +94,32 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"): >>> print(color_scheme) ['#3737f5', '#37f537', '#f5f537', '#f53737'] """ + # Try exact alphabet match first + for scheme in _color_schemes: + if scheme["name"] == name and scheme["alphabet"] == alphabet: + return _fit_color_scheme(alphabet, scheme, default) + # If no exact match was found, try to find a scheme for an alphabet + # that extends the given alphabet for scheme in _color_schemes: if scheme["name"] == name and scheme["alphabet"].extends(alphabet): - colors = scheme["colors"] - # Replace None values with default color - colors = [color if color is not None else default for color in colors] - # Only return colors that are in scope of this alphabet - # and not the extended alphabet - return colors[: len(alphabet)] + return _fit_color_scheme(alphabet, scheme, default) + raise ValueError(f"Unkown scheme '{name}' for given alphabet") -def list_color_scheme_names(alphabet): +def list_color_scheme_names(alphabet, strict=False): """ Get a list of available color scheme names for a given alphabet. Parameters ---------- alphabet : Alphabet - The alphbet to get the color scheme names for. - The alphabet of the scheme must equal or extend this parameter, - to be included in the list. + The alphabet to get the color scheme names for. + strict : bool, optional + If set to true, only schemes with an exact match to the given + alphabet are included in the list. + If set to false, schemes with an alphabet that extends the given + alphabet are also included. Returns ------- @@ -123,7 +128,9 @@ def list_color_scheme_names(alphabet): """ scheme_list = [] for scheme in _color_schemes: - if scheme["alphabet"].extends(alphabet): + if strict and scheme["alphabet"] == alphabet: + scheme_list.append(scheme["name"]) + if not strict and scheme["alphabet"].extends(alphabet): scheme_list.append(scheme["name"]) return scheme_list @@ -135,3 +142,29 @@ def list_color_scheme_names(alphabet): for file_name in glob.glob(_scheme_dir + os.sep + "*.json"): scheme = load_color_scheme(file_name) _color_schemes.append(scheme) + + +def _fit_color_scheme(alphabet, color_scheme, default_color): + """ + Fit a color scheme to the given alphabet. + + Parameters + ---------- + alphabet : Alphabet + The alphabet to get the color scheme for. + color_scheme : dict + The color scheme. + default_color : str or tuple + The default color. + + Returns + ------- + scheme : list of str + The colors from the scheme. + """ + colors = color_scheme["colors"] + # Replace None values with default color + colors = [color if color is not None else default_color for color in colors] + # Only return colors that are in scope of this alphabet + # and not the extended alphabet + return colors[: len(alphabet)] From 978c42476a9b8a0b8737f437f057684ca8891dd6 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 21 Oct 2024 10:43:23 +0200 Subject: [PATCH 7/9] Add tutorial for structural alphabets --- doc/tutorial/sequence/align_multiple.rst | 5 +- doc/tutorial/structure/alphabet.rst | 196 +++++++++++++++++++++++ doc/tutorial/structure/index.rst | 1 + 3 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 doc/tutorial/structure/alphabet.rst diff --git a/doc/tutorial/sequence/align_multiple.rst b/doc/tutorial/sequence/align_multiple.rst index 5eb30b457..85e66a524 100644 --- a/doc/tutorial/sequence/align_multiple.rst +++ b/doc/tutorial/sequence/align_multiple.rst @@ -2,13 +2,16 @@ Guide trees and multiple sequence alignments ============================================ + +.. currentmodule:: biotite.sequence.align + In :doc:`a previous chapter ` we have learned how to align *two* sequences with each other. However, for many use cases we require an alignment of more than two sequences. For example, one such use case is the analysis of homologous regions within a protein family. -Although the *dynamic programming* algorithm behind :doc:`align_optimal` can in +Although the *dynamic programming* algorithm behind :func:`align_optimal()` can in theory be extended to any number of sequences, the computation time scales linearly with the length *each* aligned sequence. Thus, the method becomes infeasible for already a few sequences. diff --git a/doc/tutorial/structure/alphabet.rst b/doc/tutorial/structure/alphabet.rst new file mode 100644 index 000000000..adc3ba8f9 --- /dev/null +++ b/doc/tutorial/structure/alphabet.rst @@ -0,0 +1,196 @@ +.. include:: /tutorial/preamble.rst + +Structural alphabets +==================== + +.. currentmodule:: biotite.structure.alphabet + +In the previous chapters we have seen the multitude of methods, than can be applied +to sequence and structure data. +*Structural alphabets* combine the best of both worlds: +A structural alphabet is a representation of protein or nucleic acid structures, +where each residue is encoded into a single character, based on the local geometry +or contact partners of that residue. +This way the high performance sequence-based methods, for example alignment searches, +can be applied to structure data. + +The :mod:`biotite.structure.alphabet` subpackage provides multiple structural alphabets, +but for the scope of this tutorial we will focus on the *3Di* alphabet, popularized by +the `Foldseek `_ software for fast protein +structure comparison. +Just keep in mind that in the following examples the underlying structural alphabet +can be substituted with minimal modifications. + +Converting structures to sequences +---------------------------------- +We start by getting the structure of our protein of interest. +In this case we will use ferredoxin from *E. coli* (PDB ID: ``2ZVS``). +After filtering out all non-amino acid residues, we create the *3Di* sequence for each +chain with :func:`to_3di()`. + +.. jupyter-execute:: + + import biotite.database.rcsb as rcsb + import biotite.structure as struc + import biotite.structure.alphabet as strucalph + import biotite.structure.io.pdbx as pdbx + + # You can actually directly read the downloaded PDBx file content + # without an intermediate file + pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2ZVS", "bcif")) + ec_ferredoxin = pdbx.get_structure(pdbx_file, model=1) + # This structural alphabet expects a peptide chain + ec_ferredoxin = ec_ferredoxin[struc.filter_amino_acids(ec_ferredoxin)] + structural_sequences, chain_starts = strucalph.to_3di(ec_ferredoxin) + print(structural_sequences) + print(chain_starts) + +Note that :func:`to_3di()` returns not a single :class:`I3DSequence` sequence but a list +of sequences, one for each chain in the structure. +Accompanying the sequences, the function also returns the atom indices where each of the +chains starts. +As our structure contains only one chain, the desired 3Di sequence is the first and only +element in the list. + +.. jupyter-execute:: + + ec_ferredoxin_3di = structural_sequences[0] + print(ec_ferredoxin_3di) + +Each symbol in this rather cryptic sequence corresponds to one residue in the structure. +To get the corresponding residues as :class:`.AtomArray` objects we can use the +residue-level functionality of :mod:`biotite.structure`. +While the sequence is hardly human-readable, it true power lies in its ability to +be compared to *3Di* sequences from other proteins. + +Sequence alignments on structural alphabets +------------------------------------------- +As mentioned in the :doc:`sequence chapter <../sequence/encoding>`, the sequence +based methods in :mod:`biotite.sequence` generally do not care about the type of +sequence. +This means that we can use any method we have learned so far on structural sequences as +well. +For the scope of this tutorial we will merely use :func:`align_optimal()` to find +corresponding residues in two structures. +As the structure is generally much better conserved than its sequence, the alignment of +*3Di* sequences will even work on remote homologs with low amino acid sequence identity, +where a classical sequence alignment would fail. +To demonstrate this, we will compare the *E. coli* ferredoxin with the remotely similar +ferredoxin from the thermophilic archaeon *S. tokodaii*. + +.. jupyter-execute:: + + pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1XER", "bcif")) + st_ferredoxin = pdbx.get_structure(pdbx_file, model=1) + st_ferredoxin = st_ferredoxin[struc.filter_amino_acids(st_ferredoxin)] + st_ferredoxin_3di = strucalph.to_3di(st_ferredoxin)[0][0] + +To align the two 3Di sequences, we merely need a :class:`.SubstitutionMatrix` that +matches the alphabet of the :class:`I3DSequence`. +Like for amino acid and nucleotide sequences, :mod:`biotite.sequence.align` provides +it out of the box with :func:`.SubstitutionMatrix.std_3di_matrix()`. + +.. jupyter-execute:: + + import matplotlib.pyplot as plt + import biotite.sequence.align as align + import biotite.sequence.graphics as graphics + + matrix = align.SubstitutionMatrix.std_3di_matrix() + alignment = align.align_optimal( + ec_ferredoxin_3di, + st_ferredoxin_3di, + matrix, + gap_penalty=(-10, -1), + terminal_penalty=False, + )[0] + + fig, ax = plt.subplots(figsize=(8.0, 2.0)) + graphics.plot_alignment_similarity_based( + ax, alignment, matrix=matrix, labels=["EC", "ST"], symbols_per_line=50 + ) + +If you prefer coloring the symbols in the alignment by their type, you are lucky: +:mod:`biotite.sequence.graphics` provides a +:doc:`color scheme <../../examples/gallery/sequence/misc/color_schemes>` for each of the +supported structural alphabets as well. + +.. jupyter-execute:: + + fig, ax = plt.subplots(figsize=(8.0, 2.0)) + graphics.plot_alignment_type_based( + ax, alignment, labels=["EC", "ST"], symbols_per_line=50 + ) + +Example: Superimposing structures +--------------------------------- +One typical use case of structural alphabets is superimposing structures of remote +homologs. +Here the challenge is finding the corresponding residues in the two structures, whose +squared distance the superimposition algorithm should minimize. +The solution is to use the alignment of the structural alphabet: +One simply inputs the ``CA`` atoms of the aligned residues. + +.. jupyter-execute:: + + def rmsd_from_alignment(fixed, mobile, alignment): + """ + A very simple function that extracts corresponding residues (the 'anchors') + from an alignment and uses them to run a superimposition. + Finally the RMSD of the superimposed structures plus the number of anchors is + returned. + """ + alignment_codes = align.get_codes(alignment) + anchor_mask = ( + # Anchors must be structurally similar + (matrix.score_matrix()[alignment_codes[0], alignment_codes[1]] > 0) + # Gaps are not anchors + & (alignment_codes != -1).all(axis=0) + ) + superimposition_anchors = alignment.trace[anchor_mask] + # Each anchor corresponds to a residue + # Use the CA atoms as representative for each residue + fixed_ca = fixed[fixed.atom_name == "CA"] + mobile_ca = mobile[mobile.atom_name == "CA"] + fixed_anchors = fixed_ca[superimposition_anchors[:, 0]] + mobile_anchors = mobile_ca[superimposition_anchors[:, 1]] + + mobile_anchors, transform = struc.superimpose( + fixed_anchors, + mobile_anchors, + ) + return struc.rmsd(fixed_anchors, mobile_anchors), len(superimposition_anchors) + + rmsd, n_anchors = rmsd_from_alignment(ec_ferredoxin, st_ferredoxin, alignment) + print("Number of corresponding residues found:", n_anchors) + print("RMSD:", rmsd) + +Again, with a classical amino acid sequence based approach the accuracy of the +superimposition would be much lower: +In this case less corresponding residues can be found from the the amino sequence +alignment and the RMSD between them is significantly higher. + +.. jupyter-execute:: + + matrix = align.SubstitutionMatrix.std_protein_matrix() + ec_ferredoxin_seq = struc.to_sequence(ec_ferredoxin)[0][0] + st_ferredoxin_seq = struc.to_sequence(st_ferredoxin)[0][0] + alignment = align.align_optimal( + ec_ferredoxin_seq, + st_ferredoxin_seq, + matrix, + gap_penalty=(-10, -1), + terminal_penalty=False, + )[0] + rmsd, n_anchors = rmsd_from_alignment(ec_ferredoxin, st_ferredoxin, alignment) + print("Number of corresponding residues found:", n_anchors) + print("RMSD:", rmsd) + + fig, ax = plt.subplots(figsize=(8.0, 2.0)) + graphics.plot_alignment_similarity_based( + ax, alignment, matrix=matrix, labels=["EC", "ST"], symbols_per_line=50 + ) + +This shows only a small fraction of the versatility of structural alphabets. +They can also be used to find structural homologs in a large database, to superimpose +multiple structures at once and much more! \ No newline at end of file diff --git a/doc/tutorial/structure/index.rst b/doc/tutorial/structure/index.rst index 32b550994..49e8dbea6 100644 --- a/doc/tutorial/structure/index.rst +++ b/doc/tutorial/structure/index.rst @@ -54,3 +54,4 @@ contains functions for structure analysis and manipulation. measurement segments nucleotide + alphabet From 67b11f2352e745b39bb3d8081c9146b90e36ddc2 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 2 Nov 2024 16:01:47 +0100 Subject: [PATCH 8/9] Add docstring for `structure.alphabet` subpackage --- src/biotite/structure/alphabet/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/biotite/structure/alphabet/__init__.py b/src/biotite/structure/alphabet/__init__.py index baafab6e1..0f538d008 100644 --- a/src/biotite/structure/alphabet/__init__.py +++ b/src/biotite/structure/alphabet/__init__.py @@ -4,6 +4,18 @@ """ A subpackage for converting structures to structural alphabet sequences. + +Structural alphabets represent the local geometry of each residue in a structure as +symbol in a sequence. +This allows using sequence-based functionality from :mod:`biotite.sequence` on +structural data. + +For each supported structural alphabet, this subpackage provides a conversion function +that converts each chain of a given structure into a :class:`Sequence` object from the +respective structural alphabet. + +Note that the structural alphabets use lower-case letters as symbols, in order to +distinguish them better from the nucleotide and amino acid alphabets. """ __name__ = "biotite.structure.alphabet" From 8dc3f67f6e01816ee799651ac3725c80a6712e7c Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 2 Nov 2024 16:44:24 +0100 Subject: [PATCH 9/9] Include cached functions in API docs --- doc/apidoc.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/apidoc.py b/doc/apidoc.py index 8f10f0923..5a67bab3b 100644 --- a/doc/apidoc.py +++ b/doc/apidoc.py @@ -257,8 +257,13 @@ def _is_relevant_type(obj): in [types.FunctionType, types.BuiltinFunctionType, types.MethodType] ) | ( - # Functions from C-extensions - type(obj).__name__ in ["cython_function_or_method", "fused_cython_function"] + # Functions from C-extensions and wrapped functions + type(obj).__name__ + in [ + "cython_function_or_method", + "fused_cython_function", + "_lru_cache_wrapper", + ] ) | ( # Enum instance