Merge pull request #682 from padix-key/struc-alph

Integrate structural alphabets better into Biotite
biotite-dev · Nov 4, 2024 · baafde9 · baafde9
2 parents 10f7977 + 8dc3f67
commit baafde9
Show file tree

Hide file tree

Showing 30 changed files with 516 additions and 277 deletions.
diff --git a/benchmarks/structure/benchmark_alphabet.py b/benchmarks/structure/benchmark_alphabet.py
@@ -0,0 +1,22 @@
+from pathlib import Path
+import pytest
+import biotite.structure.alphabet as strucalph
+import biotite.structure.io.pdbx as pdbx
+from tests.util import data_dir
+
+PDB_ID = "1aki"
+
+
+@pytest.fixture
+def atoms():
+    pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif")
+    return pdbx.get_structure(pdbx_file, model=1, include_bonds=True)
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize("method", [strucalph.to_3di, strucalph.to_protein_blocks])
+def benchmark_structural_alphabet_methods(method, atoms):
+    """
+    Convert a structure to the given structural alphabet.
+    """
+    method(atoms)
diff --git a/doc/apidoc.py b/doc/apidoc.py
@@ -257,8 +257,13 @@ def _is_relevant_type(obj):
             in [types.FunctionType, types.BuiltinFunctionType, types.MethodType]
         )
         | (
-            # Functions from C-extensions
-            type(obj).__name__ in ["cython_function_or_method", "fused_cython_function"]
+            # Functions from C-extensions and wrapped functions
+            type(obj).__name__
+            in [
+                "cython_function_or_method",
+                "fused_cython_function",
+                "_lru_cache_wrapper",
+            ]
         )
         | (
             # Enum instance

diff --git a/doc/conf.py b/doc/conf.py
@@ -183,6 +183,7 @@
             "examples/scripts/structure/molecule",
             "examples/scripts/structure/contacts",
             "examples/scripts/structure/modeling",
+            "examples/scripts/structure/alphabet",
             "examples/scripts/structure/misc",
         ]
     ),

diff --git a/doc/examples/scripts/sequence/misc/color_schemes.py b/doc/examples/scripts/sequence/misc/color_schemes.py
@@ -2,7 +2,7 @@
 Biotite color schemes
 =====================
 
-This script displays the available color schemes.
+This script displays the available color schemes for the different built-in alphabets.
 """
 
 # Code source: Patrick Kunzmann
@@ -14,6 +14,7 @@
 from matplotlib.patches import Rectangle
 import biotite.sequence as seq
 import biotite.sequence.graphics as graphics
+import biotite.structure.alphabet as strucalph
 
 
 def plot_colors(ax, alphabet):
@@ -48,25 +49,34 @@ def plot_colors(ax, alphabet):
 
 nuc_alphabet = seq.NucleotideSequence.alphabet_amb
 prot_alphabet = seq.ProteinSequence.alphabet
-pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
+i3d_alphabet = strucalph.I3DSequence.alphabet
+pb_alphabet = strucalph.ProteinBlocksSequence.alphabet
 
-figure = plt.figure(figsize=(8.0, 5.0))
+figure = plt.figure(figsize=(8.0, 7.0))
 gs = GridSpec(
-    3,
+    4,
     1,
     height_ratios=[
         len(graphics.list_color_scheme_names(alphabet))
-        for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)
+        for alphabet in (nuc_alphabet, prot_alphabet, i3d_alphabet, pb_alphabet)
     ],
 )
+
 ax = figure.add_subplot(gs[0, 0])
-ax.set_title("Nucleotide color schemes")
+ax.set_title("Nucleotide")
 plot_colors(ax, nuc_alphabet)
+
 ax = figure.add_subplot(gs[1, 0])
-ax.set_title("Protein color schemes")
+ax.set_title("Protein")
 plot_colors(ax, prot_alphabet)
+
 ax = figure.add_subplot(gs[2, 0])
-ax.set_title("Protein block color schemes")
+ax.set_title("3Di")
+plot_colors(ax, i3d_alphabet)
+
+ax = figure.add_subplot(gs[3, 0])
+ax.set_title("Protein Blocks")
 plot_colors(ax, pb_alphabet)
+
 plt.tight_layout()
 plt.show()
diff --git a/doc/examples/scripts/structure/alphabet/README.rst b/doc/examples/scripts/structure/alphabet/README.rst
@@ -0,0 +1,2 @@
+Structural alphabets
+--------------------
diff --git a/doc/examples/scripts/structure/alphabet/msa.py b/doc/examples/scripts/structure/alphabet/msa.py
@@ -0,0 +1,87 @@
+"""
+Multiple Structural alignment of orthologous proteins
+=====================================================
+
+In this example we perform a structural alignment of multiple lysozyme
+variants from different organisms.
+A feasible approach to perfrom such a multiple structure alignment is the
+usage of a structural alphabet:
+At first the structure is translated into a sequence that represents
+the structure.
+Then the sequences can be aligned with the standard sequence alignment
+techniques, using the substitution matrix of the structural alphabet.
+
+In this example, the structural alphabet we will use is called
+*Protein Blocks* (PBs) :footcite:`Brevern2000, Barnoud2017`:
+There are 16 different PBs, represented by the symbols ``a`` to ``p``.
+Each one depicts a different set of the backbone dihedral angles of a
+peptide 5-mer.
+
+.. footbibliography::
+"""
+
+# Code source: Patrick Kunzmann
+# License: BSD 3 clause
+
+import matplotlib.pyplot as plt
+import biotite.database.rcsb as rcsb
+import biotite.sequence.align as align
+import biotite.sequence.graphics as graphics
+import biotite.structure as struc
+import biotite.structure.alphabet as strucalph
+import biotite.structure.io.pdbx as pdbx
+
+PDB_IDS = ["1REX", "1AKI", "1DKJ", "1GD6"]
+
+# Create a PB sequence from each structure
+structural_sequences = []
+organisms = []
+for pdb_id in PDB_IDS:
+    file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif"))
+    # Take only the first model into account
+    array = pdbx.get_structure(file, model=1)
+    # We are only interested in the first protein chain
+    array = array[struc.filter_amino_acids(array)]
+    array = array[array.chain_id == array.chain_id[0]]
+    pb_sequences, _ = strucalph.to_protein_blocks(array)
+    structural_sequences.append(pb_sequences[0].remove_undefined())
+
+    try:
+        organism_name = file.block["entity_src_nat"][
+            "pdbx_organism_scientific"
+        ].as_item()
+    except KeyError:
+        organism_name = file.block["entity_src_gen"][
+            "pdbx_gene_src_scientific_name"
+        ].as_item()
+    generic, specific = organism_name.split(" ")
+    abbreviation = f"{generic[0]}. {specific}"
+    organisms.append(abbreviation)
+
+
+# Perform a multiple sequence alignment of the PB sequences
+matrix = align.SubstitutionMatrix.std_protein_blocks_matrix()
+alignment, order, _, _ = align.align_multiple(
+    structural_sequences, matrix, gap_penalty=(-500, -100), terminal_penalty=False
+)
+
+# Visualize the alignment
+# Order alignment according to guide tree
+alignment = alignment[:, order.tolist()]
+labels = [organisms[i] for i in order]
+fig = plt.figure(figsize=(8.0, 4.0))
+ax = fig.add_subplot(111)
+# The color scheme was generated with the 'Gecos' software
+graphics.plot_alignment_type_based(
+    ax,
+    alignment,
+    labels=labels,
+    symbols_per_line=45,
+    spacing=2,
+    show_numbers=True,
+    color_scheme="flower",
+)
+# Organism names in italic
+ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"})
+fig.tight_layout()
+plt.show()
diff --git a/doc/examples/scripts/structure/protein/pb_alignment.license b/doc/examples/scripts/structure/protein/pb_alignment.license
diff --git a/doc/examples/scripts/structure/protein/pb_alignment.py b/doc/examples/scripts/structure/protein/pb_alignment.py
diff --git a/doc/index.rst b/doc/index.rst
@@ -128,7 +128,7 @@ Biotite documentation
             <h2>Analyze sequence data</h2>
 
         Work with sequences of any kind: from the usual nucleotide and protein
-        sequences to your own sequence types created from a custom alphabet.
+        sequences to structural alphabets like *3Di*.
         Use the rapid and modular alignment tools to identify homologous
         regions or to map reads.
         Eventually, visualize your results in different *Matplotlib* based