Fix usage of mkdssp>=4

padix-key · padix-key · commit 9cb491737e22 · 2025-03-02T16:36:51.000+01:00
diff --git a/.github/workflows/test_and_deploy.yml b/.github/workflows/test_and_deploy.yml
@@ -209,10 +209,6 @@ jobs:
           miniforge-version: latest
       - name: Install distribution
         run: pip install ./dist/*.whl
-      - name: "TEMP: Skip DSSP tests"
-        # TEMP: Omit DSSP tests for now until conda-forge DSSP is functional
-        # (https://github.com/conda-forge/dssp-feedstock/pull/4)
-        run: mamba uninstall dssp
       - name: Run tests
         # Running NCBI BLAST and SRA takes too long
         # The tests on the NCBI Entrez database are not reliable enough
diff --git a/environment.yml b/environment.yml
@@ -7,7 +7,6 @@ channels:
   - anaconda
   - conda-forge
   - bioconda
-  - salilab
 
 dependencies:
   - python =3.11
@@ -33,7 +32,7 @@ dependencies:
   # Interfaced software in biotite.application (can also be installed separately)
   - autodock-vina
   - clustalo
-  - dssp =3
+  - dssp =4
   - mafft
   - muscle =3
   - sra-tools =3
diff --git a/src/biotite/application/dssp/app.py b/src/biotite/application/dssp/app.py
@@ -11,8 +11,13 @@
 import numpy as np
 from biotite.application.application import AppState, requires_state
 from biotite.application.localapp import LocalApp, cleanup_tempfile, get_version
-from biotite.structure.io.pdbx.cif import CIFFile
+from biotite.structure.error import BadStructureError
+from biotite.structure.filter import filter_amino_acids
+from biotite.structure.io.pdbx.cif import CIFCategory, CIFColumn, CIFFile
+from biotite.structure.io.pdbx.component import MaskValue
 from biotite.structure.io.pdbx.convert import set_structure
+from biotite.structure.repair import create_continuous_res_ids
+from biotite.structure.residues import get_residue_starts
 
 
 class DsspApp(LocalApp):
@@ -49,17 +54,19 @@ class DsspApp(LocalApp):
     >>> app.start()
     >>> app.join()
     >>> print(app.get_sse())
-    ['C' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'G' 'G' 'G' 'G' 'T' 'C' 'C' 'C'
-     'C' 'C']
+    ['C' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'G' 'G' 'G' 'G' 'T' 'C' 'P' 'P'
+     'P' 'C']
     """
 
     def __init__(self, atom_array, bin_path="mkdssp"):
         super().__init__(bin_path)
 
-        # mkdssp requires also the
-        # 'occupancy', 'b_factor' and 'charge' fields
-        # -> Add these annotations to a copy of the input structure
+        if not np.all(filter_amino_acids(atom_array)):
+            raise BadStructureError("The input structure must contain only amino acids")
         self._array = atom_array.copy()
+        # DSSP requires also the
+        # 'occupancy', 'b_factor' and 'charge' fields
+        # -> Add these placeholder values
         categories = self._array.get_annotation_categories()
         if "charge" not in categories:
             self._array.set_annotation(
@@ -73,6 +80,10 @@ def __init__(self, atom_array, bin_path="mkdssp"):
             self._array.set_annotation(
                 "occupancy", np.ones(self._array.array_length(), dtype=float)
             )
+        # DSSP>=4 complains about the `pdbx_poly_seq_scheme` category,
+        # if `seq_id` does not start at 1
+        self._array.res_id = create_continuous_res_ids(self._array)
+
         try:
             # The parameters have changed in version 4
             self._new_cli = get_version(bin_path)[0] >= 4
@@ -86,6 +97,9 @@ def __init__(self, atom_array, bin_path="mkdssp"):
     def run(self):
         in_file = CIFFile()
         set_structure(in_file, self._array)
+        in_file.block["pdbx_poly_seq_scheme"] = _create_pdbx_poly_seq_scheme(
+            self._array, in_file.block["atom_site"]["label_entity_id"].as_array(str)
+        )
         in_file.write(self._in_file)
         self._in_file.flush()
         if self._new_cli:
@@ -157,3 +171,46 @@ def annotate_sse(atom_array, bin_path="mkdssp"):
         app.start()
         app.join()
         return app.get_sse()
+
+
+def _create_pdbx_poly_seq_scheme(atom_array, entity_ids):
+    """
+    Create the ``pdbx_poly_seq_scheme`` category, as required by DSSP.
+
+    Parameters
+    ----------
+    atom_array : AtomArray
+        The atom array to create the category from.
+    entity_ids : ndarray, dtype=str
+        The entity IDs for each atoms.
+
+    Returns
+    -------
+    pdbx_poly_seq_scheme : CIFCategory
+        The ``pdbx_poly_seq_scheme`` category.
+    """
+    res_start_indices = get_residue_starts(atom_array)
+    chain_id = atom_array.chain_id[res_start_indices]
+    res_name = atom_array.res_name[res_start_indices]
+    res_id = atom_array.res_id[res_start_indices]
+    ins_code = atom_array.ins_code[res_start_indices]
+    hetero = atom_array.hetero[res_start_indices]
+    entity_id = entity_ids[res_start_indices]
+
+    poly_seq_scheme = CIFCategory()
+    poly_seq_scheme["asym_id"] = chain_id
+    poly_seq_scheme["entity_id"] = entity_id
+    poly_seq_scheme["seq_id"] = res_id
+    poly_seq_scheme["mon_id"] = res_name
+    poly_seq_scheme["ndb_seq_num"] = res_id
+    poly_seq_scheme["pdb_seq_num"] = res_id
+    poly_seq_scheme["auth_seq_num"] = res_id
+    poly_seq_scheme["pdb_mon_id"] = res_name
+    poly_seq_scheme["auth_mon_id"] = res_name
+    poly_seq_scheme["pdb_strand_id"] = chain_id
+    poly_seq_scheme["pdb_ins_code"] = CIFColumn(
+        ins_code, np.where(ins_code == "", MaskValue.MISSING, MaskValue.PRESENT)
+    )
+    poly_seq_scheme["hetero"] = np.where(hetero, "y", "n")
+
+    return poly_seq_scheme
diff --git a/tests/application/test_dssp.py b/tests/application/test_dssp.py
@@ -3,7 +3,6 @@
 # information.
 
 from os.path import join
-from subprocess import SubprocessError
 import numpy as np
 import pytest
 import biotite.structure as struc
@@ -13,21 +12,38 @@
 from tests.util import data_dir, is_not_installed
 
 
+@pytest.mark.parametrize(
+    "pdb_id",
+    [
+        "1aki",  # Single chain
+        "1igy",  # Multiple chains
+        "5eil",  # Contains non-canonical amino acid
+    ],
+)
 @pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed")
-def test_multiple_chains():
+def test_annotation(pdb_id):
+    """
+    Check if the the DSSP annotation has the correct length and reasonable values.
+    """
     atoms = pdbx.get_structure(
-        pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")), model=1
+        pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{pdb_id}.bcif")), model=1
     )
-    atoms = atoms[struc.filter_canonical_amino_acids(atoms)]
+    atoms = atoms[struc.filter_amino_acids(atoms)]
     sse = DsspApp.annotate_sse(atoms)
-    assert np.all(np.isin(sse, ["C", "H", "B", "E", "G", "I", "T", "S"]))
+
+    assert np.all(np.isin(sse, ["C", "H", "B", "E", "G", "I", "T", "S", "P"]))
+    # One SSE per residue
     assert len(sse) == struc.get_residue_count(atoms)
 
 
 @pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed")
 def test_invalid_structure():
+    """
+    Check if an exception is raised, if the input structure contains non-amino-acid
+    residues.
+    """
     array = strucio.load_structure(join(data_dir("structure"), "5ugo.bcif"))
     # Get DNA chain -> Invalid for DSSP
     chain = array[array.chain_id == "T"]
-    with pytest.raises(SubprocessError):
+    with pytest.raises(struc.BadStructureError):
         DsspApp.annotate_sse(chain)