Merge pull request #823 from padix-key/issue-553

padix-key · web-flow · commit 08266725cad7 · 2025-08-22T14:44:19.000+02:00
Infer hetero residue IDs from author annotation
diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
@@ -55,6 +55,7 @@
 from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
 from biotite.structure.io.pdbx.component import MaskValue
 from biotite.structure.io.pdbx.encoding import StringArrayEncoding
+from biotite.structure.repair import create_continuous_res_ids
 from biotite.structure.residues import (
     get_residue_count,
     get_residue_positions,
@@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
             atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
         ).as_array(str),
     )
-    array.set_annotation(
-        "res_id",
-        _get_or_fallback(
-            atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
-        ).as_array(int, -1),
-    )
     array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
     array.set_annotation(
         "res_name",
@@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
     )
     array.set_annotation("element", atom_site["type_symbol"].as_array(str))
 
+    # Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
+    # hetero residues, which makes distinguishing subsequent residues from another
+    # difficult (https://github.com/biotite-dev/biotite/issues/553)
+    res_id = _get_or_fallback(
+        atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
+    ).as_array(int, -1)
+    if not use_author_fields and "auth_seq_id" in atom_site:
+        # Therefore, the `auth_seq_id` is still used to determine residue starts
+        # in `create_continuous_res_ids()`, even if `use_author_fields = False`.
+        res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
+        array.set_annotation("res_id", res_id_for_residue_starts)
+        fallback_res_ids = create_continuous_res_ids(array)
+        array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
+    else:
+        array.set_annotation("res_id", res_id)
+
     if "atom_id" in extra_fields:
         if "id" in atom_site:
             array.set_annotation("atom_id", atom_site["id"].as_array(int))
diff --git a/src/biotite/structure/segments.py b/src/biotite/structure/segments.py
@@ -62,13 +62,13 @@ def get_segment_starts(
     # Convert mask to indices
     # Add 1, to shift the indices from the end of a segment
     # to the start of a new segment
-    chain_starts = np.where(segment_start_mask)[0] + 1
+    segment_starts = np.where(segment_start_mask)[0] + 1
 
     # The first chain is not included yet -> Insert '[0]'
     if add_exclusive_stop:
-        return np.concatenate(([0], chain_starts, [array.array_length()]))
+        return np.concatenate(([0], segment_starts, [array.array_length()]))
     else:
-        return np.concatenate(([0], chain_starts))
+        return np.concatenate(([0], segment_starts))
 
 
 def apply_segment_wise(starts, data, function, axis=None):
diff --git a/tests/structure/data/edge_cases/README.rst b/tests/structure/data/edge_cases/README.rst
@@ -0,0 +1,10 @@
+# Collection of structure file edges cases
+
+- ``hetatm.pdb``: A simple PDB file containing a custom ligand, whose name is already
+  taken by the CCD.
+  However, since it contains only ``HETATM`` records, the bonds should not be taken from
+  the CCD but from the ``CONECT`` records.
+- ``res_ids.cif``: Subsequent residues have the same ``label_xxx`` annotation, which
+  makes it hard to determine where a new residue starts.
+  However, using ``label_seq_id`` as fallback allows resolving the residue starts.
+  Derived from PDB entry ``5HU8``.
diff --git a/tests/structure/data/edge_cases/hetatm.pdb b/tests/structure/data/edge_cases/hetatm.pdb
diff --git a/tests/structure/data/edge_cases/res_ids.cif b/tests/structure/data/edge_cases/res_ids.cif
diff --git a/tests/structure/io/test_pdb.py b/tests/structure/io/test_pdb.py
@@ -587,7 +587,7 @@ def test_hetatm_intra_residue_bonds():
         ],
         dtype=np.uint32,
     )
-    path = join(data_dir("structure"), "hetatm/ligand.pdb")
+    path = join(data_dir("structure"), "edge_cases", "hetatm.pdb")
 
     pdb_file = pdb.PDBFile.read(path)
     structure = pdb.get_structure(pdb_file, model=1, include_bonds=True)
diff --git a/tests/structure/io/test_pdbx.py b/tests/structure/io/test_pdbx.py
@@ -285,6 +285,38 @@ def test_extra_fields(tmpdir, format):
     assert test_atoms == ref_atoms
 
 
+def test_hetero_residue_borders():
+    """
+    Check if the https://github.com/biotite-dev/biotite/issues/553 is resolved:
+    Even if the ``label_xxx`` annotation is not sufficient to determine residue starts,
+    the ``auth_seq_id`` should be used to create incrementing residue IDs.
+    As reference the structure using author fields is taken, as this problem does not
+    apply for them.
+    The created residue IDs are manually checked, based on the CIF file, representing
+    this edge case.
+    """
+    path = join(data_dir("structure"), "edge_cases", "res_ids.cif")
+    pdbx_file = pdbx.CIFFile.read(path)
+    # The issue does not exist for author fields, hence they represent the reference
+    ref_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=True)
+    test_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
+
+    # The `label_xxx` variant should provide the same residue starts
+    ref_res_starts = struc.get_residue_starts(ref_atoms)
+    assert struc.get_residue_starts(test_atoms).tolist() == ref_res_starts.tolist()
+    # The residue numbering should be incrementing from residue to residue
+    # within the same chain for hetero residues
+    test_het_res_ids, _ = struc.get_residues(test_atoms[test_atoms.hetero])
+    assert (
+        test_het_res_ids.tolist()
+        == [1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 1, 1]
+    )  # fmt: skip
+    # For non-hetero residues, the residue IDs represent the true sequence position
+    # Hence, they represent the original `label_seq_id` values
+    test_protein_res_ids, _ = struc.get_residues(test_atoms[~test_atoms.hetero])
+    assert test_protein_res_ids.tolist() == [5, 6, 7]
+
+
 def test_dynamic_dtype():
     """
     Check if the dtype of an annotation array is automatically adjusted if the
diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py
@@ -27,11 +27,12 @@ def test_pdbx_sequence_consistency(path):
     pdbx_file = pdbx.BinaryCIFFile.read(path)
     ref_sequences = pdbx.get_sequence(pdbx_file)
 
-    atoms = pdbx.get_structure(pdbx_file, use_author_fields=False, model=1)
-    # Remove pure solvent chains
-    # In those chains the "label_seq_id" is usually "."
-    # which is translated to -1
-    atoms = atoms[atoms.res_id != -1]
+    atoms = pdbx.get_structure(
+        pdbx_file, use_author_fields=False, model=1, extra_fields=["label_entity_id"]
+    )
+    # Remove non-polymer chains
+    polymer_entity_ids = pdbx_file.block["entity_poly"]["entity_id"].as_array(str)
+    atoms = atoms[np.isin(atoms.label_entity_id, polymer_entity_ids)]
     test_sequences, _ = struc.to_sequence(atoms, allow_hetero=True)
 
     # Matching against the PDBx file is not trivial

Original file line number	Diff line number	Diff line change
`@@ -587,7 +587,7 @@ def test_hetatm_intra_residue_bonds():`
`587`	`587`	`],`
`588`	`588`	`dtype=np.uint32,`
`589`	`589`	`)`
`590`		`- path = join(data_dir("structure"), "hetatm/ligand.pdb")`
	`590`	`+ path = join(data_dir("structure"), "edge_cases", "hetatm.pdb")`
`591`	`591`
`592`	`592`	`pdb_file = pdb.PDBFile.read(path)`
`593`	`593`	`structure = pdb.get_structure(pdb_file, model=1, include_bonds=True)`