Skip to content

Commit 0826672

Browse files
authored
Merge pull request #823 from padix-key/issue-553
Infer hetero residue IDs from author annotation
2 parents 7474d90 + e4afe9b commit 0826672

File tree

8 files changed

+344
-15
lines changed

8 files changed

+344
-15
lines changed

src/biotite/structure/io/pdbx/convert.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
5656
from biotite.structure.io.pdbx.component import MaskValue
5757
from biotite.structure.io.pdbx.encoding import StringArrayEncoding
58+
from biotite.structure.repair import create_continuous_res_ids
5859
from biotite.structure.residues import (
5960
get_residue_count,
6061
get_residue_positions,
@@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
496497
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
497498
).as_array(str),
498499
)
499-
array.set_annotation(
500-
"res_id",
501-
_get_or_fallback(
502-
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
503-
).as_array(int, -1),
504-
)
505500
array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
506501
array.set_annotation(
507502
"res_name",
@@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
518513
)
519514
array.set_annotation("element", atom_site["type_symbol"].as_array(str))
520515

516+
# Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
517+
# hetero residues, which makes distinguishing subsequent residues from another
518+
# difficult (https://github.com/biotite-dev/biotite/issues/553)
519+
res_id = _get_or_fallback(
520+
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
521+
).as_array(int, -1)
522+
if not use_author_fields and "auth_seq_id" in atom_site:
523+
# Therefore, the `auth_seq_id` is still used to determine residue starts
524+
# in `create_continuous_res_ids()`, even if `use_author_fields = False`.
525+
res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
526+
array.set_annotation("res_id", res_id_for_residue_starts)
527+
fallback_res_ids = create_continuous_res_ids(array)
528+
array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
529+
else:
530+
array.set_annotation("res_id", res_id)
531+
521532
if "atom_id" in extra_fields:
522533
if "id" in atom_site:
523534
array.set_annotation("atom_id", atom_site["id"].as_array(int))

src/biotite/structure/segments.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ def get_segment_starts(
6262
# Convert mask to indices
6363
# Add 1, to shift the indices from the end of a segment
6464
# to the start of a new segment
65-
chain_starts = np.where(segment_start_mask)[0] + 1
65+
segment_starts = np.where(segment_start_mask)[0] + 1
6666

6767
# The first chain is not included yet -> Insert '[0]'
6868
if add_exclusive_stop:
69-
return np.concatenate(([0], chain_starts, [array.array_length()]))
69+
return np.concatenate(([0], segment_starts, [array.array_length()]))
7070
else:
71-
return np.concatenate(([0], chain_starts))
71+
return np.concatenate(([0], segment_starts))
7272

7373

7474
def apply_segment_wise(starts, data, function, axis=None):
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Collection of structure file edges cases
2+
3+
- ``hetatm.pdb``: A simple PDB file containing a custom ligand, whose name is already
4+
taken by the CCD.
5+
However, since it contains only ``HETATM`` records, the bonds should not be taken from
6+
the CCD but from the ``CONECT`` records.
7+
- ``res_ids.cif``: Subsequent residues have the same ``label_xxx`` annotation, which
8+
makes it hard to determine where a new residue starts.
9+
However, using ``label_seq_id`` as fallback allows resolving the residue starts.
10+
Derived from PDB entry ``5HU8``.
File renamed without changes.

tests/structure/data/edge_cases/res_ids.cif

Lines changed: 275 additions & 0 deletions
Large diffs are not rendered by default.

tests/structure/io/test_pdb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def test_hetatm_intra_residue_bonds():
587587
],
588588
dtype=np.uint32,
589589
)
590-
path = join(data_dir("structure"), "hetatm/ligand.pdb")
590+
path = join(data_dir("structure"), "edge_cases", "hetatm.pdb")
591591

592592
pdb_file = pdb.PDBFile.read(path)
593593
structure = pdb.get_structure(pdb_file, model=1, include_bonds=True)

tests/structure/io/test_pdbx.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,38 @@ def test_extra_fields(tmpdir, format):
285285
assert test_atoms == ref_atoms
286286

287287

288+
def test_hetero_residue_borders():
289+
"""
290+
Check if the https://github.com/biotite-dev/biotite/issues/553 is resolved:
291+
Even if the ``label_xxx`` annotation is not sufficient to determine residue starts,
292+
the ``auth_seq_id`` should be used to create incrementing residue IDs.
293+
As reference the structure using author fields is taken, as this problem does not
294+
apply for them.
295+
The created residue IDs are manually checked, based on the CIF file, representing
296+
this edge case.
297+
"""
298+
path = join(data_dir("structure"), "edge_cases", "res_ids.cif")
299+
pdbx_file = pdbx.CIFFile.read(path)
300+
# The issue does not exist for author fields, hence they represent the reference
301+
ref_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=True)
302+
test_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
303+
304+
# The `label_xxx` variant should provide the same residue starts
305+
ref_res_starts = struc.get_residue_starts(ref_atoms)
306+
assert struc.get_residue_starts(test_atoms).tolist() == ref_res_starts.tolist()
307+
# The residue numbering should be incrementing from residue to residue
308+
# within the same chain for hetero residues
309+
test_het_res_ids, _ = struc.get_residues(test_atoms[test_atoms.hetero])
310+
assert (
311+
test_het_res_ids.tolist()
312+
== [1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 1, 1]
313+
) # fmt: skip
314+
# For non-hetero residues, the residue IDs represent the true sequence position
315+
# Hence, they represent the original `label_seq_id` values
316+
test_protein_res_ids, _ = struc.get_residues(test_atoms[~test_atoms.hetero])
317+
assert test_protein_res_ids.tolist() == [5, 6, 7]
318+
319+
288320
def test_dynamic_dtype():
289321
"""
290322
Check if the dtype of an annotation array is automatically adjusted if the

tests/structure/test_sequence.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ def test_pdbx_sequence_consistency(path):
2727
pdbx_file = pdbx.BinaryCIFFile.read(path)
2828
ref_sequences = pdbx.get_sequence(pdbx_file)
2929

30-
atoms = pdbx.get_structure(pdbx_file, use_author_fields=False, model=1)
31-
# Remove pure solvent chains
32-
# In those chains the "label_seq_id" is usually "."
33-
# which is translated to -1
34-
atoms = atoms[atoms.res_id != -1]
30+
atoms = pdbx.get_structure(
31+
pdbx_file, use_author_fields=False, model=1, extra_fields=["label_entity_id"]
32+
)
33+
# Remove non-polymer chains
34+
polymer_entity_ids = pdbx_file.block["entity_poly"]["entity_id"].as_array(str)
35+
atoms = atoms[np.isin(atoms.label_entity_id, polymer_entity_ids)]
3536
test_sequences, _ = struc.to_sequence(atoms, allow_hetero=True)
3637

3738
# Matching against the PDBx file is not trivial

0 commit comments

Comments
 (0)