Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions src/biotite/structure/io/pdbx/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
from biotite.structure.io.pdbx.component import MaskValue
from biotite.structure.io.pdbx.encoding import StringArrayEncoding
from biotite.structure.repair import create_continuous_res_ids
from biotite.structure.residues import (
get_residue_count,
get_residue_positions,
Expand Down Expand Up @@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
).as_array(str),
)
array.set_annotation(
"res_id",
_get_or_fallback(
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
).as_array(int, -1),
)
array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
array.set_annotation(
"res_name",
Expand All @@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
)
array.set_annotation("element", atom_site["type_symbol"].as_array(str))

# Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
# hetero residues, which makes distinguishing subsequent residues from another
# difficult (https://github.com/biotite-dev/biotite/issues/553)
res_id = _get_or_fallback(
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
).as_array(int, -1)
if not use_author_fields and "auth_seq_id" in atom_site:
# Therefore, the `auth_seq_id` is still used to determine residue starts
# in `create_continuous_res_ids()`, even if `use_author_fields = False`.
res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
array.set_annotation("res_id", res_id_for_residue_starts)
fallback_res_ids = create_continuous_res_ids(array)
array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
else:
array.set_annotation("res_id", res_id)

if "atom_id" in extra_fields:
if "id" in atom_site:
array.set_annotation("atom_id", atom_site["id"].as_array(int))
Expand Down
6 changes: 3 additions & 3 deletions src/biotite/structure/segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ def get_segment_starts(
# Convert mask to indices
# Add 1, to shift the indices from the end of a segment
# to the start of a new segment
chain_starts = np.where(segment_start_mask)[0] + 1
segment_starts = np.where(segment_start_mask)[0] + 1

# The first chain is not included yet -> Insert '[0]'
if add_exclusive_stop:
return np.concatenate(([0], chain_starts, [array.array_length()]))
return np.concatenate(([0], segment_starts, [array.array_length()]))
else:
return np.concatenate(([0], chain_starts))
return np.concatenate(([0], segment_starts))


def apply_segment_wise(starts, data, function, axis=None):
Expand Down
10 changes: 10 additions & 0 deletions tests/structure/data/edge_cases/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Collection of structure file edges cases

- ``hetatm.pdb``: A simple PDB file containing a custom ligand, whose name is already
taken by the CCD.
However, since it contains only ``HETATM`` records, the bonds should not be taken from
the CCD but from the ``CONECT`` records.
- ``res_ids.cif``: Subsequent residues have the same ``label_xxx`` annotation, which
makes it hard to determine where a new residue starts.
However, using ``label_seq_id`` as fallback allows resolving the residue starts.
Derived from PDB entry ``5HU8``.
275 changes: 275 additions & 0 deletions tests/structure/data/edge_cases/res_ids.cif

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/structure/io/test_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ def test_hetatm_intra_residue_bonds():
],
dtype=np.uint32,
)
path = join(data_dir("structure"), "hetatm/ligand.pdb")
path = join(data_dir("structure"), "edge_cases", "hetatm.pdb")

pdb_file = pdb.PDBFile.read(path)
structure = pdb.get_structure(pdb_file, model=1, include_bonds=True)
Expand Down
32 changes: 32 additions & 0 deletions tests/structure/io/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,38 @@ def test_extra_fields(tmpdir, format):
assert test_atoms == ref_atoms


def test_hetero_residue_borders():
"""
Check if the https://github.com/biotite-dev/biotite/issues/553 is resolved:
Even if the ``label_xxx`` annotation is not sufficient to determine residue starts,
the ``auth_seq_id`` should be used to create incrementing residue IDs.
As reference the structure using author fields is taken, as this problem does not
apply for them.
The created residue IDs are manually checked, based on the CIF file, representing
this edge case.
"""
path = join(data_dir("structure"), "edge_cases", "res_ids.cif")
pdbx_file = pdbx.CIFFile.read(path)
# The issue does not exist for author fields, hence they represent the reference
ref_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=True)
test_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)

# The `label_xxx` variant should provide the same residue starts
ref_res_starts = struc.get_residue_starts(ref_atoms)
assert struc.get_residue_starts(test_atoms).tolist() == ref_res_starts.tolist()
# The residue numbering should be incrementing from residue to residue
# within the same chain for hetero residues
test_het_res_ids, _ = struc.get_residues(test_atoms[test_atoms.hetero])
assert (
test_het_res_ids.tolist()
== [1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 1, 1]
) # fmt: skip
# For non-hetero residues, the residue IDs represent the true sequence position
# Hence, they represent the original `label_seq_id` values
test_protein_res_ids, _ = struc.get_residues(test_atoms[~test_atoms.hetero])
assert test_protein_res_ids.tolist() == [5, 6, 7]


def test_dynamic_dtype():
"""
Check if the dtype of an annotation array is automatically adjusted if the
Expand Down
11 changes: 6 additions & 5 deletions tests/structure/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ def test_pdbx_sequence_consistency(path):
pdbx_file = pdbx.BinaryCIFFile.read(path)
ref_sequences = pdbx.get_sequence(pdbx_file)

atoms = pdbx.get_structure(pdbx_file, use_author_fields=False, model=1)
# Remove pure solvent chains
# In those chains the "label_seq_id" is usually "."
# which is translated to -1
atoms = atoms[atoms.res_id != -1]
atoms = pdbx.get_structure(
pdbx_file, use_author_fields=False, model=1, extra_fields=["label_entity_id"]
)
# Remove non-polymer chains
polymer_entity_ids = pdbx_file.block["entity_poly"]["entity_id"].as_array(str)
atoms = atoms[np.isin(atoms.label_entity_id, polymer_entity_ids)]
test_sequences, _ = struc.to_sequence(atoms, allow_hetero=True)

# Matching against the PDBx file is not trivial
Expand Down
Loading