diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 80d6b730e..7ea164dfc 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -55,6 +55,7 @@ from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile from biotite.structure.io.pdbx.component import MaskValue from biotite.structure.io.pdbx.encoding import StringArrayEncoding +from biotite.structure.repair import create_continuous_res_ids from biotite.structure.residues import ( get_residue_count, get_residue_positions, @@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields): atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id" ).as_array(str), ) - array.set_annotation( - "res_id", - _get_or_fallback( - atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id" - ).as_array(int, -1), - ) array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, "")) array.set_annotation( "res_name", @@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields): ) array.set_annotation("element", atom_site["type_symbol"].as_array(str)) + # Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all + # hetero residues, which makes distinguishing subsequent residues from another + # difficult (https://github.com/biotite-dev/biotite/issues/553) + res_id = _get_or_fallback( + atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id" + ).as_array(int, -1) + if not use_author_fields and "auth_seq_id" in atom_site: + # Therefore, the `auth_seq_id` is still used to determine residue starts + # in `create_continuous_res_ids()`, even if `use_author_fields = False`. + res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1) + array.set_annotation("res_id", res_id_for_residue_starts) + fallback_res_ids = create_continuous_res_ids(array) + array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id)) + else: + array.set_annotation("res_id", res_id) + if "atom_id" in extra_fields: if "id" in atom_site: array.set_annotation("atom_id", atom_site["id"].as_array(int)) diff --git a/src/biotite/structure/segments.py b/src/biotite/structure/segments.py index 2d2b33492..dc13749d9 100644 --- a/src/biotite/structure/segments.py +++ b/src/biotite/structure/segments.py @@ -62,13 +62,13 @@ def get_segment_starts( # Convert mask to indices # Add 1, to shift the indices from the end of a segment # to the start of a new segment - chain_starts = np.where(segment_start_mask)[0] + 1 + segment_starts = np.where(segment_start_mask)[0] + 1 # The first chain is not included yet -> Insert '[0]' if add_exclusive_stop: - return np.concatenate(([0], chain_starts, [array.array_length()])) + return np.concatenate(([0], segment_starts, [array.array_length()])) else: - return np.concatenate(([0], chain_starts)) + return np.concatenate(([0], segment_starts)) def apply_segment_wise(starts, data, function, axis=None): diff --git a/tests/structure/data/edge_cases/README.rst b/tests/structure/data/edge_cases/README.rst new file mode 100644 index 000000000..b6f3ef6dd --- /dev/null +++ b/tests/structure/data/edge_cases/README.rst @@ -0,0 +1,10 @@ +# Collection of structure file edges cases + +- ``hetatm.pdb``: A simple PDB file containing a custom ligand, whose name is already + taken by the CCD. + However, since it contains only ``HETATM`` records, the bonds should not be taken from + the CCD but from the ``CONECT`` records. +- ``res_ids.cif``: Subsequent residues have the same ``label_xxx`` annotation, which + makes it hard to determine where a new residue starts. + However, using ``label_seq_id`` as fallback allows resolving the residue starts. + Derived from PDB entry ``5HU8``. \ No newline at end of file diff --git a/tests/structure/data/hetatm/ligand.pdb b/tests/structure/data/edge_cases/hetatm.pdb similarity index 100% rename from tests/structure/data/hetatm/ligand.pdb rename to tests/structure/data/edge_cases/hetatm.pdb diff --git a/tests/structure/data/edge_cases/res_ids.cif b/tests/structure/data/edge_cases/res_ids.cif new file mode 100644 index 000000000..00e21710d --- /dev/null +++ b/tests/structure/data/edge_cases/res_ids.cif @@ -0,0 +1,275 @@ +data_structure +# +loop_ +_atom_site.group_PDB +_atom_site.id +_atom_site.type_symbol +_atom_site.label_atom_id +_atom_site.label_alt_id +_atom_site.label_comp_id +_atom_site.label_asym_id +_atom_site.label_entity_id +_atom_site.label_seq_id +_atom_site.pdbx_PDB_ins_code +_atom_site.Cartn_x +_atom_site.Cartn_y +_atom_site.Cartn_z +_atom_site.occupancy +_atom_site.B_iso_or_equiv +_atom_site.pdbx_formal_charge +_atom_site.auth_seq_id +_atom_site.auth_comp_id +_atom_site.auth_asym_id +_atom_site.auth_atom_id +_atom_site.pdbx_PDB_model_num +ATOM 1 N N . SER A 1 5 ? 27.545 31.947 20.189 1.00 65.69 ? 0 SER A N 1 +ATOM 2 C CA . SER A 1 5 ? 26.575 33.050 20.472 1.00 73.04 ? 0 SER A CA 1 +ATOM 3 C C . SER A 1 5 ? 25.336 33.012 19.557 1.00 74.05 ? 0 SER A C 1 +ATOM 4 O O . SER A 1 5 ? 25.459 33.004 18.332 1.00 70.23 ? 0 SER A O 1 +ATOM 5 C CB . SER A 1 5 ? 27.254 34.409 20.313 1.00 74.98 ? 0 SER A CB 1 +ATOM 6 O OG . SER A 1 5 ? 26.397 35.452 20.736 1.00 76.08 ? 0 SER A OG 1 +ATOM 7 N N . ASP A 1 6 ? 24.161 33.036 20.180 1.00 76.32 ? 1 ASP A N 1 +ATOM 8 C CA . ASP A 1 6 ? 22.883 32.800 19.502 1.00 76.40 ? 1 ASP A CA 1 +ATOM 9 C C . ASP A 1 6 ? 22.533 33.859 18.447 1.00 73.05 ? 1 ASP A C 1 +ATOM 10 O O . ASP A 1 6 ? 22.789 35.056 18.650 1.00 72.07 ? 1 ASP A O 1 +ATOM 11 C CB . ASP A 1 6 ? 21.758 32.706 20.550 1.00 78.66 ? 1 ASP A CB 1 +ATOM 12 C CG . ASP A 1 6 ? 21.894 31.485 21.467 1.00 79.34 ? 1 ASP A CG 1 +ATOM 13 O OD1 . ASP A 1 6 ? 22.754 30.612 21.188 1.00 88.44 ? 1 ASP A OD1 1 +ATOM 14 O OD2 . ASP A 1 6 ? 21.142 31.394 22.466 1.00 73.20 ? 1 ASP A OD2 1 +ATOM 15 N N . GLN A 1 7 ? 21.930 33.404 17.343 1.00 71.10 ? 2 GLN A N 1 +ATOM 16 C CA . GLN A 1 7 ? 21.584 34.262 16.187 1.00 71.87 ? 2 GLN A CA 1 +ATOM 17 C C . GLN A 1 7 ? 20.202 33.968 15.630 1.00 70.89 ? 2 GLN A C 1 +ATOM 18 O O . GLN A 1 7 ? 19.834 32.795 15.433 1.00 69.47 ? 2 GLN A O 1 +ATOM 19 C CB . GLN A 1 7 ? 22.581 34.060 15.012 1.00 73.66 ? 2 GLN A CB 1 +ATOM 20 C CG . GLN A 1 7 ? 23.701 35.092 14.901 1.00 80.48 ? 2 GLN A CG 1 +ATOM 21 C CD . GLN A 1 7 ? 24.633 34.862 13.711 1.00 89.25 ? 2 GLN A CD 1 +ATOM 22 O OE1 . GLN A 1 7 ? 24.193 34.512 12.610 1.00 92.21 ? 2 GLN A OE1 1 +ATOM 23 N NE2 . GLN A 1 7 ? 25.940 35.054 13.930 1.00 91.84 ? 2 GLN A NE2 1 +HETATM 11599 C C1 . NAG G 3 . ? 19.046 35.364 -21.980 1.00 99.62 ? 1 NAG G C1 1 +HETATM 11600 C C2 . NAG G 3 . ? 20.161 35.560 -23.014 1.00 113.08 ? 1 NAG G C2 1 +HETATM 11601 C C3 . NAG G 3 . ? 19.880 34.836 -24.361 1.00 112.51 ? 1 NAG G C3 1 +HETATM 11602 C C4 . NAG G 3 . ? 19.521 33.376 -24.080 1.00 108.46 ? 1 NAG G C4 1 +HETATM 11603 C C5 . NAG G 3 . ? 18.245 33.443 -23.224 1.00 106.06 ? 1 NAG G C5 1 +HETATM 11604 C C6 . NAG G 3 . ? 17.522 32.099 -23.065 1.00 103.74 ? 1 NAG G C6 1 +HETATM 11605 C C7 . NAG G 3 . ? 21.181 37.737 -22.458 1.00 109.41 ? 1 NAG G C7 1 +HETATM 11606 C C8 . NAG G 3 . ? 21.195 39.210 -22.781 1.00 108.42 ? 1 NAG G C8 1 +HETATM 11607 N N2 . NAG G 3 . ? 20.325 37.000 -23.179 1.00 111.93 ? 1 NAG G N2 1 +HETATM 11608 O O3 . NAG G 3 . ? 20.953 34.889 -25.286 1.00 117.13 ? 1 NAG G O3 1 +HETATM 11609 O O4 . NAG G 3 . ? 19.432 32.585 -25.267 1.00 100.31 ? 1 NAG G O4 1 +HETATM 11610 O O5 . NAG G 3 . ? 18.606 34.009 -21.963 1.00 103.64 ? 1 NAG G O5 1 +HETATM 11611 O O6 . NAG G 3 . ? 18.049 31.404 -21.946 1.00 105.65 ? 1 NAG G O6 1 +HETATM 11612 O O7 . NAG G 3 . ? 21.927 37.275 -21.585 1.00 97.83 ? 1 NAG G O7 1 +HETATM 11613 C C1 . FUC G 3 . ? 18.241 29.976 -22.156 1.00 116.01 ? 2 FUC G C1 1 +HETATM 11614 C C2 . FUC G 3 . ? 19.721 29.598 -22.364 1.00 116.67 ? 2 FUC G C2 1 +HETATM 11615 C C3 . FUC G 3 . ? 20.471 30.115 -21.130 1.00 115.74 ? 2 FUC G C3 1 +HETATM 11616 C C4 . FUC G 3 . ? 19.900 29.414 -19.888 1.00 112.37 ? 2 FUC G C4 1 +HETATM 11617 C C5 . FUC G 3 . ? 18.365 29.511 -19.786 1.00 106.28 ? 2 FUC G C5 1 +HETATM 11618 C C6 . FUC G 3 . ? 17.758 28.606 -18.700 1.00 95.30 ? 2 FUC G C6 1 +HETATM 11619 O O2 . FUC G 3 . ? 20.282 30.028 -23.603 1.00 113.93 ? 2 FUC G O2 1 +HETATM 11620 O O3 . FUC G 3 . ? 21.869 29.926 -21.238 1.00 115.27 ? 2 FUC G O3 1 +HETATM 11621 O O4 . FUC G 3 . ? 20.300 28.061 -19.926 1.00 116.16 ? 2 FUC G O4 1 +HETATM 11622 O O5 . FUC G 3 . ? 17.760 29.226 -21.045 1.00 113.02 ? 2 FUC G O5 1 +HETATM 11623 C C1 . NAG H 4 . ? -48.905 38.254 -52.383 1.00 54.85 ? 1 NAG H C1 1 +HETATM 11624 C C2 . NAG H 4 . ? -49.330 39.652 -52.827 1.00 57.23 ? 1 NAG H C2 1 +HETATM 11625 C C3 . NAG H 4 . ? -49.914 40.344 -51.600 1.00 69.59 ? 1 NAG H C3 1 +HETATM 11626 C C4 . NAG H 4 . ? -51.093 39.501 -51.085 1.00 78.75 ? 1 NAG H C4 1 +HETATM 11627 C C5 . NAG H 4 . ? -50.633 38.054 -50.800 1.00 82.78 ? 1 NAG H C5 1 +HETATM 11628 C C6 . NAG H 4 . ? -51.747 37.080 -50.416 1.00 91.53 ? 1 NAG H C6 1 +HETATM 11629 C C7 . NAG H 4 . ? -47.955 41.119 -54.255 1.00 50.85 ? 1 NAG H C7 1 +HETATM 11630 C C8 . NAG H 4 . ? -46.788 42.067 -54.205 1.00 46.25 ? 1 NAG H C8 1 +HETATM 11631 N N2 . NAG H 4 . ? -48.233 40.551 -53.076 1.00 54.48 ? 1 NAG H N2 1 +HETATM 11632 O O3 . NAG H 4 . ? -50.125 41.759 -51.796 1.00 59.48 ? 1 NAG H O3 1 +HETATM 11633 O O4 . NAG H 4 . ? -51.653 40.040 -49.903 1.00 86.16 ? 1 NAG H O4 1 +HETATM 11634 O O5 . NAG H 4 . ? -49.995 37.508 -51.931 1.00 65.53 ? 1 NAG H O5 1 +HETATM 11635 O O6 . NAG H 4 . ? -52.655 36.980 -51.504 1.00 108.64 ? 1 NAG H O6 1 +HETATM 11636 O O7 . NAG H 4 . ? -48.586 40.911 -55.311 1.00 45.87 ? 1 NAG H O7 1 +HETATM 11637 C C1 . NAG H 4 . ? -53.059 40.062 -50.122 1.00 98.79 ? 2 NAG H C1 1 +HETATM 11638 C C2 . NAG H 4 . ? -53.837 40.224 -48.810 1.00 101.27 ? 2 NAG H C2 1 +HETATM 11639 C C3 . NAG H 4 . ? -55.184 40.950 -48.952 1.00 109.49 ? 2 NAG H C3 1 +HETATM 11640 C C4 . NAG H 4 . ? -55.326 41.755 -50.254 1.00 114.31 ? 2 NAG H C4 1 +HETATM 11641 C C5 . NAG H 4 . ? -54.707 40.951 -51.399 1.00 112.29 ? 2 NAG H C5 1 +HETATM 11642 C C6 . NAG H 4 . ? -54.949 41.447 -52.821 1.00 107.85 ? 2 NAG H C6 1 +HETATM 11643 C C7 . NAG H 4 . ? -53.436 38.279 -47.360 1.00 98.41 ? 2 NAG H C7 1 +HETATM 11644 C C8 . NAG H 4 . ? -53.869 36.877 -46.987 1.00 96.33 ? 2 NAG H C8 1 +HETATM 11645 N N2 . NAG H 4 . ? -54.128 38.875 -48.327 1.00 100.86 ? 2 NAG H N2 1 +HETATM 11646 O O3 . NAG H 4 . ? -55.416 41.759 -47.812 1.00 108.40 ? 2 NAG H O3 1 +HETATM 11647 O O4 . NAG H 4 . ? -56.683 42.046 -50.504 1.00 123.11 ? 2 NAG H O4 1 +HETATM 11648 O O5 . NAG H 4 . ? -53.333 41.013 -51.124 1.00 101.89 ? 2 NAG H O5 1 +HETATM 11649 O O6 . NAG H 4 . ? -55.141 42.836 -52.825 1.00 98.57 ? 2 NAG H O6 1 +HETATM 11650 O O7 . NAG H 4 . ? -52.489 38.840 -46.801 1.00 88.91 ? 2 NAG H O7 1 +HETATM 11651 C C1 . FUC H 4 . ? -53.799 36.114 -51.259 1.00 111.99 ? 3 FUC H C1 1 +HETATM 11652 C C2 . FUC H 4 . ? -54.724 36.190 -52.477 1.00 112.15 ? 3 FUC H C2 1 +HETATM 11653 C C3 . FUC H 4 . ? -54.051 35.559 -53.715 1.00 105.99 ? 3 FUC H C3 1 +HETATM 11654 C C4 . FUC H 4 . ? -53.387 34.190 -53.437 1.00 110.23 ? 3 FUC H C4 1 +HETATM 11655 C C5 . FUC H 4 . ? -52.723 34.087 -52.038 1.00 116.22 ? 3 FUC H C5 1 +HETATM 11656 C C6 . FUC H 4 . ? -52.370 32.653 -51.603 1.00 111.43 ? 3 FUC H C6 1 +HETATM 11657 O O2 . FUC H 4 . ? -55.093 37.544 -52.709 1.00 97.54 ? 3 FUC H O2 1 +HETATM 11658 O O3 . FUC H 4 . ? -54.962 35.480 -54.794 1.00 91.15 ? 3 FUC H O3 1 +HETATM 11659 O O4 . FUC H 4 . ? -54.285 33.125 -53.685 1.00 100.77 ? 3 FUC H O4 1 +HETATM 11660 O O5 . FUC H 4 . ? -53.497 34.740 -51.027 1.00 119.71 ? 3 FUC H O5 1 +HETATM 11661 C C1 . NAG I 3 . ? -15.959 45.383 -3.329 1.00 114.36 ? 1 NAG I C1 1 +HETATM 11662 C C2 . NAG I 3 . ? -16.167 43.869 -3.323 1.00 126.33 ? 1 NAG I C2 1 +HETATM 11663 C C3 . NAG I 3 . ? -17.264 43.527 -4.345 1.00 128.60 ? 1 NAG I C3 1 +HETATM 11664 C C4 . NAG I 3 . ? -18.515 44.443 -4.210 1.00 132.46 ? 1 NAG I C4 1 +HETATM 11665 C C5 . NAG I 3 . ? -18.216 45.934 -3.873 1.00 131.58 ? 1 NAG I C5 1 +HETATM 11666 C C6 . NAG I 3 . ? -19.410 46.729 -3.293 1.00 137.14 ? 1 NAG I C6 1 +HETATM 11667 C C7 . NAG I 3 . ? -14.520 42.017 -2.882 1.00 115.33 ? 1 NAG I C7 1 +HETATM 11668 C C8 . NAG I 3 . ? -13.101 41.547 -3.074 1.00 103.45 ? 1 NAG I C8 1 +HETATM 11669 N N2 . NAG I 3 . ? -14.845 43.207 -3.426 1.00 123.79 ? 1 NAG I N2 1 +HETATM 11670 O O3 . NAG I 3 . ? -17.628 42.171 -4.167 1.00 119.33 ? 1 NAG I O3 1 +HETATM 11671 O O4 . NAG I 3 . ? -19.315 44.343 -5.384 1.00 116.89 ? 1 NAG I O4 1 +HETATM 11672 O O5 . NAG I 3 . ? -17.148 46.051 -2.957 1.00 115.42 ? 1 NAG I O5 1 +HETATM 11673 O O6 . NAG I 3 . ? -19.008 47.784 -2.407 1.00 146.27 ? 1 NAG I O6 1 +HETATM 11674 O O7 . NAG I 3 . ? -15.319 41.299 -2.270 1.00 114.87 ? 1 NAG I O7 1 +HETATM 11675 C C1 . FUC I 3 . ? -20.085 48.712 -2.045 1.00 155.02 ? 2 FUC I C1 1 +HETATM 11676 C C2 . FUC I 3 . ? -20.487 48.599 -0.568 1.00 155.31 ? 2 FUC I C2 1 +HETATM 11677 C C3 . FUC I 3 . ? -19.482 49.225 0.408 1.00 163.37 ? 2 FUC I C3 1 +HETATM 11678 C C4 . FUC I 3 . ? -18.950 50.612 -0.010 1.00 169.60 ? 2 FUC I C4 1 +HETATM 11679 C C5 . FUC I 3 . ? -18.965 50.902 -1.534 1.00 172.53 ? 2 FUC I C5 1 +HETATM 11680 C C6 . FUC I 3 . ? -19.216 52.392 -1.826 1.00 167.74 ? 2 FUC I C6 1 +HETATM 11681 O O2 . FUC I 3 . ? -20.697 47.242 -0.236 1.00 142.74 ? 2 FUC I O2 1 +HETATM 11682 O O3 . FUC I 3 . ? -20.066 49.320 1.695 1.00 156.83 ? 2 FUC I O3 1 +HETATM 11683 O O4 . FUC I 3 . ? -19.561 51.655 0.749 1.00 148.57 ? 2 FUC I O4 1 +HETATM 11684 O O5 . FUC I 3 . ? -19.842 50.092 -2.339 1.00 166.37 ? 2 FUC I O5 1 +HETATM 11685 C C1 . NAG J 4 . ? -21.229 80.381 -68.347 1.00 76.61 ? 1 NAG J C1 1 +HETATM 11686 C C2 . NAG J 4 . ? -20.311 80.127 -69.514 1.00 78.77 ? 1 NAG J C2 1 +HETATM 11687 C C3 . NAG J 4 . ? -19.015 80.937 -69.354 1.00 85.97 ? 1 NAG J C3 1 +HETATM 11688 C C4 . NAG J 4 . ? -19.342 82.424 -69.167 1.00 92.01 ? 1 NAG J C4 1 +HETATM 11689 C C5 . NAG J 4 . ? -20.317 82.597 -67.986 1.00 98.01 ? 1 NAG J C5 1 +HETATM 11690 C C6 . NAG J 4 . ? -20.770 84.048 -67.776 1.00 101.97 ? 1 NAG J C6 1 +HETATM 11691 C C7 . NAG J 4 . ? -20.274 77.805 -70.330 1.00 77.56 ? 1 NAG J C7 1 +HETATM 11692 C C8 . NAG J 4 . ? -19.791 76.420 -69.989 1.00 75.20 ? 1 NAG J C8 1 +HETATM 11693 N N2 . NAG J 4 . ? -19.984 78.729 -69.425 1.00 81.63 ? 1 NAG J N2 1 +HETATM 11694 O O3 . NAG J 4 . ? -18.090 80.654 -70.411 1.00 79.48 ? 1 NAG J O3 1 +HETATM 11695 O O4 . NAG J 4 . ? -18.163 83.157 -68.886 1.00 100.80 ? 1 NAG J O4 1 +HETATM 11696 O O5 . NAG J 4 . ? -21.454 81.747 -68.045 1.00 80.22 ? 1 NAG J O5 1 +HETATM 11697 O O6 . NAG J 4 . ? -21.474 84.471 -68.921 1.00 111.97 ? 1 NAG J O6 1 +HETATM 11698 O O7 . NAG J 4 . ? -20.899 78.038 -71.364 1.00 74.34 ? 1 NAG J O7 1 +HETATM 11699 C C1 . NAG J 4 . ? -17.812 84.067 -69.954 1.00 113.08 ? 2 NAG J C1 1 +HETATM 11700 C C2 . NAG J 4 . ? -16.669 84.996 -69.491 1.00 116.63 ? 2 NAG J C2 1 +HETATM 11701 C C3 . NAG J 4 . ? -15.853 85.501 -70.692 1.00 124.69 ? 2 NAG J C3 1 +HETATM 11702 C C4 . NAG J 4 . ? -15.414 84.398 -71.674 1.00 122.52 ? 2 NAG J C4 1 +HETATM 11703 C C5 . NAG J 4 . ? -16.160 83.062 -71.480 1.00 118.89 ? 2 NAG J C5 1 +HETATM 11704 C C6 . NAG J 4 . ? -15.461 82.080 -70.494 1.00 112.44 ? 2 NAG J C6 1 +HETATM 11705 C C7 . NAG J 4 . ? -17.393 86.191 -67.414 1.00 101.72 ? 2 NAG J C7 1 +HETATM 11706 C C8 . NAG J 4 . ? -17.903 87.483 -66.819 1.00 103.36 ? 2 NAG J C8 1 +HETATM 11707 N N2 . NAG J 4 . ? -17.159 86.161 -68.738 1.00 109.45 ? 2 NAG J N2 1 +HETATM 11708 O O3 . NAG J 4 . ? -14.727 86.232 -70.241 1.00 128.43 ? 2 NAG J O3 1 +HETATM 11709 O O4 . NAG J 4 . ? -15.632 84.869 -72.998 1.00 114.95 ? 2 NAG J O4 1 +HETATM 11710 O O5 . NAG J 4 . ? -17.524 83.349 -71.159 1.00 117.79 ? 2 NAG J O5 1 +HETATM 11711 O O6 . NAG J 4 . ? -15.648 80.696 -70.751 1.00 97.31 ? 2 NAG J O6 1 +HETATM 11712 O O7 . NAG J 4 . ? -17.226 85.227 -66.676 1.00 83.96 ? 2 NAG J O7 1 +HETATM 11713 C C1 . FUC J 4 . ? -21.468 85.910 -69.064 1.00 128.13 ? 3 FUC J C1 1 +HETATM 11714 C C2 . FUC J 4 . ? -21.933 86.264 -70.489 1.00 128.32 ? 3 FUC J C2 1 +HETATM 11715 C C3 . FUC J 4 . ? -23.460 86.181 -70.651 1.00 130.52 ? 3 FUC J C3 1 +HETATM 11716 C C4 . FUC J 4 . ? -24.218 86.933 -69.534 1.00 135.05 ? 3 FUC J C4 1 +HETATM 11717 C C5 . FUC J 4 . ? -23.664 86.484 -68.157 1.00 140.91 ? 3 FUC J C5 1 +HETATM 11718 C C6 . FUC J 4 . ? -24.357 87.142 -66.953 1.00 132.46 ? 3 FUC J C6 1 +HETATM 11719 O O2 . FUC J 4 . ? -21.296 85.418 -71.427 1.00 117.27 ? 3 FUC J O2 1 +HETATM 11720 O O3 . FUC J 4 . ? -23.861 86.622 -71.932 1.00 123.08 ? 3 FUC J O3 1 +HETATM 11721 O O4 . FUC J 4 . ? -24.251 88.346 -69.725 1.00 118.57 ? 3 FUC J O4 1 +HETATM 11722 O O5 . FUC J 4 . ? -22.236 86.592 -68.067 1.00 143.26 ? 3 FUC J O5 1 +HETATM 11723 C C1 . NAG K 3 . ? 7.968 75.241 -17.274 1.00 108.55 ? 1 NAG K C1 1 +HETATM 11724 C C2 . NAG K 3 . ? 6.858 76.280 -16.938 1.00 104.66 ? 1 NAG K C2 1 +HETATM 11725 C C3 . NAG K 3 . ? 6.223 77.023 -18.132 1.00 108.81 ? 1 NAG K C3 1 +HETATM 11726 C C4 . NAG K 3 . ? 7.343 77.618 -18.976 1.00 116.84 ? 1 NAG K C4 1 +HETATM 11727 C C5 . NAG K 3 . ? 8.148 76.397 -19.420 1.00 126.23 ? 1 NAG K C5 1 +HETATM 11728 C C6 . NAG K 3 . ? 9.052 76.759 -20.596 1.00 130.57 ? 1 NAG K C6 1 +HETATM 11729 C C7 . NAG K 3 . ? 5.784 75.322 -14.982 1.00 108.60 ? 1 NAG K C7 1 +HETATM 11730 C C8 . NAG K 3 . ? 4.524 74.678 -14.444 1.00 100.84 ? 1 NAG K C8 1 +HETATM 11731 N N2 . NAG K 3 . ? 5.751 75.652 -16.266 1.00 108.28 ? 1 NAG K N2 1 +HETATM 11732 O O3 . NAG K 3 . ? 5.268 77.992 -17.744 1.00 103.45 ? 1 NAG K O3 1 +HETATM 11733 O O4 . NAG K 3 . ? 6.890 78.388 -20.080 1.00 110.24 ? 1 NAG K O4 1 +HETATM 11734 O O5 . NAG K 3 . ? 8.820 75.805 -18.288 1.00 123.59 ? 1 NAG K O5 1 +HETATM 11735 O O6 . NAG K 3 . ? 10.359 76.343 -20.293 1.00 146.21 ? 1 NAG K O6 1 +HETATM 11736 O O7 . NAG K 3 . ? 6.787 75.523 -14.284 1.00 110.05 ? 1 NAG K O7 1 +HETATM 11737 C C1 . FUC K 3 . ? 11.320 77.315 -20.750 1.00 156.20 ? 2 FUC K C1 1 +HETATM 11738 C C2 . FUC K 3 . ? 11.661 78.310 -19.624 1.00 152.80 ? 2 FUC K C2 1 +HETATM 11739 C C3 . FUC K 3 . ? 12.496 77.609 -18.554 1.00 147.57 ? 2 FUC K C3 1 +HETATM 11740 C C4 . FUC K 3 . ? 13.736 76.975 -19.212 1.00 150.88 ? 2 FUC K C4 1 +HETATM 11741 C C5 . FUC K 3 . ? 13.287 75.986 -20.298 1.00 149.95 ? 2 FUC K C5 1 +HETATM 11742 C C6 . FUC K 3 . ? 14.455 75.299 -21.011 1.00 136.12 ? 2 FUC K C6 1 +HETATM 11743 O O2 . FUC K 3 . ? 10.511 78.890 -19.039 1.00 146.86 ? 2 FUC K O2 1 +HETATM 11744 O O3 . FUC K 3 . ? 12.816 78.499 -17.504 1.00 121.77 ? 2 FUC K O3 1 +HETATM 11745 O O4 . FUC K 3 . ? 14.559 77.956 -19.799 1.00 147.11 ? 2 FUC K O4 1 +HETATM 11746 O O5 . FUC K 3 . ? 12.486 76.665 -21.253 1.00 161.74 ? 2 FUC K O5 1 +HETATM 11747 C C1 . NAG L 4 . ? -2.424 31.619 -77.052 1.00 58.72 ? 1 NAG L C1 1 +HETATM 11748 C C2 . NAG L 4 . ? -3.499 30.549 -77.094 1.00 64.56 ? 1 NAG L C2 1 +HETATM 11749 C C3 . NAG L 4 . ? -2.987 29.268 -76.470 1.00 68.83 ? 1 NAG L C3 1 +HETATM 11750 C C4 . NAG L 4 . ? -1.717 28.858 -77.204 1.00 72.31 ? 1 NAG L C4 1 +HETATM 11751 C C5 . NAG L 4 . ? -0.749 30.012 -77.111 1.00 74.54 ? 1 NAG L C5 1 +HETATM 11752 C C6 . NAG L 4 . ? 0.582 29.608 -77.725 1.00 87.21 ? 1 NAG L C6 1 +HETATM 11753 C C7 . NAG L 4 . ? -5.684 31.548 -76.838 1.00 64.77 ? 1 NAG L C7 1 +HETATM 11754 C C8 . NAG L 4 . ? -6.772 31.879 -75.850 1.00 59.16 ? 1 NAG L C8 1 +HETATM 11755 N N2 . NAG L 4 . ? -4.634 30.949 -76.301 1.00 64.51 ? 1 NAG L N2 1 +HETATM 11756 O O3 . NAG L 4 . ? -4.048 28.331 -76.451 1.00 68.00 ? 1 NAG L O3 1 +HETATM 11757 O O4 . NAG L 4 . ? -0.999 27.809 -76.574 1.00 88.66 ? 1 NAG L O4 1 +HETATM 11758 O O5 . NAG L 4 . ? -1.327 31.111 -77.750 1.00 58.92 ? 1 NAG L O5 1 +HETATM 11759 O O6 . NAG L 4 . ? 0.315 28.557 -78.675 1.00 117.00 ? 1 NAG L O6 1 +HETATM 11760 O O7 . NAG L 4 . ? -5.764 31.816 -78.055 1.00 64.05 ? 1 NAG L O7 1 +HETATM 11761 C C1 . NAG L 4 . ? -1.454 26.544 -77.037 1.00 101.08 ? 2 NAG L C1 1 +HETATM 11762 C C2 . NAG L 4 . ? -0.408 25.440 -77.032 1.00 104.72 ? 2 NAG L C2 1 +HETATM 11763 C C3 . NAG L 4 . ? -0.880 24.562 -78.196 1.00 109.80 ? 2 NAG L C3 1 +HETATM 11764 C C4 . NAG L 4 . ? -2.386 24.235 -78.027 1.00 116.53 ? 2 NAG L C4 1 +HETATM 11765 C C5 . NAG L 4 . ? -3.256 25.124 -77.062 1.00 118.60 ? 2 NAG L C5 1 +HETATM 11766 C C6 . NAG L 4 . ? -4.222 24.369 -76.127 1.00 114.77 ? 2 NAG L C6 1 +HETATM 11767 C C7 . NAG L 4 . ? 1.624 26.376 -75.977 1.00 102.24 ? 2 NAG L C7 1 +HETATM 11768 C C8 . NAG L 4 . ? 3.065 26.794 -76.170 1.00 98.29 ? 2 NAG L C8 1 +HETATM 11769 N N2 . NAG L 4 . ? 0.983 25.893 -77.070 1.00 106.18 ? 2 NAG L N2 1 +HETATM 11770 O O3 . NAG L 4 . ? -0.146 23.359 -78.312 1.00 109.12 ? 2 NAG L O3 1 +HETATM 11771 O O4 . NAG L 4 . ? -2.949 24.289 -79.322 1.00 105.49 ? 2 NAG L O4 1 +HETATM 11772 O O5 . NAG L 4 . ? -2.506 26.027 -76.261 1.00 103.98 ? 2 NAG L O5 1 +HETATM 11773 O O6 . NAG L 4 . ? -5.406 25.128 -75.909 1.00 103.57 ? 2 NAG L O6 1 +HETATM 11774 O O7 . NAG L 4 . ? 1.091 26.506 -74.857 1.00 82.59 ? 2 NAG L O7 1 +HETATM 11775 C C1 . FUC L 4 . ? 1.174 28.655 -79.882 1.00 128.04 ? 3 FUC L C1 1 +HETATM 11776 C C2 . FUC L 4 . ? 0.429 28.148 -81.130 1.00 128.58 ? 3 FUC L C2 1 +HETATM 11777 C C3 . FUC L 4 . ? -0.238 29.357 -81.802 1.00 125.52 ? 3 FUC L C3 1 +HETATM 11778 C C4 . FUC L 4 . ? 0.806 30.457 -82.055 1.00 131.27 ? 3 FUC L C4 1 +HETATM 11779 C C5 . FUC L 4 . ? 1.647 30.782 -80.804 1.00 129.28 ? 3 FUC L C5 1 +HETATM 11780 C C6 . FUC L 4 . ? 2.788 31.775 -81.017 1.00 124.12 ? 3 FUC L C6 1 +HETATM 11781 O O2 . FUC L 4 . ? -0.507 27.151 -80.760 1.00 128.22 ? 3 FUC L O2 1 +HETATM 11782 O O3 . FUC L 4 . ? -0.857 28.975 -83.009 1.00 103.01 ? 3 FUC L O3 1 +HETATM 11783 O O4 . FUC L 4 . ? 1.657 29.990 -83.070 1.00 141.78 ? 3 FUC L O4 1 +HETATM 11784 O O5 . FUC L 4 . ? 2.202 29.572 -80.287 1.00 126.88 ? 3 FUC L O5 1 +HETATM 11785 C C1 . NAG M 5 . ? -10.283 24.618 -21.532 1.00 91.75 ? 406 NAG A C1 1 +HETATM 11786 C C2 . NAG M 5 . ? -11.700 23.979 -21.411 1.00 105.35 ? 406 NAG A C2 1 +HETATM 11787 C C3 . NAG M 5 . ? -11.587 22.486 -20.982 1.00 105.03 ? 406 NAG A C3 1 +HETATM 11788 C C4 . NAG M 5 . ? -10.442 22.145 -19.987 1.00 109.97 ? 406 NAG A C4 1 +HETATM 11789 C C5 . NAG M 5 . ? -9.770 23.447 -19.513 1.00 108.78 ? 406 NAG A C5 1 +HETATM 11790 C C6 . NAG M 5 . ? -8.664 23.273 -18.463 1.00 105.30 ? 406 NAG A C6 1 +HETATM 11791 C C7 . NAG M 5 . ? -13.518 25.821 -21.005 1.00 107.29 ? 406 NAG A C7 1 +HETATM 11792 C C8 . NAG M 5 . ? -14.339 26.480 -19.936 1.00 99.08 ? 406 NAG A C8 1 +HETATM 11793 N N2 . NAG M 5 . ? -12.653 24.817 -20.602 1.00 114.39 ? 406 NAG A N2 1 +HETATM 11794 O O3 . NAG M 5 . ? -11.383 21.741 -22.163 1.00 102.22 ? 406 NAG A O3 1 +HETATM 11795 O O4 . NAG M 5 . ? -10.840 21.318 -18.893 1.00 104.37 ? 406 NAG A O4 1 +HETATM 11796 O O5 . NAG M 5 . ? -9.270 24.077 -20.683 1.00 98.04 ? 406 NAG A O5 1 +HETATM 11797 O O6 . NAG M 5 . ? -7.480 22.781 -19.040 1.00 95.00 ? 406 NAG A O6 1 +HETATM 11798 O O7 . NAG M 5 . ? -13.718 26.278 -22.144 1.00 100.33 ? 406 NAG A O7 1 +HETATM 11799 C C1 . NAG N 5 . ? -19.313 73.311 -18.956 1.00 89.91 ? 406 NAG C C1 1 +HETATM 11800 C C2 . NAG N 5 . ? -19.857 74.681 -19.389 1.00 98.94 ? 406 NAG C C2 1 +HETATM 11801 C C3 . NAG N 5 . ? -19.272 75.859 -18.592 1.00 103.88 ? 406 NAG C C3 1 +HETATM 11802 C C4 . NAG N 5 . ? -19.202 75.537 -17.096 1.00 105.36 ? 406 NAG C C4 1 +HETATM 11803 C C5 . NAG N 5 . ? -18.348 74.256 -17.012 1.00 102.03 ? 406 NAG C C5 1 +HETATM 11804 C C6 . NAG N 5 . ? -17.837 73.829 -15.629 1.00 101.07 ? 406 NAG C C6 1 +HETATM 11805 C C7 . NAG N 5 . ? -20.565 74.835 -21.759 1.00 105.39 ? 406 NAG C C7 1 +HETATM 11806 C C8 . NAG N 5 . ? -20.125 74.974 -23.204 1.00 102.27 ? 406 NAG C C8 1 +HETATM 11807 N N2 . NAG N 5 . ? -19.602 74.825 -20.824 1.00 102.16 ? 406 NAG C N2 1 +HETATM 11808 O O3 . NAG N 5 . ? -20.032 77.032 -18.794 1.00 107.20 ? 406 NAG C O3 1 +HETATM 11809 O O4 . NAG N 5 . ? -18.775 76.679 -16.353 1.00 105.33 ? 406 NAG C O4 1 +HETATM 11810 O O5 . NAG N 5 . ? -19.183 73.242 -17.554 1.00 93.83 ? 406 NAG C O5 1 +HETATM 11811 O O6 . NAG N 5 . ? -18.906 73.677 -14.715 1.00 101.37 ? 406 NAG C O6 1 +HETATM 11812 O O7 . NAG N 5 . ? -21.769 74.739 -21.466 1.00 100.19 ? 406 NAG C O7 1 +HETATM 11813 C C1 . NAG O 5 . ? 21.825 57.497 -41.421 1.00 107.31 ? 406 NAG E C1 1 +HETATM 11814 C C2 . NAG O 5 . ? 22.486 57.008 -42.729 1.00 119.22 ? 406 NAG E C2 1 +HETATM 11815 C C3 . NAG O 5 . ? 23.829 57.707 -43.075 1.00 118.56 ? 406 NAG E C3 1 +HETATM 11816 C C4 . NAG O 5 . ? 24.760 58.053 -41.881 1.00 116.39 ? 406 NAG E C4 1 +HETATM 11817 C C5 . NAG O 5 . ? 24.064 57.649 -40.559 1.00 112.13 ? 406 NAG E C5 1 +HETATM 11818 C C6 . NAG O 5 . ? 24.832 58.076 -39.311 1.00 106.76 ? 406 NAG E C6 1 +HETATM 11819 C C7 . NAG O 5 . ? 21.814 54.632 -43.252 1.00 136.22 ? 406 NAG E C7 1 +HETATM 11820 C C8 . NAG O 5 . ? 22.075 53.175 -42.932 1.00 132.35 ? 406 NAG E C8 1 +HETATM 11821 N N2 . NAG O 5 . ? 22.581 55.540 -42.604 1.00 131.15 ? 406 NAG E N2 1 +HETATM 11822 O O3 . NAG O 5 . ? 23.491 58.896 -43.760 1.00 109.89 ? 406 NAG E O3 1 +HETATM 11823 O O4 . NAG O 5 . ? 26.128 57.614 -42.043 1.00 92.70 ? 406 NAG E O4 1 +HETATM 11824 O O5 . NAG O 5 . ? 22.734 58.186 -40.560 1.00 106.76 ? 406 NAG E O5 1 +HETATM 11825 O O6 . NAG O 5 . ? 24.087 59.055 -38.626 1.00 89.44 ? 406 NAG E O6 1 +HETATM 11826 O O7 . NAG O 5 . ? 20.934 54.910 -44.083 1.00 125.53 ? 406 NAG E O7 1 diff --git a/tests/structure/io/test_pdb.py b/tests/structure/io/test_pdb.py index 37be2ab7b..aba417afa 100644 --- a/tests/structure/io/test_pdb.py +++ b/tests/structure/io/test_pdb.py @@ -587,7 +587,7 @@ def test_hetatm_intra_residue_bonds(): ], dtype=np.uint32, ) - path = join(data_dir("structure"), "hetatm/ligand.pdb") + path = join(data_dir("structure"), "edge_cases", "hetatm.pdb") pdb_file = pdb.PDBFile.read(path) structure = pdb.get_structure(pdb_file, model=1, include_bonds=True) diff --git a/tests/structure/io/test_pdbx.py b/tests/structure/io/test_pdbx.py index cc805cf6d..6bfc1c5c6 100644 --- a/tests/structure/io/test_pdbx.py +++ b/tests/structure/io/test_pdbx.py @@ -285,6 +285,38 @@ def test_extra_fields(tmpdir, format): assert test_atoms == ref_atoms +def test_hetero_residue_borders(): + """ + Check if the https://github.com/biotite-dev/biotite/issues/553 is resolved: + Even if the ``label_xxx`` annotation is not sufficient to determine residue starts, + the ``auth_seq_id`` should be used to create incrementing residue IDs. + As reference the structure using author fields is taken, as this problem does not + apply for them. + The created residue IDs are manually checked, based on the CIF file, representing + this edge case. + """ + path = join(data_dir("structure"), "edge_cases", "res_ids.cif") + pdbx_file = pdbx.CIFFile.read(path) + # The issue does not exist for author fields, hence they represent the reference + ref_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=True) + test_atoms = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) + + # The `label_xxx` variant should provide the same residue starts + ref_res_starts = struc.get_residue_starts(ref_atoms) + assert struc.get_residue_starts(test_atoms).tolist() == ref_res_starts.tolist() + # The residue numbering should be incrementing from residue to residue + # within the same chain for hetero residues + test_het_res_ids, _ = struc.get_residues(test_atoms[test_atoms.hetero]) + assert ( + test_het_res_ids.tolist() + == [1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 1, 1] + ) # fmt: skip + # For non-hetero residues, the residue IDs represent the true sequence position + # Hence, they represent the original `label_seq_id` values + test_protein_res_ids, _ = struc.get_residues(test_atoms[~test_atoms.hetero]) + assert test_protein_res_ids.tolist() == [5, 6, 7] + + def test_dynamic_dtype(): """ Check if the dtype of an annotation array is automatically adjusted if the diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py index 23b96677b..10616c2ee 100644 --- a/tests/structure/test_sequence.py +++ b/tests/structure/test_sequence.py @@ -27,11 +27,12 @@ def test_pdbx_sequence_consistency(path): pdbx_file = pdbx.BinaryCIFFile.read(path) ref_sequences = pdbx.get_sequence(pdbx_file) - atoms = pdbx.get_structure(pdbx_file, use_author_fields=False, model=1) - # Remove pure solvent chains - # In those chains the "label_seq_id" is usually "." - # which is translated to -1 - atoms = atoms[atoms.res_id != -1] + atoms = pdbx.get_structure( + pdbx_file, use_author_fields=False, model=1, extra_fields=["label_entity_id"] + ) + # Remove non-polymer chains + polymer_entity_ids = pdbx_file.block["entity_poly"]["entity_id"].as_array(str) + atoms = atoms[np.isin(atoms.label_entity_id, polymer_entity_ids)] test_sequences, _ = struc.to_sequence(atoms, allow_hetero=True) # Matching against the PDBx file is not trivial