diff --git a/.github/workflows/test_and_deploy.yml b/.github/workflows/test_and_deploy.yml index 5c70431ca..c3df5e28e 100644 --- a/.github/workflows/test_and_deploy.yml +++ b/.github/workflows/test_and_deploy.yml @@ -30,6 +30,8 @@ env: # Once GHA and cibuildwheel are updated this can be removed # mussllinux takes 6+ hrs to build and test so ignore it CIBW_TEST_SKIP: "*musllinux* *-macosx_arm64" + # Configuration for the architecture-agnostic jobs + PY_VERSION: "3.12" # Keep in sync with version in environment.yml jobs: @@ -50,6 +52,43 @@ jobs: run: ruff check + build-internal: + name: Build CCD and wheel for reusing it in several CI jobs + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PY_VERSION }} + - name: Get current CCD for hashing + run: wget https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz + - name: Cache CCD + uses: actions/cache@v4 + id: cache-ccd + with: + path: ./src/biotite/structure/info/components.bcif + key: cache-${{ hashFiles('setup_ccd.py') }}-${{ hashFiles('components.cif.gz') }} + - name: Remove CCD used for hashing + run: rm components.cif.gz + - name: Build internal CCD + if: steps.cache-ccd.outputs.cache-hit != 'true' + run: | + pip install . + python setup_ccd.py + - name: Install build backend + run: pip install build + - name: Build distribution + run: python -m build --wheel + - uses: actions/upload-artifact@v4 + with: + name: internal-build + path: ./dist/*.whl + - uses: actions/upload-artifact@v4 + with: + name: ccd + path: ./src/biotite/structure/info/components.bcif + + generate-wheels-matrix: name: "Generate wheels matrix" runs-on: "ubuntu-latest" @@ -85,13 +124,20 @@ jobs: test-and-build: name: "Build & Test" - needs: "generate-wheels-matrix" + needs: + - generate-wheels-matrix + - build-internal strategy: matrix: include: ${{ fromJson(needs.generate-wheels-matrix.outputs.include) }} runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 + - name: Add internal CCD to Biotite + uses: actions/download-artifact@v4 + with: + name: ccd + path: src/biotite/structure/info # QEMU enables building/testing for non-native architectures (ie arm64) # at the cost of speed @@ -113,11 +159,18 @@ jobs: path: ./wheelhouse/*.whl - make-sdist: + sdist: name: Build source distribution runs-on: ubuntu-latest + needs: + - build-internal steps: - uses: actions/checkout@v4 + - name: Add internal CCD to Biotite + uses: actions/download-artifact@v4 + with: + name: ccd + path: src/biotite/structure/info - name: Build source distribution run: pipx run build --sdist - uses: actions/upload-artifact@v4 @@ -130,20 +183,24 @@ jobs: name: Test interfaces to databases and applications runs-on: ubuntu-latest + needs: + - build-internal defaults: run: shell: bash -l {0} steps: - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: internal-build + path: dist - uses: conda-incubator/setup-miniconda@v3 with: environment-file: environment.yml miniforge-version: latest - - name: Build distribution - run: pip wheel --no-deps -w dist . - name: Install distribution - run: pip install .//dist//*.whl + run: pip install ./dist/*.whl - name: "TEMP: Skip DSSP tests" # TEMP: Omit DSSP tests for now until conda-forge DSSP is functional # (https://github.com/conda-forge/dssp-feedstock/pull/4) @@ -161,11 +218,6 @@ jobs: tests//test_modname.py tests//database tests//application - # Use the built distribution for other downstream jobs to reduce build time - - uses: actions/upload-artifact@v4 - with: - name: internal-build - path: ./dist/*.whl test-muscle5: @@ -173,7 +225,7 @@ jobs: runs-on: ubuntu-latest needs: - - test-interfaces + - build-internal defaults: run: shell: bash -l {0} @@ -188,7 +240,7 @@ jobs: with: activate-environment: biotite-dev miniforge-version: latest - python-version: "3.12" + python-version: ${{ env.PY_VERSION }} - name: Install Muscle 5 run: conda install -c bioconda "muscle=5" - name: Install distribution and pytest @@ -197,46 +249,12 @@ jobs: run: pytest --durations=50 tests//application//test_msa.py - make-ccd: - name: Compile the CCD subset for structure.info from the wwPDB CCD - - runs-on: ubuntu-latest - needs: - - test-interfaces - defaults: - run: - shell: bash -l {0} - - steps: - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - with: - name: internal-build - path: dist - - uses: actions/setup-python@v3 - with: - python-version: "3.12" - - name: Install distribution - run: pip install dist//*.whl - - name: Compile CCD subset - run: python setup_ccd.py - - name: Zip CCD - run: | - cd src/biotite/structure/info - zip -r ${{ github.workspace }}//dist//ccd.zip ccd - cd ${{ github.workspace }} - - uses: actions/upload-artifact@v4 - with: - name: ccd - path: dist//ccd.zip - - - make-docs: + docs: name: Build documentation - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest needs: - - test-interfaces + - build-internal defaults: run: shell: bash -l {0} @@ -258,7 +276,7 @@ jobs: environment-file: environment.yml miniforge-version: latest - name: Install distribution - run: pip install dist//*.whl + run: pip install dist/*.whl - name: Build base documentation run: sphinx-build -a -D plot_gallery=0 doc build//doc - name: Build tutorial and gallery @@ -285,16 +303,16 @@ jobs: runs-on: ubuntu-latest if: github.event_name != 'release' needs: - - test-interfaces + - build-internal steps: - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 with: name: internal-build path: dist - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: ${{ env.PY_VERSION }} - name: Install dependencies run: pip install dist//*.whl pytest pytest-codspeed - name: Run benchmarks @@ -310,7 +328,7 @@ jobs: needs: - lint - test-and-build - - make-sdist + - sdist - test-interfaces - test-muscle5 runs-on: ubuntu-latest @@ -334,12 +352,31 @@ jobs: password: ${{ secrets.PYPI_TOKEN }} + upload-ccd: + name: Upload CCD to GitHub Releases + permissions: + contents: write + needs: + - build-internal + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v4 + with: + name: ccd + path: dist + - name: Upload to GitHub Releases + uses: softprops/action-gh-release@v2.0.5 + if: github.event_name == 'release' && github.event.action == 'published' + with: + files: dist//components.bcif + + upload-docs: name: Upload documentation to GitHub Releases permissions: contents: write needs: - - make-docs + - docs runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v4 diff --git a/.gitignore b/.gitignore index 037c3f92e..0e248f8d2 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,9 @@ htmlcov # Ignore version file created by hatch-vcs /src/biotite/version.py +# Ignore internal CCD +/src/biotite/structure/info/components.bcif + # Ignore autogenerated documentation files /doc/static/switcher.json /doc/apidoc diff --git a/doc/apidoc.json b/doc/apidoc.json index b68573551..cce4cb16a 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -345,9 +345,29 @@ "base_pairs_from_dot_bracket" ] }, - + "biotite.structure.info" : { + "Residues and bonds": [ + "residue", + "bond_type", + "bonds_in_residue" + ], + "Component groups": [ + "amino_acid_names", + "nucleotide_names", + "carbohydrate_names" + ], + "Atom radii": [ + "vdw_radius_single", + "vdw_radius_protor" + ], + "Low-level CCD access": [ + "get_ccd", + "get_from_ccd", + "set_ccd_path" + ] + }, "biotite.structure.io.pdbx" : { - "High-level functionality": [ + "High-level functionality" : [ "get_sequence", "get_model_count", "get_structure", diff --git a/doc/install.rst b/doc/install.rst index 2dcb83028..3b51fc4ef 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -53,38 +53,35 @@ Installation via Conda Installation from source ------------------------ -You can also install Biotite from the +You can also install *Biotite* from the `project repository `_. +However, in addition to building and installing the package, the internal +`Chemical Component Dictionary (CCD) `_. for +:mod:`biotite.structure.info` needs to be built with the ``setup_ccd.py`` script. +The script in turn requires *Biotite*. +The solution to this chicken-and-egg problem is to first install Biotite without the +CCD, then build the CCD and finally install Biotite again. After cloning the repository, navigate to its top-level directory (the one ``setup.py`` is in) and type the following: .. code-block:: console $ pip install . + $ python setup_ccd.py + $ pip install . + +The `setup_ccd.py` script can also be used to update the internal CCD to the current +upstream version from the PDB. Having the *Biotite* installation always pointing to your repository clone is also possible. -Type the following in the top-level directory: +Substitute the installation with the following commands instead: .. code-block:: console $ pip install -e . - -Updating the Chemical Component Dictionary -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :mod:`biotite.structure.info` subpackage contains a subset from the -`PDB Chemical Component Dictionary (CCD) `_. -The repository ships a potentially outdated version of this subset. -To update this subset to the current upstream CCD version, run - -.. code-block:: console - $ python setup_ccd.py -Afterwards, install *Biotite* again. - - Common issues and solutions --------------------------- diff --git a/pyproject.toml b/pyproject.toml index 2935c5581..2f8f47181 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,7 @@ filterwarnings = [ "ignore:Input structure has no associated 'BondList'", ] + [tool.hatch.build.targets.sdist] exclude = [ "tests", @@ -112,6 +113,14 @@ exclude = [ # .github, .gitignore, .gitattributes ".git*", ] +artifacts = [ + "src/biotite/structure/info/components.bcif" +] + +[tool.hatch.build.targets.wheel] +artifacts = [ + "src/biotite/structure/info/components.bcif" +] [tool.hatch.version] source = "vcs" diff --git a/setup_ccd.py b/setup_ccd.py index 67ce1ed2d..8bcc67f8a 100644 --- a/setup_ccd.py +++ b/setup_ccd.py @@ -1,490 +1,192 @@ import gzip import logging -from dataclasses import dataclass +from collections import defaultdict from io import StringIO from pathlib import Path import numpy as np import requests from biotite.structure.io.pdbx import * - -class ComponentError(Exception): - pass +OUTPUT_CCD = ( + Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "components.bcif" +) +CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz" -@dataclass -class ColumnInfo: +def concatenate_ccd(categories=None): """ - Defines how to re-econde a column. + Create the CCD in BinaryCIF format with each category contains the + data of all blocks. - Attributes + Parameters ---------- - dtype : dtype - The data type of the column. - fill_value : object - The value to fill masked values with. - encoding : list of Encoding - The encodings to apply to the data. - alternative : str, optional - The name of an alternative column to use, if the original column - contains masked values and no `fill_value` is given. - """ - - dtype: ... - encoding: ... - fill_value: ... = None - alternative: ... = None - - -MAIN_COLUMNS = { - "id": ColumnInfo( - "U5", - [ - StringArrayEncoding( - data_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding(), - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "name": ColumnInfo( - str, - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "type": ColumnInfo( - str, - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "formula_weight": ColumnInfo( - "f8", - [ - FixedPointEncoding(factor=1000, src_type=TypeCode.FLOAT64), - ByteArrayEncoding(), - ], - fill_value=0, - ), - "one_letter_code": ColumnInfo( - "U1", - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - fill_value="", - ), -} - + categories : list of str, optional + The names of the categories to include. + By default, all categories from the CCD are included. -ATOM_COLUMNS = { - "comp_id": ColumnInfo( - "U5", - [ - StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding(), - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "atom_id": ColumnInfo( - "U6", - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "alt_atom_id": ColumnInfo( - "U6", - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "type_symbol": ColumnInfo( - "U2", - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "charge": ColumnInfo("i1", [ByteArrayEncoding(type=TypeCode.INT8)], fill_value=0), - "pdbx_model_Cartn_x_ideal": ColumnInfo( - "f4", - [ - FixedPointEncoding(factor=100), - IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding(), - ], - alternative="model_Cartn_x", - ), - "pdbx_model_Cartn_y_ideal": ColumnInfo( - "f4", - [ - FixedPointEncoding(factor=100), - IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding(), - ], - alternative="model_Cartn_y", - ), - "pdbx_model_Cartn_z_ideal": ColumnInfo( - "f4", - [ - FixedPointEncoding(factor=100), - IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding(), - ], - alternative="model_Cartn_z", - ), -} + Returns + ------- + compressed_file : BinaryCIFFile + The compressed CCD in BinaryCIF format. + """ -BOND_COLUMNS = { - "comp_id": ColumnInfo( - "U5", - [ - StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding(), - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "atom_id_1": ColumnInfo( - "U6", - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "atom_id_2": ColumnInfo( - "U6", - [ - StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - ) - ], - ), - "value_order": ColumnInfo( - "U4", - [ - StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)], - ) - ], - ), - "pdbx_aromatic_flag": ColumnInfo( - "U1", - [ - StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding(), - ], - offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)], - ) - ], - ), -} + logging.info("Download and read CCD...") + ccd_cif_text = gzip.decompress(requests.get(CCD_URL).content).decode() + ccd_file = CIFFile.read(StringIO(ccd_cif_text)) + compressed_block = BinaryCIFBlock() + if categories is None: + categories = _list_all_category_names(ccd_file) + for category_name in categories: + logging.info(f"Concatenate and compress '{category_name}' category...") + compressed_block[category_name] = compress( + _concatenate_blocks_into_category(ccd_file, category_name) + ) -CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz" + logging.info("Write concatenated CCD into BinaryCIF...") + compressed_file = BinaryCIFFile() + compressed_file["components"] = compressed_block + return compressed_file -def check_presence(pdbx_file, category_name, column_names): +def _concatenate_blocks_into_category(pdbx_file, category_name): """ - For each block in the file, check if each of the given column names - are present and unmasked. - Alternatively, all given column names may be masked/missing. - - This is used to ensure that coordinates are consistent: - If one dimension would be missing and another one would not, - the fallback of only one dimension would be used. - In consequence, the molecule coordinates would be distorted. + Concatenate the given category from all blocks into a single + category. Parameters ---------- pdbx_file : PDBxFile - The file to check. + The PDBx file, whose blocks should be concatenated. category_name : str - The name of the category to check. - column_names : list of str - The names of the columns to check. + The name of the category to concatenate. + + Returns + ------- + category : BinaryCIFCategory + The concatenated category. """ - for _, block in pdbx_file.items(): + columns_names = _list_all_column_names(pdbx_file, category_name) + data_chunks = defaultdict(list) + mask_chunks = defaultdict(list) + for block in pdbx_file.values(): if category_name not in block: continue category = block[category_name] - - is_present = column_names[0] in category - for name in column_names: - if (name in category) != is_present: - raise ComponentError("Only some column names are missing") - if not is_present: - return - - is_unmasked = category[column_names[0]].mask is None - for name in column_names: - if (category[name].mask is None) != is_unmasked: - raise ComponentError("Only some column names are masked") - - -def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): + for column_name in columns_names: + if column_name in category: + column = category[column_name] + data_chunks[column_name].append(column.data.array) + if column.mask is not None: + mask_chunks[column_name].append(column.mask.array) + else: + mask_chunks[column_name].append( + np.full(category.row_count, MaskValue.PRESENT, dtype=np.uint8) + ) + else: + # Column is missing in this block + # -> handle it as data masked as 'missing' + data_chunks[column_name].append( + # For now all arrays are of type string anyway, + # as they are read from a CIF file + np.full(category.row_count, "", dtype="U1") + ) + mask_chunks[column_name].append( + np.full(category.row_count, MaskValue.MISSING, dtype=np.uint8) + ) + + bcif_columns = {} + for col_name in columns_names: + data = np.concatenate(data_chunks[col_name]) + mask = np.concatenate(mask_chunks[col_name]) + data = _into_fitting_type(data, mask) + if np.all(mask == MaskValue.PRESENT): + mask = None + bcif_columns[col_name] = BinaryCIFColumn(data, mask) + return BinaryCIFCategory(bcif_columns) + + +def _list_all_column_names(pdbx_file, category_name): """ - Concatenate the given category from all blocks into a single - category. + Get all columns that exist in any block for a given category. Parameters ---------- pdbx_file : PDBxFile - The PDBx file, whose blocks should be concatenated. + The PDBx file to search in for the columns. category_name : str - The name of the category to concatenate. - column_infos : dict (str -> ColumnInfo) - Defines which columns of the category to keep and how to re-encode - them, where keys are the column names. + The name of the category to search in. Returns ------- - category : BinaryCIFCategory - The concatenated category. + columns_names : list of str + The names of the columns. """ - column_chunks = {col_name: [] for col_name in column_infos.keys()} - for comp_id, block in pdbx_file.items(): - try: - if category_name not in block: - raise ComponentError(f"Block has no category '{category_name}'") - chunk = {} - category = block[category_name] - for col_name, info in column_infos.items(): - col = category.get(col_name) - if col is None or (col.mask is not None and info.fill_value is None): - # Some/all values are missing and there is no default - # -> Try alternative - if info.alternative is not None: - col = category[info.alternative] - if col.mask is not None: - raise ComponentError( - f"Missing values in alternative " - f"'{info.alternative}'" - ) - else: - raise ComponentError(f"Missing values in column '{col_name}'") - data_array = col.as_array(info.dtype, info.fill_value) - chunk[col_name] = data_array - except ComponentError as e: - logging.warning(f"Skipping '{comp_id}': {e}") - # Append all columns in the chunk after the try-except block - # to avoid appending incomplete chunks - else: - for col_name, data_array in chunk.items(): - column_chunks[col_name].append(data_array) - return BinaryCIFCategory( - { - col_name: BinaryCIFData( - array=np.concatenate(col_data), encoding=column_infos[col_name].encoding - ) - for col_name, col_data in column_chunks.items() - } - ) + columns_names = set() + for block in pdbx_file.values(): + if category_name in block: + columns_names.update(block[category_name].keys()) + return sorted(columns_names) -def extract_component_groups(type_dict, include, exclude, file_name): +def _list_all_category_names(pdbx_file): """ - Extract component IDs that matches a given group from the given - dictionary. + Get all categories that exist in any block. Parameters ---------- - type_dict : dict - A dictionary that maps component IDs to their type. - include, exclude : list of str - The keywords to be matched. - file_name : Path - The path the output file to write the extracted component IDs - to. - """ - # Find components that matches the given keywords - comp_ids_for_group = [] - types_for_group = set() - for comp_id, comp_type in type_dict.items(): - if any(keyword in comp_type.lower() for keyword in exclude): - # 'xxx-like' components are not considered - # as they are not real 'xxx' - continue - if any(keyword in comp_type.lower() for keyword in include): - comp_ids_for_group.append(comp_id) - types_for_group.add(comp_type.lower()) - # Remove extracted components from dict - for comp_id in comp_ids_for_group: - del type_dict[comp_id] - # Write extracted components into output file - logging.info( - f"Using the following types for '{file_name.name}':\n" - + ", ".join(types_for_group) - ) - with open(file_name, "w") as file: - for comp_id in comp_ids_for_group: - file.write(comp_id + "\n") - - -def setup_ccd(target_diriectory): - logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s") + pdbx_file : PDBxFile + The PDBx file to search in for the columns. - target_diriectory.mkdir(parents=True, exist_ok=True) + Returns + ------- + columns_names : list of str + The names of the columns. + """ + category_names = set() + for block in pdbx_file.values(): + category_names.update(block.keys()) + return sorted(category_names) - logging.info("Downloading and reading CCD...") - ccd_cif_text = gzip.decompress(requests.get(CCD_URL).content).decode() - ccd_file = CIFFile.read(StringIO(ccd_cif_text)) - logging.info("Checking for consistent coordinates...") - check_presence( - ccd_file, "chem_comp_atom", ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"] - ) - check_presence( - ccd_file, - "chem_comp_atom", - ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"], - ) +def _into_fitting_type(string_array, mask): + """ + Try to find a numeric type for a string ndarray, if possible. - logging.info("Extracting component groups...") - type_dict = { - comp_id: block["chem_comp"]["type"].as_item() - for comp_id, block in ccd_file.items() - } - extract_component_groups( - type_dict, - ["peptide", "amino"], - ["peptide-like"], - target_diriectory / "amino_acids.txt", - ) - extract_component_groups( - type_dict, ["rna", "dna"], [], target_diriectory / "nucleotides.txt" - ) - extract_component_groups( - type_dict, ["saccharide"], [], target_diriectory / "carbohydrates.txt" - ) - remaining_types = set(type_dict.values()) - logging.info( - "The following types are not used in any group:\n" + ", ".join(remaining_types) - ) + Parameters + ---------- + string_array : ndarray, dtype=string + The array to convert. + mask : ndarray, dtype=uint8 + Only values in `string_array` where the mask is ``MaskValue.PRESENT`` are + considered for type conversion. - compressed_block = BinaryCIFBlock() - for category_name, column_infos in [ - ("chem_comp", MAIN_COLUMNS), - ("chem_comp_atom", ATOM_COLUMNS), - ("chem_comp_bond", BOND_COLUMNS), - ]: - logging.info(f"Concatenate '{category_name}' category...") - compressed_block[category_name] = concatenate_blocks_into_category( - ccd_file, category_name, column_infos - ) + Returns + ------- + array : ndarray + The array converted into an appropriate dtype. + """ + mask = mask == MaskValue.PRESENT + # Only try to find an appropriate dtype for unmasked values + values = string_array[mask] + try: + # Try to fit into integer type + values = values.astype(int) + except ValueError: + try: + # Try to fit into float type + values = values.astype(float) + except ValueError: + # Keep string type + pass + array = np.zeros(string_array.shape, dtype=values.dtype) + array[mask] = values + return array - logging.info("Write concatenated CCD into BinaryCIF...") - compressed_file = BinaryCIFFile() - compressed_file["components"] = compressed_block - compressed_file.write(target_diriectory / "components.bcif") +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s") + OUTPUT_CCD.parent.mkdir(parents=True, exist_ok=True) -setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd") + compressed_ccd = concatenate_ccd(["chem_comp", "chem_comp_atom", "chem_comp_bond"]) + compressed_ccd.write(OUTPUT_CCD) diff --git a/src/biotite/structure/info/__init__.py b/src/biotite/structure/info/__init__.py index 3c7078ff7..5e2961ed6 100644 --- a/src/biotite/structure/info/__init__.py +++ b/src/biotite/structure/info/__init__.py @@ -16,6 +16,7 @@ from .atoms import * from .bonds import * +from .ccd import * from .groups import * from .masses import * from .misc import * diff --git a/src/biotite/structure/info/atoms.py b/src/biotite/structure/info/atoms.py index 349fb40e4..a404b39bc 100644 --- a/src/biotite/structure/info/atoms.py +++ b/src/biotite/structure/info/atoms.py @@ -42,19 +42,19 @@ def residue(res_name): >>> alanine = residue("ALA") >>> # Atoms and geometry >>> print(alanine) - 0 ALA N N -0.970 0.490 1.500 - 0 ALA CA C 0.260 0.420 0.690 - 0 ALA C C -0.090 0.020 -0.720 - 0 ALA O O -1.060 -0.680 -0.920 - 0 ALA CB C 1.200 -0.620 1.300 - 0 ALA OXT O 0.660 0.440 -1.740 - 0 ALA H H -1.380 -0.420 1.480 - 0 ALA H2 H -0.680 0.660 2.450 - 0 ALA HA H 0.750 1.390 0.680 - 0 ALA HB1 H 1.460 -0.330 2.320 - 0 ALA HB2 H 0.720 -1.590 1.310 - 0 ALA HB3 H 2.110 -0.680 0.700 - 0 ALA HXT H 0.440 0.180 -2.650 + 0 ALA N N -0.966 0.493 1.500 + 0 ALA CA C 0.257 0.418 0.692 + 0 ALA C C -0.094 0.017 -0.716 + 0 ALA O O -1.056 -0.682 -0.923 + 0 ALA CB C 1.204 -0.620 1.296 + 0 ALA OXT O 0.661 0.439 -1.742 + 0 ALA H H -1.383 -0.425 1.482 + 0 ALA H2 H -0.676 0.661 2.452 + 0 ALA HA H 0.746 1.392 0.682 + 0 ALA HB1 H 1.459 -0.330 2.316 + 0 ALA HB2 H 0.715 -1.594 1.307 + 0 ALA HB3 H 2.113 -0.676 0.697 + 0 ALA HXT H 0.435 0.182 -2.647 >>> # Bonds >>> print(alanine.atom_name[alanine.bonds.as_array()[:,:2]]) [['N' 'CA'] diff --git a/src/biotite/structure/info/bonds.py b/src/biotite/structure/info/bonds.py index fb851d294..101ed115d 100644 --- a/src/biotite/structure/info/bonds.py +++ b/src/biotite/structure/info/bonds.py @@ -6,6 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["bond_type", "bonds_in_residue"] +import functools from biotite.structure.bonds import BondType from biotite.structure.info.ccd import get_from_ccd @@ -69,6 +70,7 @@ def bond_type(res_name, atom_name1, atom_name2): return None +@functools.cache def bonds_in_residue(res_name): """ Get a dictionary containing all atoms inside a given residue @@ -94,6 +96,10 @@ def bonds_in_residue(res_name): In other functionalities throughout *Biotite* that uses this function. + Notes + ----- + The returned values are cached for faster access in subsequent calls. + Examples -------- >>> bonds = bonds_in_residue("PHE") @@ -126,16 +132,16 @@ def bonds_in_residue(res_name): """ global _intra_bonds if res_name not in _intra_bonds: - chem_comp_bond_dict = get_from_ccd("chem_comp_bond", res_name) - if chem_comp_bond_dict is None: + chem_comp_bond = get_from_ccd("chem_comp_bond", res_name) + if chem_comp_bond is None: _intra_bonds[res_name] = {} else: bonds_for_residue = {} for atom1, atom2, order, aromatic_flag in zip( - chem_comp_bond_dict["atom_id_1"], - chem_comp_bond_dict["atom_id_2"], - chem_comp_bond_dict["value_order"], - chem_comp_bond_dict["pdbx_aromatic_flag"], + chem_comp_bond["atom_id_1"].as_array(), + chem_comp_bond["atom_id_2"].as_array(), + chem_comp_bond["value_order"].as_array(), + chem_comp_bond["pdbx_aromatic_flag"].as_array(), ): bond_type = BOND_TYPES[order, aromatic_flag] bonds_for_residue[atom1.item(), atom2.item()] = bond_type diff --git a/src/biotite/structure/info/ccd.py b/src/biotite/structure/info/ccd.py index e650b2f09..adcbc92f0 100644 --- a/src/biotite/structure/info/ccd.py +++ b/src/biotite/structure/info/ccd.py @@ -4,24 +4,23 @@ __name__ = "biotite.structure.info" __author__ = "Patrick Kunzmann" -__all__ = ["get_ccd", "get_from_ccd"] +__all__ = ["get_ccd", "set_ccd_path", "get_from_ccd"] import functools +import importlib +import inspect +import pkgutil from pathlib import Path import numpy as np -CCD_DIR = Path(__file__).parent / "ccd" -INDEX_COLUMN_NAME = { +_CCD_FILE = Path(__file__).parent / "components.bcif" +_SPECIAL_ID_COLUMN_NAMES = { "chem_comp": "id", - "chem_comp_atom": "comp_id", - "chem_comp_bond": "comp_id", } - -_ccd_block = None -# For each category this index gives the start and stop for each residue -_residue_index = {} +_DEFAULT_ID_COLUMN_NAME = "comp_id" +@functools.cache def get_ccd(): """ Get the internal subset of the PDB @@ -30,8 +29,16 @@ def get_ccd(): Returns ------- - ccd : BinaryCIFFile + ccd : BinaryCIFBlock The CCD. + It contains the categories `chem_comp`, `chem_comp_atom` and `chem_comp_bond`. + + Warnings + -------- + + Consider the return value as read-only. + As other functions cache data from it, changing data may lead to undefined + behavior. References ---------- @@ -42,11 +49,46 @@ def get_ccd(): # Avoid circular import from biotite.structure.io.pdbx.bcif import BinaryCIFFile - global _ccd_block - if _ccd_block is None: - # Load CCD once and cache it for subsequent calls - _ccd_block = BinaryCIFFile.read(CCD_DIR / "components.bcif").block - return _ccd_block + try: + return BinaryCIFFile.read(_CCD_FILE).block + except FileNotFoundError: + raise RuntimeError( + "Internal CCD not found. Please run 'setup_ccd.py' and reinstall Biotite." + ) + + +def set_ccd_path(ccd_path): + """ + Replace the internal *Chemical Component Dictionary* (CCD) with a custom one. + + This function also clears the cache of functions depending on the CCD to ensure + that the new CCD is used. + + Parameters + ---------- + ccd_path : path-like + The path to the custom CCD in BinaryCIF format, prepared with the + ``setup_ccd.py`` script. + + Notes + ----- + This function is intended for advanced users who need to add information for + compounds, which are not part of the internal CCD. + The reason might be that an updated version already exists upstream or that + the user wants to add custom compounds to the CCD. + """ + global _CCD_FILE + _CCD_FILE = Path(ccd_path) + + # Clear caches in all functions in biotite.structure.info + info_modules = [ + importlib.import_module(f"biotite.structure.info.{mod_name}") + for _, mod_name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]) + ] + for module in info_modules: + for _, function in inspect.getmembers(module, callable): + if hasattr(function, "cache_clear"): + function.cache_clear() @functools.cache @@ -69,9 +111,9 @@ def get_from_ccd(category_name, comp_id, column_name=None): Returns ------- - value : ndarray or dict or None - The array of the given column or all columns as dictionary. - ``None`` if the `comp_id` is not found in the category. + slice : BinaryCIFCategory or BinaryCIFColumn + The category or column (if `column_name` is provided) containing only the rows + for the given residue. Notes ----- @@ -83,28 +125,41 @@ def get_from_ccd(category_name, comp_id, column_name=None): .. footbibliography:: """ - global _residue_index - ccd = get_ccd() - category = ccd[category_name] - if category_name not in _residue_index: - _residue_index[category_name] = _index_residues( - category[INDEX_COLUMN_NAME[category_name]].as_array() - ) try: - start, stop = _residue_index[category_name][comp_id] + start, stop = _residue_index(category_name)[comp_id] except KeyError: return None + category = get_ccd()[category_name] if column_name is None: - return { - col_name: category[col_name].as_array()[start:stop] - for col_name in category.keys() - } + return _filter_category(category, slice(start, stop)) else: - return category[column_name].as_array()[start:stop] + return _filter_column(category[column_name], slice(start, stop)) -def _index_residues(id_column): +@functools.cache +def _residue_index(category_name): + """ + Get the start and stop index for each component name in the given + CCD category. + + Parameters + ---------- + category_name : str + The category to determine start and stop indices for each component in. + + Returns + ------- + index : dict (str -> (int, int)) + The index maps each present component name to the corresponding + start and exclusive stop index in `id_column`. + """ + category = get_ccd()[category_name] + id_column_name = _SPECIAL_ID_COLUMN_NAMES.get( + category_name, _DEFAULT_ID_COLUMN_NAME + ) + id_column = category[id_column_name].as_array() + residue_starts = np.where(id_column[:-1] != id_column[1:])[0] + 1 # The final start is the exclusive stop of last residue residue_starts = np.concatenate(([0], residue_starts, [len(id_column)])) @@ -113,3 +168,35 @@ def _index_residues(id_column): comp_id = id_column[residue_starts[i]].item() index[comp_id] = (residue_starts[i], residue_starts[i + 1]) return index + + +def _filter_category(category, index): + """ + Reduce the category to the values for the given index.∂ + """ + # Avoid circular import + from biotite.structure.io.pdbx.bcif import BinaryCIFCategory + + return BinaryCIFCategory( + {key: _filter_column(column, index) for key, column in category.items()} + ) + + +def _filter_column(column, index): + """ + Reduce the column to the values for the given index. + """ + # Avoid circular import + from biotite.structure.io.pdbx.bcif import BinaryCIFColumn, BinaryCIFData + from biotite.structure.io.pdbx.component import MaskValue + + data_array = column.data.array[index] + mask_array = column.mask.array[index] if column.mask is not None else None + return BinaryCIFColumn( + BinaryCIFData(data_array), + ( + BinaryCIFData(mask_array) + if column.mask is not None and (mask_array != MaskValue.PRESENT).any() + else None + ), + ) diff --git a/src/biotite/structure/info/ccd/README.rst b/src/biotite/structure/info/ccd/README.rst deleted file mode 100644 index 0ea73a479..000000000 --- a/src/biotite/structure/info/ccd/README.rst +++ /dev/null @@ -1,8 +0,0 @@ -These files are based on the -`Chemical Component Dictionary `_ -and were created using ``setup_ccd.py``. - -To keep the size of the repository small, the original commit should be -rewritten, if the formats of the affected files are compatible with the -original ones. -The name of the commit is ``Add CCD dataset``. \ No newline at end of file diff --git a/src/biotite/structure/info/ccd/amino_acids.txt b/src/biotite/structure/info/ccd/amino_acids.txt deleted file mode 100644 index 84fbd2764..000000000 --- a/src/biotite/structure/info/ccd/amino_acids.txt +++ /dev/null @@ -1,1663 +0,0 @@ -004 -00B -00C -00E -00O -01N -01W -02A -02K -02L -02O -02V -02Y -037 -03E -03Y -04Q -04R -04U -04V -04X -05N -05O -060 -07O -08M -08P -0A0 -0A1 -0A2 -0A8 -0A9 -0AA -0AB -0AC -0AF -0AH -0AK -0AR -0AZ -0BN -0CS -0E5 -0EA -0EH -0FL -0G5 -0GG -0LF -0MU -0NC -0PR -0QL -0QZ -0RJ -0TD -0TH -0UZ -0W6 -0WZ -0X9 -0XL -0XO -0XQ -0Y8 -0Y9 -0YG -11Q -11W -12L -12X -12Y -13E -143 -175 -192 -193 -19W -1AC -1C3 -1E3 -1G2 -1G3 -1G8 -1IC -1IP -1JM -1L1 -1ME -1MH -1OP -1PA -1PI -1QI -1TQ -1TX -1TY -1U8 -1VR -1X6 -1XW -200 -22G -23F -23P -23S -26B -28J -28X -2AD -2AG -2AO -2AS -2CO -2DO -2FM -2GX -2HF -2JC -2JF -2JG -2JH -2JJ -2JN -2KK -2KP -2KY -2KZ -2L5 -2L6 -2L9 -2LT -2LU -2ML -2MR -2MT -2OR -2P0 -2PI -2QD -2QY -2QZ -2R1 -2R3 -2RA -2RX -2SO -2TL -2TY -2VA -2XA -2YC -2YF -2YG -2YH -2YJ -2ZC -30F -30V -31Q -32L -32S -32T -33S -33W -33X -34E -35Y -3A5 -3AH -3AR -3BY -3CF -3CT -3EG -3FG -3GA -3GL -3K4 -3MD -3MM -3MY -3NF -3O3 -3PM -3PX -3QN -3TY -3U0 -3WS -3X9 -3XH -3YM -3ZH -3ZL -41H -41Q -432 -45W -4AF -4AK -4AR -4AW -4BF -4CF -4CG -4CY -4D4 -4DB -4DP -4F3 -4FB -4FO -4FW -4GJ -4HH -4HJ -4HL -4HT -4II -4IK -4IN -4J2 -4J4 -4J5 -4KY -4L0 -4L8 -4LZ -4M8 -4M9 -4MM -4N3 -4N7 -4N8 -4N9 -4NT -4NU -4OG -4OP -4OU -4OV -4PH -4PQ -4QK -4SJ -4U7 -4UD -4WQ -51T -54C -55I -55N -562 -56A -56C -5A6 -5AB -5CR -5CS -5CT -5CW -5DW -5F0 -5FQ -5GG -5GM -5HP -5LB -5LE -5LF -5MW -5OH -5OL -5OM -5OW -5PG -5R5 -5SQ -5T3 -5VV -5X8 -5XU -5ZA -60F -62H -65T -66C -66D -66E -6BR -6CL -6CV -6CW -6DU -6E4 -6FL -6G4 -6GL -6HN -6KM -6KY -6M6 -6RK -6V1 -6V9 -6WK -6Y9 -6YJ -6ZS -73C -73N -73O -73P -74P -7C9 -7CC -7HA -7ID -7J3 -7J4 -7JA -7MN -7N8 -7O5 -7OZ -7QK -7R0 -7R6 -7RX -7T2 -7TK -7VN -7VU -7W2 -7WC -7XC -7YO -81R -81S -823 -85F -85G -85J -85L -86N -8JB -8LJ -8RE -8SP -8WY -8YR -999 -99Y -9AT -9BP -9DN -9DS -9E7 -9EV -9JC -9JV -9KK -9KP -9MN -9NE -9NF -9NR -9NV -9OW -9R1 -9R4 -9R7 -9TR -9TU -9TX -9U0 -9U6 -9U9 -9UC -9UF -9V0 -9V6 -9VC -9VF -9VL -9VR -9WV -A0G -A1ADO -A1ADW -A1ADY -A1ADZ -A1D64 -A1H2H -A1H2I -A1H45 -A1LWV -A30 -A3U -A5N -A66 -A8E -A9D -AA3 -AA4 -AA6 -AAR -AB7 -ABA -AC5 -ACA -ACB -ACL -ADD -AE5 -AEA -AEI -AFA -AGD -AGM -AGQ -AGT -AHB -AHH -AHO -AHP -AIB -AJE -AKL -ALA -ALC -ALM -ALN -ALO -ALS -ALT -ALY -AME -AN6 -AN8 -APH -API -APK -APM -APO -APP -AR2 -AR4 -ARF -ARG -ARM -ARO -ARV -AS2 -AS7 -AS9 -ASA -ASB -ASI -ASK -ASL -ASM -ASN -ASP -ASQ -ASX -AVN -AYA -AYG -AZH -AZK -AZS -AZY -B1F -B27 -B2C -B2H -B2N -B3A -B3D -B3E -B3K -B3L -B3M -B3Q -B3S -B3T -B3U -B3X -B3Y -B8R -BB6 -BB7 -BB8 -BB9 -BBC -BCS -BCX -BE2 -BF6 -BF7 -BF9 -BFD -BG1 -BH2 -BHD -BIF -BIL -BIU -BJH -BJO -BL2 -BMT -BNN -BP5 -BPE -BSE -BTA -BTC -BTK -BTR -BUC -BUG -BW5 -BWB -BWV -BXT -BYR -BZK -C0O -C12 -C1T -C1X -C22 -C2N -C3Y -C4R -C5C -C66 -C6C -C99 -CAB -CAF -CAS -CAY -CCL -CCS -CCY -CDE -CDV -CE7 -CEA -CFY -CG6 -CGA -CGH -CGU -CGV -CH6 -CH7 -CHG -CHP -CIR -CIV -CJO -CLB -CLD -CLE -CLG -CLH -CLV -CME -CMH -CML -CMT -CNG -CPC -CPI -CQ1 -CQ2 -CQR -CR0 -CR2 -CR5 -CR7 -CR8 -CRF -CRG -CRK -CRO -CRQ -CRU -CRW -CRX -CS0 -CS1 -CS3 -CS4 -CSA -CSB -CSD -CSE -CSH -CSJ -CSK -CSO -CSP -CSR -CSS -CSU -CSW -CSX -CSY -CSZ -CTE -CTH -CUC -CUD -CWD -CWR -CXM -CY0 -CY1 -CY3 -CY4 -CYA -CYD -CYF -CYG -CYJ -CYM -CYQ -CYR -CYS -CYW -CZ2 -CZO -CZS -CZZ -D0C -D0Q -D11 -D2T -D3P -D4P -DA2 -DAB -DAH -DAL -DAM -DAR -DAS -DBB -DBS -DBU -DBY -DBZ -DC2 -DCY -DDE -DDZ -DFF -DFI -DFO -DGH -DGL -DGN -DHA -DHI -DHL -DHN -DHP -DHV -DI7 -DI8 -DIL -DIR -DIV -DJD -DLE -DLS -DLY -DM0 -DMH -DMK -DMT -DNE -DNG -DNL -DNM -DNP -DNS -DNW -DO2 -DOA -DOH -DON -DPL -DPN -DPP -DPQ -DPR -DSE -DSG -DSN -DSP -DTH -DTR -DTY -DV7 -DV9 -DVA -DYA -DYG -DYJ -DYL -DYS -E03 -E0Y -E95 -E9C -E9M -E9V -ECC -ECX -EEP -EFC -EHP -EI4 -EJA -EJM -ELY -EME -EO2 -EOE -ERL -ESB -ESC -ETA -EU0 -EUP -EW6 -EXA -EXL -EXY -EYG -EYS -F0G -F2F -F2Y -F3M -F3T -F6N -F75 -F7P -F7Q -F7S -F7V -F7W -F93 -F9D -FAK -FB5 -FB6 -FC0 -FCL -FDL -FF9 -FFM -FGA -FGL -FGP -FH7 -FHE -FHL -FHO -FIO -FL6 -FLA -FLE -FLT -FME -FOD -FOE -FP9 -FPK -FPR -FQA -FRD -FT6 -FTR -FTY -FVA -FX9 -FXC -FXF -FXL -FY2 -FY3 -FZN -G01 -G1X -G3M -G5G -G8M -G8X -GAU -GCM -GEE -GFT -GGB -GGL -GHC -GHG -GHP -GHW -GL3 -GLH -GLJ -GLM -GLN -GLQ -GLU -GLX -GLY -GLZ -GMA -GME -GMO -GNC -GND -GPL -GQI -GSC -GSU -GT9 -GVL -GYC -GYS -GZB -GZJ -H14 -H5M -H7V -HAC -HAR -HBN -HCL -HCM -HCS -HFA -HG7 -HGL -HGM -HGY -HHI -HHK -HIA -HIC -HIP -HIQ -HIS -HIX -HJH -HJV -HJY -HL2 -HL5 -HLU -HLX -HLY -HM8 -HM9 -HMF -HMR -HNC -HOO -HOX -HP9 -HPC -HPE -HPQ -HQA -HR7 -HRG -HRP -HS8 -HS9 -HSE -HSK -HSL -HSO -HSV -HT7 -HTI -HTN -HTR -HTY -HV5 -HVA -HY3 -HYP -HZP -I1C -I2F -I2M -I3L -I4G -I4O -I58 -I7F -IAE -IAM -IAR -IAS -IB9 -IC0 -ICY -IEL -IEY -IGL -II7 -IIC -IIL -ILE -ILG -ILM -ILX -IML -IO8 -IOR -IOY -IPG -IT1 -IYR -IYT -IZO -J2F -J3D -J7H -J8W -J9A -J9Y -JBY -JJJ -JJK -JJL -JKC -JKH -JLP -JMX -K1R -K5H -K5L -K7K -KBE -KBS -KBV -KCJ -KCR -KCX -KCY -KEO -KFP -KGC -KHB -KJW -KKD -KNB -KOR -KPF -KPI -KPY -KR3 -KSB -KST -KWS -KXV -KY4 -KY7 -KYN -KYQ -KZ1 -KZ4 -KZ7 -KZG -KZV -KZY -L2A -L3O -L4R -L5P -LA2 -LAA -LAL -LAY -LBY -LBZ -LCK -LCX -LDH -LE1 -LED -LEF -LEH -LEI -LEM -LEN -LET -LEU -LGY -LHC -LKE -LLO -LLP -LLY -LLZ -LME -LMF -LMQ -LNE -LNM -LOU -LP6 -LPD -LPG -LPH -LPL -LPS -LRK -LSO -LT0 -LTA -LTR -LTU -LV8 -LVG -LVN -LWI -LWY -LYF -LYH -LYM -LYN -LYO -LYR -LYS -LYU -LYV -LYX -LYZ -M0H -M2L -M2S -M30 -M3L -M3R -M3V -M64 -M6J -MA -MAA -MAI -MBQ -MC1 -MCG -MCL -MCS -MD0 -MD3 -MD5 -MD6 -MDF -MDH -MDO -ME0 -MEA -MED -MEG -MEN -MEQ -MET -MEU -MF3 -MFC -MFH -MFV -MGG -MGN -MGY -MH1 -MH6 -MH8 -MHE -MHL -MHO -MHS -MHU -MHV -MHW -MHY -MIR -MIS -MJ1 -MK8 -MKD -MKF -ML3 -MLE -MLL -MLU -MLY -MLZ -MME -MMO -MND -MNL -MNV -MOD -MOZ -MP4 -MP8 -MPH -MPJ -MPQ -MSA -MSE -MSL -MSO -MSP -MT2 -MTY -MV9 -MVA -MYK -MYN -N0A -N10 -N2C -N65 -N7P -N80 -N8P -N9P -NA8 -NAL -NAM -NB8 -NBQ -NC1 -NCB -NCY -NDF -NEM -NEP -NFA -NHL -NIY -NKS -NLB -NLE -NLN -NLO -NLP -NLQ -NLW -NLY -NMC -NMM -NNH -NOT -NPH -NPI -NRG -NRP -NRQ -NSK -NTR -NTY -NVA -NWD -NYB -NYC -NYG -NYS -NZC -NZH -O12 -O2E -O6H -O7A -O7D -O7G -OAS -OBF -OBS -OCS -OCY -OEM -OFM -OGC -OGU -OGZ -OHD -OHI -OHS -OIB -OIC -OIM -OJY -OLD -OLE -OLT -OLZ -OMH -OMT -OMX -OMY -OMZ -ONH -ONL -ORD -ORN -ORQ -OSE -OTB -OTH -OTY -OTZ -OV7 -OWF -OXX -OYL -OZ3 -OZT -OZW -P0A -P1L -P2Q -P2Y -P3Q -P4E -P4F -P9S -PAQ -PAS -PAT -PBB -PBF -PCA -PCC -PCE -PCS -PDD -PDL -PDW -PE1 -PEC -PF5 -PFF -PG1 -PG9 -PGY -PH6 -PH8 -PHA -PHD -PHE -PHI -PHL -PIA -PIV -PJ3 -PLJ -PM3 -PNZ -POK -POM -PPN -PQ4 -PQG -PR3 -PR4 -PR7 -PR9 -PRJ -PRK -PRO -PRQ -PRR -PRS -PRV -PSH -PSW -PTH -PTM -PTR -PVH -PVL -PVO -PXU -PYA -PYH -PYL -PYX -Q2E -Q2K -Q3P -Q3S -Q75 -Q78 -Q8X -QAC -QC4 -QCA -QCD -QCI -QCS -QDS -QFG -QIL -QIP -QLG -QM8 -QMB -QMM -QNQ -QNT -QNW -QNY -QO2 -QO5 -QO8 -QPA -QPH -QQ8 -QQB -QUK -QVA -QX7 -QXV -QYG -QYX -QZA -R0E -R0K -R1A -R2P -R2T -R4K -R6E -RC7 -RE0 -RE3 -RF9 -RGL -RGP -RON -RPI -RT0 -RVJ -RVX -RX9 -RXL -RZ4 -S0R -S12 -S1H -S2C -S2D -S2P -SAC -SAH -SAO -SAR -SBD -SBL -SCH -SCS -SCY -SD2 -SD4 -SDP -SE7 -SEB -SEC -SEE -SEG -SEL -SEM -SEN -SEP -SER -SET -SFE -SGB -SHC -SHP -SHR -SIB -SIC -SKG -SKH -SKJ -SLL -SLR -SLZ -SMC -SME -SMF -SNC -SNK -SNM -SNN -SOC -SOY -SRZ -STY -SUB -SUI -SUN -SVA -SVV -SVW -SVX -SVY -SVZ -SWG -SWW -SXE -SYS -SZF -T09 -T0I -T11 -T3R -T66 -T79 -T7Q -T8L -T9E -TA4 -TAV -TBG -TBM -TCQ -TCR -TDD -TDF -TEF -TFQ -TFR -TGH -TH5 -TH6 -THC -THO -THR -THZ -TIH -TIS -TJI -TLY -TMB -TMD -TNB -TNQ -TNR -TNY -TOQ -TOX -TOZ -TPH -TPJ -TPK -TPL -TPO -TPQ -TQI -TQQ -TQZ -TRF -TRG -TRN -TRO -TRP -TRQ -TRW -TRX -TRY -TS9 -TSQ -TST -TSY -TTQ -TTS -TXY -TY1 -TY2 -TY3 -TY5 -TY8 -TY9 -TYB -TYC -TYE -TYI -TYJ -TYN -TYO -TYQ -TYR -TYS -TYT -TYX -TYY -TZB -TZO -U2M -U2X -U3X -U6A -UAL -UB4 -UDS -UF0 -UGY -UIA -UJR -UKD -UKY -UM1 -UM2 -UMA -UN1 -UN2 -UNK -UOX -URV -UU4 -UU5 -UX8 -UXQ -UXY -UY7 -UYA -UZ4 -UZA -UZN -V1C -V1V -V3C -V44 -V4F -V53 -V5F -V5N -V61 -V6W -VAD -VAF -VAH -VAL -VB1 -VDE -VDK -VDL -VEF -VH0 -VHF -VI3 -VLL -VLM -VMS -VNW -VOL -VPV -VR0 -VUB -VVK -VYA -W4T -WCM -WCR -WFP -WLU -WPA -WRP -WVL -WYK -WZJ -X1B -X2W -X5H -X5P -X5V -X60 -X6E -X9Q -XA6 -XC0 -XCN -XDT -XOK -XPL -XPR -XSN -XW1 -XX1 -XXA -XXY -XYC -XYG -Y1V -Y28 -Y57 -YCM -YCP -YHA -YNM -YOF -YPR -YPZ -YRV -YTF -YTH -YWV -YYA -Z01 -Z3E -Z50 -Z70 -Z9J -ZAE -ZAI -ZAL -ZBZ -ZCL -ZDJ -ZFB -ZGL -ZIQ -ZJU -ZKO -ZLF -ZNY -ZRJ -ZSX -ZT6 -ZT9 -ZTC -ZTG -ZTK -ZU0 -ZUK -ZV4 -ZY9 -ZYJ -ZYK -ZZD -ZZJ -ZZU diff --git a/src/biotite/structure/info/ccd/carbohydrates.txt b/src/biotite/structure/info/ccd/carbohydrates.txt deleted file mode 100644 index 9c109072e..000000000 --- a/src/biotite/structure/info/ccd/carbohydrates.txt +++ /dev/null @@ -1,1135 +0,0 @@ -045 -05L -07E -07Y -08U -09X -0AT -0BD -0H0 -0HX -0LP -0MK -0NZ -0TS -0UB -0V4 -0WK -0XY -0YT -10M -12E -145 -147 -149 -14T -15L -16F -16G -16O -17T -18D -18O -18T -1AR -1BW -1CF -1FT -1GL -1GN -1JB -1LL -1NA -1S3 -1S4 -1SD -1X4 -20S -20X -22O -22S -23V -24S -25E -26M -26O -26Q -26R -26V -26W -26Y -27C -289 -291 -293 -2DG -2DR -2F8 -2FG -2FL -2FP -2GL -2GS -2H5 -2HA -2M4 -2M5 -2M8 -2OS -2SI -2WP -2WS -32O -34V -38J -3BU -3CM -3DO -3DY -3FM -3GR -3HD -3J3 -3J4 -3LJ -3LR -3MF -3MG -3MK -3R3 -3S6 -3SA -3YW -40J -42D -445 -44S -46D -46M -46Z -475 -48Z -491 -49A -49S -49T -49V -4AM -4CQ -4GC -4GL -4GP -4JA -4N2 -4NN -4QY -4R1 -4RS -4SG -4U0 -4U1 -4U2 -4UZ -4V5 -50A -510 -51N -56N -57S -5DI -5GF -5GO -5II -5KQ -5KS -5KT -5KV -5L2 -5L3 -5LS -5LT -5MM -5N6 -5QP -5RP -5SA -5SP -5TH -5TJ -5TK -5TM -604 -61J -62I -64K -66O -6BG -6C2 -6DM -6GB -6GP -6GR -6K3 -6KH -6KL -6KS -6KU -6KW -6LA -6LS -6LW -6MJ -6MN -6PG -6PY -6PZ -6S2 -6SA -6UD -6Y6 -6YR -6ZC -73E -79J -7CV -7D1 -7GP -7JZ -7K2 -7K3 -7NU -7SA -83Y -89Y -8B7 -8B9 -8EX -8GA -8GG -8GP -8I4 -8LM -8LR -8OQ -8PK -8S0 -8YV -95Z -96O -98U -9AM -9C1 -9CD -9GP -9KJ -9MR -9OK -9PG -9QG -9QZ -9RN -9S7 -9SG -9SJ -9SM -9SP -9T1 -9T7 -9VP -9WJ -9WN -9WZ -9YW -A0K -A1AIO -A1H0Z -A1Q -A2G -A5C -A6P -AAL -AAO -ABC -ABD -ABE -ABF -ABL -AC1 -ACG -ACR -ACX -ADA -ADG -ADR -AF1 -AFD -AFL -AFO -AFP -AFR -AGC -AGH -AGL -AGR -AH2 -AH8 -AHG -AHM -AHR -AIG -ALL -ALX -AMG -AMN -AMU -AMV -ANA -AOG -AOS -AQA -ARA -ARB -ARE -ARI -ARW -ASC -ASG -ASO -AXP -AXR -AY9 -AZC -B0D -B16 -B1H -B1N -B2G -B4G -B6D -B7G -B8D -B9D -BBK -BBV -BCD -BCW -BDF -BDG -BDP -BDR -BDZ -BEM -BFN -BFP -BG6 -BG8 -BGC -BGL -BGN -BGP -BGS -BHG -BM3 -BM7 -BMA -BMX -BND -BNG -BNX -BO1 -BOG -BQY -BRI -BS7 -BTG -BTU -BW3 -BWG -BXF -BXP -BXX -BXY -BZD -C3B -C3G -C3X -C4B -C4W -C4X -C5X -CAP -CBF -CBI -CBK -CDR -CE5 -CE6 -CE8 -CEG -CEX -CEY -CEZ -CGF -CJB -CKB -CKP -CNP -CR1 -CR6 -CRA -CT3 -CTO -CTR -CTT -D0N -D1M -D5E -D6G -DAF -DAG -DAN -DDA -DDB -DDL -DEG -DEL -DFR -DFX -DG0 -DGC -DGD -DGM -DGO -DGS -DGU -DIG -DJB -DJE -DK4 -DKX -DKZ -DL6 -DLD -DLF -DLG -DMU -DNO -DO8 -DOM -DP5 -DPC -DQQ -DQR -DR2 -DR3 -DR4 -DR5 -DRI -DSR -DT6 -DVC -DYM -E3M -E4P -E5G -EAG -EBG -EBQ -EEN -EEQ -EGA -EJT -EMP -EMZ -EPG -EQP -EQV -ERE -ERI -ETT -EUS -F1P -F1X -F55 -F58 -F6P -F8X -FBP -FCA -FCB -FCT -FDP -FDQ -FFC -FFX -FIF -FIX -FK9 -FKD -FMF -FMO -FNG -FNY -FRU -FSA -FSI -FSM -FSR -FSW -FU4 -FUB -FUC -FUD -FUF -FUL -FUY -FVQ -FX1 -FYJ -G0S -G16 -G1P -G20 -G28 -G2F -G3F -G3I -G4D -G4S -G6D -G6P -G6S -G7P -G8Z -GAA -GAC -GAD -GAF -GAL -GAT -GBH -GC1 -GC4 -GC9 -GCB -GCD -GCN -GCO -GCS -GCT -GCU -GCV -GCW -GDA -GDL -GE1 -GE3 -GFP -GIV -GL0 -GL1 -GL2 -GL4 -GL5 -GL6 -GL7 -GL9 -GLA -GLB -GLC -GLD -GLF -GLG -GLO -GLP -GLS -GLT -GLW -GM0 -GMB -GMH -GMT -GMZ -GN1 -GN4 -GNS -GNX -GP0 -GP1 -GP4 -GPH -GPK -GPM -GPO -GPQ -GPU -GPV -GPW -GQ1 -GRF -GRX -GS1 -GS4 -GS9 -GSA -GSD -GTE -GTH -GTK -GTM -GTR -GU0 -GU1 -GU2 -GU3 -GU4 -GU5 -GU6 -GU8 -GU9 -GUF -GUL -GUP -GUZ -GXL -GXV -GYE -GYG -GYP -GYU -GYV -GZL -H1M -H1S -H2P -H3S -H53 -H6Q -H6Z -HBZ -HD4 -HDL -HMS -HNV -HNW -HSG -HSH -HSJ -HSQ -HSR -HSU -HSX -HSY -HSZ -HTG -HTM -HVC -I57 -IAB -IDC -IDF -IDG -IDR -IDS -IDT -IDU -IDX -IDY -IEM -IN1 -IPT -ISD -ISL -ISX -IVG -IXD -J5B -JFZ -JHM -JLT -JRV -JS2 -JSV -JV4 -JVA -JVS -JZR -K5B -K99 -KBA -KBG -KD5 -KDA -KDB -KDD -KDE -KDF -KDM -KDN -KDO -KDR -KFN -KG1 -KGM -KHP -KME -KO1 -KO2 -KOT -KQC -KTU -L1L -L6N -L6S -L6T -LAG -LAH -LAI -LAK -LAO -LAT -LB2 -LBS -LBT -LCN -LDY -LEC -LER -LFC -LFR -LGC -LGU -LKA -LKS -LM2 -LMO -LMT -LMU -LNV -LOG -LOX -LPK -LRH -LSM -LTG -LTM -LVO -LVZ -LXB -LXC -LXZ -LZ0 -M1F -M1P -M2F -M3M -M3N -M55 -M6D -M6P -M7B -M7P -M8C -MA1 -MA2 -MA3 -MA8 -MAB -MAF -MAG -MAL -MAN -MAT -MAV -MAW -MBE -MBF -MBG -MCU -MDA -MDP -MFA -MFB -MFU -MG5 -MGA -MGC -MGL -MGS -MJJ -MLB -MLR -MMA -MMN -MN0 -MNA -MQG -MQT -MRH -MRP -MSX -MTT -MUB -MUG -MUR -MVP -MXY -MXZ -MYG -N1L -N3U -N9S -NA1 -NAA -NAG -NBG -NBX -NBY -NDG -NED -NFG -NG1 -NG6 -NGA -NGB -NGC -NGE -NGF -NGK -NGL -NGR -NGS -NGY -NGZ -NHF -NLC -NM6 -NM9 -NNG -NPF -NSQ -NT1 -NTF -NTO -NTP -NXD -NYT -O1G -OAK -OEL -OI7 -OPM -ORP -OSU -OTG -OTN -OTU -OX2 -P53 -P6P -P8E -PA1 -PA5 -PAV -PDX -PH5 -PKM -PNA -PNG -PNJ -PNW -PPC -PRP -PSG -PSJ -PSV -PTQ -PUF -PZU -QDK -QIF -QKH -QPS -QV4 -R1P -R1X -R2B -R2G -R5P -RAA -RAE -RAF -RAM -RAO -RAT -RB5 -RBL -RCD -RDP -REL -RER -RF5 -RG1 -RGG -RHA -RHC -RI2 -RIB -RIP -RM4 -RNS -RNT -ROB -ROR -RP3 -RP5 -RP6 -RPA -RR7 -RRJ -RRY -RST -RTG -RTV -RUB -RUG -RUU -RV7 -RVG -RVM -RWI -RY7 -RZM -S6P -S7P -S81 -SA0 -SCG -SCR -SDD -SDY -SEJ -SF6 -SF9 -SFJ -SFU -SG4 -SG5 -SG6 -SG7 -SGA -SGC -SGD -SGN -SGS -SHB -SHD -SHG -SI3 -SIA -SID -SIO -SIZ -SLB -SLM -SLT -SMD -SN5 -SNG -SOE -SOG -SOL -SOR -SR1 -SSG -SSH -STW -STZ -SUC -SUP -SUS -SWE -SZZ -T68 -T6D -T6P -T6T -TA6 -TAG -TCB -TCG -TDG -TEU -TF0 -TFU -TGA -TGK -TGR -TGY -TH1 -TM5 -TM6 -TM9 -TMR -TMX -TNX -TOA -TOC -TQY -TRE -TRV -TS8 -TT7 -TTV -TTZ -TU4 -TUG -TUJ -TUP -TUR -TVD -TVG -TVM -TVS -TVV -TVY -TW7 -TWA -TWD -TWG -TWJ -TWY -TXB -TY6 -TYV -U1Y -U2A -U2D -U63 -U8V -U97 -U9A -U9D -U9G -U9J -U9M -UAP -UBH -UBO -UCD -UDC -UEA -V3M -V3P -V71 -VDF -VG1 -VJ1 -VJ4 -VKN -VTB -W9T -WIA -WOO -WUN -WZ1 -WZ2 -WZ4 -X0X -X1P -X1X -X2F -X2Y -X34 -X4S -X5S -X6X -X6Y -XBP -XDP -XDX -XGP -XIL -XKJ -XLF -XLS -XMM -XS2 -XUL -XXM -XXR -XXX -XY6 -XY9 -XYB -XYF -XYL -XYP -XYS -XYT -XYZ -YDR -YIO -YJM -YKR -YO5 -YX0 -YX1 -YYB -YYD -YYH -YYJ -YYK -YYM -YYQ -YYR -YZ0 -YZT -Z0F -Z15 -Z16 -Z2D -Z2T -Z3K -Z3L -Z3Q -Z3U -Z4K -Z4R -Z4S -Z4U -Z4V -Z4W -Z4Y -Z57 -Z5J -Z5L -Z61 -Z6G -Z6H -Z6J -Z6W -Z8H -Z8T -Z9D -Z9E -Z9H -Z9K -Z9L -Z9M -Z9N -Z9W -ZB0 -ZB1 -ZB2 -ZB3 -ZCD -ZCZ -ZD0 -ZDC -ZDM -ZDO -ZEE -ZEL -ZGE -ZMR diff --git a/src/biotite/structure/info/ccd/components.bcif b/src/biotite/structure/info/ccd/components.bcif deleted file mode 100644 index 9f3921dd1..000000000 Binary files a/src/biotite/structure/info/ccd/components.bcif and /dev/null differ diff --git a/src/biotite/structure/info/ccd/nucleotides.txt b/src/biotite/structure/info/ccd/nucleotides.txt deleted file mode 100644 index e01581c6f..000000000 --- a/src/biotite/structure/info/ccd/nucleotides.txt +++ /dev/null @@ -1,798 +0,0 @@ -02I -05A -05H -05K -0A -0AD -0AM -0AP -0AU -0AV -0C -0DA -0DC -0DG -0DT -0G -0KZ -0R8 -0SP -0U -0U1 -0UH -10C -125 -126 -127 -128 -12A -16B -18M -18Q -1AP -1CC -1DP -1FC -1MA -1MG -1PR -1RN -1SC -1TL -1TW -1W5 -1WA -23G -2AR -2AT -2AU -2BD -2BT -2BU -2DA -2DF -2DM -2DT -2EG -2FE -2FI -2GF -2GT -2IA -2JU -2JV -2L8 -2LA -2LF -2MA -2MG -2MU -2NT -2OM -2OT -2PR -2SA -2SG -2ST -31H -31M -3AU -3DA -3DR -3ME -3MU -3TD -3ZO -45A -47C -4AC -4DG -4DU -4E9 -4EN -4MF -4OC -4PC -4PD -4PE -4SC -4SU -4TA -4U3 -50L -50N -56B -574 -5AA -5AT -5BU -5CF -5CG -5CM -5DB -5EJ -5FA -5FC -5FU -5HC -5HM -5HT -5HU -5IC -5IT -5IU -5JO -5MC -5MD -5MU -5NC -5OC -5PC -5PY -5SE -5UA -63G -63H -64P -64T -68Z -6CT -6F7 -6FC -6FK -6FM -6FU -6HA -6HB -6HC -6HG -6HT -6IA -6MA -6MC -6MI -6MT -6MZ -6NW -6OG -6OO -6OP -6PO -70U -73W -75B -77Y -7AT -7BG -7DA -7GU -7MG -7OK -7S3 -7S8 -7SN -84E -85Y -8AA -8AG -8AH -8AN -8AZ -8BA -8DT -8EB -8FG -8H2 -8MG -8NI -8OG -8OS -8PI -8PY -8RJ -8RO -8Y9 -8YN -92F -93D -94O -9O4 -9QV -9SI -9SY -9V9 -A -A1H3G -A1P -A23 -A2L -A2M -A34 -A35 -A38 -A39 -A3A -A3P -A40 -A43 -A44 -A47 -A5L -A5M -A5O -A6A -A6C -A6G -A6U -A7C -A7E -A9Z -AAB -ABR -ABS -ABT -AD2 -ADX -AET -AF2 -AFF -AFG -AP7 -AS -ASU -ATD -ATL -ATM -AVC -AWC -B1P -B7C -B8H -B8K -B8N -B8Q -B8T -B8W -B9B -B9H -BGH -BGM -BMN -BMP -BMQ -BOE -BRU -BT5 -BVP -BZG -C -C25 -C2L -C2S -C31 -C32 -C34 -C36 -C37 -C38 -C42 -C43 -C45 -C46 -C49 -C4J -C4S -C5L -C6G -C7R -C7S -CAR -CB2 -CBR -CBV -CCC -CDW -CFL -CFZ -CG1 -CGY -CH -CJ1 -CM0 -CMR -CNU -CP1 -CS8 -CSF -CSL -CSM -CTG -CVC -CX2 -D00 -D1P -D3 -D33 -D3N -D3T -D4B -D4M -DA -DBM -DC -DCG -DCT -DCZ -DDG -DDN -DDX -DFC -DFG -DFT -DG -DG8 -DGI -DHU -DI -DJF -DN -DNR -DOC -DP -DPB -DPY -DRM -DRP -DRT -DRZ -DT -DU -DUZ -DX -DXD -DXN -DZ -DZM -E -E1X -E3C -E6G -E7G -EAN -EDA -EDC -EDI -EHG -EIT -EIX -ENA -ENP -ENQ -EQ0 -EQ4 -EW3 -EWC -EXC -F2T -F3H -F3N -F3O -F4H -F4Q -F5H -F6H -F6U -F6X -F73 -F74 -F7H -F7K -F7O -F7R -F7X -FA2 -FA5 -FAG -FAI -FAX -FDG -FFD -FHU -FMG -FMU -FNU -FOX -G -G25 -G2L -G2S -G31 -G32 -G33 -G35 -G36 -G38 -G42 -G46 -G47 -G48 -G49 -G4P -G7M -GAO -GCK -GDO -GDP -GDR -GF2 -GFL -GH3 -GMS -GMU -GMX -GN7 -GNE -GOM -GRB -GS -GSR -GSS -GX1 -H2U -HDP -HEU -HN0 -HN1 -HOL -HYJ -I -I2T -I4U -I5C -IC -IG -IGU -IKS -ILK -IMC -IOO -IPN -IRN -IU -J0X -J4T -JDT -JLN -JMH -JSP -JW5 -K1F -K2F -K39 -KAG -KAK -KGV -L3X -LC -LCA -LCC -LCG -LCH -LG -LGP -LHH -LHO -LHU -LKC -LMS -LR6 -LSH -LST -LTP -LV2 -LWM -M1G -M1Y -M2G -M3O -M3X -M4C -M5M -M7A -MA6 -MA7 -MAD -MBZ -MCY -MDJ -MDK -MDQ -MDR -MDU -MDV -ME6 -MEP -MF7 -MFO -MFT -MG1 -MGQ -MGV -MHG -MIA -MM7 -MMT -MMX -MNU -MRG -MTR -MTU -MUM -N -N4S -N5I -N5M -N6G -N7X -NCU -NCX -NDN -NDU -NF2 -NMS -NMT -NP3 -NR1 -NRI -NTT -NYM -O2C -O2G -O2Z -OAD -ODP -OFC -OGX -OHU -OIP -OIQ -OKN -OKQ -OKT -OMC -OMG -OMU -ONE -OOB -OWR -OYW -P -P1P -P2T -P2U -P4U -P5P -P7G -P9G -PBT -PDU -PG7 -PGN -PGP -PMT -PPU -PPW -PQ1 -PR5 -PRN -PST -PSU -PU -PUY -PVX -PYO -PYY -PZG -QBT -QCK -QSK -QSQ -QUO -R -RBD -RCE -RDG -RFJ -RIA -RMP -RPC -RSP -RSQ -RT -RTP -RUS -RY -S2M -S4A -S4C -S4G -S4U -S6G -S6M -S8M -S8U -SAY -SC -SDE -SDG -SDH -SJO -SMP -SMT -SOS -SPT -SRA -SSU -SUR -T -T0N -T0P -T0Q -T0T -T2S -T2T -T31 -T32 -T36 -T37 -T38 -T39 -T3P -T41 -T48 -T49 -T4S -T5O -T5S -T64 -T6A -TA3 -TAF -TAL -TC -TC1 -TCJ -TCP -TCY -TDY -TED -TFE -TFF -TFO -TFT -TG -TGP -THM -THP -THX -TLB -TLC -TLN -TP1 -TPC -TPG -TS -TSP -TT -TTD -TTI -TTM -TX2 -TYU -U -U23 -U25 -U2L -U2N -U2P -U31 -U33 -U34 -U36 -U37 -U48 -U4M -U5M -U6F -U7B -U8U -UAR -UBD -UBI -UBR -UCL -UD5 -UDP -UEL -UF2 -UFB -UFP -UFR -UFT -UMS -UMX -UOA -UOB -UPE -UPS -UPV -UR3 -URD -URT -URU -URX -US1 -US2 -US3 -US4 -US5 -USM -UVX -UY1 -UY4 -UZL -V3L -VC7 -VET -VSN -WC7 -WUH -WVQ -X -X0F -X0O -X4A -XAD -XAE -XAL -XAR -XCL -XCR -XCS -XCT -XCY -XE6 -XEC -XFC -XGA -XGL -XGR -XGU -XNY -XPB -XSX -XTF -XTH -XTL -XTR -XTS -XTY -XUA -XUG -Y -Y5P -YA4 -YCO -YG -YRR -YWQ -YYG -Z -ZAD -ZBC -ZBU -ZCY -ZDU -ZF9 -ZGU -ZHP diff --git a/src/biotite/structure/info/groups.py b/src/biotite/structure/info/groups.py index 781f9c587..b5bde4c67 100644 --- a/src/biotite/structure/info/groups.py +++ b/src/biotite/structure/info/groups.py @@ -6,14 +6,45 @@ __author__ = "Tom David Müller, Patrick Kunzmann" __all__ = ["amino_acid_names", "nucleotide_names", "carbohydrate_names"] -from pathlib import Path - -CCD_DIR = Path(__file__).parent / "ccd" - - -group_lists = {} - - +import functools +import numpy as np +from biotite.structure.info.ccd import get_ccd + +_AMINO_ACID_TYPES = [ + "D-beta-peptide, C-gamma linking", + "D-gamma-peptide, C-delta linking", + "D-peptide COOH carboxy terminus", + "D-peptide NH3 amino terminus", + "D-peptide linking", + "L-beta-peptide, C-gamma linking", + "L-gamma-peptide, C-delta linking", + "L-peptide COOH carboxy terminus", + "L-peptide NH3 amino terminus", + "L-peptide linking", + "peptide linking", +] +_NUCLEOTIDE_TYPES = [ + "DNA OH 3 prime terminus", + "DNA OH 5 prime terminus", + "DNA linking", + "L-DNA linking", + "L-RNA linking", + "RNA OH 3 prime terminus", + "RNA OH 5 prime terminus", + "RNA linking", +] +_CARBOHYDRATE_TYPES = [ + "D-saccharide", + "D-saccharide, alpha linking", + "D-saccharide, beta linking", + "L-saccharide", + "L-saccharide, alpha linking", + "L-saccharide, beta linking", + "saccharide", +] + + +@functools.cache def amino_acid_names(): """ Get a tuple of amino acid three-letter codes according to the @@ -32,9 +63,10 @@ def amino_acid_names(): .. footbibliography:: """ - return _get_group_members("amino_acids") + return _get_group_members(_AMINO_ACID_TYPES) +@functools.cache def nucleotide_names(): """ Get a tuple of nucleotide three-letter codes according to the @@ -53,9 +85,10 @@ def nucleotide_names(): .. footbibliography:: """ - return _get_group_members("nucleotides") + return _get_group_members(_NUCLEOTIDE_TYPES) +@functools.cache def carbohydrate_names(): """ Get a tuple of carbohydrate three-letter codes according to the @@ -74,12 +107,25 @@ def carbohydrate_names(): .. footbibliography:: """ - return _get_group_members("carbohydrates") + return _get_group_members(_CARBOHYDRATE_TYPES) + + +def _get_group_members(match_types): + """ + Identify component IDs that matches a given component *type* from the CCD. + Parameters + ---------- + match_types : list of str + The component types to extract. -def _get_group_members(group_name): - global group_lists - if group_name not in group_lists: - with open(CCD_DIR / f"{group_name}.txt", "r") as file: - group_lists[group_name] = tuple(file.read().split()) - return group_lists[group_name] + Returns + ------- + comp_ids : list of str + The extracted component IDs. + """ + category = get_ccd()["chem_comp"] + comp_ids = category["id"].as_array() + types = category["type"].as_array() + # Ignore case + return comp_ids[np.isin(np.char.lower(types), np.char.lower(match_types))].tolist() diff --git a/src/biotite/structure/info/masses.py b/src/biotite/structure/info/masses.py index e0ac8cd8d..5d2f0690d 100644 --- a/src/biotite/structure/info/masses.py +++ b/src/biotite/structure/info/masses.py @@ -95,15 +95,11 @@ def mass(item, is_residue=None): if is_residue is None: result_mass = _atom_masses.get(item.upper()) if result_mass is None: - result_mass = get_from_ccd( - "chem_comp", item.upper(), "formula_weight" - ).item() + result_mass = _mass_for_residue(item) elif not is_residue: result_mass = _atom_masses.get(item.upper()) else: - result_mass = get_from_ccd( - "chem_comp", item.upper(), "formula_weight" - ).item() + result_mass = _mass_for_residue(item) elif isinstance(item, Atom): result_mass = mass(item.element, is_residue=False) @@ -116,3 +112,10 @@ def mass(item, is_residue=None): if result_mass is None: raise KeyError(f"{item} is not known") return result_mass + + +def _mass_for_residue(res_name): + column = get_from_ccd("chem_comp", res_name.upper(), "formula_weight") + if column is None: + raise KeyError(f"Residue '{res_name}' is not known") + return column.as_item() diff --git a/src/biotite/structure/info/misc.py b/src/biotite/structure/info/misc.py index 57e270568..d61ae6455 100644 --- a/src/biotite/structure/info/misc.py +++ b/src/biotite/structure/info/misc.py @@ -11,19 +11,13 @@ def all_residues(): """ - Get a list of all residues/compound names in the - PDB chemical components dictionary. + Get a list of all residues/compound names in the PDB + *Chemical Component Dictionary* (CCD). Returns ------- residues : list of str - A list of all available The up to 3-letter residue names. - - Examples - -------- - - >>> print(all_residues()[1000 : 1010]) - ['0V9', '0VA', '0VB', '0VC', '0VD', '0VE', '0VF', '0VG', '0VH', '0VI'] + A list of all available residue names. """ return get_ccd()["chem_comp"]["id"].as_array().tolist() @@ -51,10 +45,10 @@ def full_name(res_name): >>> print(full_name("MAN")) alpha-D-mannopyranose """ - array = get_from_ccd("chem_comp", res_name.upper(), "name") - if array is None: + column = get_from_ccd("chem_comp", res_name.upper(), "name") + if column is None: return None - return array.item() + return column.as_item() def link_type(res_name): @@ -84,10 +78,10 @@ def link_type(res_name): >>> print(link_type("HOH")) NON-POLYMER """ - array = get_from_ccd("chem_comp", res_name.upper(), "type") - if array is None: + column = get_from_ccd("chem_comp", res_name.upper(), "type") + if column is None: return None - return array.item() + return column.as_item() def one_letter_code(res_name): @@ -107,7 +101,7 @@ def one_letter_code(res_name): ------- one_letter_code : str or None The one-letter code. - None if the compound is not present in the CCD or if no + ``None`` if the compound is not present in the CCD or if no one-letter code is defined for this compound. Examples @@ -135,10 +129,10 @@ def one_letter_code(res_name): None """ - array = get_from_ccd("chem_comp", res_name.upper(), "one_letter_code") - if array is None: + column = get_from_ccd("chem_comp", res_name.upper(), "one_letter_code") + if column is None: return None - item = array.item() - if item == "": + if column.mask is not None: + # Value is masked, i.e. inapplicable or missing return None - return item + return column.as_item() diff --git a/src/biotite/structure/info/standardize.py b/src/biotite/structure/info/standardize.py index 558b81f41..3685a6fbe 100644 --- a/src/biotite/structure/info/standardize.py +++ b/src/biotite/structure/info/standardize.py @@ -121,8 +121,8 @@ def standardize_order(atoms): stop = starts[i + 1] res_name = atoms.res_name[start] - standard_atom_names = get_from_ccd("chem_comp_atom", res_name, "atom_id") - if standard_atom_names is None: + chem_comp_atom = get_from_ccd("chem_comp_atom", res_name, "atom_id") + if chem_comp_atom is None: # If the residue is not in the CCD, keep the current order warnings.warn( f"Residue '{res_name}' is not in the CCD, " @@ -131,6 +131,7 @@ def standardize_order(atoms): reordered_indices[start:stop] = np.arange(start, stop) continue + standard_atom_names = chem_comp_atom.as_array() reordered_indices[start:stop] = ( _reorder(atoms.atom_name[start:stop], standard_atom_names) + start ) diff --git a/src/biotite/structure/io/mol/sdf.py b/src/biotite/structure/io/mol/sdf.py index 3b51181ce..fb2dc7d5f 100644 --- a/src/biotite/structure/io/mol/sdf.py +++ b/src/biotite/structure/io/mol/sdf.py @@ -314,19 +314,19 @@ class SDRecord: >>> record = SDRecord(header=Header(mol_name="ALA", dimensions="3D")) >>> record.set_structure(atoms) >>> print(record.get_structure()) - 0 N -0.970 0.490 1.500 - 0 C 0.260 0.420 0.690 - 0 C -0.090 0.020 -0.720 - 0 O -1.060 -0.680 -0.920 - 0 C 1.200 -0.620 1.300 - 0 O 0.660 0.440 -1.740 - 0 H -1.380 -0.420 1.480 - 0 H -0.680 0.660 2.450 - 0 H 0.750 1.390 0.680 - 0 H 1.460 -0.330 2.320 - 0 H 0.720 -1.590 1.310 - 0 H 2.110 -0.680 0.700 - 0 H 0.440 0.180 -2.650 + 0 N -0.966 0.493 1.500 + 0 C 0.257 0.418 0.692 + 0 C -0.094 0.017 -0.716 + 0 O -1.056 -0.682 -0.923 + 0 C 1.204 -0.620 1.296 + 0 O 0.661 0.439 -1.742 + 0 H -1.383 -0.425 1.482 + 0 H -0.676 0.661 2.452 + 0 H 0.746 1.392 0.682 + 0 H 1.459 -0.330 2.316 + 0 H 0.715 -1.594 1.307 + 0 H 2.113 -0.676 0.697 + 0 H 0.435 0.182 -2.647 >>> # Add the record to an SD file >>> file = SDFile() >>> file["ALA"] = record @@ -335,19 +335,19 @@ class SDRecord: 3D 13 12 0 0 0 0 0 0 0 1 V2000 - -0.9700 0.4900 1.5000 N 0 0 0 0 0 0 0 0 0 0 0 0 - 0.2600 0.4200 0.6900 C 0 0 0 0 0 0 0 0 0 0 0 0 - -0.0900 0.0200 -0.7200 C 0 0 0 0 0 0 0 0 0 0 0 0 - -1.0600 -0.6800 -0.9200 O 0 0 0 0 0 0 0 0 0 0 0 0 - 1.2000 -0.6200 1.3000 C 0 0 0 0 0 0 0 0 0 0 0 0 - 0.6600 0.4400 -1.7400 O 0 0 0 0 0 0 0 0 0 0 0 0 - -1.3800 -0.4200 1.4800 H 0 0 0 0 0 0 0 0 0 0 0 0 - -0.6800 0.6600 2.4500 H 0 0 0 0 0 0 0 0 0 0 0 0 - 0.7500 1.3900 0.6800 H 0 0 0 0 0 0 0 0 0 0 0 0 - 1.4600 -0.3300 2.3200 H 0 0 0 0 0 0 0 0 0 0 0 0 - 0.7200 -1.5900 1.3100 H 0 0 0 0 0 0 0 0 0 0 0 0 - 2.1100 -0.6800 0.7000 H 0 0 0 0 0 0 0 0 0 0 0 0 - 0.4400 0.1800 -2.6500 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.9660 0.4930 1.5000 N 0 0 0 0 0 0 0 0 0 0 0 0 + 0.2570 0.4180 0.6920 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.0940 0.0170 -0.7160 C 0 0 0 0 0 0 0 0 0 0 0 0 + -1.0560 -0.6820 -0.9230 O 0 0 0 0 0 0 0 0 0 0 0 0 + 1.2040 -0.6200 1.2960 C 0 0 0 0 0 0 0 0 0 0 0 0 + 0.6610 0.4390 -1.7420 O 0 0 0 0 0 0 0 0 0 0 0 0 + -1.3830 -0.4250 1.4820 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.6760 0.6610 2.4520 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7460 1.3920 0.6820 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.4590 -0.3300 2.3160 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7150 -1.5940 1.3070 H 0 0 0 0 0 0 0 0 0 0 0 0 + 2.1130 -0.6760 0.6970 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.4350 0.1820 -2.6470 H 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 0 0 1 7 1 0 0 0 0 1 8 1 0 0 0 0 @@ -362,6 +362,7 @@ class SDRecord: 6 13 1 0 0 0 0 M END $$$$ + """ def __init__(self, header=None, ctab=None, metadata=None): @@ -678,19 +679,19 @@ class SDFile(File, MutableMapping): 13 12 0 0 0 0 0 0 0 1 V2000 - -0.9700 0.4900 1.5000 N 0 0 0 0 0 0 0 0 0 0 0 0 - 0.2600 0.4200 0.6900 C 0 0 0 0 0 0 0 0 0 0 0 0 - -0.0900 0.0200 -0.7200 C 0 0 0 0 0 0 0 0 0 0 0 0 - -1.0600 -0.6800 -0.9200 O 0 0 0 0 0 0 0 0 0 0 0 0 - 1.2000 -0.6200 1.3000 C 0 0 0 0 0 0 0 0 0 0 0 0 - 0.6600 0.4400 -1.7400 O 0 0 0 0 0 0 0 0 0 0 0 0 - -1.3800 -0.4200 1.4800 H 0 0 0 0 0 0 0 0 0 0 0 0 - -0.6800 0.6600 2.4500 H 0 0 0 0 0 0 0 0 0 0 0 0 - 0.7500 1.3900 0.6800 H 0 0 0 0 0 0 0 0 0 0 0 0 - 1.4600 -0.3300 2.3200 H 0 0 0 0 0 0 0 0 0 0 0 0 - 0.7200 -1.5900 1.3100 H 0 0 0 0 0 0 0 0 0 0 0 0 - 2.1100 -0.6800 0.7000 H 0 0 0 0 0 0 0 0 0 0 0 0 - 0.4400 0.1800 -2.6500 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.9660 0.4930 1.5000 N 0 0 0 0 0 0 0 0 0 0 0 0 + 0.2570 0.4180 0.6920 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.0940 0.0170 -0.7160 C 0 0 0 0 0 0 0 0 0 0 0 0 + -1.0560 -0.6820 -0.9230 O 0 0 0 0 0 0 0 0 0 0 0 0 + 1.2040 -0.6200 1.2960 C 0 0 0 0 0 0 0 0 0 0 0 0 + 0.6610 0.4390 -1.7420 O 0 0 0 0 0 0 0 0 0 0 0 0 + -1.3830 -0.4250 1.4820 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.6760 0.6610 2.4520 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7460 1.3920 0.6820 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.4590 -0.3300 2.3160 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7150 -1.5940 1.3070 H 0 0 0 0 0 0 0 0 0 0 0 0 + 2.1130 -0.6760 0.6970 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.4350 0.1820 -2.6470 H 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 0 0 1 7 1 0 0 0 0 1 8 1 0 0 0 0 diff --git a/src/biotite/structure/io/pdbqt/file.py b/src/biotite/structure/io/pdbqt/file.py index 21f883c0a..ae449d797 100644 --- a/src/biotite/structure/io/pdbqt/file.py +++ b/src/biotite/structure/io/pdbqt/file.py @@ -58,48 +58,48 @@ class PDBQTFile(TextFile): >>> mask = file.set_structure(ligand, rotatable_bonds="all") >>> # Print removed nonpolar hydrogen atoms >>> print(ligand[~mask]) - HET 0 BTN H101 H 3.740 1.170 0.970 - HET 0 BTN H102 H 4.070 1.340 -0.770 - HET 0 BTN H91 H 2.800 -0.740 -1.210 - HET 0 BTN H92 H 2.480 -0.910 0.530 - HET 0 BTN H81 H 1.290 1.260 0.520 - HET 0 BTN H82 H 1.620 1.440 -1.220 - HET 0 BTN H71 H 0.350 -0.650 -1.660 - HET 0 BTN H72 H 0.020 -0.820 0.080 - HET 0 BTN H2 H -0.840 1.580 -1.630 - HET 0 BTN H61 H -3.800 1.840 1.290 - HET 0 BTN H62 H -3.370 2.740 -0.200 - HET 0 BTN H5 H -4.310 0.810 -1.210 - HET 0 BTN H4 H -2.450 -0.040 -2.250 + HET 0 BTN H101 H 3.745 1.171 0.974 + HET 0 BTN H102 H 4.071 1.343 -0.767 + HET 0 BTN H91 H 2.802 -0.740 -1.211 + HET 0 BTN H92 H 2.476 -0.912 0.530 + HET 0 BTN H81 H 1.289 1.265 0.523 + HET 0 BTN H82 H 1.616 1.437 -1.218 + HET 0 BTN H71 H 0.346 -0.646 -1.662 + HET 0 BTN H72 H 0.020 -0.818 0.079 + HET 0 BTN H2 H -0.838 1.576 -1.627 + HET 0 BTN H61 H -3.797 1.837 1.286 + HET 0 BTN H62 H -3.367 2.738 -0.205 + HET 0 BTN H5 H -4.307 0.812 -1.205 + HET 0 BTN H4 H -2.451 -0.038 -2.252 >>> print(file) ROOT - HETATM 1 C11 BTN 0 5.090 -0.280 0.170 1.00 0.00 0.258 C - HETATM 2 O11 BTN 0 4.960 -1.470 0.030 1.00 0.00 -0.264 OA + HETATM 1 C11 BTN 0 5.089 -0.280 0.173 1.00 0.00 0.258 C + HETATM 2 O11 BTN 0 4.956 -1.473 0.030 1.00 0.00 -0.264 OA ENDROOT BRANCH 1 3 - HETATM 3 O12 BTN 0 6.300 0.230 0.440 1.00 0.00 -0.331 OA - HETATM 17 HO2 BTN 0 7.030 -0.390 0.520 1.00 0.00 0.221 HD + HETATM 3 O12 BTN 0 6.299 0.233 0.444 1.00 0.00 -0.331 OA + HETATM 17 HO2 BTN 0 7.034 -0.391 0.517 1.00 0.00 0.221 HD ENDBRANCH 1 3 BRANCH 1 4 - HETATM 4 C10 BTN 0 3.900 0.630 0.040 1.00 0.00 0.105 C + HETATM 4 C10 BTN 0 3.896 0.631 0.039 1.00 0.00 0.105 C BRANCH 4 5 - HETATM 5 C9 BTN 0 2.650 -0.200 -0.280 1.00 0.00 0.010 C + HETATM 5 C9 BTN 0 2.651 -0.200 -0.276 1.00 0.00 0.010 C BRANCH 5 6 - HETATM 6 C8 BTN 0 1.440 0.720 -0.410 1.00 0.00 0.002 C + HETATM 6 C8 BTN 0 1.440 0.725 -0.412 1.00 0.00 0.002 C BRANCH 6 7 - HETATM 7 C7 BTN 0 0.200 -0.110 -0.730 1.00 0.00 0.016 C + HETATM 7 C7 BTN 0 0.196 -0.106 -0.727 1.00 0.00 0.016 C BRANCH 7 8 - HETATM 8 C2 BTN 0 -1.020 0.820 -0.860 1.00 0.00 0.065 C - HETATM 9 S1 BTN 0 -1.420 1.600 0.750 1.00 0.00 -0.154 SA - HETATM 10 C6 BTN 0 -3.200 1.830 0.370 1.00 0.00 0.090 C - HETATM 11 C5 BTN 0 -3.530 0.580 -0.480 1.00 0.00 0.091 C - HETATM 12 N1 BTN 0 -3.970 -0.510 0.410 1.00 0.00 -0.239 NA - HETATM 13 C3 BTN 0 -3.140 -1.550 0.270 1.00 0.00 0.272 C - HETATM 14 O3 BTN 0 -3.270 -2.590 0.890 1.00 0.00 -0.259 OA - HETATM 15 N2 BTN 0 -2.150 -1.340 -0.610 1.00 0.00 -0.239 NA - HETATM 16 C4 BTN 0 -2.290 0.010 -1.170 1.00 0.00 0.093 C - HETATM 18 HN1 BTN 0 -4.740 -0.470 1.000 1.00 0.00 0.132 HD - HETATM 19 HN2 BTN 0 -1.460 -1.980 -0.840 1.00 0.00 0.132 HD + HETATM 8 C2 BTN 0 -1.015 0.819 -0.863 1.00 0.00 0.065 C + HETATM 9 S1 BTN 0 -1.419 1.604 0.751 1.00 0.00 -0.154 SA + HETATM 10 C6 BTN 0 -3.205 1.827 0.371 1.00 0.00 0.090 C + HETATM 11 C5 BTN 0 -3.530 0.581 -0.476 1.00 0.00 0.091 C + HETATM 12 N1 BTN 0 -3.970 -0.507 0.412 1.00 0.00 -0.239 NA + HETATM 13 C3 BTN 0 -3.141 -1.549 0.271 1.00 0.00 0.272 C + HETATM 14 O3 BTN 0 -3.271 -2.589 0.888 1.00 0.00 -0.259 OA + HETATM 15 N2 BTN 0 -2.154 -1.343 -0.612 1.00 0.00 -0.239 NA + HETATM 16 C4 BTN 0 -2.289 0.010 -1.175 1.00 0.00 0.093 C + HETATM 18 HN1 BTN 0 -4.738 -0.474 1.004 1.00 0.00 0.132 HD + HETATM 19 HN2 BTN 0 -1.462 -1.982 -0.843 1.00 0.00 0.132 HD ENDBRANCH 7 8 ENDBRANCH 6 7 ENDBRANCH 5 6 diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 5c78d55e9..4ff284168 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -1279,17 +1279,28 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non # Swap with the fallback option coord_fields, alt_coord_fields = alt_coord_fields, coord_fields try: - for i, field in enumerate(coord_fields): - array.coord[:, i] = atom_category[field].as_array(np.float32) - except KeyError as err: - key = err.args[0] - warnings.warn( - f"Attribute '{key}' not found within 'chem_comp_atom' category. " - f"The fallback coordinates will be used instead", - UserWarning, + array.coord = _parse_component_coordinates( + [atom_category[field] for field in coord_fields] + ) + except Exception as err: + if isinstance(err, KeyError): + key = err.args[0] + warnings.warn( + f"Attribute '{key}' not found within 'chem_comp_atom' category. " + f"The fallback coordinates will be used instead", + UserWarning, + ) + elif isinstance(err, ValueError): + warnings.warn( + "The coordinates are missing for some atoms. " + "The fallback coordinates will be used instead", + UserWarning, + ) + else: + raise + array.coord = _parse_component_coordinates( + [atom_category[field] for field in alt_coord_fields] ) - for i, field in enumerate(alt_coord_fields): - array.coord[:, i] = atom_category[field].as_array(np.float32) try: bond_category = block["chem_comp_bond"] @@ -1319,6 +1330,17 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non return array +def _parse_component_coordinates(coord_columns): + coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32) + for i, column in enumerate(coord_columns): + if column.mask is not None and column.mask.array.any(): + raise ValueError( + "Missing coordinates for some atoms", + ) + coord[:, i] = column.as_array(np.float32) + return coord + + def set_component(pdbx_file, array, data_block=None): """ Set the ``chem_comp_atom`` and, if bonds are available, diff --git a/src/biotite/structure/molecules.py b/src/biotite/structure/molecules.py index f20a5a1b6..6f9cb4669 100644 --- a/src/biotite/structure/molecules.py +++ b/src/biotite/structure/molecules.py @@ -63,54 +63,54 @@ def get_molecule_indices(array): >>> print(len(indices)) 2 >>> print(atp[indices[0]]) - HET 0 ATP PG P 1.200 -0.230 -6.850 - HET 0 ATP O1G O 1.740 1.140 -6.670 - HET 0 ATP O2G O 2.120 -1.040 -7.890 - HET 0 ATP O3G O -0.300 -0.140 -7.420 - HET 0 ATP PB P 0.260 -0.130 -4.450 - HET 0 ATP O1B O 0.810 1.230 -4.300 - HET 0 ATP O2B O -1.230 -0.040 -5.060 - HET 0 ATP O3B O 1.190 -0.990 -5.430 - HET 0 ATP PA P -0.740 0.070 -2.070 - HET 0 ATP O1A O -2.100 0.140 -2.670 - HET 0 ATP O2A O -0.120 1.550 -1.960 - HET 0 ATP O3A O 0.200 -0.840 -3.000 - HET 0 ATP HOG2 H 2.100 -0.550 -8.730 - HET 0 ATP HOG3 H -0.620 -1.050 -7.520 - HET 0 ATP HOB2 H -1.550 -0.950 -5.130 - HET 0 ATP HOA2 H 0.750 1.460 -1.560 + HET 0 ATP PG P 1.200 -0.226 -6.850 + HET 0 ATP O1G O 1.740 1.140 -6.672 + HET 0 ATP O2G O 2.123 -1.036 -7.891 + HET 0 ATP O3G O -0.302 -0.139 -7.421 + HET 0 ATP PB P 0.255 -0.130 -4.446 + HET 0 ATP O1B O 0.810 1.234 -4.304 + HET 0 ATP O2B O -1.231 -0.044 -5.057 + HET 0 ATP O3B O 1.192 -0.990 -5.433 + HET 0 ATP PA P -0.745 0.068 -2.071 + HET 0 ATP O1A O -2.097 0.143 -2.669 + HET 0 ATP O2A O -0.125 1.549 -1.957 + HET 0 ATP O3A O 0.203 -0.840 -3.002 + HET 0 ATP HOG2 H 2.100 -0.546 -8.725 + HET 0 ATP HOG3 H -0.616 -1.048 -7.522 + HET 0 ATP HOB2 H -1.554 -0.952 -5.132 + HET 0 ATP HOA2 H 0.752 1.455 -1.563 >>> print(atp[indices[1]]) - HET 0 ATP O5' O -0.840 -0.590 -0.600 - HET 0 ATP C5' C -1.690 0.260 0.170 - HET 0 ATP C4' C -1.830 -0.310 1.580 - HET 0 ATP O4' O -0.540 -0.360 2.230 - HET 0 ATP C3' C -2.680 0.630 2.460 - HET 0 ATP O3' O -4.030 0.160 2.530 - HET 0 ATP C2' C -2.010 0.560 3.860 - HET 0 ATP O2' O -2.930 0.040 4.830 - HET 0 ATP C1' C -0.830 -0.420 3.650 - HET 0 ATP N9 N 0.330 0.020 4.430 - HET 0 ATP C8 C 1.300 0.880 4.010 - HET 0 ATP N7 N 2.180 1.040 4.960 - HET 0 ATP C5 C 1.830 0.300 6.030 - HET 0 ATP C6 C 2.390 0.080 7.300 - HET 0 ATP N6 N 3.560 0.710 7.680 - HET 0 ATP N1 N 1.760 -0.750 8.140 - HET 0 ATP C2 C 0.640 -1.350 7.780 - HET 0 ATP N3 N 0.090 -1.180 6.600 - HET 0 ATP C4 C 0.640 -0.370 5.700 - HET 0 ATP H5'1 H -2.680 0.310 -0.300 - HET 0 ATP H5'2 H -1.260 1.260 0.220 - HET 0 ATP H4' H -2.280 -1.300 1.550 - HET 0 ATP H3' H -2.650 1.650 2.080 - HET 0 ATP HO3' H -4.520 0.790 3.090 - HET 0 ATP H2' H -1.650 1.540 4.160 - HET 0 ATP HO2' H -3.670 0.660 4.870 - HET 0 ATP H1' H -1.120 -1.430 3.930 - HET 0 ATP H8 H 1.330 1.360 3.040 - HET 0 ATP HN61 H 3.940 0.550 8.560 - HET 0 ATP HN62 H 4.020 1.300 7.060 - HET 0 ATP H2 H 0.170 -2.010 8.490 + HET 0 ATP O5' O -0.844 -0.587 -0.604 + HET 0 ATP C5' C -1.694 0.260 0.170 + HET 0 ATP C4' C -1.831 -0.309 1.584 + HET 0 ATP O4' O -0.542 -0.355 2.234 + HET 0 ATP C3' C -2.683 0.630 2.465 + HET 0 ATP O3' O -4.033 0.165 2.534 + HET 0 ATP C2' C -2.011 0.555 3.856 + HET 0 ATP O2' O -2.926 0.043 4.827 + HET 0 ATP C1' C -0.830 -0.418 3.647 + HET 0 ATP N9 N 0.332 0.015 4.425 + HET 0 ATP C8 C 1.302 0.879 4.012 + HET 0 ATP N7 N 2.184 1.042 4.955 + HET 0 ATP C5 C 1.833 0.300 6.033 + HET 0 ATP C6 C 2.391 0.077 7.303 + HET 0 ATP N6 N 3.564 0.706 7.681 + HET 0 ATP N1 N 1.763 -0.747 8.135 + HET 0 ATP C2 C 0.644 -1.352 7.783 + HET 0 ATP N3 N 0.088 -1.178 6.602 + HET 0 ATP C4 C 0.644 -0.371 5.704 + HET 0 ATP H5'1 H -2.678 0.312 -0.296 + HET 0 ATP H5'2 H -1.263 1.259 0.221 + HET 0 ATP H4' H -2.275 -1.304 1.550 + HET 0 ATP H3' H -2.651 1.649 2.078 + HET 0 ATP HO3' H -4.515 0.788 3.094 + HET 0 ATP H2' H -1.646 1.537 4.157 + HET 0 ATP HO2' H -3.667 0.662 4.867 + HET 0 ATP H1' H -1.119 -1.430 3.931 + HET 0 ATP H8 H 1.334 1.357 3.044 + HET 0 ATP HN61 H 3.938 0.548 8.562 + HET 0 ATP HN62 H 4.015 1.303 7.064 + HET 0 ATP H2 H 0.166 -2.014 8.490 """ if isinstance(array, BondList): bonds = array @@ -181,54 +181,54 @@ def get_molecule_masks(array): >>> print(len(masks)) 2 >>> print(atp[masks[0]]) - HET 0 ATP PG P 1.200 -0.230 -6.850 - HET 0 ATP O1G O 1.740 1.140 -6.670 - HET 0 ATP O2G O 2.120 -1.040 -7.890 - HET 0 ATP O3G O -0.300 -0.140 -7.420 - HET 0 ATP PB P 0.260 -0.130 -4.450 - HET 0 ATP O1B O 0.810 1.230 -4.300 - HET 0 ATP O2B O -1.230 -0.040 -5.060 - HET 0 ATP O3B O 1.190 -0.990 -5.430 - HET 0 ATP PA P -0.740 0.070 -2.070 - HET 0 ATP O1A O -2.100 0.140 -2.670 - HET 0 ATP O2A O -0.120 1.550 -1.960 - HET 0 ATP O3A O 0.200 -0.840 -3.000 - HET 0 ATP HOG2 H 2.100 -0.550 -8.730 - HET 0 ATP HOG3 H -0.620 -1.050 -7.520 - HET 0 ATP HOB2 H -1.550 -0.950 -5.130 - HET 0 ATP HOA2 H 0.750 1.460 -1.560 + HET 0 ATP PG P 1.200 -0.226 -6.850 + HET 0 ATP O1G O 1.740 1.140 -6.672 + HET 0 ATP O2G O 2.123 -1.036 -7.891 + HET 0 ATP O3G O -0.302 -0.139 -7.421 + HET 0 ATP PB P 0.255 -0.130 -4.446 + HET 0 ATP O1B O 0.810 1.234 -4.304 + HET 0 ATP O2B O -1.231 -0.044 -5.057 + HET 0 ATP O3B O 1.192 -0.990 -5.433 + HET 0 ATP PA P -0.745 0.068 -2.071 + HET 0 ATP O1A O -2.097 0.143 -2.669 + HET 0 ATP O2A O -0.125 1.549 -1.957 + HET 0 ATP O3A O 0.203 -0.840 -3.002 + HET 0 ATP HOG2 H 2.100 -0.546 -8.725 + HET 0 ATP HOG3 H -0.616 -1.048 -7.522 + HET 0 ATP HOB2 H -1.554 -0.952 -5.132 + HET 0 ATP HOA2 H 0.752 1.455 -1.563 >>> print(atp[masks[1]]) - HET 0 ATP O5' O -0.840 -0.590 -0.600 - HET 0 ATP C5' C -1.690 0.260 0.170 - HET 0 ATP C4' C -1.830 -0.310 1.580 - HET 0 ATP O4' O -0.540 -0.360 2.230 - HET 0 ATP C3' C -2.680 0.630 2.460 - HET 0 ATP O3' O -4.030 0.160 2.530 - HET 0 ATP C2' C -2.010 0.560 3.860 - HET 0 ATP O2' O -2.930 0.040 4.830 - HET 0 ATP C1' C -0.830 -0.420 3.650 - HET 0 ATP N9 N 0.330 0.020 4.430 - HET 0 ATP C8 C 1.300 0.880 4.010 - HET 0 ATP N7 N 2.180 1.040 4.960 - HET 0 ATP C5 C 1.830 0.300 6.030 - HET 0 ATP C6 C 2.390 0.080 7.300 - HET 0 ATP N6 N 3.560 0.710 7.680 - HET 0 ATP N1 N 1.760 -0.750 8.140 - HET 0 ATP C2 C 0.640 -1.350 7.780 - HET 0 ATP N3 N 0.090 -1.180 6.600 - HET 0 ATP C4 C 0.640 -0.370 5.700 - HET 0 ATP H5'1 H -2.680 0.310 -0.300 - HET 0 ATP H5'2 H -1.260 1.260 0.220 - HET 0 ATP H4' H -2.280 -1.300 1.550 - HET 0 ATP H3' H -2.650 1.650 2.080 - HET 0 ATP HO3' H -4.520 0.790 3.090 - HET 0 ATP H2' H -1.650 1.540 4.160 - HET 0 ATP HO2' H -3.670 0.660 4.870 - HET 0 ATP H1' H -1.120 -1.430 3.930 - HET 0 ATP H8 H 1.330 1.360 3.040 - HET 0 ATP HN61 H 3.940 0.550 8.560 - HET 0 ATP HN62 H 4.020 1.300 7.060 - HET 0 ATP H2 H 0.170 -2.010 8.490 + HET 0 ATP O5' O -0.844 -0.587 -0.604 + HET 0 ATP C5' C -1.694 0.260 0.170 + HET 0 ATP C4' C -1.831 -0.309 1.584 + HET 0 ATP O4' O -0.542 -0.355 2.234 + HET 0 ATP C3' C -2.683 0.630 2.465 + HET 0 ATP O3' O -4.033 0.165 2.534 + HET 0 ATP C2' C -2.011 0.555 3.856 + HET 0 ATP O2' O -2.926 0.043 4.827 + HET 0 ATP C1' C -0.830 -0.418 3.647 + HET 0 ATP N9 N 0.332 0.015 4.425 + HET 0 ATP C8 C 1.302 0.879 4.012 + HET 0 ATP N7 N 2.184 1.042 4.955 + HET 0 ATP C5 C 1.833 0.300 6.033 + HET 0 ATP C6 C 2.391 0.077 7.303 + HET 0 ATP N6 N 3.564 0.706 7.681 + HET 0 ATP N1 N 1.763 -0.747 8.135 + HET 0 ATP C2 C 0.644 -1.352 7.783 + HET 0 ATP N3 N 0.088 -1.178 6.602 + HET 0 ATP C4 C 0.644 -0.371 5.704 + HET 0 ATP H5'1 H -2.678 0.312 -0.296 + HET 0 ATP H5'2 H -1.263 1.259 0.221 + HET 0 ATP H4' H -2.275 -1.304 1.550 + HET 0 ATP H3' H -2.651 1.649 2.078 + HET 0 ATP HO3' H -4.515 0.788 3.094 + HET 0 ATP H2' H -1.646 1.537 4.157 + HET 0 ATP HO2' H -3.667 0.662 4.867 + HET 0 ATP H1' H -1.119 -1.430 3.931 + HET 0 ATP H8 H 1.334 1.357 3.044 + HET 0 ATP HN61 H 3.938 0.548 8.562 + HET 0 ATP HN62 H 4.015 1.303 7.064 + HET 0 ATP H2 H 0.166 -2.014 8.490 """ if isinstance(array, BondList): bonds = array @@ -288,55 +288,55 @@ def molecule_iter(array): ... print(molecule) ... print() New molecule - HET 0 ATP PG P 1.200 -0.230 -6.850 - HET 0 ATP O1G O 1.740 1.140 -6.670 - HET 0 ATP O2G O 2.120 -1.040 -7.890 - HET 0 ATP O3G O -0.300 -0.140 -7.420 - HET 0 ATP PB P 0.260 -0.130 -4.450 - HET 0 ATP O1B O 0.810 1.230 -4.300 - HET 0 ATP O2B O -1.230 -0.040 -5.060 - HET 0 ATP O3B O 1.190 -0.990 -5.430 - HET 0 ATP PA P -0.740 0.070 -2.070 - HET 0 ATP O1A O -2.100 0.140 -2.670 - HET 0 ATP O2A O -0.120 1.550 -1.960 - HET 0 ATP O3A O 0.200 -0.840 -3.000 - HET 0 ATP HOG2 H 2.100 -0.550 -8.730 - HET 0 ATP HOG3 H -0.620 -1.050 -7.520 - HET 0 ATP HOB2 H -1.550 -0.950 -5.130 - HET 0 ATP HOA2 H 0.750 1.460 -1.560 + HET 0 ATP PG P 1.200 -0.226 -6.850 + HET 0 ATP O1G O 1.740 1.140 -6.672 + HET 0 ATP O2G O 2.123 -1.036 -7.891 + HET 0 ATP O3G O -0.302 -0.139 -7.421 + HET 0 ATP PB P 0.255 -0.130 -4.446 + HET 0 ATP O1B O 0.810 1.234 -4.304 + HET 0 ATP O2B O -1.231 -0.044 -5.057 + HET 0 ATP O3B O 1.192 -0.990 -5.433 + HET 0 ATP PA P -0.745 0.068 -2.071 + HET 0 ATP O1A O -2.097 0.143 -2.669 + HET 0 ATP O2A O -0.125 1.549 -1.957 + HET 0 ATP O3A O 0.203 -0.840 -3.002 + HET 0 ATP HOG2 H 2.100 -0.546 -8.725 + HET 0 ATP HOG3 H -0.616 -1.048 -7.522 + HET 0 ATP HOB2 H -1.554 -0.952 -5.132 + HET 0 ATP HOA2 H 0.752 1.455 -1.563 New molecule - HET 0 ATP O5' O -0.840 -0.590 -0.600 - HET 0 ATP C5' C -1.690 0.260 0.170 - HET 0 ATP C4' C -1.830 -0.310 1.580 - HET 0 ATP O4' O -0.540 -0.360 2.230 - HET 0 ATP C3' C -2.680 0.630 2.460 - HET 0 ATP O3' O -4.030 0.160 2.530 - HET 0 ATP C2' C -2.010 0.560 3.860 - HET 0 ATP O2' O -2.930 0.040 4.830 - HET 0 ATP C1' C -0.830 -0.420 3.650 - HET 0 ATP N9 N 0.330 0.020 4.430 - HET 0 ATP C8 C 1.300 0.880 4.010 - HET 0 ATP N7 N 2.180 1.040 4.960 - HET 0 ATP C5 C 1.830 0.300 6.030 - HET 0 ATP C6 C 2.390 0.080 7.300 - HET 0 ATP N6 N 3.560 0.710 7.680 - HET 0 ATP N1 N 1.760 -0.750 8.140 - HET 0 ATP C2 C 0.640 -1.350 7.780 - HET 0 ATP N3 N 0.090 -1.180 6.600 - HET 0 ATP C4 C 0.640 -0.370 5.700 - HET 0 ATP H5'1 H -2.680 0.310 -0.300 - HET 0 ATP H5'2 H -1.260 1.260 0.220 - HET 0 ATP H4' H -2.280 -1.300 1.550 - HET 0 ATP H3' H -2.650 1.650 2.080 - HET 0 ATP HO3' H -4.520 0.790 3.090 - HET 0 ATP H2' H -1.650 1.540 4.160 - HET 0 ATP HO2' H -3.670 0.660 4.870 - HET 0 ATP H1' H -1.120 -1.430 3.930 - HET 0 ATP H8 H 1.330 1.360 3.040 - HET 0 ATP HN61 H 3.940 0.550 8.560 - HET 0 ATP HN62 H 4.020 1.300 7.060 - HET 0 ATP H2 H 0.170 -2.010 8.490 + HET 0 ATP O5' O -0.844 -0.587 -0.604 + HET 0 ATP C5' C -1.694 0.260 0.170 + HET 0 ATP C4' C -1.831 -0.309 1.584 + HET 0 ATP O4' O -0.542 -0.355 2.234 + HET 0 ATP C3' C -2.683 0.630 2.465 + HET 0 ATP O3' O -4.033 0.165 2.534 + HET 0 ATP C2' C -2.011 0.555 3.856 + HET 0 ATP O2' O -2.926 0.043 4.827 + HET 0 ATP C1' C -0.830 -0.418 3.647 + HET 0 ATP N9 N 0.332 0.015 4.425 + HET 0 ATP C8 C 1.302 0.879 4.012 + HET 0 ATP N7 N 2.184 1.042 4.955 + HET 0 ATP C5 C 1.833 0.300 6.033 + HET 0 ATP C6 C 2.391 0.077 7.303 + HET 0 ATP N6 N 3.564 0.706 7.681 + HET 0 ATP N1 N 1.763 -0.747 8.135 + HET 0 ATP C2 C 0.644 -1.352 7.783 + HET 0 ATP N3 N 0.088 -1.178 6.602 + HET 0 ATP C4 C 0.644 -0.371 5.704 + HET 0 ATP H5'1 H -2.678 0.312 -0.296 + HET 0 ATP H5'2 H -1.263 1.259 0.221 + HET 0 ATP H4' H -2.275 -1.304 1.550 + HET 0 ATP H3' H -2.651 1.649 2.078 + HET 0 ATP HO3' H -4.515 0.788 3.094 + HET 0 ATP H2' H -1.646 1.537 4.157 + HET 0 ATP HO2' H -3.667 0.662 4.867 + HET 0 ATP H1' H -1.119 -1.430 3.931 + HET 0 ATP H8 H 1.334 1.357 3.044 + HET 0 ATP HN61 H 3.938 0.548 8.562 + HET 0 ATP HN62 H 4.015 1.303 7.064 + HET 0 ATP H2 H 0.166 -2.014 8.490 """ if array.bonds is None: diff --git a/tests/structure/test_info.py b/tests/structure/test_info.py index 2d823aaf1..90b9cbc90 100644 --- a/tests/structure/test_info.py +++ b/tests/structure/test_info.py @@ -8,10 +8,30 @@ import pytest import biotite.structure as struc import biotite.structure.info as strucinfo +import biotite.structure.io.pdbx as pdbx +from biotite.structure.info.ccd import _CCD_FILE as INTERNAL_CCD_FILE from biotite.structure.io import load_structure from tests.util import data_dir +@pytest.fixture +def fake_ccd_path(tmp_path): + block = pdbx.BinaryCIFBlock() + block["chem_comp"] = pdbx.BinaryCIFCategory({"id": "FOO", "name": "Foo"}) + block["chem_comp_atom"] = pdbx.BinaryCIFCategory({"comp_id": "FOO"}) + block["chem_comp_bond"] = pdbx.BinaryCIFCategory({"comp_id": "FOO"}) + file = pdbx.BinaryCIFFile() + file["components"] = block + original_path = INTERNAL_CCD_FILE + path = tmp_path / "components.bcif" + file.write(path) + + yield path + + # Restore the original internal CCD path + strucinfo.set_ccd_path(original_path) + + @pytest.mark.parametrize( "function, included, excluded", [ @@ -159,3 +179,16 @@ def test_standardize_order(multi_model, seed): assert ( restored[..., restored.element != "H"] == original[..., original.element != "H"] ) + + +def test_set_ccd_path(fake_ccd_path): + """ + Test if the CCD path can be set and the CCD is loaded correctly from it. + """ + # Access CCD before setting it to a new path to check if the cache is cleared + strucinfo.all_residues() + + strucinfo.set_ccd_path(fake_ccd_path) + + # The new fake CCD has only a single compound + assert strucinfo.all_residues() == ["FOO"]