Skip to content

Commit

Permalink
Compute compound groups on-the-fly
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Nov 2, 2024
1 parent b0e9fe8 commit de688e1
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 80 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ htmlcov
/src/biotite/version.py

# Ignore internal CCD
/src/biotite/structure/info/ccd/
/src/biotite/structure/info/components.bcif

# Ignore autogenerated documentation files
/doc/static/switcher.json
Expand Down
71 changes: 5 additions & 66 deletions setup_ccd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,42 +7,10 @@
import requests
from biotite.structure.io.pdbx import *

TARGET_DIR = Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd"
OUTPUT_CCD = (
Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "components.bcif"
)
CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
COMPONENT_GROUPS = {
"amino_acids": [
"D-beta-peptide, C-gamma linking",
"D-gamma-peptide, C-delta linking",
"D-peptide COOH carboxy terminus",
"D-peptide NH3 amino terminus",
"D-peptide linking",
"L-beta-peptide, C-gamma linking",
"L-gamma-peptide, C-delta linking",
"L-peptide COOH carboxy terminus",
"L-peptide NH3 amino terminus",
"L-peptide linking",
"peptide linking",
],
"nucleotides": [
"DNA OH 3 prime terminus",
"DNA OH 5 prime terminus",
"DNA linking",
"L-DNA linking",
"L-RNA linking",
"RNA OH 3 prime terminus",
"RNA OH 5 prime terminus",
"RNA linking",
],
"carbohydrates": [
"D-saccharide",
"D-saccharide, alpha linking",
"D-saccharide, beta linking",
"L-saccharide",
"L-saccharide, alpha linking",
"L-saccharide, beta linking",
"saccharide",
],
}


def concatenate_ccd(categories=None):
Expand Down Expand Up @@ -81,29 +49,6 @@ def concatenate_ccd(categories=None):
return compressed_file


def group_components(ccd, match_types):
"""
Identify component IDs that matches a *given component type* from the given file.
Parameters
----------
ccd : BinaryCIFFile
The file to look into-
match_types : list of str
The component types to extract.
Returns
-------
comp_ids : list of str
The extracted component IDs.
"""
category = ccd.block["chem_comp"]
comp_ids = category["id"].as_array()
types = category["type"].as_array()
# Ignore case
return comp_ids[np.isin(np.char.lower(types), np.char.lower(match_types))].tolist()


def _concatenate_blocks_into_category(pdbx_file, category_name):
"""
Concatenate the given category from all blocks into a single
Expand Down Expand Up @@ -241,13 +186,7 @@ def _into_fitting_type(string_array, mask):

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
TARGET_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_CCD.parent.mkdir(parents=True, exist_ok=True)

compressed_ccd = concatenate_ccd(["chem_comp", "chem_comp_atom", "chem_comp_bond"])
compressed_ccd.write(TARGET_DIR / "components.bcif")

for super_group, groups in COMPONENT_GROUPS.items():
logging.info(f"Identify all components belonging to '{super_group}' group...")
components = group_components(compressed_ccd, groups)
with open(TARGET_DIR / f"{super_group}.txt", "w") as file:
file.write("\n".join(components) + "\n")
compressed_ccd.write(OUTPUT_CCD)
8 changes: 4 additions & 4 deletions src/biotite/structure/info/ccd.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
from pathlib import Path
import numpy as np

CCD_DIR = Path(__file__).parent / "ccd"
SPECIAL_ID_COLUMN_NAMES = {
_CCD_FILE = Path(__file__).parent / "components.bcif"
_SPECIAL_ID_COLUMN_NAMES = {
"chem_comp": "id",
}
DEFAULT_ID_COLUMN_NAME = "comp_id"
_DEFAULT_ID_COLUMN_NAME = "comp_id"


@functools.cache
Expand Down Expand Up @@ -46,7 +46,7 @@ def get_ccd():
from biotite.structure.io.pdbx.bcif import BinaryCIFFile

try:
return BinaryCIFFile.read(CCD_DIR / "components.bcif").block
return BinaryCIFFile.read(_CCD_FILE).block
except FileNotFoundError:
raise RuntimeError(
"Internal CCD not found. Please run 'setup_ccd.py' and reinstall Biotite."
Expand Down
66 changes: 57 additions & 9 deletions src/biotite/structure/info/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,41 @@
__all__ = ["amino_acid_names", "nucleotide_names", "carbohydrate_names"]

import functools
from pathlib import Path

CCD_DIR = Path(__file__).parent / "ccd"
import numpy as np
from biotite.structure.info.ccd import get_ccd

_AMINO_ACID_TYPES = [
"D-beta-peptide, C-gamma linking",
"D-gamma-peptide, C-delta linking",
"D-peptide COOH carboxy terminus",
"D-peptide NH3 amino terminus",
"D-peptide linking",
"L-beta-peptide, C-gamma linking",
"L-gamma-peptide, C-delta linking",
"L-peptide COOH carboxy terminus",
"L-peptide NH3 amino terminus",
"L-peptide linking",
"peptide linking",
]
_NUCLEOTIDE_TYPES = [
"DNA OH 3 prime terminus",
"DNA OH 5 prime terminus",
"DNA linking",
"L-DNA linking",
"L-RNA linking",
"RNA OH 3 prime terminus",
"RNA OH 5 prime terminus",
"RNA linking",
]
_CARBOHYDRATE_TYPES = [
"D-saccharide",
"D-saccharide, alpha linking",
"D-saccharide, beta linking",
"L-saccharide",
"L-saccharide, alpha linking",
"L-saccharide, beta linking",
"saccharide",
]


@functools.cache
Expand All @@ -31,7 +63,7 @@ def amino_acid_names():
.. footbibliography::
"""
return _get_group_members("amino_acids")
return _get_group_members(_AMINO_ACID_TYPES)


@functools.cache
Expand All @@ -53,7 +85,7 @@ def nucleotide_names():
.. footbibliography::
"""
return _get_group_members("nucleotides")
return _get_group_members(_NUCLEOTIDE_TYPES)


@functools.cache
Expand All @@ -75,9 +107,25 @@ def carbohydrate_names():
.. footbibliography::
"""
return _get_group_members("carbohydrates")
return _get_group_members(_CARBOHYDRATE_TYPES)


def _get_group_members(match_types):
"""
Identify component IDs that matches a given component *type* from the CCD.
Parameters
----------
match_types : list of str
The component types to extract.
def _get_group_members(group_name):
with open(CCD_DIR / f"{group_name}.txt", "r") as file:
return tuple(file.read().split())
Returns
-------
comp_ids : list of str
The extracted component IDs.
"""
category = get_ccd()["chem_comp"]
comp_ids = category["id"].as_array()
types = category["type"].as_array()
# Ignore case
return comp_ids[np.isin(np.char.lower(types), np.char.lower(match_types))].tolist()

0 comments on commit de688e1

Please sign in to comment.