Compute compound groups on-the-fly

biotite-dev · Nov 2, 2024 · de688e1 · de688e1
1 parent b0e9fe8
commit de688e1
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 80 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,7 +21,7 @@ htmlcov
 /src/biotite/version.py
 
 # Ignore internal CCD
-/src/biotite/structure/info/ccd/
+/src/biotite/structure/info/components.bcif
 
 # Ignore autogenerated documentation files
 /doc/static/switcher.json

diff --git a/setup_ccd.py b/setup_ccd.py
@@ -7,42 +7,10 @@
 import requests
 from biotite.structure.io.pdbx import *
 
-TARGET_DIR = Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd"
+OUTPUT_CCD = (
+    Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "components.bcif"
+)
 CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
-COMPONENT_GROUPS = {
-    "amino_acids": [
-        "D-beta-peptide, C-gamma linking",
-        "D-gamma-peptide, C-delta linking",
-        "D-peptide COOH carboxy terminus",
-        "D-peptide NH3 amino terminus",
-        "D-peptide linking",
-        "L-beta-peptide, C-gamma linking",
-        "L-gamma-peptide, C-delta linking",
-        "L-peptide COOH carboxy terminus",
-        "L-peptide NH3 amino terminus",
-        "L-peptide linking",
-        "peptide linking",
-    ],
-    "nucleotides": [
-        "DNA OH 3 prime terminus",
-        "DNA OH 5 prime terminus",
-        "DNA linking",
-        "L-DNA linking",
-        "L-RNA linking",
-        "RNA OH 3 prime terminus",
-        "RNA OH 5 prime terminus",
-        "RNA linking",
-    ],
-    "carbohydrates": [
-        "D-saccharide",
-        "D-saccharide, alpha linking",
-        "D-saccharide, beta linking",
-        "L-saccharide",
-        "L-saccharide, alpha linking",
-        "L-saccharide, beta linking",
-        "saccharide",
-    ],
-}
 
 
 def concatenate_ccd(categories=None):
@@ -81,29 +49,6 @@ def concatenate_ccd(categories=None):
     return compressed_file
 
 
-def group_components(ccd, match_types):
-    """
-    Identify component IDs that matches a *given component type* from the given file.
-
-    Parameters
-    ----------
-    ccd : BinaryCIFFile
-        The file to look into-
-    match_types : list of str
-        The component types to extract.
-
-    Returns
-    -------
-    comp_ids : list of str
-        The extracted component IDs.
-    """
-    category = ccd.block["chem_comp"]
-    comp_ids = category["id"].as_array()
-    types = category["type"].as_array()
-    # Ignore case
-    return comp_ids[np.isin(np.char.lower(types), np.char.lower(match_types))].tolist()
-
-
 def _concatenate_blocks_into_category(pdbx_file, category_name):
     """
     Concatenate the given category from all blocks into a single
@@ -241,13 +186,7 @@ def _into_fitting_type(string_array, mask):
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
-    TARGET_DIR.mkdir(parents=True, exist_ok=True)
+    OUTPUT_CCD.parent.mkdir(parents=True, exist_ok=True)
 
     compressed_ccd = concatenate_ccd(["chem_comp", "chem_comp_atom", "chem_comp_bond"])
-    compressed_ccd.write(TARGET_DIR / "components.bcif")
-
-    for super_group, groups in COMPONENT_GROUPS.items():
-        logging.info(f"Identify all components belonging to '{super_group}' group...")
-        components = group_components(compressed_ccd, groups)
-        with open(TARGET_DIR / f"{super_group}.txt", "w") as file:
-            file.write("\n".join(components) + "\n")
+    compressed_ccd.write(OUTPUT_CCD)
diff --git a/src/biotite/structure/info/ccd.py b/src/biotite/structure/info/ccd.py
@@ -10,11 +10,11 @@
 from pathlib import Path
 import numpy as np
 
-CCD_DIR = Path(__file__).parent / "ccd"
-SPECIAL_ID_COLUMN_NAMES = {
+_CCD_FILE = Path(__file__).parent / "components.bcif"
+_SPECIAL_ID_COLUMN_NAMES = {
     "chem_comp": "id",
 }
-DEFAULT_ID_COLUMN_NAME = "comp_id"
+_DEFAULT_ID_COLUMN_NAME = "comp_id"
 
 
 @functools.cache
@@ -46,7 +46,7 @@ def get_ccd():
     from biotite.structure.io.pdbx.bcif import BinaryCIFFile
 
     try:
-        return BinaryCIFFile.read(CCD_DIR / "components.bcif").block
+        return BinaryCIFFile.read(_CCD_FILE).block
     except FileNotFoundError:
         raise RuntimeError(
             "Internal CCD not found. Please run 'setup_ccd.py' and reinstall Biotite."

diff --git a/src/biotite/structure/info/groups.py b/src/biotite/structure/info/groups.py
@@ -7,9 +7,41 @@
 __all__ = ["amino_acid_names", "nucleotide_names", "carbohydrate_names"]
 
 import functools
-from pathlib import Path
-
-CCD_DIR = Path(__file__).parent / "ccd"
+import numpy as np
+from biotite.structure.info.ccd import get_ccd
+
+_AMINO_ACID_TYPES = [
+    "D-beta-peptide, C-gamma linking",
+    "D-gamma-peptide, C-delta linking",
+    "D-peptide COOH carboxy terminus",
+    "D-peptide NH3 amino terminus",
+    "D-peptide linking",
+    "L-beta-peptide, C-gamma linking",
+    "L-gamma-peptide, C-delta linking",
+    "L-peptide COOH carboxy terminus",
+    "L-peptide NH3 amino terminus",
+    "L-peptide linking",
+    "peptide linking",
+]
+_NUCLEOTIDE_TYPES = [
+    "DNA OH 3 prime terminus",
+    "DNA OH 5 prime terminus",
+    "DNA linking",
+    "L-DNA linking",
+    "L-RNA linking",
+    "RNA OH 3 prime terminus",
+    "RNA OH 5 prime terminus",
+    "RNA linking",
+]
+_CARBOHYDRATE_TYPES = [
+    "D-saccharide",
+    "D-saccharide, alpha linking",
+    "D-saccharide, beta linking",
+    "L-saccharide",
+    "L-saccharide, alpha linking",
+    "L-saccharide, beta linking",
+    "saccharide",
+]
 
 
 @functools.cache
@@ -31,7 +63,7 @@ def amino_acid_names():
     .. footbibliography::
 
     """
-    return _get_group_members("amino_acids")
+    return _get_group_members(_AMINO_ACID_TYPES)
 
 
 @functools.cache
@@ -53,7 +85,7 @@ def nucleotide_names():
     .. footbibliography::
 
     """
-    return _get_group_members("nucleotides")
+    return _get_group_members(_NUCLEOTIDE_TYPES)
 
 
 @functools.cache
@@ -75,9 +107,25 @@ def carbohydrate_names():
     .. footbibliography::
 
     """
-    return _get_group_members("carbohydrates")
+    return _get_group_members(_CARBOHYDRATE_TYPES)
+
+
+def _get_group_members(match_types):
+    """
+    Identify component IDs that matches a given component *type* from the CCD.
 
+    Parameters
+    ----------
+    match_types : list of str
+        The component types to extract.
 
-def _get_group_members(group_name):
-    with open(CCD_DIR / f"{group_name}.txt", "r") as file:
-        return tuple(file.read().split())
+    Returns
+    -------
+    comp_ids : list of str
+        The extracted component IDs.
+    """
+    category = get_ccd()["chem_comp"]
+    comp_ids = category["id"].as_array()
+    types = category["type"].as_array()
+    # Ignore case
+    return comp_ids[np.isin(np.char.lower(types), np.char.lower(match_types))].tolist()