bytedance · taivu1998 · May 10, 2026
diff --git a/docs/infer_json_format.md b/docs/infer_json_format.md
@@ -20,10 +20,16 @@ Here is an overview of the JSON file format:
 The JSON file consists of a list of dictionaries, where each dictionary represents a set of sequences you want to model. 
 Even if you are modeling only one set of sequences, the top-level structure should still be a list.
 
-Each dictionary contains the following three keys:
+Each dictionary contains the following keys:
 * `name`: A string representing the name of the inference job.
 * `sequences`: A list of dictionaries that describe the entities (e.g., proteins, DNA, RNA, small molecules, and ions) involved in the inference.
 * `covalent_bonds`: An optional list of dictionaries that define the covalent bonds between atoms from different entities.
+* `userCCD`: Optional inline CCD mmCIF text that defines additional chemical components for this job.
+* `userCCDPath`: Optional path to a CCD mmCIF file that defines additional chemical components for this job. Relative paths are resolved from the input JSON file location.
+
+`userCCD` and `userCCDPath` are mutually exclusive. They can be used when a modification or CCD ligand is not present in the default Protenix CCD cache. For example, a custom protein PTM can be provided in a user CCD file and referenced from `proteinChain.modifications` via `ptmType: "CCD_<your_code>"`.
+
+The user CCD component must contain the atom names used by the PTM or covalent-bond fields. Protein PTM components should be peptide-like CCD entries with standard backbone atom names such as `N`, `CA`, `C`, and `O`, and correct leaving-atom flags for atoms such as `OXT` when present.
 
 Details of `sequences` and `covalent_bonds` are provided below.
 
@@ -395,4 +401,4 @@ The contents of each output file are as follows:
     - `has_clash` - Boolean flag indicating if there are steric clashes in the predicted structure.
     - `disorder` - Predicted regions of intrinsic disorder within the protein, highlighting residues that may be flexible or unstructured.
     - `ranking_score` - Predicted confidence score for ranking complexes. Higher values indicate greater confidence.
-    - `num_recycles`: Number of recycling steps used during inference.
+    - `num_recycles`: Number of recycling steps used during inference.
diff --git a/examples/custom_ptm_components.cif b/examples/custom_ptm_components.cif
@@ -0,0 +1,66 @@
+data_UAA
+#
+_chem_comp.id                                    UAA
+_chem_comp.name                                  'USER DEFINED AMINO ACID'
+_chem_comp.type                                  'L-PEPTIDE LINKING'
+_chem_comp.pdbx_type                             ATOMP
+_chem_comp.formula                               'C3 H7 N O2'
+_chem_comp.mon_nstd_parent_comp_id               ?
+_chem_comp.pdbx_synonyms                         ?
+_chem_comp.pdbx_formal_charge                    0
+_chem_comp.pdbx_initial_date                     2026-05-10
+_chem_comp.pdbx_modified_date                    2026-05-10
+_chem_comp.pdbx_ambiguous_flag                   N
+_chem_comp.pdbx_release_status                   REL
+_chem_comp.pdbx_replaced_by                      ?
+_chem_comp.pdbx_replaces                         ?
+_chem_comp.formula_weight                        89.094
+_chem_comp.one_letter_code                       K
+_chem_comp.three_letter_code                     UAA
+_chem_comp.pdbx_model_coordinates_db_code        ?
+_chem_comp.pdbx_model_coordinates_details        ?
+_chem_comp.pdbx_ideal_coordinates_details        ?
+_chem_comp.pdbx_ideal_coordinates_missing_flag   N
+_chem_comp.pdbx_model_coordinates_missing_flag   N
+_chem_comp.pdbx_processing_site                  ?
+#
+loop_
+_chem_comp_atom.comp_id
+_chem_comp_atom.atom_id
+_chem_comp_atom.alt_atom_id
+_chem_comp_atom.type_symbol
+_chem_comp_atom.charge
+_chem_comp_atom.pdbx_align
+_chem_comp_atom.pdbx_aromatic_flag
+_chem_comp_atom.pdbx_leaving_atom_flag
+_chem_comp_atom.pdbx_stereo_config
+_chem_comp_atom.model_Cartn_x
+_chem_comp_atom.model_Cartn_y
+_chem_comp_atom.model_Cartn_z
+_chem_comp_atom.pdbx_model_Cartn_x_ideal
+_chem_comp_atom.pdbx_model_Cartn_y_ideal
+_chem_comp_atom.pdbx_model_Cartn_z_ideal
+_chem_comp_atom.pdbx_component_atom_id
+_chem_comp_atom.pdbx_component_comp_id
+_chem_comp_atom.pdbx_ordinal
+UAA N   N   N 0 1 N N N  0.000  0.000  0.000  0.000  0.000  0.000 N   UAA 1
+UAA CA  CA  C 0 1 N N N  1.450  0.000  0.000  1.450  0.000  0.000 CA  UAA 2
+UAA C   C   C 0 1 N N N  2.020  1.410  0.000  2.020  1.410  0.000 C   UAA 3
+UAA O   O   O 0 1 N N N  1.320  2.400  0.000  1.320  2.400  0.000 O   UAA 4
+UAA CB  CB  C 0 1 N N N  1.980 -0.780 -1.200  1.980 -0.780 -1.200 CB  UAA 5
+UAA OXT OXT O 0 1 N Y N  3.250  1.560  0.000  3.250  1.560  0.000 OXT UAA 6
+#
+loop_
+_chem_comp_bond.comp_id
+_chem_comp_bond.atom_id_1
+_chem_comp_bond.atom_id_2
+_chem_comp_bond.value_order
+_chem_comp_bond.pdbx_aromatic_flag
+_chem_comp_bond.pdbx_stereo_config
+_chem_comp_bond.pdbx_ordinal
+UAA N  CA  SING N N 1
+UAA CA C   SING N N 2
+UAA C  O   DOUB N N 3
+UAA CA CB  SING N N 4
+UAA C  OXT SING N N 5
+#
diff --git a/examples/example_custom_ptm_userccd.json b/examples/example_custom_ptm_userccd.json
@@ -0,0 +1,20 @@
+[
+  {
+    "name": "custom_ptm_userccd",
+    "userCCDPath": "custom_ptm_components.cif",
+    "sequences": [
+      {
+        "proteinChain": {
+          "sequence": "AKT",
+          "count": 1,
+          "modifications": [
+            {
+              "ptmType": "CCD_UAA",
+              "ptmPosition": 2
+            }
+          ]
+        }
+      }
+    ]
+  }
+]
diff --git a/protenix/data/core/ccd.py b/protenix/data/core/ccd.py
@@ -17,7 +17,7 @@
 import pickle
 from collections import defaultdict
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import biotite
 import biotite.structure as struc
@@ -310,7 +310,9 @@ def get_ccd_ref_info(
 
 # Modified from biotite to use consistent ccd components file
 def _connect_inter_residue(
-    atoms: AtomArray, residue_starts: np.ndarray
+    atoms: AtomArray,
+    residue_starts: np.ndarray,
+    get_mol_type_fn: Callable[[str], str] = get_mol_type,
 ) -> struc.BondList:
     """
     Create a :class:`BondList` containing the bonds between adjacent
@@ -355,8 +357,8 @@ def _connect_inter_residue(
             continue
 
         # Get link type for this residue from RCSB components.cif
-        curr_link = get_mol_type(res_names[curr_start_i])
-        next_link = get_mol_type(res_names[next_start_i])
+        curr_link = get_mol_type_fn(res_names[curr_start_i])
+        next_link = get_mol_type_fn(res_names[next_start_i])
 
         if curr_link == "protein" and next_link in "protein":
             curr_connect_atom_name = "C"