diff --git a/Project.toml b/Project.toml index e0da862..3ba17ca 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MoleculeFlow" uuid = "0907176b-47b3-4e96-be91-0e9bb91fb14f" -authors = ["Renée Gil"] -version = "0.4.0" +authors = ["Renee Gil"] +version = "0.5.0" [deps] CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab" diff --git a/docs/make.jl b/docs/make.jl index d4c0512..fa219b7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -26,6 +26,7 @@ makedocs(; "Drawing & Visualization" => "api/drawing.md", "Molecular Descriptors" => "api/descriptors.md", "Fingerprints" => "api/fingerprints.md", + "Pharmacophore Features" => "api/pharmacophore.md", "Substructure Search" => "api/substructure.md", "Atom Operations" => "api/atoms.md", "Bond Operations" => "api/bonds.md", diff --git a/docs/src/api/descriptors.md b/docs/src/api/descriptors.md index 256cbfb..75a9dfc 100644 --- a/docs/src/api/descriptors.md +++ b/docs/src/api/descriptors.md @@ -114,18 +114,19 @@ slogp_vsa calc_all_descriptors ``` -## Advanced Drug-like and ADMET Descriptors +## Quantitative Estimate of Drug-likeness (QED) -These descriptors are particularly important for drug discovery and ADMET (Absorption, Distribution, Metabolism, Excretion, Toxicity) prediction. +```@docs +qed +``` -### Drug-likeness Scores +## Synthetic Accessibility ```@docs -qed synthetic_accessibility ``` -### Molecular Complexity and 3D Character +## Molecular Complexity and 3D Character ```@docs fraction_csp3 @@ -133,18 +134,78 @@ labute_asa molar_refractivity ``` -## Advanced Ring and Structure Counts - -More detailed structural analysis beyond basic ring counts. +## Number of Aliphatic Carbocycles ```@docs num_aliphatic_carbocycles +``` + +## Number of Aromatic Carbocycles + +```@docs num_aromatic_carbocycles +``` + +## Number of Aromatic Heterocycles + +```@docs num_aromatic_heterocycles +``` + +## Number of Stereo Centres + +```@docs num_atom_stereo_centers +``` + +## Number of Amide Bonds + +```@docs num_amide_bonds ``` +## Additional Ring Type Counts + +```@docs +num_aliphatic_heterocycles +``` + +```@docs +num_saturated_heterocycles +``` + +```@docs +num_saturated_carbocycles +``` + +```@docs +num_aliphatic_rings +``` + +```@docs +num_heterocycles +``` + +## Additional Stereochemistry and Structure Counts + +```@docs +num_unspecified_atom_stereo_centers +``` + +```@docs +num_spiro_atoms +``` + +```@docs +num_bridgehead_atoms +``` + +## Molecular Complexity + +```@docs +hall_kier_alpha +``` + ## 3D Descriptors These descriptors require 3D coordinates to be present in the molecule. @@ -152,71 +213,55 @@ These descriptors require 3D coordinates to be present in the molecule. ```@docs asphericity radius_of_gyration +eccentricity +inertial_shape_factor ``` -## Examples +## Principal Moments of Inertia -### Drug Discovery Workflow - -```julia -using MoleculeFlow +```@docs +pmi1 +``` -# Analyze drug-like properties -compounds = [ - mol_from_smiles("CC(=O)OC1=CC=CC=C1C(=O)O"), # Aspirin - mol_from_smiles("CCO"), # Ethanol - mol_from_smiles("c1ccc2c(c1)nnnc2N") # Drug-like compound -] +```@docs +pmi2 +``` -for (i, mol) in enumerate(compounds) - println("Compound $i:") - println("QED score: ", qed(mol)) - println("SAscore: ", synthetic_accessibility(mol)) - println("Fsp3: ", fraction_csp3(mol)) - println("Aromatic rings: ", num_aromatic_carbocycles(mol)) - println("Stereocenters: ", num_atom_stereo_centers(mol)) - println() -end +```@docs +pmi3 ``` -### 3D Shape Analysis +## Spherocity Index -```julia -# Generate 3D conformer and analyze shape -mol = mol_from_smiles("CCO") -conformers = generate_3d_conformers(mol, 1) +```@docs +spherocity_index +``` -if !isempty(conformers) - mol_3d = conformers[1].molecule +## GETAWAY Descriptors - println("3D Shape Descriptors:") - println(" Asphericity: ", asphericity(mol_3d)) - println(" Radius of gyration: ", radius_of_gyration(mol_3d)) -end +```@docs +getaway_descriptors ``` -### Comprehensive Analysis +## WHIM Descriptors -```julia -# Calculate multiple advanced descriptors at once -mol = mol_from_smiles("CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F") +```@docs +whim_descriptors +``` -println("Comprehensive Descriptor Analysis:") -println("Drug-likeness (QED): ", qed(mol)) -println("Synthetic accessibility: ", synthetic_accessibility(mol)) -println("3D character (Fsp3): ", fraction_csp3(mol)) -println("Surface area (LabuteASA): ", labute_asa(mol)) -println("Molar refractivity: ", molar_refractivity(mol)) -println("Aromatic carbocycles: ", num_aromatic_carbocycles(mol)) -println("Aromatic heterocycles: ", num_aromatic_heterocycles(mol)) -println("Amide bonds: ", num_amide_bonds(mol)) +## RDF Descriptors + +```@docs +rdf_descriptors ``` -## Additional Molecular Connectivity Descriptors +## MORSE Descriptors -These descriptors provide detailed information about molecular connectivity and shape. +```@docs +morse_descriptors +``` -### Chi Connectivity Indices +## Chi Connectivity Indices ```@docs chi0n @@ -230,26 +275,226 @@ chi3v chi4v ``` -### Kappa Shape Descriptors +## Kappa Shape Descriptors ```@docs kappa2 kappa3 ``` -### E-State Descriptors +## E-State Descriptors ```@docs max_e_state_index min_e_state_index ``` -### Information Content +```@docs +max_absolute_e_state_index +``` + +```@docs +min_absolute_e_state_index +``` + +## Information Content ```@docs ipc ``` +## SlogP_VSA Descriptors + +Surface area contributions based on SlogP values. + +```@docs +slogp_vsa2 +``` + +```@docs +slogp_vsa3 +``` + +```@docs +slogp_vsa4 +``` + +```@docs +slogp_vsa5 +``` + +```@docs +slogp_vsa6 +``` + +```@docs +slogp_vsa7 +``` + +```@docs +slogp_vsa8 +``` + +```@docs +slogp_vsa9 +``` + +```@docs +slogp_vsa10 +``` + +```@docs +slogp_vsa11 +``` + +```@docs +slogp_vsa12 +``` + +## SMR_VSA Descriptors + +Surface area contributions based on Molar Refractivity values. + +```@docs +smr_vsa1 +``` + +```@docs +smr_vsa2 +``` + +```@docs +smr_vsa3 +``` + +```@docs +smr_vsa4 +``` + +```@docs +smr_vsa5 +``` + +```@docs +smr_vsa6 +``` + +```@docs +smr_vsa7 +``` + +```@docs +smr_vsa8 +``` + +```@docs +smr_vsa9 +``` + +```@docs +smr_vsa10 +``` + +## PEOE_VSA Descriptors + +Surface area contributions based on Partial Equalization of Orbital Electronegativities (PEOE) charges. + +```@docs +peoe_vsa1 +``` + +```@docs +peoe_vsa2 +``` + +```@docs +peoe_vsa3 +``` + +```@docs +peoe_vsa4 +``` + +```@docs +peoe_vsa5 +``` + +```@docs +peoe_vsa6 +``` + +```@docs +peoe_vsa7 +``` + +```@docs +peoe_vsa8 +``` + +```@docs +peoe_vsa9 +``` + +```@docs +peoe_vsa10 +``` + +```@docs +peoe_vsa11 +``` + +```@docs +peoe_vsa12 +``` + +```@docs +peoe_vsa13 +``` + +```@docs +peoe_vsa14 +``` + +## BCUT2D Molecular Weight + +```@docs +bcut2d_mwlow +``` + +```@docs +bcut2d_mwhi +``` + +## BCUT2D Partial Charge + +```@docs +bcut2d_chglow +``` + +```@docs +bcut2d_chghi +``` + +## BCUT2D LogP + +```@docs +bcut2d_logplow +``` + +```@docs +bcut2d_logphi +``` + +## BCUT2D Molar Refractivity + +```@docs +bcut2d_mrlow +``` + +```@docs +bcut2d_mrhi +``` + ## Atom Counts Simple but useful atom counting functions for chemical analysis. @@ -262,40 +507,7 @@ num_sulfurs num_halogens ``` -## Get Address - -```@docs -get_address -``` - -## Examples - -### Molecular Connectivity Analysis - -```julia -using MoleculeFlow - -# Analyze molecular connectivity for drug-like molecules -compounds = [ - mol_from_smiles("CC(=O)OC1=CC=CC=C1C(=O)O"), # Aspirin - mol_from_smiles("CCO"), # Ethanol - mol_from_smiles("c1ccc2c(c1)nnnc2N") # Triazole compound -] - -for (i, mol) in enumerate(compounds) - println("Compound $i:") - println("Chi0n: ", chi0n(mol)) - println("Chi1v: ", chi1v(mol)) - println("Kappa2: ", kappa2(mol)) - println("Max E-state: ", max_e_state_index(mol)) - println("Carbons: ", num_carbons(mol)) - println("Nitrogens: ", num_nitrogens(mol)) - println("Halogens: ", num_halogens(mol)) - println() -end -``` - -### Working with Multiple Molecules +## Working with Multiple Molecules Most descriptor functions in MoleculeFlow can accept vectors of molecules directly for efficient batch processing: @@ -314,3 +526,4 @@ println("LogP values: ", logp_values) println("Chi1v values: ", chi_values) println("Carbon counts: ", carbon_counts) ``` + diff --git a/docs/src/api/fragmentation.md b/docs/src/api/fragmentation.md index d1c5fc2..fea2955 100644 --- a/docs/src/api/fragmentation.md +++ b/docs/src/api/fragmentation.md @@ -127,24 +127,4 @@ println("Custom fragments: ", fragments) # Size-constrained BRICS fragmentation fragments = brics_decompose(mol, min_fragment_size=3, max_fragment_size=8) println("Size-constrained fragments: ", fragments) -``` - -### Vectorized Operations - -```julia -# Process multiple molecules at once -smiles_list = ["CCCOCCc1ccccc1", "CC(=O)NCCc1ccccc1", "CCCCCC"] -mols = mol_from_smiles(smiles_list) - -# Get all BRICS fragments -all_brics = brics_decompose(mols) -for (i, frags) in enumerate(all_brics) - println("Molecule $i BRICS: ", frags) -end - -# Get all scaffolds -all_scaffolds = get_murcko_scaffold(mols) -for (i, scaffold) in enumerate(all_scaffolds) - println("Molecule $i scaffold: ", scaffold) -end -``` +``` \ No newline at end of file diff --git a/docs/src/api/operations.md b/docs/src/api/operations.md index 883d69b..2468408 100644 --- a/docs/src/api/operations.md +++ b/docs/src/api/operations.md @@ -4,201 +4,111 @@ CurrentModule = MoleculeFlow # Molecular Operations -Advanced molecular manipulation, editing, and analysis functions. +Other molecular manipulation, editing, and analysis functions. -## Hydrogen Manipulation - -Functions for adding and removing explicit hydrogens from molecules. - -### Adding Hydrogens +## Adding Hydrogens ```@docs add_hs ``` -### Removing Hydrogens +## Removing Hydrogens ```@docs remove_hs ``` -## Molecular Editing - -Functions for modifying molecular structures programmatically. - -### Combining Molecules +## Combining Molecules ```@docs combine_mols ``` -### Substructure Modifications +## Substructure Modifications ```@docs delete_substructs replace_substructs ``` -## Stereochemistry Operations - -Functions for analyzing and manipulating molecular stereochemistry. - -### Stereochemistry Assignment +## Stereochemistry Assignment ```@docs assign_stereochemistry! ``` -### Chiral Center Analysis +## Chiral Center Analysis ```@docs find_chiral_centers ``` -## Ring Analysis - -Functions for analyzing ring systems and molecular topology. - -### Ring Detection +## Ring Detection ```@docs fast_find_rings! ``` -### Atom Ranking +## Ring Information ```@docs -canonical_atom_ranks +get_ring_info ``` -## Pattern Matching - -Advanced pattern matching and substructure analysis. - -### SMARTS Matching +## Is Ring Aromatic ```@docs -quick_smarts_match +is_ring_aromatic ``` -### Fragment Analysis +## Atom Ranking ```@docs -mol_fragment_to_smarts +canonical_atom_ranks ``` -## Examples - -### Working with Hydrogens +## Atom Environment Analysis -```julia -using MoleculeFlow - -# Start with a simple molecule -mol = mol_from_smiles("CCO") -println("Original: ", mol_to_smiles(mol)) - -# Add explicit hydrogens -mol_with_hs = add_hs(mol) -println("With H's: ", mol_to_smiles(mol_with_hs)) - -# Remove explicit hydrogens -mol_clean = remove_hs(mol_with_hs) -println("Clean: ", mol_to_smiles(mol_clean)) +```@docs +find_atom_environment ``` -### Molecular Editing +## SMARTS Matching -```julia -# Combine two molecules -ethanol = mol_from_smiles("CCO") -propane = mol_from_smiles("CCC") -combined = combine_mols(ethanol, propane) - -# Remove alcohol groups -benzyl_alcohol = mol_from_smiles("c1ccc(CO)cc1") -benzene_derivative = delete_substructs(benzyl_alcohol, "[OH]") +```@docs +quick_smarts_match ``` -### Pattern Matching +## Fragment Analysis -```julia -# Check for functional groups -mol = mol_from_smiles("CCO") - -has_alcohol = quick_smarts_match(mol, "[OH]") # true -has_amine = quick_smarts_match(mol, "[NH2]") # false -has_carbon = quick_smarts_match(mol, "[C]") # true +```@docs +mol_fragment_to_smarts ``` -### Stereochemistry Analysis - -```julia -# Work with chiral molecules -chiral_mol = mol_from_smiles("C[C@H](O)C") - -# Assign stereochemistry -assign_stereochemistry!(chiral_mol) - -# Find chiral centers -centers = find_chiral_centers(chiral_mol) -for (atom_idx, chirality) in centers - println("Atom $atom_idx: $chirality") -end +```@docs +mol_fragment_to_smiles ``` -### Advanced I/O - -```julia -# Generate InChI keys for database storage -mol = mol_from_smiles("CCO") -inchi_key = mol_to_inchi_key(mol) -println("InChI Key: ", inchi_key) +## Atom Renumbering -# Export as MOL block -molblock = mol_to_molblock(mol) -println("MOL Block:") -println(molblock) +```@docs +renumber_atoms ``` -### Working with 3D Coordinates +## Remove Stereochemistry -```julia -# Load molecule from XYZ file with 3D coordinates -mol_3d = mol_from_xyz_file("molecule.xyz") - -# Or create XYZ format from existing molecule with 3D coordinates -mol = mol_from_smiles("CCO") -conformers = generate_3d_conformers(mol, 1) -if !isempty(conformers) - mol_with_coords = conformers[1].molecule - xyz_string = mol_to_xyz_block(mol_with_coords) - println("XYZ Format:") - println(xyz_string) -end +```@docs +remove_stereochemistry! ``` -### MOL2 Format for Pharmacophore Modeling +## Sanitize Molecule -```julia -# Load ligand from MOL2 file (preserves charges and atom types) -ligand = mol_from_mol2_file("drug_compound.mol2") - -if ligand.valid - println("Loaded ligand: ", mol_to_smiles(ligand)) - println("Heavy atoms: ", heavy_atom_count(ligand)) - - # MOL2 format preserves partial charges for QSAR studies - atoms = get_atoms(ligand) - for atom in atoms[1:3] # Show first 3 atoms - println("Atom ", get_symbol(atom), " charge: ", get_formal_charge(atom)) - end -end +```@docs +sanitize_mol! ``` -## Notes +## Computer 2D coordinates -- Many functions modify molecules in-place (indicated by `!` suffix) for performance -- Functions return `false` or empty results when operations fail -- Invalid molecules are handled gracefully without throwing exceptions -- All new functions integrate seamlessly with existing MoleculeFlow.jl workflows \ No newline at end of file +```@docs +compute_2d_coords! +``` \ No newline at end of file diff --git a/docs/src/api/pharmacophore.md b/docs/src/api/pharmacophore.md new file mode 100644 index 0000000..bbec941 --- /dev/null +++ b/docs/src/api/pharmacophore.md @@ -0,0 +1,255 @@ +```@meta +CurrentModule = MoleculeFlow +``` + +# Pharmacophore Features + +Functions for pharmacophore analysis, chemical feature identification, and pharmacophore fingerprint generation. + +## Overview + +Pharmacophores represent the spatial arrangement of chemical features that are essential for molecular recognition. MoleculeFlow provides comprehensive pharmacophore functionality including: + +- Chemical feature identification (donors, acceptors, aromatic rings, etc.) +- 3D pharmacophore point extraction +- Pharmacophore fingerprint generation +- Feature-based molecular analysis + +## Types + +### FeatureFactory + +```@docs +FeatureFactory +``` + +### ChemicalFeature + +```@docs +ChemicalFeature +``` + +## Factory Creation + +### Default Feature Factory + +```@docs +create_feature_factory +``` + +## Feature Extraction + +### Molecular Features + +```@docs +get_mol_features +``` + +### Feature Families + +```@docs +get_feature_families +``` + +### Feature Filtering + +```@docs +filter_features_by_family +``` + +## Pharmacophore Analysis + +### 3D Pharmacophore Points + +```@docs +get_pharmacophore_3d +``` + +### Pharmacophore Fingerprints + +```@docs +pharmacophore_fingerprint +``` + +## Usage Examples + +### Basic Feature Identification + +```julia +using MoleculeFlow + +# Create a feature factory with default definitions +factory = create_feature_factory() + +# Analyze a molecule (features require coordinates) +mol = mol_from_smiles("c1ccccc1O") # Phenol +conformers_2d = generate_2d_conformers(mol) # Generate 2D coordinates + +if !isempty(conformers_2d) + mol_2d = conformers_2d[1].molecule + features = get_mol_features(mol_2d, factory) + + println("Found $(length(features)) features:") + for feature in features + println(" $(feature.family) at atoms $(feature.atom_ids)") + end +end +``` + +### Pharmacophore Fingerprints + +```julia +# Generate pharmacophore fingerprints for multiple molecules +molecules = [ + mol_from_smiles("CCO"), # Ethanol + mol_from_smiles("c1ccccc1O"), # Phenol + mol_from_smiles("CC(=O)N") # Acetamide +] + +# Note: Pharmacophore fingerprints work with molecules as-is +# (they generate 2D topological features) +fingerprints = pharmacophore_fingerprint(molecules) + +for (i, fp) in enumerate(fingerprints) + if fp !== missing + println("Molecule $(i): $(count(fp)) bits set in fingerprint of length $(length(fp))") + end +end +``` + +### 3D Pharmacophore Analysis + +```julia +# Analyze 3D pharmacophore points (requires 3D coordinates) +mol = mol_from_smiles("c1ccccc1O") # Phenol + +# Generate 3D conformer +conformers_3d = generate_3d_conformers(mol, 1) +if !isempty(conformers_3d) + mol_3d = conformers_3d[1].molecule + factory = create_feature_factory() + + # Extract 3D pharmacophore points + ph4_points = get_pharmacophore_3d(mol_3d, factory) + + println("3D Pharmacophore points:") + for (family, position) in ph4_points + println("$(family) at [$(position[1]), $(position[2]), $(position[3])]") + end +end +``` + +### Feature Family Analysis + +```julia +# Analyze specific feature families +mol = mol_from_smiles("CC(=O)Nc1ccccc1O") # N-acetyl-p-hydroxybenzamide + +# Generate 3D conformer for accurate feature positioning +conformers = generate_3d_conformers(mol, 1) +if !isempty(conformers) + mol_3d = conformers[1].molecule + factory = create_feature_factory() + features = get_mol_features(mol_3d, factory) + + # Get all hydrogen bond donors + donors = filter_features_by_family(features, "Donor") + println("Hydrogen bond donors: $(length(donors))") + + # Get all hydrogen bond acceptors + acceptors = filter_features_by_family(features, "Acceptor") + println("Hydrogen bond acceptors: $(length(acceptors))") + + # Get all aromatic features + aromatics = filter_features_by_family(features, "Aromatic") + println("Aromatic rings: $(length(aromatics))") +end +``` + +### Custom Feature Definitions + +```julia +# Define custom pharmacophore features +custom_fdef = raw""" +DefineFeature Carboxyl [CX3](=O)[OX2H1] + Family AcidicGroup + Weights 1.0,1.0,1.0 +EndFeature + +DefineFeature Amine [NX3;H2,H1;!$(NC=O)] + Family BasicGroup + Weights 1.0 +EndFeature +""" + +# Create factory with custom definitions +custom_factory = create_feature_factory( + use_default=false, + fdef_string=custom_fdef +) + +# Available families in custom factory +families = get_feature_families(custom_factory) +println("Custom feature families: $(families)") +``` + +### Drug-like Molecule Analysis + +```julia +# Analyze pharmacophore features of drug molecules +aspirin = mol_from_smiles("CC(=O)Oc1ccccc1C(=O)O") +ibuprofen = mol_from_smiles("CC(C)Cc1ccc(cc1)C(C)C(=O)O") + +factory = create_feature_factory() + +# Compare pharmacophore fingerprints +aspirin_fp = pharmacophore_fingerprint(aspirin) +ibuprofen_fp = pharmacophore_fingerprint(ibuprofen) + +if aspirin_fp isa Vector{Bool} && ibuprofen_fp isa Vector{Bool} + # Calculate Tanimoto similarity + similarity = tanimoto_similarity(aspirin_fp, ibuprofen_fp) + println("Pharmacophore similarity: $(similarity)") +end + +# Analyze individual features (need coordinates for feature extraction) +aspirin_2d = generate_2d_conformers(aspirin) +ibuprofen_2d = generate_2d_conformers(ibuprofen) + +if !isempty(aspirin_2d) && !isempty(ibuprofen_2d) + aspirin_features = get_mol_features(aspirin_2d[1].molecule, factory) + ibuprofen_features = get_mol_features(ibuprofen_2d[1].molecule, factory) + + println("Aspirin features:") + for feature in aspirin_features + println(" $(feature.family)") + end + + println("Ibuprofen features:") + for feature in ibuprofen_features + println(" $(feature.family)") + end +end +``` + +## Available Feature Families + +The default feature factory includes the following pharmacophore feature families: + +- **Donor**: Hydrogen bond donors +- **Acceptor**: Hydrogen bond acceptors +- **NegIonizable**: Negatively ionizable groups +- **PosIonizable**: Positively ionizable groups +- **ZnBinder**: Zinc binding groups +- **Aromatic**: Aromatic ring systems +- **Hydrophobe**: Hydrophobic regions +- **LumpedHydrophobe**: Larger hydrophobic regions + +## Notes + +- **Coordinate Requirements**: Feature extraction (`get_mol_features`) requires molecules to have 2D or 3D coordinates. Use `generate_2d_conformers()` or `generate_3d_conformers()` before feature extraction. +- **Pharmacophore Fingerprints**: Work directly with molecules (no coordinates required) and use 2D topological distances by default +- **3D Pharmacophore Analysis**: Requires molecules with 3D coordinates for accurate spatial positioning +- **Feature Identification**: Uses SMARTS patterns defined in the feature factory +- **Custom Definitions**: Custom feature definitions can be created using the RDKit feature definition language +- **Vectorized Operations**: All functions support vectorized operations for processing multiple molecules efficiently diff --git a/docs/src/api/substructure.md b/docs/src/api/substructure.md index a4e550e..7e50bcd 100644 --- a/docs/src/api/substructure.md +++ b/docs/src/api/substructure.md @@ -59,15 +59,3 @@ get_functional_groups ```@docs FUNCTIONAL_GROUPS ``` - -## Get Ring Info - -```@docs -get_ring_info -``` - -## Is Ring Aromatic - -```@docs -is_ring_aromatic -``` \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index afe0944..a05b87f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -8,8 +8,8 @@ MoleculeFlow.jl is a comprehensive Julia library for cheminformatics, providing !!! info This library is based on the [rdkit](https://github.com/rdkit/rdkit) and [PythonCall](https://github.com/JuliaPy/PythonCall.jl). Author sees little sense in trying to implement things from scratch, as rdkit has been, is, and very likely will continue being the benchmark cheminformatics library in the future. - The point of the library is two-fold: 1) Provide a pleasant and user-friendly julian interface that is very easy to extend and/or build upon using a reliable source library. 2) Provide cheminformatics functionality that is not straightforward to implement in in rdkit. - + The point of the library is two-fold: 1) Provide a pleasant and user-friendly julian interface that is very easy to extend and/or build upon using a reliable source library. 2) Provide cheminformatics functionality that is not straightforward to implement in in rdkit. + ## Installation diff --git a/src/MoleculeFlow.jl b/src/MoleculeFlow.jl index 836721f..9d89b1d 100644 --- a/src/MoleculeFlow.jl +++ b/src/MoleculeFlow.jl @@ -25,8 +25,10 @@ export combine_mols, delete_substructs, replace_substructs # Stereochemistry operations export assign_stereochemistry!, find_chiral_centers -# Ring analysis -export fast_find_rings!, canonical_atom_ranks +# Ring analysis and molecular operations +export fast_find_rings!, canonical_atom_ranks, get_ring_info, find_atom_environment +export mol_fragment_to_smarts, mol_fragment_to_smiles, renumber_atoms +export remove_stereochemistry!, sanitize_mol!, compute_2d_coords! # Pattern matching export quick_smarts_match, mol_fragment_to_smarts @@ -54,10 +56,67 @@ export qed, fraction_csp3, labute_asa, molar_refractivity, synthetic_accessibili # Advanced ring and structure counts export num_aliphatic_carbocycles, num_aromatic_carbocycles, num_aromatic_heterocycles -export num_atom_stereo_centers, num_amide_bonds +export num_atom_stereo_centers, + num_amide_bonds, num_aliphatic_heterocycles, num_saturated_heterocycles +export num_saturated_carbocycles, + num_unspecified_atom_stereo_centers, num_spiro_atoms, num_bridgehead_atoms +export hall_kier_alpha, num_aliphatic_rings, num_heterocycles + +# BCUT descriptors +export bcut2d_mwlow, + bcut2d_mwhi, + bcut2d_chglow, + bcut2d_chghi, + bcut2d_logplow, + bcut2d_logphi, + bcut2d_mrlow, + bcut2d_mrhi + +# VSA descriptors +export slogp_vsa2, + slogp_vsa3, + slogp_vsa4, + slogp_vsa5, + slogp_vsa6, + slogp_vsa7, + slogp_vsa8, + slogp_vsa9, + slogp_vsa10, + slogp_vsa11, + slogp_vsa12 +export smr_vsa1, + smr_vsa2, + smr_vsa3, + smr_vsa4, + smr_vsa5, + smr_vsa6, + smr_vsa7, + smr_vsa8, + smr_vsa9, + smr_vsa10 +export peoe_vsa1, + peoe_vsa2, + peoe_vsa3, + peoe_vsa4, + peoe_vsa5, + peoe_vsa6, + peoe_vsa7, + peoe_vsa8, + peoe_vsa9, + peoe_vsa10, + peoe_vsa11, + peoe_vsa12, + peoe_vsa13, + peoe_vsa14 + +# Additional E-state descriptors +export max_absolute_e_state_index, min_absolute_e_state_index # 3D descriptors -export asphericity, radius_of_gyration +export asphericity, + radius_of_gyration, pmi1, pmi2, pmi3, eccentricity, inertial_shape_factor +export spherocity_index, + getaway_descriptors, whim_descriptors, rdf_descriptors, morse_descriptors # Fingerprints export morgan_fingerprint, rdk_fingerprint, maccs_fingerprint @@ -123,6 +182,11 @@ export is_balanced, get_atom_mapping_numbers, set_atom_mapping_numbers! export remove_unmapped_reactant_templates!, remove_unmapped_product_templates! export preprocess_reaction!, compute_atom_mapping!, is_template_molecule_agent +# Pharmacophore features +export FeatureFactory, ChemicalFeature +export create_feature_factory, get_mol_features, pharmacophore_fingerprint +export get_feature_families, filter_features_by_family, get_pharmacophore_3d + include("./config.jl") include("./utils.jl") include("./rdkit.jl") @@ -144,5 +208,6 @@ include("./standardization/standardization.jl") include("./fragmentation/fragmentation.jl") include("./progress.jl") include("./reaction/reaction.jl") +include("./pharmacophore/pharmacophore.jl") end diff --git a/src/io/read.jl b/src/io/read.jl index 2c8d624..408c5bb 100644 --- a/src/io/read.jl +++ b/src/io/read.jl @@ -349,3 +349,223 @@ function mol_from_inchi(inchi_list::Vector{String}) return results end + +""" + mol_from_pdb_block(pdb_block::String) -> Molecule + +Create a molecule from a PDB block string. + +# Arguments +- `pdb_block::String`: PDB format string + +# Returns +- `Molecule`: Parsed molecule or invalid molecule if parsing fails + +# Example +```julia +pdb_data = "ATOM 1 C MOL A 1 20.154 21.875 21.235 1.00 10.00 C" +mol = mol_from_pdb_block(pdb_data) +``` +""" +function mol_from_pdb_block(pdb_block::String) + try + rdkit_mol = _mol_from_pdb_block(pdb_block) + if pyconvert(Bool, rdkit_mol !== pybuiltins.None) + return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = "PDB block") + else + return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = "PDB block") + end + catch e + @warn "Error parsing PDB block: $e" + return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "PDB block") + end +end + +""" + mol_from_pdb_file(filename::String) -> Molecule + +Create a molecule from a PDB file. + +# Arguments +- `filename::String`: Path to PDB file + +# Returns +- `Molecule`: Parsed molecule or invalid molecule if parsing fails + +# Example +```julia +mol = mol_from_pdb_file("protein.pdb") +``` +""" +function mol_from_pdb_file(filename::String) + try + rdkit_mol = _mol_from_pdb_file(filename) + if pyconvert(Bool, rdkit_mol !== pybuiltins.None) + return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = filename) + else + @warn "Failed to parse PDB file: $filename" + return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = filename) + end + catch e + @warn "Error reading PDB file '$filename': $e" + return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = filename) + end +end + +""" + mol_from_xyz_file(filename::String) -> Molecule + +Create a molecule from an XYZ file. + +# Arguments +- `filename::String`: Path to XYZ file + +# Returns +- `Molecule`: Parsed molecule or invalid molecule if parsing fails + +# Example +```julia +mol = mol_from_xyz_file("molecule.xyz") +``` + +# Notes +- XYZ format only contains atomic coordinates, not bond information +- RDKit will attempt to infer bonds based on distances +- May not preserve original bond orders or formal charges +""" +function mol_from_xyz_file(filename::String) + try + rdkit_mol = _mol_from_xyz_file(filename) + if pyconvert(Bool, rdkit_mol !== pybuiltins.None) + return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = filename) + else + @warn "Failed to parse XYZ file: $filename" + return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = filename) + end + catch e + @warn "Error reading XYZ file '$filename': $e" + return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = filename) + end +end + +""" + mol_from_xyz_block(xyz_block::String) -> Molecule + +Create a molecule from an XYZ block string. + +# Arguments +- `xyz_block::String`: XYZ format string + +# Returns +- `Molecule`: Parsed molecule or invalid molecule if parsing fails + +# Example +```julia +xyz_data = \"\"\"3 +Ethanol +C 0.000 0.000 0.000 +C 1.540 0.000 0.000 +O 2.000 1.000 0.000\"\"\" +mol = mol_from_xyz_block(xyz_data) +``` + +# Notes +- XYZ format only contains atomic coordinates, not bond information +- RDKit will attempt to infer bonds based on distances +""" +function mol_from_xyz_block(xyz_block::String) + try + rdkit_mol = _mol_from_xyz_block(xyz_block) + if pyconvert(Bool, rdkit_mol === pybuiltins.None) + @warn "Failed to parse XYZ block" + return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = "xyz_block", props = Dict{Symbol, Any}()) + end + return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = "xyz_block", props = Dict{Symbol, Any}()) + catch e + @warn "Error parsing XYZ block: $e" + return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "xyz_block", props = Dict{Symbol, Any}()) + end +end + +""" + mol_from_mol2_file(filename::String) -> Molecule + +Create a molecule from a MOL2 file. + +# Arguments +- `filename::String`: Path to MOL2 file + +# Returns +- `Molecule`: Parsed molecule or invalid molecule if parsing fails + +# Example +```julia +mol = mol_from_mol2_file("ligand.mol2") +``` + +# Notes +- MOL2 format includes atom types, bond types, and partial charges +- Supports multiple conformations and substructures +- Commonly used for small molecule drug-like compounds +""" +function mol_from_mol2_file(filename::String) + try + rdkit_mol = _mol_from_mol2_file(filename) + if pyconvert(Bool, rdkit_mol === pybuiltins.None) + @warn "Failed to parse MOL2 file: $filename" + return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = filename, props = Dict{Symbol, Any}()) + end + return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = filename, props = Dict{Symbol, Any}()) + catch e + @warn "Error reading MOL2 file '$filename': $e" + return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = filename, props = Dict{Symbol, Any}()) + end +end + +""" + mol_from_mol2_block(mol2_block::String) -> Molecule + +Create a molecule from a MOL2 block string. + +# Arguments +- `mol2_block::String`: MOL2 format string + +# Returns +- `Molecule`: Parsed molecule or invalid molecule if parsing fails + +# Example +```julia +mol2_data = \"\"\" +@MOLECULE +ethanol +3 2 0 0 0 +SMALL +@ATOM +1 C1 0.0000 0.0000 0.0000 C.3 1 MOL 0.0000 +2 C2 1.5400 0.0000 0.0000 C.3 1 MOL 0.0000 +3 O1 2.0000 1.0000 0.0000 O.3 1 MOL 0.0000 +@BOND +1 1 2 1 +2 2 3 1 +\"\"\" +mol = mol_from_mol2_block(mol2_data) +``` + +# Notes +- MOL2 format uses TRIPOS record types +- Supports various atom and bond types +- May include substructure and partial charge information +""" +function mol_from_mol2_block(mol2_block::String) + try + rdkit_mol = _mol_from_mol2_block(mol2_block) + if pyconvert(Bool, rdkit_mol === pybuiltins.None) + @warn "Failed to parse MOL2 block" + return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = "mol2_block", props = Dict{Symbol, Any}()) + end + return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = "mol2_block", props = Dict{Symbol, Any}()) + catch e + @warn "Error parsing MOL2 block: $e" + return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "mol2_block", props = Dict{Symbol, Any}()) + end +end diff --git a/src/io/write.jl b/src/io/write.jl index bcd4716..c276e81 100644 --- a/src/io/write.jl +++ b/src/io/write.jl @@ -94,3 +94,236 @@ function mol_to_inchi(mol_list::Vector{Union{Molecule, Missing}}) return pyconvert(Vector{Union{String, Missing}}, results) end + +""" + mol_to_inchi_key(mol::Molecule) -> Union{String, Missing} + +Convert a molecule to InChI Key format. + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Union{String, Missing}`: InChI Key or missing if conversion fails + +# Example + +```julia +mol = mol_from_smiles("CCO") +key = mol_to_inchi_key(mol) # "LFQSCWFLJHTTHZ-UHFFFAOYSA-N" +``` +""" +function mol_to_inchi_key(mol::Molecule) + !mol.valid && return "" + try + return pyconvert(String, _mol_to_inchi_key(mol._rdkit_mol)) + catch e + @warn "Error converting to InChI key: $e" + return "" + end +end + +""" + mol_to_inchi_and_aux_info(mol::Molecule) -> Union{Tuple{String, String}, Missing} + +Convert a molecule to InChI string with auxiliary information. + +# Returns + + - `Union{Tuple{String, String}, Missing}`: (InChI, AuxInfo) or missing if conversion fails + +# Example + +```julia +mol = mol_from_smiles("CCO") +inchi, aux = mol_to_inchi_and_aux_info(mol) +``` +""" +function mol_to_inchi_and_aux_info(mol::Molecule) + !mol.valid && return missing + try + result = _mol_to_inchi_and_aux_info(mol._rdkit_mol) + inchi = pyconvert(String, result[0]) + aux_info = pyconvert(String, result[1]) + return (inchi, aux_info) + catch e + @warn "Error converting to InChI with aux info: $e" + return missing + end +end + +""" + mol_to_molblock(mol::Molecule) -> Union{String, Missing} + +Convert a molecule to MOL block format. + +# Returns + + - `Union{String, Missing}`: MOL block or missing if conversion fails + +# Example + +```julia +mol = mol_from_smiles("CCO") +molblock = mol_to_molblock(mol) +``` +""" +function mol_to_molblock(mol::Molecule) + !mol.valid && return "" + try + return pyconvert(String, _mol_to_molblock(mol._rdkit_mol)) + catch e + @warn "Error converting to MOL block: $e" + return "" + end +end + +""" + mol_to_v3k_molblock(mol::Molecule) -> Union{String, Missing} + +Convert a molecule to V3000 MOL block format. + +# Returns + + - `Union{String, Missing}`: V3000 MOL block or missing if conversion fails + +# Example + +```julia +mol = mol_from_smiles("CCO") +v3k_block = mol_to_v3k_molblock(mol) +``` +""" +function mol_to_v3k_molblock(mol::Molecule) + !mol.valid && return missing + try + return pyconvert(String, _mol_to_v3k_molblock(mol._rdkit_mol)) + catch e + @warn "Error converting to V3K MOL block: $e" + return missing + end +end + +""" + mol_to_pdb_block(mol::Molecule; conf_id::Int=-1) -> Union{String, Missing} + +Convert a molecule to PDB block format. + +# Arguments + + - `mol::Molecule`: Input molecule + - `conf_id::Int`: Conformer ID to use (-1 for default) + +# Returns + + - `Union{String, Missing}`: PDB block or missing if conversion fails + +# Example + +```julia +mol = mol_from_smiles("CCO") +pdb_block = mol_to_pdb_block(mol) +``` +""" +function mol_to_pdb_block(mol::Molecule; conf_id::Int = -1) + !mol.valid && return "" + try + return pyconvert(String, _mol_to_pdb_block(mol._rdkit_mol)) + catch e + @warn "Error converting to PDB block: $e" + return "" + end +end + +""" + mol_to_xyz_block(mol::Molecule) -> Union{String, Missing} + +Convert a molecule to XYZ block format. + +# Returns + + - `Union{String, Missing}`: XYZ block or missing if conversion fails + +# Example + +```julia +mol = mol_from_smiles("CCO") +xyz_block = mol_to_xyz_block(mol) +``` +""" +function mol_to_xyz_block(mol::Molecule) + !mol.valid && return "" + try + return pyconvert(String, _mol_to_xyz_block(mol._rdkit_mol)) + catch e + @warn "Error converting to XYZ block: $e" + return "" + end +end + +""" + mol_to_pdb_file(mol::Molecule, filename::String; conf_id::Int=-1) -> Bool + +Write a molecule to a PDB file. + +# Arguments + + - `mol::Molecule`: Input molecule + - `filename::String`: Output file path + - `conf_id::Int`: Conformer ID to use (-1 for default) + +# Returns + + - `Bool`: true if successful, false otherwise + +# Example + +```julia +mol = mol_from_smiles("CCO") +success = mol_to_pdb_file(mol, "ethanol.pdb") +``` +""" +function mol_to_pdb_file(mol::Molecule, filename::String; conf_id::Int = -1) + !mol.valid && return false + try + _mol_to_pdb_file(mol._rdkit_mol, filename) + return true + catch e + @warn "Error writing PDB file: $e" + return false + end +end + +""" + mol_to_xyz_file(mol::Molecule, filename::String) -> Bool + +Write a molecule to an XYZ file. + +# Arguments + + - `mol::Molecule`: Input molecule + - `filename::String`: Output file path + +# Returns + + - `Bool`: true if successful, false otherwise + +# Example + +```julia +mol = mol_from_smiles("CCO") +success = mol_to_xyz_file(mol, "ethanol.xyz") +``` +""" +function mol_to_xyz_file(mol::Molecule, filename::String) + !mol.valid && return false + try + _mol_to_xyz_file(mol._rdkit_mol, filename) + return true + catch e + @warn "Error writing XYZ file: $e" + return false + end +end diff --git a/src/molecule/descriptors.jl b/src/molecule/descriptors.jl index 26e54c9..c63594c 100644 --- a/src/molecule/descriptors.jl +++ b/src/molecule/descriptors.jl @@ -1385,6 +1385,347 @@ function ipc(mol::Union{Molecule, Missing}) end end +# Additional missing descriptors that need high-level wrappers + +""" + num_aliphatic_heterocycles(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of aliphatic heterocycles in the molecule. +""" +function num_aliphatic_heterocycles(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_aliphatic_heterocycles(mol._rdkit_mol)) + catch e + @warn "Error calculating aliphatic heterocycles: $e" + return missing + end +end + +""" + num_saturated_heterocycles(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of saturated heterocycles in the molecule. +""" +function num_saturated_heterocycles(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_saturated_heterocycles(mol._rdkit_mol)) + catch e + @warn "Error calculating saturated heterocycles: $e" + return missing + end +end + +""" + num_saturated_carbocycles(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of saturated carbocycles in the molecule. +""" +function num_saturated_carbocycles(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_saturated_carbocycles(mol._rdkit_mol)) + catch e + @warn "Error calculating saturated carbocycles: $e" + return missing + end +end + +""" + num_unspecified_atom_stereo_centers(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of unspecified atom stereo centers. +""" +function num_unspecified_atom_stereo_centers(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_unspecified_atom_stereo_centers(mol._rdkit_mol)) + catch e + @warn "Error calculating unspecified stereo centers: $e" + return missing + end +end + +""" + num_spiro_atoms(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of spiro atoms in the molecule. +""" +function num_spiro_atoms(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_spiro_atoms(mol._rdkit_mol)) + catch e + @warn "Error calculating spiro atoms: $e" + return missing + end +end + +""" + num_bridgehead_atoms(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of bridgehead atoms in the molecule. +""" +function num_bridgehead_atoms(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_bridgehead_atoms(mol._rdkit_mol)) + catch e + @warn "Error calculating bridgehead atoms: $e" + return missing + end +end + +""" + hall_kier_alpha(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the Hall-Kier alpha descriptor. +""" +function hall_kier_alpha(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _hall_kier_alpha(mol._rdkit_mol)) + catch e + @warn "Error calculating Hall-Kier alpha: $e" + return missing + end +end + +""" + eccentricity(mol::Union{Molecule, Missing}; conf_id::Int = -1) -> Union{Float64, Missing} + +Calculate the eccentricity of the molecule's 3D structure. +""" +function eccentricity(mol::Union{Molecule, Missing}; conf_id::Int = -1) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _eccentricity(mol._rdkit_mol; confId = conf_id)) + catch e + @warn "Error calculating eccentricity: $e" + return missing + end +end + +""" + inertial_shape_factor(mol::Union{Molecule, Missing}; conf_id::Int = -1) -> Union{Float64, Missing} + +Calculate the inertial shape factor of the molecule. +""" +function inertial_shape_factor(mol::Union{Molecule, Missing}; conf_id::Int = -1) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _inertial_shape_factor(mol._rdkit_mol; confId = conf_id)) + catch e + @warn "Error calculating inertial shape factor: $e" + return missing + end +end + +# BCUT descriptors +""" + bcut2d_mwlow(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_MWLOW descriptor. +""" +function bcut2d_mwlow(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_mwlow(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_MWLOW: $e" + return missing + end +end + +""" + bcut2d_mwhi(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_MWHI descriptor. +""" +function bcut2d_mwhi(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_mwhi(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_MWHI: $e" + return missing + end +end + +""" + bcut2d_chglow(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_CHGLO descriptor. +""" +function bcut2d_chglow(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_chglow(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_CHGLO: $e" + return missing + end +end + +""" + bcut2d_chghi(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_CHGHI descriptor. +""" +function bcut2d_chghi(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_chghi(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_CHGHI: $e" + return missing + end +end + +""" + bcut2d_logplow(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_LOGPLOW descriptor. +""" +function bcut2d_logplow(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_logplow(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_LOGPLOW: $e" + return missing + end +end + +""" + bcut2d_logphi(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_LOGPHI descriptor. +""" +function bcut2d_logphi(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_logphi(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_LOGPHI: $e" + return missing + end +end + +""" + bcut2d_mrlow(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_MRLOW descriptor. +""" +function bcut2d_mrlow(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_mrlow(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_MRLOW: $e" + return missing + end +end + +""" + bcut2d_mrhi(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the BCUT2D_MRHI descriptor. +""" +function bcut2d_mrhi(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _bcut2d_mrhi(mol._rdkit_mol)) + catch e + @warn "Error calculating BCUT2D_MRHI: $e" + return missing + end +end + +""" + max_absolute_e_state_index(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the maximum absolute E-state index. +""" +function max_absolute_e_state_index(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _max_absolute_e_state_index(mol._rdkit_mol)) + catch e + @warn "Error calculating max absolute E-state index: $e" + return missing + end +end + +""" + min_absolute_e_state_index(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the minimum absolute E-state index. +""" +function min_absolute_e_state_index(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _min_absolute_e_state_index(mol._rdkit_mol)) + catch e + @warn "Error calculating min absolute E-state index: $e" + return missing + end +end + +# Removed num_sp3_heavy_atoms as the underlying RDKit function doesn't exist + +""" + num_aliphatic_rings(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of aliphatic rings in the molecule. +""" +function num_aliphatic_rings(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_aliphatic_rings(mol._rdkit_mol)) + catch e + @warn "Error calculating aliphatic rings: $e" + return missing + end +end + +""" + num_heterocycles(mol::Union{Molecule, Missing}) -> Union{Int, Missing} + +Count the number of heterocycles in the molecule. +""" +function num_heterocycles(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Int, _num_heterocycles(mol._rdkit_mol)) + catch e + @warn "Error calculating heterocycles: $e" + return missing + end +end + # Vectorized functions for new descriptors (moved to bottom for organization) const NEW_DESCRIPTOR_FUNCTIONS = [ # Chi connectivity indices @@ -1403,12 +1744,38 @@ const NEW_DESCRIPTOR_FUNCTIONS = [ # E-state descriptors :max_e_state_index, :min_e_state_index, + :max_absolute_e_state_index, + :min_absolute_e_state_index, # Atom counts :num_carbons, :num_nitrogens, :num_oxygens, :num_sulfurs, :num_halogens, + # Additional ring counts + :num_aliphatic_heterocycles, + :num_saturated_heterocycles, + :num_saturated_carbocycles, + :num_aliphatic_rings, + :num_heterocycles, + # Stereo and structure counts + :num_unspecified_atom_stereo_centers, + :num_spiro_atoms, + :num_bridgehead_atoms, + # 3D descriptors + :eccentricity, + :inertial_shape_factor, + # BCUT descriptors + :bcut2d_mwlow, + :bcut2d_mwhi, + :bcut2d_chglow, + :bcut2d_chghi, + :bcut2d_logplow, + :bcut2d_logphi, + :bcut2d_mrlow, + :bcut2d_mrhi, + # Other descriptors + :hall_kier_alpha, # Complexity measures :ipc, ] @@ -1421,3 +1788,925 @@ for func in NEW_DESCRIPTOR_FUNCTIONS return [$(func)(mol) for mol in mols] end end + +# Additional 3D descriptors +""" + pmi1(mol::Union{Molecule, Missing}; conf_id::Int = -1) -> Union{Float64, Missing} + +Calculate the first principal moment of inertia (PMI1). +""" +function pmi1(mol::Union{Molecule, Missing}; conf_id::Int = -1) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _pmi1(mol._rdkit_mol; confId = conf_id)) + catch e + @warn "Error calculating PMI1: $e" + return missing + end +end + +""" + pmi2(mol::Union{Molecule, Missing}; conf_id::Int = -1) -> Union{Float64, Missing} + +Calculate the second principal moment of inertia (PMI2). +""" +function pmi2(mol::Union{Molecule, Missing}; conf_id::Int = -1) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _pmi2(mol._rdkit_mol; confId = conf_id)) + catch e + @warn "Error calculating PMI2: $e" + return missing + end +end + +""" + pmi3(mol::Union{Molecule, Missing}; conf_id::Int = -1) -> Union{Float64, Missing} + +Calculate the third principal moment of inertia (PMI3). +""" +function pmi3(mol::Union{Molecule, Missing}; conf_id::Int = -1) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _pmi3(mol._rdkit_mol; confId = conf_id)) + catch e + @warn "Error calculating PMI3: $e" + return missing + end +end + +""" + spherocity_index(mol::Union{Molecule, Missing}; conf_id::Int = -1) -> Union{Float64, Missing} + +Calculate the spherocity index, a measure of molecular shape. +Lower values indicate more linear molecules, higher values indicate more spherical molecules. + +# Arguments + + - `mol::Union{Molecule, Missing}`: Input molecule (must have 3D coordinates) + - `conf_id::Int = -1`: Conformer ID (-1 for default conformer) + +# Returns + + - `Union{Float64, Missing}`: Spherocity index value, or missing if molecule is invalid or lacks 3D coordinates + +# Example + +```julia +mol = mol_from_smiles("CCCCCCC") # Linear alkane +result = generate_3d_conformers(mol; num_conformers = 1) +if result.success + spherocity = spherocity_index(result.molecules[1]) # Should be low (linear) +end +``` + +# Notes + + - Requires 3D coordinates to be present + - Values range from 0 (linear) to 1 (spherical) +""" +function spherocity_index(mol::Union{Molecule, Missing}; conf_id::Int = -1) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _spherocity_index(mol._rdkit_mol; confId = conf_id)) + catch e + @warn "Error calculating spherocity index: $e" + return missing + end +end + +""" + getaway_descriptors(mol::Union{Molecule, Missing}; conf_id::Int = -1, precision::Int = 2, custom_atom_property::String = "") -> Union{Vector{Float64}, Missing} + +Calculate GETAWAY (GEometry, Topology, and Atom-Weights AssemblY) descriptors. +These are 3D molecular descriptors that encode information about molecular geometry, +topology, and atomic properties. + +# Arguments + + - `mol::Union{Molecule, Missing}`: Input molecule (must have 3D coordinates) + - `conf_id::Int = -1`: Conformer ID (-1 for default conformer) + - `precision::Int = 2`: Precision for floating point calculations + - `custom_atom_property::String = ""`: Custom atomic property to use (empty string for default) + +# Returns + + - `Union{Vector{Float64}, Missing}`: Vector of GETAWAY descriptor values, or missing if molecule is invalid + +# Example + +```julia +mol = mol_from_smiles("CCO") +result = generate_3d_conformers(mol; num_conformers = 1) +if result.success + getaway_desc = getaway_descriptors(result.molecules[1]) + println("Number of GETAWAY descriptors: ", length(getaway_desc)) +end +``` + +# Notes + + - Returns a vector of descriptors (typically 273 values) + - Requires 3D coordinates to be present + - GETAWAY descriptors combine geometric, topological, and chemical information +""" +function getaway_descriptors( + mol::Union{Molecule, Missing}; + conf_id::Int = -1, + precision::Int = 2, + custom_atom_property::String = "", +) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + result = _calc_getaway( + mol._rdkit_mol; + confId = conf_id, + precision = precision, + custom_atom_property = custom_atom_property, + ) + return pyconvert(Vector{Float64}, result) + catch e + @warn "Error calculating GETAWAY descriptors: $e" + return missing + end +end + +""" + whim_descriptors(mol::Union{Molecule, Missing}; conf_id::Int = -1, thresh::Float64 = 0.001, custom_atom_property::String = "") -> Union{Vector{Float64}, Missing} + +Calculate WHIM (Weighted Holistic Invariant Molecular) descriptors. +These descriptors capture information about molecular shape and atomic distributions +in 3D space using principal component analysis. + +# Arguments + + - `mol::Union{Molecule, Missing}`: Input molecule (must have 3D coordinates) + - `conf_id::Int = -1`: Conformer ID (-1 for default conformer) + - `thresh::Float64 = 0.001`: Threshold for eigenvalue calculation + - `custom_atom_property::String = ""`: Custom atomic property to use (empty string for default) + +# Returns + + - `Union{Vector{Float64}, Missing}`: Vector of WHIM descriptor values, or missing if molecule is invalid + +# Example + +```julia +mol = mol_from_smiles("c1ccccc1") # Benzene +result = generate_3d_conformers(mol; num_conformers = 1) +if result.success + whim_desc = whim_descriptors(result.molecules[1]) + println("Number of WHIM descriptors: ", length(whim_desc)) +end +``` + +# Notes + + - Returns a vector of descriptors (typically 114 values) + - Requires 3D coordinates to be present + - WHIM descriptors are rotationally invariant +""" +function whim_descriptors( + mol::Union{Molecule, Missing}; + conf_id::Int = -1, + thresh::Float64 = 0.001, + custom_atom_property::String = "", +) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + result = _calc_whim( + mol._rdkit_mol; + confId = conf_id, + thresh = thresh, + custom_atom_property = custom_atom_property, + ) + return pyconvert(Vector{Float64}, result) + catch e + @warn "Error calculating WHIM descriptors: $e" + return missing + end +end + +""" + rdf_descriptors(mol::Union{Molecule, Missing}; conf_id::Int = -1, custom_atom_property::String = "") -> Union{Vector{Float64}, Missing} + +Calculate RDF (Radial Distribution Function) descriptors. +These descriptors encode the distribution of atoms as a function of distance +from a central point in the molecule. + +# Arguments + + - `mol::Union{Molecule, Missing}`: Input molecule (must have 3D coordinates) + - `conf_id::Int = -1`: Conformer ID (-1 for default conformer) + - `custom_atom_property::String = ""`: Custom atomic property to use (empty string for default) + +# Returns + + - `Union{Vector{Float64}, Missing}`: Vector of RDF descriptor values, or missing if molecule is invalid + +# Example + +```julia +mol = mol_from_smiles("CCCCCC") # Hexane +result = generate_3d_conformers(mol; num_conformers = 1) +if result.success + rdf_desc = rdf_descriptors(result.molecules[1]) + println("Number of RDF descriptors: ", length(rdf_desc)) +end +``` + +# Notes + + - Returns a vector of descriptors (typically 210 values) + - Requires 3D coordinates to be present + - RDF descriptors capture radial atomic distribution patterns +""" +function rdf_descriptors( + mol::Union{Molecule, Missing}; conf_id::Int = -1, custom_atom_property::String = "" +) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + result = _calc_rdf( + mol._rdkit_mol; confId = conf_id, custom_atom_property = custom_atom_property + ) + return pyconvert(Vector{Float64}, result) + catch e + @warn "Error calculating RDF descriptors: $e" + return missing + end +end + +""" + morse_descriptors(mol::Union{Molecule, Missing}; conf_id::Int = -1, custom_atom_property::String = "") -> Union{Vector{Float64}, Missing} + +Calculate MORSE (Molecule Representation of Structures based on Electron diffraction) descriptors. +These 3D descriptors are based on the idea of electron diffraction and encode +3D structural information as a function of scattering angle. + +# Arguments + + - `mol::Union{Molecule, Missing}`: Input molecule (must have 3D coordinates) + - `conf_id::Int = -1`: Conformer ID (-1 for default conformer) + - `custom_atom_property::String = ""`: Custom atomic property to use (empty string for default) + +# Returns + + - `Union{Vector{Float64}, Missing}`: Vector of MORSE descriptor values, or missing if molecule is invalid + +# Example + +```julia +mol = mol_from_smiles("CC(C)C") # Isobutane +result = generate_3d_conformers(mol; num_conformers = 1) +if result.success + morse_desc = morse_descriptors(result.molecules[1]) + println("Number of MORSE descriptors: ", length(morse_desc)) +end +``` + +# Notes + + - Returns a vector of descriptors (typically 224 values) + - Requires 3D coordinates to be present + - MORSE descriptors are based on electron diffraction theory +""" +function morse_descriptors( + mol::Union{Molecule, Missing}; conf_id::Int = -1, custom_atom_property::String = "" +) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + result = _calc_morse( + mol._rdkit_mol; confId = conf_id, custom_atom_property = custom_atom_property + ) + return pyconvert(Vector{Float64}, result) + catch e + @warn "Error calculating MORSE descriptors: $e" + return missing + end +end + +# VSA descriptors (SlogP_VSA series) +""" + slogp_vsa2(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA2 descriptor. +""" +function slogp_vsa2(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa2(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA2: $e" + return missing + end +end + +""" + slogp_vsa3(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA3 descriptor. +""" +function slogp_vsa3(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa3(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA3: $e" + return missing + end +end + +""" + slogp_vsa4(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA4 descriptor. +""" +function slogp_vsa4(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa4(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA4: $e" + return missing + end +end + +""" + slogp_vsa5(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA5 descriptor. +""" +function slogp_vsa5(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa5(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA5: $e" + return missing + end +end + +""" + slogp_vsa6(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA6 descriptor. +""" +function slogp_vsa6(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa6(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA6: $e" + return missing + end +end + +""" + slogp_vsa7(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA7 descriptor. +""" +function slogp_vsa7(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa7(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA7: $e" + return missing + end +end + +""" + slogp_vsa8(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA8 descriptor. +""" +function slogp_vsa8(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa8(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA8: $e" + return missing + end +end + +""" + slogp_vsa9(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA9 descriptor. +""" +function slogp_vsa9(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa9(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA9: $e" + return missing + end +end + +""" + slogp_vsa10(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA10 descriptor. +""" +function slogp_vsa10(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa10(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA10: $e" + return missing + end +end + +""" + slogp_vsa11(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA11 descriptor. +""" +function slogp_vsa11(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa11(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA11: $e" + return missing + end +end + +""" + slogp_vsa12(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SlogP_VSA12 descriptor. +""" +function slogp_vsa12(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _slogp_vsa12(mol._rdkit_mol)) + catch e + @warn "Error calculating SlogP_VSA12: $e" + return missing + end +end + +# SMR_VSA descriptors +""" + smr_vsa1(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA1 descriptor. +""" +function smr_vsa1(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa1(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA1: $e" + return missing + end +end + +""" + smr_vsa2(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA2 descriptor. +""" +function smr_vsa2(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa2(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA2: $e" + return missing + end +end + +""" + smr_vsa3(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA3 descriptor. +""" +function smr_vsa3(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa3(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA3: $e" + return missing + end +end + +""" + smr_vsa4(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA4 descriptor. +""" +function smr_vsa4(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa4(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA4: $e" + return missing + end +end + +""" + smr_vsa5(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA5 descriptor. +""" +function smr_vsa5(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa5(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA5: $e" + return missing + end +end + +""" + smr_vsa6(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA6 descriptor. +""" +function smr_vsa6(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa6(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA6: $e" + return missing + end +end + +""" + smr_vsa7(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA7 descriptor. +""" +function smr_vsa7(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa7(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA7: $e" + return missing + end +end + +""" + smr_vsa8(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA8 descriptor. +""" +function smr_vsa8(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa8(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA8: $e" + return missing + end +end + +""" + smr_vsa9(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA9 descriptor. +""" +function smr_vsa9(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa9(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA9: $e" + return missing + end +end + +""" + smr_vsa10(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the SMR_VSA10 descriptor. +""" +function smr_vsa10(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _smr_vsa10(mol._rdkit_mol)) + catch e + @warn "Error calculating SMR_VSA10: $e" + return missing + end +end + +# PEOE_VSA descriptors +""" + peoe_vsa1(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA1 descriptor. +""" +function peoe_vsa1(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa1(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA1: $e" + return missing + end +end + +""" + peoe_vsa2(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA2 descriptor. +""" +function peoe_vsa2(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa2(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA2: $e" + return missing + end +end + +""" + peoe_vsa3(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA3 descriptor. +""" +function peoe_vsa3(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa3(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA3: $e" + return missing + end +end + +""" + peoe_vsa4(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA4 descriptor. +""" +function peoe_vsa4(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa4(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA4: $e" + return missing + end +end + +""" + peoe_vsa5(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA5 descriptor. +""" +function peoe_vsa5(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa5(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA5: $e" + return missing + end +end + +""" + peoe_vsa6(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA6 descriptor. +""" +function peoe_vsa6(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa6(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA6: $e" + return missing + end +end + +""" + peoe_vsa7(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA7 descriptor. +""" +function peoe_vsa7(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa7(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA7: $e" + return missing + end +end + +""" + peoe_vsa8(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA8 descriptor. +""" +function peoe_vsa8(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa8(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA8: $e" + return missing + end +end + +""" + peoe_vsa9(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA9 descriptor. +""" +function peoe_vsa9(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa9(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA9: $e" + return missing + end +end + +""" + peoe_vsa10(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA10 descriptor. +""" +function peoe_vsa10(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa10(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA10: $e" + return missing + end +end + +""" + peoe_vsa11(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA11 descriptor. +""" +function peoe_vsa11(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa11(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA11: $e" + return missing + end +end + +""" + peoe_vsa12(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA12 descriptor. +""" +function peoe_vsa12(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa12(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA12: $e" + return missing + end +end + +""" + peoe_vsa13(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA13 descriptor. +""" +function peoe_vsa13(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa13(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA13: $e" + return missing + end +end + +""" + peoe_vsa14(mol::Union{Molecule, Missing}) -> Union{Float64, Missing} + +Calculate the PEOE_VSA14 descriptor. +""" +function peoe_vsa14(mol::Union{Molecule, Missing}) + isa(mol, Missing) && return missing + !mol.valid && return missing + try + return pyconvert(Float64, _peoe_vsa14(mol._rdkit_mol)) + catch e + @warn "Error calculating PEOE_VSA14: $e" + return missing + end +end + +# Update vectorized function list with the new descriptors +const ADDITIONAL_VSA_DESCRIPTOR_FUNCTIONS = [ + # 3D descriptors + :pmi1, + :pmi2, + :pmi3, + # SlogP_VSA series (2-12) + :slogp_vsa2, + :slogp_vsa3, + :slogp_vsa4, + :slogp_vsa5, + :slogp_vsa6, + :slogp_vsa7, + :slogp_vsa8, + :slogp_vsa9, + :slogp_vsa10, + :slogp_vsa11, + :slogp_vsa12, + # SMR_VSA series (1-10) + :smr_vsa1, + :smr_vsa2, + :smr_vsa3, + :smr_vsa4, + :smr_vsa5, + :smr_vsa6, + :smr_vsa7, + :smr_vsa8, + :smr_vsa9, + :smr_vsa10, + # PEOE_VSA series (1-14) + :peoe_vsa1, + :peoe_vsa2, + :peoe_vsa3, + :peoe_vsa4, + :peoe_vsa5, + :peoe_vsa6, + :peoe_vsa7, + :peoe_vsa8, + :peoe_vsa9, + :peoe_vsa10, + :peoe_vsa11, + :peoe_vsa12, + :peoe_vsa13, + :peoe_vsa14, +] + +for func in ADDITIONAL_VSA_DESCRIPTOR_FUNCTIONS + @eval function $(func)(mols::Vector{Union{Molecule, Missing}}) + return [mol === missing ? missing : $(func)(mol) for mol in mols] + end + @eval function $(func)(mols::Vector{Molecule}) + return [$(func)(mol) for mol in mols] + end +end diff --git a/src/molecule/operations.jl b/src/molecule/operations.jl index 598e040..e4b941d 100644 --- a/src/molecule/operations.jl +++ b/src/molecule/operations.jl @@ -1,235 +1,436 @@ ####################################################### -# Additional Molecular Operations -####################################################### - -####################################################### -# File I/O Extensions +# Molecular Editing and Manipulation ####################################################### """ - mol_from_pdb_block(pdb_block::String) -> Molecule + combine_mols(mol1::Molecule, mol2::Molecule) -> Molecule -Create a molecule from a PDB block string. +Combine two molecules into a single molecule. # Arguments -- `pdb_block::String`: PDB format string + + - `mol1::Molecule`: First molecule + - `mol2::Molecule`: Second molecule # Returns -- `Molecule`: Parsed molecule or invalid molecule if parsing fails + + - `Molecule`: Combined molecule # Example + ```julia -pdb_data = "ATOM 1 C MOL A 1 20.154 21.875 21.235 1.00 10.00 C" -mol = mol_from_pdb_block(pdb_data) +mol1 = mol_from_smiles("CCO") +mol2 = mol_from_smiles("CCC") +combined = combine_mols(mol1, mol2) ``` """ -function mol_from_pdb_block(pdb_block::String) +function combine_mols(mol1::Molecule, mol2::Molecule) + if !mol1.valid || !mol2.valid + return Molecule(; _rdkit_mol = pybuiltins.None, valid = false, source = "combined") + end + try - rdkit_mol = _mol_from_pdb_block(pdb_block) - if pyconvert(Bool, rdkit_mol !== pybuiltins.None) - return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = "PDB block") - else - return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = "PDB block") - end + combined_mol = _combine_mols(mol1._rdkit_mol, mol2._rdkit_mol) + return Molecule(; _rdkit_mol = combined_mol, valid = true, source = "combined") catch e - @warn "Error parsing PDB block: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "PDB block") + @warn "Error combining molecules: $e" + return Molecule(; _rdkit_mol = pybuiltins.None, valid = false, source = "combined") end end """ - mol_from_pdb_file(filename::String) -> Molecule + delete_substructs(mol::Molecule, query::String) -> Molecule -Create a molecule from a PDB file. +Delete substructures matching a SMARTS pattern from a molecule. # Arguments -- `filename::String`: Path to PDB file + + - `mol::Molecule`: Input molecule + - `query::String`: SMARTS pattern to match and delete # Returns -- `Molecule`: Parsed molecule or invalid molecule if parsing fails + + - `Molecule`: Molecule with matching substructures removed # Example + ```julia -mol = mol_from_pdb_file("protein.pdb") +mol = mol_from_smiles("CCO") +# Remove alcohol groups +mol_modified = delete_substructs(mol, "[OH]") ``` """ -function mol_from_pdb_file(filename::String) +function delete_substructs(mol::Molecule, query::String) + if !mol.valid + return mol + end + try - rdkit_mol = _mol_from_pdb_file(filename) - if pyconvert(Bool, rdkit_mol !== pybuiltins.None) - return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = filename) - else - return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = filename) + query_mol = _mol_from_smarts(query) + if pyconvert(Bool, query_mol === pybuiltins.None) + @warn "Invalid SMARTS pattern: $query" + return mol end + + modified_mol = _delete_substructs(mol._rdkit_mol, query_mol) + return Molecule(; _rdkit_mol = modified_mol, valid = true, source = mol.source) catch e - @warn "Error reading PDB file: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = filename) + @warn "Error deleting substructures: $e" + return mol end end """ - mol_to_pdb_block(mol::Molecule; confId::Int=-1) -> String + replace_substructs(mol::Molecule, query::String, replacement::String) -> Vector{Molecule} -Convert a molecule to PDB block format. +Replace substructures matching a SMARTS pattern with a replacement structure. # Arguments - `mol::Molecule`: Input molecule -- `confId::Int`: Conformer ID to use (-1 for default) +- `query::String`: SMARTS pattern to match +- `replacement::String`: SMILES string for replacement # Returns -- `String`: PDB format string or empty string if conversion fails +- `Vector{Molecule}`: Vector of molecules with replacements made # Example ```julia mol = mol_from_smiles("CCO") -pdb_block = mol_to_pdb_block(mol) +# Replace alcohol with amine +replacements = replace_substructs(mol, "[OH]", "N") ``` """ -function mol_to_pdb_block(mol::Molecule; confId::Int=-1) - if !mol.valid - return "" - end +# Ring analysis functions +""" + get_ring_info(mol::Molecule) -> Union{Py, Missing} +Get ring information for a molecule. + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Union{Py, Missing}`: RingInfo object or missing if molecule is invalid + +# Example + +```julia +mol = mol_from_smiles("c1ccccc1") +ring_info = get_ring_info(mol) +``` +""" +function get_ring_info(mol::Molecule) + !mol.valid && return missing try - return pyconvert(String, _mol_to_pdb_block(mol._rdkit_mol)) + ring_info = _get_ring_info(mol._rdkit_mol) + return Dict( + :num_rings => pyconvert(Int, ring_info.NumRings()), + :atom_rings => + [pyconvert(Vector{Int}, ring) .+ 1 for ring in ring_info.AtomRings()], + :bond_rings => + [pyconvert(Vector{Int}, ring) .+ 1 for ring in ring_info.BondRings()], + ) catch e - @warn "Error converting to PDB block: $e" - return "" + @warn "Error getting ring info: $e" + return missing end end """ - mol_to_inchi_key(mol::Molecule) -> String + canonical_atom_ranks(mol::Molecule) -> Union{Vector{Int}, Missing} -Generate InChI key for a molecule. +Get canonical atom ranks for a molecule. # Arguments -- `mol::Molecule`: Input molecule + + - `mol::Molecule`: Input molecule # Returns -- `String`: InChI key or empty string if generation fails + + - `Union{Vector{Int}, Missing}`: Vector of atom ranks or missing if molecule is invalid # Example + ```julia mol = mol_from_smiles("CCO") -inchi_key = mol_to_inchi_key(mol) +ranks = canonical_atom_ranks(mol) ``` """ -function mol_to_inchi_key(mol::Molecule) - if !mol.valid - return "" +function canonical_atom_ranks(mol::Molecule) + !mol.valid && return Int[] + try + py_ranks = _canonical_rank_atoms(mol._rdkit_mol) + return pyconvert(Vector{Int}, py_ranks) + catch e + @warn "Error getting canonical atom ranks: $e" + return Int[] end +end +""" + find_atom_environment(mol::Molecule, radius::Int, atom_idx::Int) -> Union{Vector{Int}, Missing} + +Find atom environment of specified radius around an atom. + +# Arguments + + - `mol::Molecule`: Input molecule + - `radius::Int`: Radius to search + - `atom_idx::Int`: Index of central atom (0-based) + +# Returns + + - `Union{Vector{Int}, Missing}`: Vector of bond indices in environment or missing if invalid + +# Example + +```julia +mol = mol_from_smiles("CCCCC") +env = find_atom_environment(mol, 2, 1) +``` +""" +function find_atom_environment(mol::Molecule, radius::Int, atom_idx::Int) + !mol.valid && return missing try - return pyconvert(String, _mol_to_inchi_key(mol._rdkit_mol)) + py_env = _find_atom_environment_of_radius_n(mol._rdkit_mol, radius, atom_idx) + return pyconvert(Vector{Int}, py_env) catch e - @warn "Error generating InChI key: $e" - return "" + @warn "Error finding atom environment: $e" + return missing end end """ - mol_to_molblock(mol::Molecule) -> String + mol_fragment_to_smarts(mol::Molecule, atom_indices::Vector{Int}) -> Union{String, Missing} -Convert a molecule to MOL block format. +Convert a molecular fragment to SMARTS pattern. # Arguments -- `mol::Molecule`: Input molecule + + - `mol::Molecule`: Input molecule + - `atom_indices::Vector{Int}`: Indices of atoms to include in fragment # Returns -- `String`: MOL block string or empty string if conversion fails + + - `Union{String, Missing}`: SMARTS pattern or missing if invalid # Example + ```julia -mol = mol_from_smiles("CCO") -molblock = mol_to_molblock(mol) +mol = mol_from_smiles("CCCO") +smarts = mol_fragment_to_smarts(mol, [0, 1]) # First two carbons ``` """ -function mol_to_molblock(mol::Molecule) - if !mol.valid - return "" +function mol_fragment_to_smarts(mol::Molecule, atom_indices::Vector{Int}) + !mol.valid && return missing + try + py_smarts = _mol_fragment_to_smarts(mol._rdkit_mol, atom_indices) + return pyconvert(String, py_smarts) + catch e + @warn "Error converting fragment to SMARTS: $e" + return missing end +end +""" + mol_fragment_to_smiles(mol::Molecule, atom_indices::Vector{Int}) -> Union{String, Missing} + +Convert a molecular fragment to SMILES string. + +# Arguments + + - `mol::Molecule`: Input molecule + - `atom_indices::Vector{Int}`: Indices of atoms to include in fragment + +# Returns + + - `Union{String, Missing}`: SMILES string or missing if invalid + +# Example + +```julia +mol = mol_from_smiles("CCCO") +smiles = mol_fragment_to_smiles(mol, [0, 1]) # First two carbons +``` +""" +function mol_fragment_to_smiles(mol::Molecule, atom_indices::Vector{Int}) + !mol.valid && return missing try - return pyconvert(String, _mol_to_molblock(mol._rdkit_mol)) + py_smiles = _mol_fragment_to_smiles(mol._rdkit_mol, atom_indices) + return pyconvert(String, py_smiles) catch e - @warn "Error converting to MOL block: $e" - return "" + @warn "Error converting fragment to SMILES: $e" + return missing end end -####################################################### -# Molecular Editing and Manipulation -####################################################### - """ - combine_mols(mol1::Molecule, mol2::Molecule) -> Molecule + renumber_atoms(mol::Molecule, new_order::Vector{Int}) -> Molecule -Combine two molecules into a single molecule. +Renumber atoms in a molecule according to a new ordering. # Arguments -- `mol1::Molecule`: First molecule -- `mol2::Molecule`: Second molecule + + - `mol::Molecule`: Input molecule + - `new_order::Vector{Int}`: New atom ordering (0-based indices) # Returns -- `Molecule`: Combined molecule + + - `Molecule`: Molecule with renumbered atoms # Example + ```julia -mol1 = mol_from_smiles("CCO") -mol2 = mol_from_smiles("CCC") -combined = combine_mols(mol1, mol2) +mol = mol_from_smiles("CCO") +# Reverse atom order +new_mol = renumber_atoms(mol, [2, 1, 0]) ``` """ -function combine_mols(mol1::Molecule, mol2::Molecule) - if !mol1.valid || !mol2.valid - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "combined") +function renumber_atoms(mol::Molecule, new_order::Vector{Int}) + !mol.valid && return mol + try + new_mol = _renumber_atoms(mol._rdkit_mol, new_order) + return Molecule(; _rdkit_mol = new_mol, valid = true, source = mol.source) + catch e + @warn "Error renumbering atoms: $e" + return mol end +end + +""" + remove_stereochemistry!(mol::Molecule) -> Molecule + +Remove stereochemistry information from a molecule (in-place operation). + +# Arguments + - `mol::Molecule`: Input molecule + +# Returns + + - `Molecule`: Modified molecule + +# Example + +```julia +mol = mol_from_smiles("C[C@H](O)C") +remove_stereochemistry!(mol) +``` +""" +function remove_stereochemistry!(mol::Molecule) + !mol.valid && return mol try - combined_mol = _combine_mols(mol1._rdkit_mol, mol2._rdkit_mol) - return Molecule(_rdkit_mol = combined_mol, valid = true, source = "combined") + _remove_stereochemistry(mol._rdkit_mol) + return mol catch e - @warn "Error combining molecules: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "combined") + @warn "Error removing stereochemistry: $e" + return mol end end """ - delete_substructs(mol::Molecule, query::String) -> Molecule + sanitize_mol!(mol::Molecule) -> Molecule -Delete substructures matching a SMARTS pattern from a molecule. +Sanitize a molecule (in-place operation). # Arguments -- `mol::Molecule`: Input molecule -- `query::String`: SMARTS pattern to match and delete + + - `mol::Molecule`: Input molecule # Returns -- `Molecule`: Molecule with matching substructures removed + + - `Molecule`: Sanitized molecule # Example + ```julia mol = mol_from_smiles("CCO") -# Remove alcohol groups -mol_modified = delete_substructs(mol, "[OH]") +sanitize_mol!(mol) ``` """ -function delete_substructs(mol::Molecule, query::String) - if !mol.valid +function sanitize_mol!(mol::Molecule) + !mol.valid && return mol + try + _sanitize_mol(mol._rdkit_mol) + return mol + catch e + @warn "Error sanitizing molecule: $e" return mol end +end + +""" + fast_find_rings!(mol::Molecule) -> Molecule + +Find rings in a molecule using the fast algorithm (in-place operation). + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Molecule`: Molecule with ring information computed +# Example + +```julia +mol = mol_from_smiles("c1ccccc1") +fast_find_rings!(mol) +``` +""" +function fast_find_rings!(mol::Molecule) + !mol.valid && return false try - query_mol = _mol_from_smarts(query) - if pyconvert(Bool, query_mol === pybuiltins.None) - @warn "Invalid SMARTS pattern: $query" - return mol + _fast_find_rings(mol._rdkit_mol) + return true + catch e + @warn "Error finding rings: $e" + return false + end +end + +""" + compute_2d_coords!(mol::Molecule) -> Molecule + +Compute 2D coordinates for a molecule (in-place operation). + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Molecule`: Molecule with 2D coordinates + +# Example + +```julia +mol = mol_from_smiles("CCO") +compute_2d_coords!(mol) +println(mol.props) +``` +""" +function compute_2d_coords!(mol::Molecule) + !mol.valid && return mol + try + _compute_2d_coords(mol._rdkit_mol) + conf = mol._rdkit_mol.GetConformer() + num_atoms = pyconvert(Int, mol._rdkit_mol.GetNumAtoms()) + coords_2d = Matrix{Float64}(undef, num_atoms, 2) + + for i in 0:(num_atoms - 1) + pos = conf.GetAtomPosition(i) + coords_2d[i + 1, 1] = pyconvert(Float64, pos.x) + coords_2d[i + 1, 2] = pyconvert(Float64, pos.y) end - modified_mol = _delete_substructs(mol._rdkit_mol, query_mol) - return Molecule(_rdkit_mol = modified_mol, valid = true, source = mol.source) + mol.props[:coordinates_2d] = coords_2d + return mol catch e - @warn "Error deleting substructures: $e" + @warn "Error computing 2D coordinates: $e" return mol end end @@ -237,22 +438,31 @@ end """ replace_substructs(mol::Molecule, query::String, replacement::String) -> Vector{Molecule} -Replace substructures matching a SMARTS pattern with a replacement structure. +Replace substructures in a molecule based on a SMARTS query pattern. # Arguments -- `mol::Molecule`: Input molecule -- `query::String`: SMARTS pattern to match -- `replacement::String`: SMILES string for replacement + + - `mol::Molecule`: Input molecule + - `query::String`: SMARTS pattern for substructure to replace + - `replacement::String`: SMILES pattern for replacement substructure # Returns -- `Vector{Molecule}`: Vector of molecules with replacements made + + - `Vector{Molecule}`: Vector of molecules with replaced substructures # Example + ```julia -mol = mol_from_smiles("CCO") -# Replace alcohol with amine -replacements = replace_substructs(mol, "[OH]", "N") +mol = mol_from_smiles("CCc1ccccc1") # Ethylbenzene +# Replace benzene ring with pyridine +results = replace_substructs(mol, "c1ccccc1", "c1ccncc1") ``` + +# Notes + + - Returns original molecule in vector if replacement fails + - Uses SMARTS pattern matching for query identification + - Can generate multiple products if multiple matches exist """ function replace_substructs(mol::Molecule, query::String, replacement::String) if !mol.valid @@ -263,7 +473,8 @@ function replace_substructs(mol::Molecule, query::String, replacement::String) query_mol = _mol_from_smarts(query) replacement_mol = _mol_from_smiles(replacement) - if pyconvert(Bool, query_mol === pybuiltins.None) || pyconvert(Bool, replacement_mol === pybuiltins.None) + if pyconvert(Bool, query_mol === pybuiltins.None) || + pyconvert(Bool, replacement_mol === pybuiltins.None) @warn "Invalid SMARTS/SMILES pattern" return [mol] end @@ -273,7 +484,10 @@ function replace_substructs(mol::Molecule, query::String, replacement::String) # Convert Python tuple/list to Julia vector molecules = Molecule[] for rdkit_mol in result_mols - push!(molecules, Molecule(_rdkit_mol = rdkit_mol, valid = true, source = mol.source)) + push!( + molecules, + Molecule(; _rdkit_mol = rdkit_mol, valid = true, source = mol.source), + ) end return molecules @@ -293,26 +507,29 @@ end Assign stereochemistry to a molecule in place. # Arguments -- `mol::Molecule`: Input molecule (modified in place) -- `clean_it::Bool`: Whether to clean up the molecule -- `force::Bool`: Whether to force assignment + + - `mol::Molecule`: Input molecule (modified in place) + - `clean_it::Bool`: Whether to clean up the molecule + - `force::Bool`: Whether to force assignment # Returns -- `Bool`: Success status + + - `Bool`: Success status # Example + ```julia mol = mol_from_smiles("C[C@H](O)C") success = assign_stereochemistry!(mol) ``` """ -function assign_stereochemistry!(mol::Molecule; clean_it::Bool=true, force::Bool=false) +function assign_stereochemistry!(mol::Molecule; clean_it::Bool = true, force::Bool = false) if !mol.valid return false end try - _assign_stereochemistry(mol._rdkit_mol; cleanIt=clean_it, force=force) + _assign_stereochemistry(mol._rdkit_mol; cleanIt = clean_it, force = force) return true catch e @warn "Error assigning stereochemistry: $e" @@ -326,26 +543,31 @@ end Find chiral centers in a molecule. # Arguments -- `mol::Molecule`: Input molecule -- `include_unassigned::Bool`: Whether to include unassigned chiral centers + + - `mol::Molecule`: Input molecule + - `include_unassigned::Bool`: Whether to include unassigned chiral centers # Returns -- `Vector{Tuple{Int,String}}`: Vector of (atom_index, chirality) tuples + + - `Vector{Tuple{Int,String}}`: Vector of (atom_index, chirality) tuples # Example + ```julia mol = mol_from_smiles("C[C@H](O)C") chiral_centers = find_chiral_centers(mol) ``` """ -function find_chiral_centers(mol::Molecule; include_unassigned::Bool=false) +function find_chiral_centers(mol::Molecule; include_unassigned::Bool = false) if !mol.valid - return Tuple{Int,String}[] + return Tuple{Int, String}[] end try - centers = _find_mol_chiral_centers(mol._rdkit_mol; includeUnassigned=include_unassigned) - result = Tuple{Int,String}[] + centers = _find_mol_chiral_centers( + mol._rdkit_mol; includeUnassigned = include_unassigned + ) + result = Tuple{Int, String}[] for center in centers atom_idx = pyconvert(Int, center[0]) @@ -356,7 +578,7 @@ function find_chiral_centers(mol::Molecule; include_unassigned::Bool=false) return result catch e @warn "Error finding chiral centers: $e" - return Tuple{Int,String}[] + return Tuple{Int, String}[] end end @@ -364,358 +586,336 @@ end # Ring Analysis ####################################################### +####################################################### +# Pattern Matching +####################################################### + """ - fast_find_rings!(mol::Molecule) -> Bool + quick_smarts_match(mol::Molecule, smarts::String) -> Bool -Perform fast ring finding on a molecule in place. +Quick check if a molecule matches a SMARTS pattern. # Arguments -- `mol::Molecule`: Input molecule (modified in place) + + - `mol::Molecule`: Input molecule + - `smarts::String`: SMARTS pattern # Returns -- `Bool`: Success status + + - `Bool`: Whether the molecule matches the pattern # Example + ```julia -mol = mol_from_smiles("c1ccccc1") -success = fast_find_rings!(mol) +mol = mol_from_smiles("CCO") +has_alcohol = quick_smarts_match(mol, "[OH]") ``` """ -function fast_find_rings!(mol::Molecule) +function quick_smarts_match(mol::Molecule, smarts::String) if !mol.valid return false end try - _fast_find_rings(mol._rdkit_mol) - return true + # Use the standard substructure matching instead of QuickSmartsMatch + query_mol = _mol_from_smarts(smarts) + if pyconvert(Bool, query_mol === pybuiltins.None) + @warn "Invalid SMARTS pattern: $smarts" + return false + end + + return pyconvert(Bool, mol._rdkit_mol.HasSubstructMatch(query_mol)) catch e - @warn "Error finding rings: $e" + @warn "Error in SMARTS matching: $e" return false end end -""" - canonical_atom_ranks(mol::Molecule) -> Vector{Int} +####################################################### +# Additional Missing Operations +####################################################### -Get canonical ranking of atoms in a molecule. +""" + split_mol_by_pdb_chain_id(mol::Molecule) -> Union{Vector{Molecule}, Missing} -# Arguments -- `mol::Molecule`: Input molecule +Split a PDB molecule by chain ID. # Returns -- `Vector{Int}`: Canonical ranks for each atom -# Example -```julia -mol = mol_from_smiles("CCO") -ranks = canonical_atom_ranks(mol) -``` + - `Union{Vector{Molecule}, Missing}`: Vector of molecules (one per chain) or missing if operation fails """ -function canonical_atom_ranks(mol::Molecule) - if !mol.valid - return Int[] +function split_mol_by_pdb_chain_id(mol::Molecule) + !mol.valid && return missing + try + mol_dict = _split_mol_by_pdb_chain_id(mol._rdkit_mol) + molecules = Molecule[] + for (chain_id, rdkit_mol) in mol_dict + push!( + molecules, + Molecule(; + _rdkit_mol = rdkit_mol, valid = true, source = "PDB_chain_$chain_id" + ), + ) + end + return molecules + catch e + @warn "Error splitting molecule by chain ID: $e" + return missing end +end +""" + split_mol_by_pdb_residues(mol::Molecule) -> Union{Vector{Molecule}, Missing} + +Split a PDB molecule by residues. + +# Returns + + - `Union{Vector{Molecule}, Missing}`: Vector of molecules (one per residue) or missing if operation fails +""" +function split_mol_by_pdb_residues(mol::Molecule) + !mol.valid && return missing try - ranks = _canonical_rank_atoms(mol._rdkit_mol) - return [pyconvert(Int, rank) for rank in ranks] + mol_list = _split_mol_by_pdb_residues(mol._rdkit_mol) + molecules = Molecule[] + for (i, rdkit_mol) in enumerate(mol_list) + push!( + molecules, + Molecule(; _rdkit_mol = rdkit_mol, valid = true, source = "PDB_residue_$i"), + ) + end + return molecules catch e - @warn "Error getting canonical ranks: $e" - return Int[] + @warn "Error splitting molecule by residues: $e" + return missing end end -####################################################### -# Pattern Matching -####################################################### - """ - quick_smarts_match(mol::Molecule, smarts::String) -> Bool + assign_stereochemistry_from_3d!(mol::Molecule; conf_id::Int = -1) -> Bool -Quick check if a molecule matches a SMARTS pattern. +Assign stereochemistry from 3D coordinates. # Arguments -- `mol::Molecule`: Input molecule -- `smarts::String`: SMARTS pattern + + - `mol::Molecule`: Input molecule (modified in place) + - `conf_id::Int`: Conformer ID to use (-1 for default) # Returns -- `Bool`: Whether the molecule matches the pattern -# Example -```julia -mol = mol_from_smiles("CCO") -has_alcohol = quick_smarts_match(mol, "[OH]") -``` + - `Bool`: true if successful, false otherwise """ -function quick_smarts_match(mol::Molecule, smarts::String) - if !mol.valid - return false - end - +function assign_stereochemistry_from_3d!(mol::Molecule; conf_id::Int = -1) + !mol.valid && return false try - # Use the standard substructure matching instead of QuickSmartsMatch - query_mol = _mol_from_smarts(smarts) - if pyconvert(Bool, query_mol === pybuiltins.None) - @warn "Invalid SMARTS pattern: $smarts" - return false - end - - return pyconvert(Bool, mol._rdkit_mol.HasSubstructMatch(query_mol)) + _assign_stereochemistry_from_3d(mol._rdkit_mol; confId = conf_id) + return true catch e - @warn "Error in SMARTS matching: $e" + @warn "Error assigning stereochemistry from 3D: $e" return false end end """ - mol_fragment_to_smarts(mol::Molecule, atom_indices::Vector{Int}) -> String + detect_bond_stereochemistry(mol::Molecule, bond_idx::Int) -> Union{String, Missing} -Convert a molecular fragment to SMARTS representation. +Detect the stereochemistry of a specific bond. # Arguments -- `mol::Molecule`: Input molecule -- `atom_indices::Vector{Int}`: Atom indices to include in fragment (0-based) + + - `mol::Molecule`: Input molecule + - `bond_idx::Int`: Bond index # Returns -- `String`: SMARTS representation of the fragment -# Example -```julia -mol = mol_from_smiles("CCO") -smarts = mol_fragment_to_smarts(mol, [0, 1]) # First two atoms -``` + - `Union{String, Missing}`: Stereochemistry designation or missing if detection fails """ -function mol_fragment_to_smarts(mol::Molecule, atom_indices::Vector{Int}) - if !mol.valid - return "" - end - +function detect_bond_stereochemistry(mol::Molecule, bond_idx::Int) + !mol.valid && return missing try - return pyconvert(String, _mol_fragment_to_smarts(mol._rdkit_mol, atom_indices)) + return pyconvert(String, _detect_bond_stereochemistry(mol._rdkit_mol, bond_idx)) catch e - @warn "Error converting fragment to SMARTS: $e" - return "" + @warn "Error detecting bond stereochemistry: $e" + return missing end end -# XYZ format support """ - mol_from_xyz_file(filename::String) -> Molecule + find_potential_stereo(mol::Molecule) -> Union{Vector, Missing} -Read a molecule from an XYZ file. +Find potential stereo centers in a molecule. -XYZ is a simple file format for storing molecular coordinates. The format typically -contains the number of atoms, an optional comment line, and atom coordinates. +# Returns -# Arguments -- `filename`: Path to the XYZ file + - `Union{Vector, Missing}`: Vector of potential stereo information or missing if detection fails +""" +function find_potential_stereo(mol::Molecule) + !mol.valid && return missing + try + return pyconvert(Vector, _find_potential_stereo(mol._rdkit_mol)) + catch e + @warn "Error finding potential stereo: $e" + return missing + end +end + +""" + canonicalize_enhanced_stereo!(mol::Molecule) -> Bool + +Canonicalize enhanced stereochemistry information. # Returns -- `Molecule`: The parsed molecule object -- Returns invalid molecule if parsing fails -# Examples -```julia -mol = mol_from_xyz_file("molecule.xyz") -if mol.valid - println("Successfully loaded molecule with ", heavy_atom_count(mol), " heavy atoms") + - `Bool`: true if successful, false otherwise +""" +function canonicalize_enhanced_stereo!(mol::Molecule) + !mol.valid && return false + try + _canonicalize_enhanced_stereo(mol._rdkit_mol) + return true + catch e + @warn "Error canonicalizing enhanced stereo: $e" + return false + end end -``` -# Notes -- XYZ files contain only 3D coordinates, no bond information -- RDKit may infer bonds based on interatomic distances -- Not all XYZ files will produce chemically meaningful molecules """ -function mol_from_xyz_file(filename::String) + set_bond_stereo_from_directions!(mol::Molecule) -> Bool + +Set bond stereochemistry from directional information. + +# Returns + + - `Bool`: true if successful, false otherwise +""" +function set_bond_stereo_from_directions!(mol::Molecule) + !mol.valid && return false try - rdkit_mol = _mol_from_xyz_file(filename) - if pyconvert(Bool, rdkit_mol === pybuiltins.None) - @warn "Failed to parse XYZ file: $filename" - return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = filename, props = Dict{Symbol, Any}()) - end - return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = filename, props = Dict{Symbol, Any}()) + _set_bond_stereo_from_directions(mol._rdkit_mol) + return true catch e - @warn "Error reading XYZ file $filename: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = filename, props = Dict{Symbol, Any}()) + @warn "Error setting bond stereo from directions: $e" + return false end end """ - mol_from_xyz_block(xyz_block::String) -> Molecule + wedge_mol_bonds!(mol::Molecule; wedge_bonds::Bool = true) -> Bool -Parse a molecule from an XYZ format string. +Add wedge information to molecule bonds for visualization. # Arguments -- `xyz_block`: XYZ format string containing atom coordinates -# Returns -- `Molecule`: The parsed molecule object -- Returns invalid molecule if parsing fails + - `mol::Molecule`: Input molecule (modified in place) + - `wedge_bonds::Bool`: Whether to add wedge bonds -# Examples -```julia -xyz_data = \"\"\"3 -Ethanol molecule -C 0.000 0.000 0.000 -C 1.520 0.000 0.000 -O 2.080 1.100 0.000\"\"\" -mol = mol_from_xyz_block(xyz_data) -``` +# Returns -# Notes -- First line: number of atoms -- Second line: comment (molecule name/description) -- Following lines: element symbol and x, y, z coordinates + - `Bool`: true if successful, false otherwise """ -function mol_from_xyz_block(xyz_block::String) +function wedge_mol_bonds!(mol::Molecule; wedge_bonds::Bool = true) + !mol.valid && return false try - rdkit_mol = _mol_from_xyz_block(xyz_block) - if pyconvert(Bool, rdkit_mol === pybuiltins.None) - @warn "Failed to parse XYZ block" - return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = "xyz_block", props = Dict{Symbol, Any}()) - end - return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = "xyz_block", props = Dict{Symbol, Any}()) + _wedge_mol_bonds(mol._rdkit_mol; wedgeBonds = wedge_bonds) + return true catch e - @warn "Error parsing XYZ block: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "xyz_block", props = Dict{Symbol, Any}()) + @warn "Error wedging mol bonds: $e" + return false end end """ - mol_to_xyz_block(mol::Molecule) -> String + find_ring_families(mol::Molecule) -> Union{Vector, Missing} + +Find ring families in the molecule. -Convert a molecule to XYZ format string. +# Returns -Exports the 3D coordinates of the molecule in XYZ format. The molecule must have -3D coordinates assigned for this to work properly. + - `Union{Vector, Missing}`: Vector of ring families or missing if operation fails +""" +function find_ring_families(mol::Molecule) + !mol.valid && return missing + try + return pyconvert(Vector, _find_ring_families(mol._rdkit_mol)) + catch e + @warn "Error finding ring families: $e" + return missing + end +end + +""" + canonical_rank_atoms_in_fragment(mol::Molecule, atoms_to_use::Vector{Int}) -> Union{Vector{Int}, Missing} + +Get canonical ranking of atoms in a molecular fragment. # Arguments -- `mol`: Molecule object with 3D coordinates -# Returns -- `String`: XYZ format representation -- Empty string if conversion fails or molecule has no 3D coordinates + - `mol::Molecule`: Input molecule + - `atoms_to_use::Vector{Int}`: Atom indices to include in ranking -# Examples -```julia -mol = mol_from_smiles("CCO") -# Generate 3D coordinates first -conformers = generate_3d_conformers(mol, 1) -if !isempty(conformers) - mol_3d = conformers[1].molecule - xyz_string = mol_to_xyz_block(mol_3d) - println(xyz_string) -end -``` +# Returns -# Notes -- Requires 3D coordinates to be present in the molecule -- Only exports heavy atoms (no hydrogens unless explicit) -- Bond information is lost in XYZ format + - `Union{Vector{Int}, Missing}`: Canonical ranks or missing if operation fails """ -function mol_to_xyz_block(mol::Molecule) - !mol.valid && return "" +function canonical_rank_atoms_in_fragment(mol::Molecule, atoms_to_use::Vector{Int}) + !mol.valid && return missing try - return pyconvert(String, _mol_to_xyz_block(mol._rdkit_mol)) + ranks = _canonical_rank_atoms_in_fragment(mol._rdkit_mol, atoms_to_use) + return pyconvert(Vector{Int}, ranks) catch e - @warn "Error converting molecule to XYZ: $e" - return "" + @warn "Error ranking atoms in fragment: $e" + return missing end end -# MOL2 format support """ - mol_from_mol2_file(filename::String) -> Molecule + find_atom_environment_of_radius_n(mol::Molecule, radius::Int, atom_idx::Int) -> Union{Vector{Int}, Missing} -Read a molecule from a MOL2 file. - -MOL2 (Sybyl format) is a comprehensive molecular file format that includes -atoms, bonds, and additional molecular information like partial charges. +Find atoms within a specified radius of a given atom. # Arguments -- `filename`: Path to the MOL2 file -# Returns -- `Molecule`: The parsed molecule object -- Returns invalid molecule if parsing fails + - `mol::Molecule`: Input molecule + - `radius::Int`: Search radius + - `atom_idx::Int`: Central atom index -# Examples -```julia -mol = mol_from_mol2_file("ligand.mol2") -if mol.valid - println("Loaded molecule: ", mol_to_smiles(mol)) - println("Molecular weight: ", molecular_weight(mol)) -end -``` +# Returns -# Notes -- MOL2 format preserves bond orders and formal charges -- May contain multiple molecules (only first one is loaded) -- Partial charges are preserved if present in the file + - `Union{Vector{Int}, Missing}`: Atom indices within radius or missing if operation fails """ -function mol_from_mol2_file(filename::String) +function find_atom_environment_of_radius_n(mol::Molecule, radius::Int, atom_idx::Int) + !mol.valid && return missing try - rdkit_mol = _mol_from_mol2_file(filename) - if pyconvert(Bool, rdkit_mol === pybuiltins.None) - @warn "Failed to parse MOL2 file: $filename" - return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = filename, props = Dict{Symbol, Any}()) - end - return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = filename, props = Dict{Symbol, Any}()) + env = _find_atom_environment_of_radius_n(mol._rdkit_mol, radius, atom_idx) + return pyconvert(Vector{Int}, env) catch e - @warn "Error reading MOL2 file $filename: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = filename, props = Dict{Symbol, Any}()) + @warn "Error finding atom environment: $e" + return missing end end """ - mol_from_mol2_block(mol2_block::String) -> Molecule + get_most_substituted_core_match(mol::Molecule, core::Molecule) -> Union{Vector{Int}, Missing} -Parse a molecule from a MOL2 format string. +Get the most substituted core match in a molecule. # Arguments -- `mol2_block`: MOL2 format string -# Returns -- `Molecule`: The parsed molecule object -- Returns invalid molecule if parsing fails + - `mol::Molecule`: Input molecule + - `core::Molecule`: Core pattern to match -# Examples -```julia -mol2_data = \"\"\"@MOLECULE -ethanol -3 2 0 0 0 -SMALL -NO_CHARGES - -@ATOM -1 C1 0.0000 0.0000 0.0000 C.3 1 RES1 0.0000 -2 C2 1.5200 0.0000 0.0000 C.3 1 RES1 0.0000 -3 O1 2.0800 1.1000 0.0000 O.3 1 RES1 0.0000 - -@BOND -1 1 2 1 -2 2 3 1 -\"\"\" -mol = mol_from_mol2_block(mol2_data) -``` +# Returns -# Notes -- MOL2 format uses TRIPOS record types -- Supports various atom and bond types -- May include substructure and partial charge information + - `Union{Vector{Int}, Missing}`: Atom indices of the match or missing if no match found """ -function mol_from_mol2_block(mol2_block::String) +function get_most_substituted_core_match(mol::Molecule, core::Molecule) + !mol.valid && return missing + !core.valid && return missing try - rdkit_mol = _mol_from_mol2_block(mol2_block) - if pyconvert(Bool, rdkit_mol === pybuiltins.None) - @warn "Failed to parse MOL2 block" - return Molecule(_rdkit_mol = rdkit_mol, valid = false, source = "mol2_block", props = Dict{Symbol, Any}()) - end - return Molecule(_rdkit_mol = rdkit_mol, valid = true, source = "mol2_block", props = Dict{Symbol, Any}()) + match = _get_most_substituted_core_match(mol._rdkit_mol, core._rdkit_mol) + return pyconvert(Vector{Int}, match) catch e - @warn "Error parsing MOL2 block: $e" - return Molecule(_rdkit_mol = pybuiltins.None, valid = false, source = "mol2_block", props = Dict{Symbol, Any}()) + @warn "Error finding most substituted core match: $e" + return missing end -end \ No newline at end of file +end diff --git a/src/pharmacophore/pharmacophore.jl b/src/pharmacophore/pharmacophore.jl new file mode 100644 index 0000000..e27cb01 --- /dev/null +++ b/src/pharmacophore/pharmacophore.jl @@ -0,0 +1,395 @@ +####################################################### +# Pharmacophore Features and Analysis +####################################################### + +""" + FeatureFactory + +Struct to hold a chemical feature factory for pharmacophore analysis. + +# Fields +- `_rdkit_factory::Py`: The underlying RDKit feature factory object +- `feature_families::Vector{String}`: List of available feature families +- `num_definitions::Int`: Number of feature definitions + +# Example +```julia +factory = create_feature_factory() +println("Available families: ", factory.feature_families) +``` +""" +@kwdef struct FeatureFactory + _rdkit_factory::Py + feature_families::Vector{String} = String[] + num_definitions::Int = 0 +end + +function Base.show(io::IO, factory::FeatureFactory) + nfamilies = length(factory.feature_families) + ndefs = factory.num_definitions + + print(io, "FeatureFactory(") + print(io, "$nfamilies families, $ndefs definitions") + + if nfamilies > 0 + print(io, ": ") + if nfamilies <= 6 + # Show all families if 6 or fewer + print(io, join(factory.feature_families, ", ")) + else + # Show first 5 and indicate there are more + print(io, join(factory.feature_families[1:5], ", ")) + print(io, ", ... ($(nfamilies-5) more)") + end + end + + print(io, ")") +end + +""" + ChemicalFeature + +Represents a chemical feature identified in a molecule. + +# Fields +- `family::String`: Feature family (e.g., "Donor", "Acceptor", "Aromatic") +- `type::String`: Specific feature type +- `atom_ids::Vector{Int}`: Atom indices involved in the feature (1-based) +- `position::Vector{Float64}`: 3D coordinates of the feature center +- `id::Int`: Feature identifier + +# Example +```julia +features = get_mol_features(mol, factory) +for feature in features + println("Feature: ", feature.family, " at position ", feature.position) +end +``` +""" +@kwdef struct ChemicalFeature + family::String + type::String + atom_ids::Vector{Int} + position::Vector{Float64} + id::Int +end + +""" + create_feature_factory(;use_default::Bool = true, fdef_file::Union{String, Nothing} = nothing, fdef_string::Union{String, Nothing} = nothing) -> FeatureFactory + +Create a chemical feature factory for pharmacophore analysis. + +# Arguments +- `use_default::Bool = true`: Whether to use RDKit's default feature definitions +- `fdef_file::Union{String, Nothing} = nothing`: Path to custom feature definition file +- `fdef_string::Union{String, Nothing} = nothing`: Custom feature definition string + +# Returns +- `FeatureFactory`: A feature factory for identifying molecular features + +# Examples +```julia +# Use default RDKit feature definitions +factory = create_feature_factory() + +# Use custom feature definition file +factory = create_feature_factory(use_default=false, fdef_file="custom.fdef") + +# Use custom feature definition string +fdef = \"\"\" +DefineFeature HAcceptor1 [N,O;H0] + Family HBondAcceptor + Weights 1.0 +EndFeature +\"\"\" +factory = create_feature_factory(use_default=false, fdef_string=fdef) +``` + +# Notes +- Default factory includes: Donor, Acceptor, NegIonizable, PosIonizable, ZnBinder, Aromatic, Hydrophobe, LumpedHydrophobe +- Custom definitions use SMARTS patterns +""" +function create_feature_factory(;use_default::Bool = true, fdef_file::Union{String, Nothing} = nothing, fdef_string::Union{String, Nothing} = nothing) + try + if use_default + # Use RDKit's default BaseFeatures.fdef + data_dir = _get_rdconfig_data_dir() + default_fdef = joinpath(pyconvert(String, data_dir), "BaseFeatures.fdef") + rdkit_factory = _build_feature_factory(default_fdef) + elseif fdef_file !== nothing + rdkit_factory = _build_feature_factory(fdef_file) + elseif fdef_string !== nothing + rdkit_factory = _build_feature_factory_from_string(fdef_string) + else + throw(ArgumentError("Must specify either use_default=true, fdef_file, or fdef_string")) + end + + # Extract feature information + families = pyconvert(Vector{String}, _get_feature_families(rdkit_factory)) + num_defs = pyconvert(Int, _get_num_feature_defs(rdkit_factory)) + + return FeatureFactory( + _rdkit_factory = rdkit_factory, + feature_families = families, + num_definitions = num_defs + ) + catch e + @warn "Error creating feature factory: $e" + return FeatureFactory( + _rdkit_factory = pybuiltins.None, + feature_families = String[], + num_definitions = 0 + ) + end +end + +""" + get_mol_features(mol::Molecule, factory::FeatureFactory; conf_id::Int = -1) -> Vector{ChemicalFeature} + +Extract chemical features from a molecule using the specified feature factory. + +# Arguments +- `mol::Molecule`: Input molecule (must have 2D or 3D conformer with coordinates) +- `factory::FeatureFactory`: Feature factory for feature identification +- `conf_id::Int = -1`: Conformer ID to use (-1 for default) + +# Returns +- `Vector{ChemicalFeature}`: List of identified chemical features + +# Examples +```julia +mol = mol_from_smiles("CCO") # Ethanol +factory = create_feature_factory() + +# Generate coordinates (required for feature extraction) +conformers_2d = generate_2d_conformers(mol) +if !isempty(conformers_2d) + mol_2d = conformers_2d[1].molecule + features = get_mol_features(mol_2d, factory) + + println("Found ", length(features), " features:") + for feature in features + println(" ", feature.family, " at atoms ", feature.atom_ids) + end +end +``` + +# Notes +- **Coordinate Requirement**: Molecule must have 2D or 3D conformer coordinates. Use `generate_2d_conformers()` or `generate_3d_conformers()` first. +- Returns empty vector for molecules without conformers or invalid molecules +- 3D coordinates provide more accurate feature positioning than 2D +""" +function get_mol_features(mol::Molecule, factory::FeatureFactory; conf_id::Int = -1) + !mol.valid && return ChemicalFeature[] + + try + rdkit_features = _get_features_for_mol(factory._rdkit_factory, mol._rdkit_mol; conf_id = conf_id) + features = ChemicalFeature[] + + for (i, rdkit_feature) in enumerate(rdkit_features) + family = pyconvert(String, rdkit_feature.GetFamily()) + feature_type = pyconvert(String, rdkit_feature.GetType()) + atom_ids = pyconvert(Vector{Int}, rdkit_feature.GetAtomIds()) .+ 1 # Convert to 1-based + + # Extract position coordinates from Point3D object + pos_obj = rdkit_feature.GetPos() + pos_coords = try + [pyconvert(Float64, pos_obj.x), pyconvert(Float64, pos_obj.y), pyconvert(Float64, pos_obj.z)] + catch + # If direct access fails, use default position + [0.0, 0.0, 0.0] + end + + feature_id = pyconvert(Int, rdkit_feature.GetId()) + + push!(features, ChemicalFeature( + family = family, + type = feature_type, + atom_ids = atom_ids, + position = pos_coords, + id = feature_id + )) + end + + return features + catch e + @warn "Error extracting molecular features: $e" + return ChemicalFeature[] + end +end + +""" + pharmacophore_fingerprint(mol::Molecule; min_points::Int = 2, max_points::Int = 3, factory::Union{FeatureFactory, Nothing} = nothing) -> Union{Vector{Bool}, Missing} + +Generate a pharmacophore fingerprint for a molecule. + +# Arguments +- `mol::Molecule`: Input molecule (no conformer coordinates required) +- `min_points::Int = 2`: Minimum number of pharmacophore points +- `max_points::Int = 3`: Maximum number of pharmacophore points +- `factory::Union{FeatureFactory, Nothing} = nothing`: Feature factory (default will be created if not provided) + +# Returns +- `Union{Vector{Bool}, Missing}`: Binary pharmacophore fingerprint, or missing if molecule is invalid + +# Examples +```julia +mol = mol_from_smiles("c1ccccc1O") # Phenol +fp = pharmacophore_fingerprint(mol) + +# Custom parameters +fp = pharmacophore_fingerprint(mol; min_points=2, max_points=4) +``` + +# Notes +- **No Conformer Required**: Works directly with molecular structure using 2D topological distances +- Combines chemical features with distance information +- Fingerprint length depends on feature combinations and distance bins +- More efficient than 3D approaches as no coordinate generation is needed +""" +function pharmacophore_fingerprint(mol::Molecule; min_points::Int = 2, max_points::Int = 3, factory::Union{FeatureFactory, Nothing} = nothing) + !mol.valid && return missing + + try + # Create factory if not provided + if factory === nothing + factory = create_feature_factory() + end + + # Create signature factory + sig_factory = _create_sig_factory(factory._rdkit_factory; min_point_count = min_points, max_point_count = max_points) + + # Generate fingerprint + fp = _get_pharmacophore_fingerprint(mol._rdkit_mol, factory._rdkit_factory, sig_factory) + + # Convert IntSparseIntVect to Julia vector + fp_size = pyconvert(Int, fp.GetLength()) + result = zeros(Bool, fp_size) + + # Get non-zero indices and set them to true + non_zero_indices = fp.GetNonzeroElements() + for (idx, val) in non_zero_indices.items() + if pyconvert(Int, val) > 0 + result[pyconvert(Int, idx) + 1] = true # Convert to 1-based indexing + end + end + + return result + catch e + @warn "Error generating pharmacophore fingerprint: $e" + return missing + end +end + +""" + get_feature_families(factory::FeatureFactory) -> Vector{String} + +Get the list of feature families defined in the feature factory. + +# Arguments +- `factory::FeatureFactory`: The feature factory + +# Returns +- `Vector{String}`: List of feature family names + +# Example +```julia +factory = create_feature_factory() +families = get_feature_families(factory) +println("Available families: ", families) +``` +""" +function get_feature_families(factory::FeatureFactory) + return factory.feature_families +end + +""" + filter_features_by_family(features::Vector{ChemicalFeature}, family::String) -> Vector{ChemicalFeature} + +Filter chemical features by their family type. + +# Arguments +- `features::Vector{ChemicalFeature}`: List of chemical features (from `get_mol_features`) +- `family::String`: Feature family to filter by (e.g., "Donor", "Acceptor") + +# Returns +- `Vector{ChemicalFeature}`: Filtered list of features + +# Example +```julia +mol = mol_from_smiles("CCO") +factory = create_feature_factory() + +# Generate coordinates for feature extraction +conformers_2d = generate_2d_conformers(mol) +if !isempty(conformers_2d) + mol_2d = conformers_2d[1].molecule + features = get_mol_features(mol_2d, factory) + + # Get only hydrogen bond donors + donors = filter_features_by_family(features, "Donor") + println("Found ", length(donors), " hydrogen bond donors") +end +``` +""" +function filter_features_by_family(features::Vector{ChemicalFeature}, family::String) + return filter(f -> f.family == family, features) +end + +""" + get_pharmacophore_3d(mol::Molecule, factory::FeatureFactory; conf_id::Int = -1) -> Vector{Tuple{String, Vector{Float64}}} + +Extract 3D pharmacophore points from a molecule. + +# Arguments +- `mol::Molecule`: Input molecule (must have 3D conformer with coordinates) +- `factory::FeatureFactory`: Feature factory for feature identification +- `conf_id::Int = -1`: Conformer ID to use (-1 for default) + +# Returns +- `Vector{Tuple{String, Vector{Float64}}}`: List of (feature_family, 3d_position) tuples + +# Example +```julia +mol = mol_from_smiles("CCO") +conformers_3d = generate_3d_conformers(mol, 1) +if !isempty(conformers_3d) + mol_3d = conformers_3d[1].molecule + factory = create_feature_factory() + ph4_points = get_pharmacophore_3d(mol_3d, factory) + + for (family, pos) in ph4_points + println(" \$(family) at position [\$(pos[1]), \$(pos[2]), \$(pos[3])]") + end +end +``` + +# Notes +- **3D Conformer Required**: Molecule must have 3D coordinates. Use `generate_3d_conformers()` first. +- Returns empty vector for molecules without 3D conformers +- Provides spatial positioning of pharmacophore features for 3D analysis +""" +function get_pharmacophore_3d(mol::Molecule, factory::FeatureFactory; conf_id::Int = -1) + !mol.valid && return Tuple{String, Vector{Float64}}[] + + try + feature_list = _explicit_pharmacophore_from_mol(mol._rdkit_mol, factory._rdkit_factory; conf_id = conf_id) + return feature_list + catch e + @warn "Error extracting 3D pharmacophore: $e" + return Tuple{String, Vector{Float64}}[] + end +end + +# Vectorized operations +function get_mol_features(mols::Vector{Molecule}, factory::FeatureFactory; conf_id::Int = -1) + return [get_mol_features(mol, factory; conf_id = conf_id) for mol in mols] +end + +function pharmacophore_fingerprint(mols::Vector{Molecule}; min_points::Int = 2, max_points::Int = 3, factory::Union{FeatureFactory, Nothing} = nothing) + return [pharmacophore_fingerprint(mol; min_points = min_points, max_points = max_points, factory = factory) for mol in mols] +end + +function get_pharmacophore_3d(mols::Vector{Molecule}, factory::FeatureFactory; conf_id::Int = -1) + return [get_pharmacophore_3d(mol, factory; conf_id = conf_id) for mol in mols] +end \ No newline at end of file diff --git a/src/rdkit.jl b/src/rdkit.jl index a904842..73c6472 100644 --- a/src/rdkit.jl +++ b/src/rdkit.jl @@ -168,6 +168,62 @@ function _inertial_shape_factor(mol::Py; confId::Int = -1) @pyconst(pyimport("rdkit.Chem.Descriptors3D").InertialShapeFactor)(mol; confId = confId) end +# Advanced 3D descriptors +function _spherocity_index(mol::Py; confId::Int = -1) + @pyconst(pyimport("rdkit.Chem.Descriptors3D").SpherocityIndex)(mol; confId = confId) +end + +function _calc_getaway( + mol::Py; confId::Int = -1, precision::Int = 2, custom_atom_property::String = "" +) + if custom_atom_property == "" + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcGETAWAY)( + mol; confId = confId, precision = precision + ) + else + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcGETAWAY)( + mol; + confId = confId, + precision = precision, + CustomAtomProperty = custom_atom_property, + ) + end +end + +function _calc_whim( + mol::Py; confId::Int = -1, thresh::Float64 = 0.001, custom_atom_property::String = "" +) + if custom_atom_property == "" + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcWHIM)( + mol; confId = confId, thresh = thresh + ) + else + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcWHIM)( + mol; confId = confId, thresh = thresh, CustomAtomProperty = custom_atom_property + ) + end +end + +function _calc_rdf(mol::Py; confId::Int = -1, custom_atom_property::String = "") + if custom_atom_property == "" + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcRDF)(mol; confId = confId) + else + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcRDF)( + mol; confId = confId, CustomAtomProperty = custom_atom_property + ) + end +end + +function _calc_morse(mol::Py; confId::Int = -1, custom_atom_property::String = "") + if custom_atom_property == "" + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcMORSE)(mol; confId = confId) + else + @pyconst(pyimport("rdkit.Chem.rdMolDescriptors").CalcMORSE)( + mol; confId = confId, CustomAtomProperty = custom_atom_property + ) + end +end + # Synthetic Accessibility Score function _sascore(mol::Py) @pyconst(pyimport("rdkit.Contrib.SA_Score.sascorer").calculateScore)(mol) @@ -346,7 +402,7 @@ function _find_ring_families(mol::Py) @pyconst(pyimport("rdkit.Chem").FindRingFamilies)(mol) end function _get_ring_info(mol::Py) - @pyconst(pyimport("rdkit.Chem").GetRingInfo)(mol) + mol.GetRingInfo() end function _canonical_rank_atoms(mol::Py) @pyconst(pyimport("rdkit.Chem").CanonicalRankAtoms)(mol) @@ -635,10 +691,6 @@ function _min_absolute_e_state_index(mol::Py) @pyconst(pyimport("rdkit.Chem.Descriptors").MinAbsEStateIndex)(mol) end -# Functional group counts - drug discovery relevant -function _num_sp3_hbonds(mol::Py) - @pyconst(pyimport("rdkit.Chem.Descriptors").NumSP3Hbonds)(mol) -end function _num_aliphatic_rings(mol::Py) @pyconst(pyimport("rdkit.Chem.Descriptors").NumAliphaticRings)(mol) end @@ -750,33 +802,39 @@ function _reaction_get_reacting_atoms(rxn::Py) return rxn.GetReactingAtoms() end -function _reaction_fingerprint(rxn::Py, fp_size::Int=2048) +function _reaction_fingerprint(rxn::Py, fp_size::Int = 2048) rdkit_reactions = @pyconst(pyimport("rdkit.Chem.rdChemReactions")) params = rdkit_reactions.ReactionFingerprintParams() params.fpSize = fp_size return rdkit_reactions.CreateDifferenceFingerprintForReaction(rxn, params) end -function _reaction_structural_fingerprint(rxn::Py, fp_size::Int=2048) +function _reaction_structural_fingerprint(rxn::Py, fp_size::Int = 2048) rdkit_reactions = @pyconst(pyimport("rdkit.Chem.rdChemReactions")) params = rdkit_reactions.ReactionFingerprintParams() params.fpSize = fp_size return rdkit_reactions.CreateStructuralFingerprintForReaction(rxn, params) end -function _compute_reaction_center_fingerprint(rxn::Py, fp_size::Int=2048) +function _compute_reaction_center_fingerprint(rxn::Py, fp_size::Int = 2048) rdkit_reactions = @pyconst(pyimport("rdkit.Chem.rdChemReactions")) params = rdkit_reactions.ReactionFingerprintParams() params.fpSize = fp_size return rdkit_reactions.CreateDifferenceFingerprintForReaction(rxn, params) end -function _reaction_run_reactants_inline_properties(rxn::Py, reactants::Vector{Py}, max_products::Int=1000) +function _reaction_run_reactants_inline_properties( + rxn::Py, reactants::Vector{Py}, max_products::Int = 1000 +) rxn.RunReactants(pylist(reactants), max_products) end -function _reaction_enumerate_library_from_reaction(rxn::Py, reactant_lists::Vector{Vector{Py}}) - @pyconst(pyimport("rdkit.Chem.AllChem").EnumerateLibraryFromReaction)(rxn, pylist([pylist(r) for r in reactant_lists])) +function _reaction_enumerate_library_from_reaction( + rxn::Py, reactant_lists::Vector{Vector{Py}} +) + @pyconst(pyimport("rdkit.Chem.AllChem").EnumerateLibraryFromReaction)( + rxn, pylist([pylist(r) for r in reactant_lists]) + ) end function _reaction_to_smarts(rxn::Py) @@ -787,16 +845,20 @@ function _reaction_compute_atom_mapping(rxn::Py) @pyconst(pyimport("rdkit.Chem.rdChemReactions").ReduceProductToSideChains)(rxn) end -function _reaction_sanitize_reaction(rxn::Py, sanitize_ops::Int=15) +function _reaction_sanitize_reaction(rxn::Py, sanitize_ops::Int = 15) @pyconst(pyimport("rdkit.Chem.rdChemReactions").SanitizeRxn)(rxn, sanitize_ops) end -function _reaction_remove_unmapped_reactant_templates(rxn::Py, mode::Int=1) - @pyconst(pyimport("rdkit.Chem.rdChemReactions").RemoveUnmappedReactantTemplates)(rxn, mode) +function _reaction_remove_unmapped_reactant_templates(rxn::Py, mode::Int = 1) + @pyconst(pyimport("rdkit.Chem.rdChemReactions").RemoveUnmappedReactantTemplates)( + rxn, mode + ) end -function _reaction_remove_unmapped_product_templates(rxn::Py, mode::Int=1) - @pyconst(pyimport("rdkit.Chem.rdChemReactions").RemoveUnmappedProductTemplates)(rxn, mode) +function _reaction_remove_unmapped_product_templates(rxn::Py, mode::Int = 1) + @pyconst(pyimport("rdkit.Chem.rdChemReactions").RemoveUnmappedProductTemplates)( + rxn, mode + ) end function _reaction_preprocess(rxn::Py) @@ -810,7 +872,7 @@ end function _get_atom_mapping_numbers(mol::Py) atom_map_nums = [] num_atoms = pyconvert(Int, mol.GetNumAtoms()) - for i in 0:(num_atoms-1) + for i in 0:(num_atoms - 1) atom = mol.GetAtomWithIdx(i) map_num = pyconvert(Int, atom.GetAtomMapNum()) push!(atom_map_nums, map_num) @@ -823,4 +885,83 @@ function _set_atom_mapping_numbers(mol::Py, map_nums::Vector{Int}) atom = mol.GetAtomWithIdx(i-1) atom.SetAtomMapNum(map_num) end -end \ No newline at end of file +end + +# Pharmacophore and Chemical Features +function _build_feature_factory(filename::String) + @pyconst(pyimport("rdkit.Chem.ChemicalFeatures").BuildFeatureFactory)(filename) +end + +function _build_feature_factory_from_string(fdef_string::String) + @pyconst(pyimport("rdkit.Chem.ChemicalFeatures").BuildFeatureFactoryFromString)( + fdef_string + ) +end + +function _get_features_for_mol(factory::Py, mol::Py; conf_id::Int = -1) + if conf_id == -1 + factory.GetFeaturesForMol(mol) + else + factory.GetFeaturesForMol(mol; confId = conf_id) + end +end + +function _get_feature_families(factory::Py) + factory.GetFeatureFamilies() +end + +function _get_feature_defs(factory::Py) + factory.GetFeatureDefs() +end + +function _get_num_feature_defs(factory::Py) + factory.GetNumFeatureDefs() +end + +function _mol_from_ph4(pharmacophore::Py) + @pyconst(pyimport("rdkit.Chem.Pharm3D.EmbedLib").EmbedMol)(pharmacophore) +end + +function _get_pharmacophore_fingerprint(mol::Py, factory::Py, sig_factory::Py) + @pyconst(pyimport("rdkit.Chem.Pharm2D.Generate").Gen2DFingerprint)(mol, sig_factory) +end + +function _create_sig_factory( + factory::Py; min_point_count::Int = 2, max_point_count::Int = 3 +) + sig_factory = @pyconst(pyimport("rdkit.Chem.Pharm2D.SigFactory").SigFactory)( + factory, min_point_count, max_point_count + ) + # Use non-overlapping distance bins that avoid boundary issues + sig_factory.SetBins([(0, 2), (2, 6), (6, 12)]) + sig_factory.Init() + return sig_factory +end + +function _get_rdconfig_data_dir() + @pyconst(pyimport("rdkit.RDConfig").RDDataDir) +end + +# 3D Pharmacophore functions +function _pharmacophore_from_mol(mol::Py, feature_factory::Py; conf_id::Int = -1) + @pyconst(pyimport("rdkit.Chem.Pharm3D.Pharmacophore").Pharmacophore)() +end + +function _explicit_pharmacophore_from_mol(mol::Py, feature_factory::Py; conf_id::Int = -1) + features = _get_features_for_mol(feature_factory, mol; conf_id = conf_id) + + # Extract feature information + feature_list = Tuple{String, Vector{Float64}}[] + for feature in features + family = pyconvert(String, feature.GetFamily()) + pos_obj = feature.GetPos() + pos = [ + pyconvert(Float64, pos_obj.x), + pyconvert(Float64, pos_obj.y), + pyconvert(Float64, pos_obj.z), + ] + push!(feature_list, (family, pos)) + end + + return feature_list +end diff --git a/src/reaction/reaction.jl b/src/reaction/reaction.jl index 56f398c..d4380d7 100644 --- a/src/reaction/reaction.jl +++ b/src/reaction/reaction.jl @@ -3,118 +3,130 @@ ####################################################### """ - Reaction + Reaction Represents a chemical reaction with comprehensive functionality # Fields -- `_rdkit_rxn::Py`: The underlying RDKit reaction object -- `props::Dict{Symbol, Any}`: Dictionary for storing additional reaction properties -- `validated::Bool`: Whether the reaction has been validated -- `fingerprint::Union{Nothing, BitVector}`: Cached reaction fingerprint + + - `_rdkit_rxn::Py`: The underlying RDKit reaction object + - `props::Dict{Symbol, Any}`: Dictionary for storing additional reaction properties + - `validated::Bool`: Whether the reaction has been validated + - `fingerprint::Union{Nothing, BitVector}`: Cached reaction fingerprint """ @kwdef mutable struct Reaction - _rdkit_rxn::Py - props::Dict{Symbol, Any} = Dict() - validated::Bool = false - fingerprint::Union{Nothing, BitVector} = nothing + _rdkit_rxn::Py + props::Dict{Symbol, Any} = Dict() + validated::Bool = false + fingerprint::Union{Nothing, BitVector} = nothing end function Base.show(io::IO, rxn::Reaction) - smarts = get(rxn.props, :SMARTS, "unknown") - status = rxn.validated ? "validated" : "unvalidated" - print(io, "Reaction($smarts, $status)") + smarts = get(rxn.props, :SMARTS, "unknown") + status = rxn.validated ? "validated" : "unvalidated" + print(io, "Reaction($smarts, $status)") end """ - reaction_from_smarts(smarts::String; validate::Bool=true) -> Reaction + reaction_from_smarts(smarts::String; validate::Bool=true) -> Reaction Create a Reaction object from a SMARTS string. # Arguments -- `smarts::String`: A valid reaction SMARTS string with atom mapping (e.g., `"[C:1](=O)[O:2][C:3]>>[C:1](=O)[O-].[C:3][O+]"`) -- `validate::Bool=true`: Whether to validate and sanitize the reaction after creation + + - `smarts::String`: A valid reaction SMARTS string with atom mapping (e.g., `"[C:1](=O)[O:2][C:3]>>[C:1](=O)[O-].[C:3][O+]"`) + - `validate::Bool=true`: Whether to validate and sanitize the reaction after creation # Returns -- `Reaction`: A validated reaction object ready for use + + - `Reaction`: A validated reaction object ready for use # Examples + ```julia # Basic ester hydrolysis reaction -rxn = reaction_from_smarts("\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]") +rxn = reaction_from_smarts( + "\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]" +) # Create without validation (faster, but potentially unsafe) -rxn = reaction_from_smarts(smarts, validate=false) +rxn = reaction_from_smarts(smarts; validate = false) ``` # Notes -- Atom mapping (numbers after colons) is highly recommended for predictable results -- Invalid SMARTS strings will throw an error -- Validation includes sanitization and structural checks + + - Atom mapping (numbers after colons) is highly recommended for predictable results + - Invalid SMARTS strings will throw an error + - Validation includes sanitization and structural checks """ -function reaction_from_smarts(smarts::String; validate::Bool=true) - rxn = _reaction_from_smarts(smarts) - if pynot(rxn) - error("Invalid reaction SMARTS: $smarts") - end - reaction = Reaction(_rdkit_rxn=rxn, props=Dict(:SMARTS => smarts)) - if validate - validate_reaction!(reaction) - end - return reaction +function reaction_from_smarts(smarts::String; validate::Bool = true) + rxn = _reaction_from_smarts(smarts) + if pynot(rxn) + error("Invalid reaction SMARTS: $smarts") + end + reaction = Reaction(; _rdkit_rxn = rxn, props = Dict(:SMARTS => smarts)) + if validate + validate_reaction!(reaction) + end + return reaction end """ - reaction_from_rxn_file(filename::String; validate::Bool=true) -> Reaction + reaction_from_rxn_file(filename::String; validate::Bool=true) -> Reaction Create a Reaction object from an RXN file. """ -function reaction_from_rxn_file(filename::String; validate::Bool=true) - rxn = _reaction_from_rxn_file(filename) - if pynot(rxn) - error("Invalid RXN file: $filename") - end - reaction = Reaction(_rdkit_rxn=rxn, props=Dict(:source => filename)) - if validate - validate_reaction!(reaction) - end - return reaction +function reaction_from_rxn_file(filename::String; validate::Bool = true) + rxn = _reaction_from_rxn_file(filename) + if pynot(rxn) + error("Invalid RXN file: $filename") + end + reaction = Reaction(; _rdkit_rxn = rxn, props = Dict(:source => filename)) + if validate + validate_reaction!(reaction) + end + return reaction end """ - reaction_from_rxn_block(rxnblock::String; validate::Bool=true) -> Reaction + reaction_from_rxn_block(rxnblock::String; validate::Bool=true) -> Reaction Create a Reaction object from an RXN block string. """ -function reaction_from_rxn_block(rxnblock::String; validate::Bool=true) - rxn = _reaction_from_rxn_block(rxnblock) - if pynot(rxn) - error("Invalid RXN block") - end - reaction = Reaction(_rdkit_rxn=rxn, props=Dict(:source => "rxn_block")) - if validate - validate_reaction!(reaction) - end - return reaction +function reaction_from_rxn_block(rxnblock::String; validate::Bool = true) + rxn = _reaction_from_rxn_block(rxnblock) + if pynot(rxn) + error("Invalid RXN block") + end + reaction = Reaction(; _rdkit_rxn = rxn, props = Dict(:source => "rxn_block")) + if validate + validate_reaction!(reaction) + end + return reaction end """ - run_reaction(rxn::Reaction, reactants::Vector{Molecule}; max_products::Int=1000) -> Vector{Vector{Molecule}} + run_reaction(rxn::Reaction, reactants::Vector{Molecule}; max_products::Int=1000) -> Vector{Vector{Molecule}} Apply a reaction to a set of reactants and generate products. # Arguments -- `rxn::Reaction`: The reaction to apply -- `reactants::Vector{Molecule}`: Vector of reactant molecules -- `max_products::Int=1000`: Maximum number of product sets to generate + + - `rxn::Reaction`: The reaction to apply + - `reactants::Vector{Molecule}`: Vector of reactant molecules + - `max_products::Int=1000`: Maximum number of product sets to generate # Returns -- `Vector{Vector{Molecule}}`: Vector of product sets, where each set represents one possible reaction outcome + + - `Vector{Vector{Molecule}}`: Vector of product sets, where each set represents one possible reaction outcome # Examples + ```julia # Apply ester hydrolysis to ethyl acetate -rxn = reaction_from_smarts("\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]") +rxn = reaction_from_smarts( + "\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]" +) reactant = mol_from_smiles("CC(=O)OCC") products = run_reaction(rxn, [reactant]) @@ -128,37 +140,45 @@ end ``` # Notes -- Returns empty vector if no products can be formed -- Each product set represents one possible stereochemical or regioisomeric outcome -- Unvalidated reactions will show a warning but still execute -- Use `has_reactant_substructure_match()` to pre-check compatibility + + - Returns empty vector if no products can be formed + - Each product set represents one possible stereochemical or regioisomeric outcome + - Unvalidated reactions will show a warning but still execute + - Use `has_reactant_substructure_match()` to pre-check compatibility """ -function run_reaction(rxn::Reaction, reactants::Vector{Molecule}; max_products::Int=1000) - if !rxn.validated - @warn "Running unvalidated reaction. Consider calling validate_reaction! first." - end +function run_reaction(rxn::Reaction, reactants::Vector{Molecule}; max_products::Int = 1000) + if !rxn.validated + @warn "Running unvalidated reaction. Consider calling validate_reaction! first." + end - py_reactants = [r._rdkit_mol for r in reactants] - product_sets = _reaction_run_reactants_inline_properties(rxn._rdkit_rxn, py_reactants, max_products) - return [[Molecule(_rdkit_mol=prod, valid=!pynot(prod), source="reaction") for prod in set] for set in product_sets] + py_reactants = [r._rdkit_mol for r in reactants] + product_sets = _reaction_run_reactants_inline_properties( + rxn._rdkit_rxn, py_reactants, max_products + ) + return [ + [ + Molecule(; _rdkit_mol = prod, valid = !pynot(prod), source = "reaction") for + prod in set + ] for set in product_sets + ] end """ - reaction_to_smarts(rxn::Reaction) -> String + reaction_to_smarts(rxn::Reaction) -> String Export a Reaction object to a SMARTS string. """ function reaction_to_smarts(rxn::Reaction) - return string(_reaction_to_smarts(rxn._rdkit_rxn)) + return string(_reaction_to_smarts(rxn._rdkit_rxn)) end """ - reaction_to_rxn_block(rxn::Reaction) -> String + reaction_to_rxn_block(rxn::Reaction) -> String Export a Reaction object to an RXN block string. """ function reaction_to_rxn_block(rxn::Reaction) - return string(_reaction_to_rxn_block(rxn._rdkit_rxn)) + return string(_reaction_to_rxn_block(rxn._rdkit_rxn)) end ####################################################### @@ -166,35 +186,35 @@ end ####################################################### """ - validate_reaction!(rxn::Reaction) -> Bool + validate_reaction!(rxn::Reaction) -> Bool Validate and sanitize a reaction. Returns true if successful. """ function validate_reaction!(rxn::Reaction) - try - _reaction_validate(rxn._rdkit_rxn) - rxn.validated = true - return true - catch e - rxn.validated = false - error("Reaction validation failed: $e") - end + try + _reaction_validate(rxn._rdkit_rxn) + rxn.validated = true + return true + catch e + rxn.validated = false + error("Reaction validation failed: $e") + end end """ - is_reaction_valid(rxn::Reaction) -> Bool + is_reaction_valid(rxn::Reaction) -> Bool Check if a reaction is valid without modifying it. """ function is_reaction_valid(rxn::Reaction) - try - # Create a copy and try to validate it - test_rxn = _reaction_from_smarts(reaction_to_smarts(rxn)) - _reaction_validate(test_rxn) - return true - catch - return false - end + try + # Create a copy and try to validate it + test_rxn = _reaction_from_smarts(reaction_to_smarts(rxn)) + _reaction_validate(test_rxn) + return true + catch + return false + end end ####################################################### @@ -202,67 +222,73 @@ end ####################################################### """ - get_num_reactant_templates(rxn::Reaction) -> Int + get_num_reactant_templates(rxn::Reaction) -> Int Get the number of reactant templates in the reaction. """ function get_num_reactant_templates(rxn::Reaction) - return pyconvert(Int, _reaction_get_num_reactant_templates(rxn._rdkit_rxn)) + return pyconvert(Int, _reaction_get_num_reactant_templates(rxn._rdkit_rxn)) end """ - get_num_product_templates(rxn::Reaction) -> Int + get_num_product_templates(rxn::Reaction) -> Int Get the number of product templates in the reaction. """ function get_num_product_templates(rxn::Reaction) - return pyconvert(Int, _reaction_get_num_product_templates(rxn._rdkit_rxn)) + return pyconvert(Int, _reaction_get_num_product_templates(rxn._rdkit_rxn)) end """ - get_reactant_template(rxn::Reaction, idx::Int) -> Molecule + get_reactant_template(rxn::Reaction, idx::Int) -> Molecule Get a reactant template by index (0-based). """ function get_reactant_template(rxn::Reaction, idx::Int) - template = _reaction_get_reactant_template(rxn._rdkit_rxn, idx) - return Molecule(_rdkit_mol=template, valid=!pynot(template), source="reaction_template") + template = _reaction_get_reactant_template(rxn._rdkit_rxn, idx) + return Molecule(; + _rdkit_mol = template, valid = !pynot(template), source = "reaction_template" + ) end """ - get_product_template(rxn::Reaction, idx::Int) -> Molecule + get_product_template(rxn::Reaction, idx::Int) -> Molecule Get a product template by index (0-based). """ function get_product_template(rxn::Reaction, idx::Int) - template = _reaction_get_product_template(rxn._rdkit_rxn, idx) - return Molecule(_rdkit_mol=template, valid=!pynot(template), source="reaction_template") + template = _reaction_get_product_template(rxn._rdkit_rxn, idx) + return Molecule(; + _rdkit_mol = template, valid = !pynot(template), source = "reaction_template" + ) end """ - has_reactant_substructure_match(rxn::Reaction, mol::Molecule) -> Bool + has_reactant_substructure_match(rxn::Reaction, mol::Molecule) -> Bool Check if a molecule matches any reactant template in the reaction. """ function has_reactant_substructure_match(rxn::Reaction, mol::Molecule) - return pyconvert(Bool, _reaction_has_reactant_substructure_match(rxn._rdkit_rxn, mol._rdkit_mol)) + return pyconvert( + Bool, _reaction_has_reactant_substructure_match(rxn._rdkit_rxn, mol._rdkit_mol) + ) end """ - get_reacting_atoms(rxn::Reaction) -> Vector{Tuple{Int, Int}} + get_reacting_atoms(rxn::Reaction) -> Vector{Tuple{Int, Int}} Get the indices of atoms that participate in the reaction for each reactant template. Returns a vector of tuples where each tuple contains (template_idx, atom_idx). """ function get_reacting_atoms(rxn::Reaction) - reacting_atoms = _reaction_get_reacting_atoms(rxn._rdkit_rxn) - result = Tuple{Int, Int}[] - for template_atoms in reacting_atoms - for atom_idx in template_atoms - push!(result, (length(result), pyconvert(Int, atom_idx))) - end - end - return result + reacting_atoms = _reaction_get_reacting_atoms(rxn._rdkit_rxn) + result = Tuple{Int, Int}[] + for template_atoms in reacting_atoms + for atom_idx in template_atoms + push!(result, (length(result), pyconvert(Int, atom_idx))) + end + end + return result end ####################################################### @@ -270,21 +296,26 @@ end ####################################################### """ - reaction_fingerprint(rxn::Reaction; fp_size::Int=2048, use_cache::Bool=true) -> BitVector + reaction_fingerprint(rxn::Reaction; fp_size::Int=2048, use_cache::Bool=true) -> BitVector Generate a difference fingerprint for the reaction, capturing the chemical transformation. # Arguments -- `rxn::Reaction`: The reaction to fingerprint -- `fp_size::Int=2048`: Size of the fingerprint bit vector -- `use_cache::Bool=true`: Whether to cache the fingerprint for reuse + + - `rxn::Reaction`: The reaction to fingerprint + - `fp_size::Int=2048`: Size of the fingerprint bit vector + - `use_cache::Bool=true`: Whether to cache the fingerprint for reuse # Returns -- `BitVector`: Fingerprint representing the chemical transformation + + - `BitVector`: Fingerprint representing the chemical transformation # Examples + ```julia -rxn1 = reaction_from_smarts("\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]") +rxn1 = reaction_from_smarts( + "\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]" +) rxn2 = reaction_from_smarts("\\[C:1\\]\\[OH:2\\]>>\\[C:1\\]\\[O-\\]") fp1 = reaction_fingerprint(rxn1) @@ -295,127 +326,136 @@ similarity = sum(fp1 .& fp2) / sum(fp1 .| fp2) ``` # Notes -- Difference fingerprints focus on the atoms and bonds that change during the reaction -- Cached fingerprints are automatically retrieved on subsequent calls -- Use for reaction similarity analysis and database searching -- Different from structural fingerprints which capture overall molecular features -""" -function reaction_fingerprint(rxn::Reaction; fp_size::Int=2048, use_cache::Bool=true) - if use_cache && rxn.fingerprint !== nothing && length(rxn.fingerprint) == fp_size - return rxn.fingerprint - end - fp = _reaction_fingerprint(rxn._rdkit_rxn, fp_size) + - Difference fingerprints focus on the atoms and bonds that change during the reaction + - Cached fingerprints are automatically retrieved on subsequent calls + - Use for reaction similarity analysis and database searching + - Different from structural fingerprints which capture overall molecular features +""" +function reaction_fingerprint(rxn::Reaction; fp_size::Int = 2048, use_cache::Bool = true) + if use_cache && rxn.fingerprint !== nothing && length(rxn.fingerprint) == fp_size + return rxn.fingerprint + end - # Handle different fingerprint types - if pyhasattr(fp, "GetBit") - # ExplicitBitVect - can access bits directly - fp_bits = BitVector([pyconvert(Bool, fp.GetBit(i)) for i in 0:(fp_size-1)]) - else - # UIntSparseIntVect - convert to dense representation - fp_bits = BitVector(zeros(Bool, fp_size)) - nz_elements = fp.GetNonzeroElements() - py_dict = pyconvert(Dict, nz_elements) - for (idx, val) in py_dict - bit_idx = idx + 1 # Convert to 1-based indexing - if bit_idx <= fp_size && val != 0 - fp_bits[bit_idx] = true - end - end - end + fp = _reaction_fingerprint(rxn._rdkit_rxn, fp_size) + + # Handle different fingerprint types + if pyhasattr(fp, "GetBit") + # ExplicitBitVect - can access bits directly + fp_bits = BitVector([pyconvert(Bool, fp.GetBit(i)) for i in 0:(fp_size - 1)]) + else + # UIntSparseIntVect - convert to dense representation + fp_bits = BitVector(zeros(Bool, fp_size)) + nz_elements = fp.GetNonzeroElements() + py_dict = pyconvert(Dict, nz_elements) + for (idx, val) in py_dict + bit_idx = idx + 1 # Convert to 1-based indexing + if bit_idx <= fp_size && val != 0 + fp_bits[bit_idx] = true + end + end + end - if use_cache - rxn.fingerprint = fp_bits - end + if use_cache + rxn.fingerprint = fp_bits + end - return fp_bits + return fp_bits end """ - reaction_structural_fingerprint(rxn::Reaction; fp_size::Int=2048) -> BitVector + reaction_structural_fingerprint(rxn::Reaction; fp_size::Int=2048) -> BitVector Generate a structural fingerprint for the reaction. """ -function reaction_structural_fingerprint(rxn::Reaction; fp_size::Int=2048) - fp = _reaction_structural_fingerprint(rxn._rdkit_rxn, fp_size) - return BitVector([pyconvert(Bool, fp.GetBit(i)) for i in 0:(fp_size-1)]) +function reaction_structural_fingerprint(rxn::Reaction; fp_size::Int = 2048) + fp = _reaction_structural_fingerprint(rxn._rdkit_rxn, fp_size) + return BitVector([pyconvert(Bool, fp.GetBit(i)) for i in 0:(fp_size - 1)]) end """ - reaction_center_fingerprint(rxn::Reaction; fp_size::Int=2048) -> BitVector + reaction_center_fingerprint(rxn::Reaction; fp_size::Int=2048) -> BitVector Generate a reaction center fingerprint focusing on the reaction core. """ -function reaction_center_fingerprint(rxn::Reaction; fp_size::Int=2048) - fp = _compute_reaction_center_fingerprint(rxn._rdkit_rxn, fp_size) - - # Handle different fingerprint types - if pyhasattr(fp, "GetBit") - # ExplicitBitVect - can access bits directly - return BitVector([pyconvert(Bool, fp.GetBit(i)) for i in 0:(fp_size-1)]) - else - # UIntSparseIntVect - convert to dense representation - fp_bits = BitVector(zeros(Bool, fp_size)) - nz_elements = fp.GetNonzeroElements() - py_dict = pyconvert(Dict, nz_elements) - for (idx, val) in py_dict - bit_idx = idx + 1 # Convert to 1-based indexing - if bit_idx <= fp_size && val != 0 - fp_bits[bit_idx] = true - end - end - return fp_bits - end +function reaction_center_fingerprint(rxn::Reaction; fp_size::Int = 2048) + fp = _compute_reaction_center_fingerprint(rxn._rdkit_rxn, fp_size) + + # Handle different fingerprint types + if pyhasattr(fp, "GetBit") + # ExplicitBitVect - can access bits directly + return BitVector([pyconvert(Bool, fp.GetBit(i)) for i in 0:(fp_size - 1)]) + else + # UIntSparseIntVect - convert to dense representation + fp_bits = BitVector(zeros(Bool, fp_size)) + nz_elements = fp.GetNonzeroElements() + py_dict = pyconvert(Dict, nz_elements) + for (idx, val) in py_dict + bit_idx = idx + 1 # Convert to 1-based indexing + if bit_idx <= fp_size && val != 0 + fp_bits[bit_idx] = true + end + end + return fp_bits + end end """ - reaction_similarity(rxn1::Reaction, rxn2::Reaction; method::Symbol=:tanimoto) -> Float64 + reaction_similarity(rxn1::Reaction, rxn2::Reaction; method::Symbol=:tanimoto) -> Float64 Calculate similarity between two reactions using their fingerprints. # Arguments -- `rxn1::Reaction`: First reaction -- `rxn2::Reaction`: Second reaction -- `method::Symbol=:tanimoto`: Similarity method (`:tanimoto` or `:dice`) + + - `rxn1::Reaction`: First reaction + - `rxn2::Reaction`: Second reaction + - `method::Symbol=:tanimoto`: Similarity method (`:tanimoto` or `:dice`) # Returns -- `Float64`: Similarity score between 0.0 (no similarity) and 1.0 (identical) + + - `Float64`: Similarity score between 0.0 (no similarity) and 1.0 (identical) # Examples + ```julia -rxn1 = reaction_from_smarts("\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]") -rxn2 = reaction_from_smarts("\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[OH\\].\\[C:3\\]") +rxn1 = reaction_from_smarts( + "\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[O-\\].\\[C:3\\]\\[O+\\]" +) +rxn2 = reaction_from_smarts( + "\\[C:1\\](=O)\\[O:2\\]\\[C:3\\]>>\\[C:1\\](=O)\\[OH\\].\\[C:3\\]" +) # Tanimoto similarity -tanimoto_sim = reaction_similarity(rxn1, rxn2, method=:tanimoto) +tanimoto_sim = reaction_similarity(rxn1, rxn2; method = :tanimoto) # Dice similarity -dice_sim = reaction_similarity(rxn1, rxn2, method=:dice) +dice_sim = reaction_similarity(rxn1, rxn2; method = :dice) println("Tanimoto: \$tanimoto_sim, Dice: \$dice_sim") ``` # Notes -- Tanimoto similarity: intersection / union (most common) -- Dice similarity: 2 * intersection / (|A| + |B|) (gives higher scores) -- Uses cached fingerprints when available for better performance -- Self-similarity should equal 1.0 for identical reactions -""" -function reaction_similarity(rxn1::Reaction, rxn2::Reaction; method::Symbol=:tanimoto) - fp1 = reaction_fingerprint(rxn1) - fp2 = reaction_fingerprint(rxn2) - - if method == :tanimoto - intersection = sum(fp1 .& fp2) - union = sum(fp1 .| fp2) - return union > 0 ? intersection / union : 0.0 - elseif method == :dice - intersection = sum(fp1 .& fp2) - total = sum(fp1) + sum(fp2) - return total > 0 ? 2 * intersection / total : 0.0 - else - error("Unknown similarity method: $method") - end + + - Tanimoto similarity: intersection / union (most common) + - Dice similarity: 2 * intersection / (|A| + |B|) (gives higher scores) + - Uses cached fingerprints when available for better performance + - Self-similarity should equal 1.0 for identical reactions +""" +function reaction_similarity(rxn1::Reaction, rxn2::Reaction; method::Symbol = :tanimoto) + fp1 = reaction_fingerprint(rxn1) + fp2 = reaction_fingerprint(rxn2) + + if method == :tanimoto + intersection = sum(fp1 .& fp2) + union = sum(fp1 .| fp2) + return union > 0 ? intersection / union : 0.0 + elseif method == :dice + intersection = sum(fp1 .& fp2) + total = sum(fp1) + sum(fp2) + return total > 0 ? 2 * intersection / total : 0.0 + else + error("Unknown similarity method: $method") + end end ####################################################### @@ -423,14 +463,19 @@ end ####################################################### """ - enumerate_library(rxn::Reaction, reactant_lists::Vector{Vector{Molecule}}) -> Vector{Vector{Molecule}} + enumerate_library(rxn::Reaction, reactant_lists::Vector{Vector{Molecule}}) -> Vector{Vector{Molecule}} Enumerate a library of products from lists of reactants. """ function enumerate_library(rxn::Reaction, reactant_lists::Vector{Vector{Molecule}}) - py_reactant_lists = [[mol._rdkit_mol for mol in mols] for mols in reactant_lists] - product_sets = _reaction_enumerate_library_from_reaction(rxn._rdkit_rxn, py_reactant_lists) - return [[Molecule(_rdkit_mol=prod, valid=true, source="library") for prod in set] for set in product_sets] + py_reactant_lists = [[mol._rdkit_mol for mol in mols] for mols in reactant_lists] + product_sets = _reaction_enumerate_library_from_reaction( + rxn._rdkit_rxn, py_reactant_lists + ) + return [ + [Molecule(; _rdkit_mol = prod, valid = true, source = "library") for prod in set] + for set in product_sets + ] end ####################################################### @@ -438,54 +483,54 @@ end ####################################################### """ - reaction_info(rxn::Reaction) -> Dict{Symbol, Any} + reaction_info(rxn::Reaction) -> Dict{Symbol, Any} Get comprehensive information about a reaction. """ function reaction_info(rxn::Reaction) - info = Dict{Symbol, Any}( - :smarts => reaction_to_smarts(rxn), - :validated => rxn.validated, - :num_reactant_templates => get_num_reactant_templates(rxn), - :num_product_templates => get_num_product_templates(rxn), - :has_fingerprint => rxn.fingerprint !== nothing, - :properties => copy(rxn.props) - ) - return info + info = Dict{Symbol, Any}( + :smarts => reaction_to_smarts(rxn), + :validated => rxn.validated, + :num_reactant_templates => get_num_reactant_templates(rxn), + :num_product_templates => get_num_product_templates(rxn), + :has_fingerprint => rxn.fingerprint !== nothing, + :properties => copy(rxn.props), + ) + return info end """ - is_balanced(rxn::Reaction) -> Bool + is_balanced(rxn::Reaction) -> Bool Check if a reaction is atom-balanced (same atoms on both sides). Note: This is a simplified implementation. """ function is_balanced(rxn::Reaction) - try - # Get reactant and product templates - num_reactants = get_num_reactant_templates(rxn) - num_products = get_num_product_templates(rxn) - - # Simple check: same number of heavy atoms on both sides - reactant_atoms = 0 - product_atoms = 0 - - # Count reactant atoms - for i in 0:(num_reactants-1) - template = get_reactant_template(rxn, i) - reactant_atoms += heavy_atom_count(template) - end - - # Count product atoms - for i in 0:(num_products-1) - template = get_product_template(rxn, i) - product_atoms += heavy_atom_count(template) - end - - return reactant_atoms == product_atoms - catch - return false - end + try + # Get reactant and product templates + num_reactants = get_num_reactant_templates(rxn) + num_products = get_num_product_templates(rxn) + + # Simple check: same number of heavy atoms on both sides + reactant_atoms = 0 + product_atoms = 0 + + # Count reactant atoms + for i in 0:(num_reactants - 1) + template = get_reactant_template(rxn, i) + reactant_atoms += heavy_atom_count(template) + end + + # Count product atoms + for i in 0:(num_products - 1) + template = get_product_template(rxn, i) + product_atoms += heavy_atom_count(template) + end + + return reactant_atoms == product_atoms + catch + return false + end end ####################################################### @@ -493,87 +538,87 @@ end ####################################################### """ - get_atom_mapping_numbers(mol::Molecule) -> Vector{Int} + get_atom_mapping_numbers(mol::Molecule) -> Vector{Int} Get atom mapping numbers for all atoms in a molecule. """ function get_atom_mapping_numbers(mol::Molecule) - py_result = _get_atom_mapping_numbers(mol._rdkit_mol) - return [pyconvert(Int, x) for x in py_result] + py_result = _get_atom_mapping_numbers(mol._rdkit_mol) + return [pyconvert(Int, x) for x in py_result] end """ - set_atom_mapping_numbers!(mol::Molecule, map_nums::Vector{Int}) + set_atom_mapping_numbers!(mol::Molecule, map_nums::Vector{Int}) Set atom mapping numbers for all atoms in a molecule. """ function set_atom_mapping_numbers!(mol::Molecule, map_nums::Vector{Int}) - _set_atom_mapping_numbers(mol._rdkit_mol, map_nums) + _set_atom_mapping_numbers(mol._rdkit_mol, map_nums) end """ - remove_unmapped_reactant_templates!(rxn::Reaction; mode::Int=1) + remove_unmapped_reactant_templates!(rxn::Reaction; mode::Int=1) Remove reactant templates that don't have mapped atoms. """ -function remove_unmapped_reactant_templates!(rxn::Reaction; mode::Int=1) - _reaction_remove_unmapped_reactant_templates(rxn._rdkit_rxn, mode) - rxn.validated = false # Need to revalidate after modification +function remove_unmapped_reactant_templates!(rxn::Reaction; mode::Int = 1) + _reaction_remove_unmapped_reactant_templates(rxn._rdkit_rxn, mode) + rxn.validated = false # Need to revalidate after modification end """ - remove_unmapped_product_templates!(rxn::Reaction; mode::Int=1) + remove_unmapped_product_templates!(rxn::Reaction; mode::Int=1) Remove product templates that don't have mapped atoms. """ -function remove_unmapped_product_templates!(rxn::Reaction; mode::Int=1) - _reaction_remove_unmapped_product_templates(rxn._rdkit_rxn, mode) - rxn.validated = false # Need to revalidate after modification +function remove_unmapped_product_templates!(rxn::Reaction; mode::Int = 1) + _reaction_remove_unmapped_product_templates(rxn._rdkit_rxn, mode) + rxn.validated = false # Need to revalidate after modification end """ - preprocess_reaction!(rxn::Reaction) + preprocess_reaction!(rxn::Reaction) Preprocess a reaction to standardize it. """ function preprocess_reaction!(rxn::Reaction) - _reaction_preprocess(rxn._rdkit_rxn) - rxn.validated = false # Need to revalidate after modification + _reaction_preprocess(rxn._rdkit_rxn) + rxn.validated = false # Need to revalidate after modification end """ - compute_atom_mapping!(rxn::Reaction) + compute_atom_mapping!(rxn::Reaction) Compute and apply atom mapping to the reaction. """ function compute_atom_mapping!(rxn::Reaction) - _reaction_compute_atom_mapping(rxn._rdkit_rxn) - rxn.validated = false # Need to revalidate after modification + _reaction_compute_atom_mapping(rxn._rdkit_rxn) + rxn.validated = false # Need to revalidate after modification end """ - is_template_molecule_agent(mol::Molecule) -> Bool + is_template_molecule_agent(mol::Molecule) -> Bool Check if a molecule is an agent in a reaction template. """ function is_template_molecule_agent(mol::Molecule) - return pyconvert(Bool, _reaction_is_template_molecule_agent(mol._rdkit_mol)) + return pyconvert(Bool, _reaction_is_template_molecule_agent(mol._rdkit_mol)) end """ - sanitize_reaction!(rxn::Reaction; ops::Int=15) -> Bool + sanitize_reaction!(rxn::Reaction; ops::Int=15) -> Bool Sanitize a reaction with specific operations. Returns true if successful. """ -function sanitize_reaction!(rxn::Reaction; ops::Int=15) - try - _reaction_sanitize_reaction(rxn._rdkit_rxn, ops) - rxn.validated = true - return true - catch e - rxn.validated = false - error("Reaction sanitization failed: $e") - end +function sanitize_reaction!(rxn::Reaction; ops::Int = 15) + try + _reaction_sanitize_reaction(rxn._rdkit_rxn, ops) + rxn.validated = true + return true + catch e + rxn.validated = false + error("Reaction sanitization failed: $e") + end end ####################################################### @@ -581,74 +626,78 @@ end ####################################################### """ - reaction_complexity(rxn::Reaction) -> Float64 + reaction_complexity(rxn::Reaction) -> Float64 Calculate a complexity score for the reaction based on various factors. """ function reaction_complexity(rxn::Reaction) - num_reactants = get_num_reactant_templates(rxn) - num_products = get_num_product_templates(rxn) + num_reactants = get_num_reactant_templates(rxn) + num_products = get_num_product_templates(rxn) - # Base complexity from number of components - complexity = (num_reactants + num_products) / 2.0 + # Base complexity from number of components + complexity = (num_reactants + num_products) / 2.0 - # Add complexity from molecular size - total_atoms = 0 - for i in 0:(num_reactants-1) - template = get_reactant_template(rxn, i) - total_atoms += heavy_atom_count(template) - end - for i in 0:(num_products-1) - template = get_product_template(rxn, i) - total_atoms += heavy_atom_count(template) - end + # Add complexity from molecular size + total_atoms = 0 + for i in 0:(num_reactants - 1) + template = get_reactant_template(rxn, i) + total_atoms += heavy_atom_count(template) + end + for i in 0:(num_products - 1) + template = get_product_template(rxn, i) + total_atoms += heavy_atom_count(template) + end - complexity += total_atoms / 20.0 # Normalize by typical molecule size + complexity += total_atoms / 20.0 # Normalize by typical molecule size - return complexity + return complexity end """ - reaction_type_classification(rxn::Reaction) -> Symbol + reaction_type_classification(rxn::Reaction) -> Symbol Classify the reaction type based on structural changes. """ function reaction_type_classification(rxn::Reaction) - num_reactants = get_num_reactant_templates(rxn) - num_products = get_num_product_templates(rxn) - - if num_reactants == 1 && num_products == 2 - return :decomposition - elseif num_reactants == 2 && num_products == 1 - return :combination - elseif num_reactants == 2 && num_products == 2 - return :substitution - elseif num_reactants == 1 && num_products == 1 - return :rearrangement - else - return :complex - end + num_reactants = get_num_reactant_templates(rxn) + num_products = get_num_product_templates(rxn) + + if num_reactants == 1 && num_products == 2 + return :decomposition + elseif num_reactants == 2 && num_products == 1 + return :combination + elseif num_reactants == 2 && num_products == 2 + return :substitution + elseif num_reactants == 1 && num_products == 1 + return :rearrangement + else + return :complex + end end """ - find_similar_reactions(target_rxn::Reaction, reaction_db::Vector{Reaction}; - threshold::Float64=0.7, method::Symbol=:tanimoto) -> Vector{Tuple{Reaction, Float64}} + find_similar_reactions(target_rxn::Reaction, reaction_db::Vector{Reaction}; + threshold::Float64=0.7, method::Symbol=:tanimoto) -> Vector{Tuple{Reaction, Float64}} Find reactions similar to the target reaction from a database. """ -function find_similar_reactions(target_rxn::Reaction, reaction_db::Vector{Reaction}; - threshold::Float64=0.7, method::Symbol=:tanimoto) - similar_reactions = Tuple{Reaction, Float64}[] - - for rxn in reaction_db - similarity = reaction_similarity(target_rxn, rxn, method=method) - if similarity >= threshold - push!(similar_reactions, (rxn, similarity)) - end - end +function find_similar_reactions( + target_rxn::Reaction, + reaction_db::Vector{Reaction}; + threshold::Float64 = 0.7, + method::Symbol = :tanimoto, +) + similar_reactions = Tuple{Reaction, Float64}[] + + for rxn in reaction_db + similarity = reaction_similarity(target_rxn, rxn; method = method) + if similarity >= threshold + push!(similar_reactions, (rxn, similarity)) + end + end - # Sort by similarity (descending) - sort!(similar_reactions, by=x->x[2], rev=true) + # Sort by similarity (descending) + sort!(similar_reactions; by = x->x[2], rev = true) - return similar_reactions + return similar_reactions end diff --git a/src/standardization/standardization.jl b/src/standardization/standardization.jl index 7fcd38e..db10c8a 100644 --- a/src/standardization/standardization.jl +++ b/src/standardization/standardization.jl @@ -386,3 +386,173 @@ function normalize_molecule(mol::Molecule) return mol end end + +""" + cleanup_molecule(mol::Molecule) -> Molecule + +Perform general cleanup operations on a molecule using RDKit's cleanup function. + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Molecule`: Cleaned up molecule + +# Example + +```julia +mol = mol_from_smiles("CCO") +clean_mol = cleanup_molecule(mol) +``` +""" +function cleanup_molecule(mol::Molecule) + if !mol.valid + return mol + end + + try + cleaned_mol = _cleanup(mol._rdkit_mol) + + if pynot(cleaned_mol) + return Molecule(; + _rdkit_mol = cleaned_mol, valid = false, source = "cleanup_$(mol.source)" + ) + end + + return Molecule(; + _rdkit_mol = cleaned_mol, + valid = true, + source = "cleanup_$(mol.source)", + props = copy(mol.props), + ) + catch e + @warn "Failed to cleanup molecule: $e" + return mol + end +end + +""" + standardize_smiles(smiles::String) -> Union{String, Missing} + +Standardize a SMILES string using RDKit's standardization procedures. + +# Arguments + + - `smiles::String`: Input SMILES string + +# Returns + + - `Union{String, Missing}`: Standardized SMILES string or missing if standardization fails + +# Example + +```julia +std_smiles = standardize_smiles("CC(O)=CC(=O)C.Na") +``` +""" +function standardize_smiles(smiles::String) + try + return pyconvert(String, _standardize_smiles(smiles)) + catch e + @warn "Failed to standardize SMILES: $e" + return missing + end +end + +""" + charge_parent(mol::Molecule) -> Molecule + +Get the charge parent (neutral form) of a molecule. + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Molecule`: Charge parent molecule + +# Example + +```julia +mol = mol_from_smiles("[NH3+]CCO") # Protonated ethanolamine +parent = charge_parent(mol) # Returns neutral form +``` +""" +function charge_parent(mol::Molecule) + if !mol.valid + return mol + end + + try + parent_mol = _charge_parent(mol._rdkit_mol) + + if pynot(parent_mol) + return Molecule(; + _rdkit_mol = parent_mol, + valid = false, + source = "charge_parent_$(mol.source)", + ) + end + + return Molecule(; + _rdkit_mol = parent_mol, + valid = true, + source = "charge_parent_$(mol.source)", + props = copy(mol.props), + ) + catch e + @warn "Failed to get charge parent: $e" + return mol + end +end + +""" + fragment_parent(mol::Molecule) -> Molecule + +Get the fragment parent (largest fragment) of a molecule. + +# Arguments + + - `mol::Molecule`: Input molecule + +# Returns + + - `Molecule`: Fragment parent molecule + +# Example + +```julia +mol = mol_from_smiles("CCO.Cl") # Ethanol with chloride +parent = fragment_parent(mol) # Returns just ethanol +``` +""" +function fragment_parent(mol::Molecule) + if !mol.valid + return mol + end + + try + parent_mol = _fragment_parent(mol._rdkit_mol) + + if pynot(parent_mol) + return Molecule(; + _rdkit_mol = parent_mol, + valid = false, + source = "fragment_parent_$(mol.source)", + ) + end + + return Molecule(; + _rdkit_mol = parent_mol, + valid = true, + source = "fragment_parent_$(mol.source)", + props = copy(mol.props), + ) + catch e + @warn "Failed to get fragment parent: $e" + return mol + end +end diff --git a/src/substructure/substructure.jl b/src/substructure/substructure.jl index 1dae708..a5cbaa9 100644 --- a/src/substructure/substructure.jl +++ b/src/substructure/substructure.jl @@ -626,52 +626,6 @@ function get_functional_groups(mol::Molecule) return results end -""" - get_ring_info(mol::Molecule) -> Union{Dict, Missing} - -Get detailed information about rings in a molecule. - -# Arguments - - - `mol::Molecule`: The molecule to analyze - -# Returns - - - `Dict`: Dictionary containing ring information with keys: - - + `:num_rings`: Total number of rings - + `:atom_rings`: Vector of vectors containing 1-based atom indices for each ring - + `:bond_rings`: Vector of vectors containing 1-based bond indices for each ring - - - `missing`: If molecule is invalid - -# Examples - -```julia -mol = mol_from_smiles("c1ccccc1") # Benzene -ring_info = get_ring_info(mol) -# Returns Dict(:num_rings => 1, :atom_rings => [[1,2,3,4,5,6]], :bond_rings => [[1,2,3,4,5,6]]) -``` - -# Notes - - - Provides comprehensive ring analysis - - Useful for understanding molecular topology -""" -function get_ring_info(mol::Molecule) - !mol.valid && return missing - - ring_info = mol._rdkit_mol.GetRingInfo() - - return Dict( - :num_rings => pyconvert(Int, ring_info.NumRings()), - :atom_rings => - [pyconvert(Vector{Int}, ring) .+ 1 for ring in ring_info.AtomRings()], - :bond_rings => - [pyconvert(Vector{Int}, ring) .+ 1 for ring in ring_info.BondRings()], - ) -end - """ is_ring_aromatic(mol::Molecule, ring_atoms::Vector{Int}) -> Union{Bool, Missing} diff --git a/test/runtests.jl b/test/runtests.jl index 8718f88..2f8cfab 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,9 +4,11 @@ using Graphs @testset "MoleculeFlow.jl Tests" begin include("test_molecules.jl") - include("test_operations.jl") # New operations tests + include("test_operations.jl") include("test_descriptors.jl") - include("test_advanced_descriptors.jl") # New advanced descriptors + include("test_advanced_descriptors.jl") + include("test_vsa_descriptors.jl") + include("test_new_wrappers_integration.jl") include("test_fingerprints.jl") include("test_similarity.jl") include("test_atoms.jl") diff --git a/test/test_advanced_descriptors.jl b/test/test_advanced_descriptors.jl index c780975..963cbdd 100644 --- a/test/test_advanced_descriptors.jl +++ b/test/test_advanced_descriptors.jl @@ -206,6 +206,26 @@ end @test radius_of_gyration(invalid_mol) === missing end + @testset "Principal Moments of Inertia" begin + # Test PMI descriptors (may fail without 3D coordinates) + ethanol = mol_from_smiles("CCO") + + pmi1_result = pmi1(ethanol) + pmi2_result = pmi2(ethanol) + pmi3_result = pmi3(ethanol) + + # Should either be Float64 or missing (if no 3D coords) + @test pmi1_result === missing || isa(pmi1_result, Float64) + @test pmi2_result === missing || isa(pmi2_result, Float64) + @test pmi3_result === missing || isa(pmi3_result, Float64) + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + @test pmi1(invalid_mol) === missing + @test pmi2(invalid_mol) === missing + @test pmi3(invalid_mol) === missing + end + @testset "3D Descriptors with Generated Conformers" begin # Skip this test if conformer generation is not available try @@ -229,6 +249,90 @@ end @warn "3D conformer generation not available for testing: $e" end end + + @testset "Advanced 3D Descriptors" begin + # Test spherocity index + mol = mol_from_smiles("CCO") + spherocity_result = spherocity_index(mol) + @test spherocity_result === missing || isa(spherocity_result, Float64) + + # Test GETAWAY descriptors + getaway_result = getaway_descriptors(mol) + @test getaway_result === missing || isa(getaway_result, Vector{Float64}) + + # Test WHIM descriptors + whim_result = whim_descriptors(mol) + @test whim_result === missing || isa(whim_result, Vector{Float64}) + + # Test RDF descriptors + rdf_result = rdf_descriptors(mol) + @test rdf_result === missing || isa(rdf_result, Vector{Float64}) + + # Test MORSE descriptors + morse_result = morse_descriptors(mol) + @test morse_result === missing || isa(morse_result, Vector{Float64}) + + # Test with missing molecule + @test spherocity_index(missing) === missing + @test getaway_descriptors(missing) === missing + @test whim_descriptors(missing) === missing + @test rdf_descriptors(missing) === missing + @test morse_descriptors(missing) === missing + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + @test spherocity_index(invalid_mol) === missing + @test getaway_descriptors(invalid_mol) === missing + @test whim_descriptors(invalid_mol) === missing + @test rdf_descriptors(invalid_mol) === missing + @test morse_descriptors(invalid_mol) === missing + end + + @testset "Advanced 3D Descriptors with Generated Conformers" begin + # Skip this test if conformer generation is not available + try + mol = mol_from_smiles("CCO") + conformers = generate_3d_conformers(mol, 1) + + if !isempty(conformers) + mol_3d = conformers[1].molecule + + # Test spherocity index with 3D coordinates + spherocity = spherocity_index(mol_3d) + @test isa(spherocity, Float64) + @test 0.0 <= spherocity <= 1.0 + + # Test GETAWAY descriptors with 3D coordinates + getaway = getaway_descriptors(mol_3d) + @test isa(getaway, Vector{Float64}) + @test length(getaway) > 0 + + # Test WHIM descriptors with 3D coordinates + whim = whim_descriptors(mol_3d) + @test isa(whim, Vector{Float64}) + @test length(whim) > 0 + + # Test RDF descriptors with 3D coordinates + rdf = rdf_descriptors(mol_3d) + @test isa(rdf, Vector{Float64}) + @test length(rdf) > 0 + + # Test MORSE descriptors with 3D coordinates + morse = morse_descriptors(mol_3d) + @test isa(morse, Vector{Float64}) + @test length(morse) > 0 + + # Test custom parameters + getaway_custom = getaway_descriptors(mol_3d; precision = 3) + @test isa(getaway_custom, Vector{Float64}) + + whim_custom = whim_descriptors(mol_3d; thresh = 0.01) + @test isa(whim_custom, Vector{Float64}) + end + catch e + @warn "3D conformer generation not available for advanced descriptor testing: $e" + end + end end @testset "Vectorized Operations for New Descriptors" begin @@ -259,6 +363,27 @@ end amide_counts = num_amide_bonds.(mols) @test length(amide_counts) == 3 @test all(isa(v, Int) for v in amide_counts) + + # Test vectorized 3D descriptors (may be missing without 3D coordinates) + spherocity_values = spherocity_index.(mols) + @test length(spherocity_values) == 3 + @test all(v === missing || isa(v, Float64) for v in spherocity_values) + + getaway_values = getaway_descriptors.(mols) + @test length(getaway_values) == 3 + @test all(v === missing || isa(v, Vector{Float64}) for v in getaway_values) + + whim_values = whim_descriptors.(mols) + @test length(whim_values) == 3 + @test all(v === missing || isa(v, Vector{Float64}) for v in whim_values) + + rdf_values = rdf_descriptors.(mols) + @test length(rdf_values) == 3 + @test all(v === missing || isa(v, Vector{Float64}) for v in rdf_values) + + morse_values = morse_descriptors.(mols) + @test length(morse_values) == 3 + @test all(v === missing || isa(v, Vector{Float64}) for v in morse_values) end @testset "Error Handling and Edge Cases" begin diff --git a/test/test_new_wrappers_integration.jl b/test/test_new_wrappers_integration.jl new file mode 100644 index 0000000..eb9c2b7 --- /dev/null +++ b/test/test_new_wrappers_integration.jl @@ -0,0 +1,318 @@ +using Test +using MoleculeFlow + +@testset "New Wrappers Integration Tests" begin + # Test molecules covering different structural features + test_molecules = [ + ("ethanol", "CCO"), + ("benzene", "c1ccccc1"), + ("cyclohexane", "C1CCCCC1"), + ("pyridine", "c1cccnc1"), + ("aspirin", "CC(=O)OC1=CC=CC=C1C(=O)O"), + ("chiral_molecule", "C[C@H](O)C"), + ("tetrahydrofuran", "C1CCOC1"), + ("acetamide", "CC(=O)N"), + ] + + @testset "Comprehensive Descriptor Coverage" begin + for (name, smiles) in test_molecules + mol = mol_from_smiles(smiles) + @test mol.valid + + # Test all new VSA descriptors + vsa_descriptors = [ + slogp_vsa2, + slogp_vsa3, + slogp_vsa4, + slogp_vsa5, + slogp_vsa6, + slogp_vsa7, + slogp_vsa8, + slogp_vsa9, + slogp_vsa10, + slogp_vsa11, + slogp_vsa12, + smr_vsa1, + smr_vsa2, + smr_vsa3, + smr_vsa4, + smr_vsa5, + smr_vsa6, + smr_vsa7, + smr_vsa8, + smr_vsa9, + smr_vsa10, + peoe_vsa1, + peoe_vsa2, + peoe_vsa3, + peoe_vsa4, + peoe_vsa5, + peoe_vsa6, + peoe_vsa7, + peoe_vsa8, + peoe_vsa9, + peoe_vsa10, + peoe_vsa11, + peoe_vsa12, + peoe_vsa13, + peoe_vsa14, + ] + + for desc_func in vsa_descriptors + result = desc_func(mol) + @test isa(result, Float64) || result === missing + if isa(result, Float64) + @test result >= 0.0 # VSA descriptors should be non-negative + end + end + + # Test all new BCUT descriptors + bcut_descriptors = [ + bcut2d_mwlow, + bcut2d_mwhi, + bcut2d_chglow, + bcut2d_chghi, + bcut2d_logplow, + bcut2d_logphi, + bcut2d_mrlow, + bcut2d_mrhi, + ] + + for desc_func in bcut_descriptors + result = desc_func(mol) + @test isa(result, Float64) || result === missing + end + + # Test 3D descriptors (may be missing without coordinates) + threeD_descriptors = [pmi1, pmi2, pmi3] + for desc_func in threeD_descriptors + result = desc_func(mol) + @test isa(result, Float64) || result === missing + end + + # Test additional structure count descriptors + structure_descriptors = [ + num_aliphatic_heterocycles, + num_saturated_heterocycles, + num_saturated_carbocycles, + num_unspecified_atom_stereo_centers, + num_spiro_atoms, + num_bridgehead_atoms, + num_aliphatic_rings, + num_heterocycles, + ] + + for desc_func in structure_descriptors + result = desc_func(mol) + @test isa(result, Int) || result === missing + if isa(result, Int) + @test result >= 0 # Counts should be non-negative + end + end + + # Test additional descriptors + @test isa(hall_kier_alpha(mol), Float64) || hall_kier_alpha(mol) === missing + @test isa(max_absolute_e_state_index(mol), Float64) || + max_absolute_e_state_index(mol) === missing + @test isa(min_absolute_e_state_index(mol), Float64) || + min_absolute_e_state_index(mol) === missing + end + end + + @testset "Molecular Operations Integration" begin + mol = mol_from_smiles("c1ccccc1CCO") # Benzyl alcohol + + # Test ring information + ring_info = get_ring_info(mol) + @test ring_info !== missing + + # Test canonical ranking + ranks = canonical_atom_ranks(mol) + @test isa(ranks, Vector{Int}) || ranks === missing + + # Test atom environment + if isa(ranks, Vector{Int}) && length(ranks) > 3 + env = find_atom_environment(mol, 1, 0) + @test isa(env, Vector{Int}) || env === missing + end + + # Test fragment operations + if heavy_atom_count(mol) >= 3 + frag_smarts = mol_fragment_to_smarts(mol, [0, 1, 2]) + @test isa(frag_smarts, String) || frag_smarts === missing + + frag_smiles = mol_fragment_to_smiles(mol, [0, 1]) + @test isa(frag_smiles, String) || frag_smiles === missing + end + + # Test molecular editing operations + mol_copy = mol_from_smiles("CCO") + + # Test in-place operations + sanitize_mol!(mol_copy) + @test mol_copy.valid + + compute_2d_coords!(mol_copy) + @test mol_copy.valid + + fast_find_rings!(mol_copy) + @test mol_copy.valid + + remove_stereochemistry!(mol_copy) + @test mol_copy.valid + + # Test atom renumbering + if heavy_atom_count(mol_copy) == 3 + new_mol = renumber_atoms(mol_copy, [2, 1, 0]) + @test new_mol.valid + end + end + + @testset "Error Handling Consistency" begin + invalid_mol = mol_from_smiles("invalid_smiles") + @test !invalid_mol.valid + + # Test that all new descriptors handle invalid molecules consistently + @test slogp_vsa2(invalid_mol) === missing + @test smr_vsa1(invalid_mol) === missing + @test peoe_vsa1(invalid_mol) === missing + @test bcut2d_mwlow(invalid_mol) === missing + @test pmi1(invalid_mol) === missing + @test num_aliphatic_heterocycles(invalid_mol) === missing + @test hall_kier_alpha(invalid_mol) === missing + + # Test that all new operations handle invalid molecules consistently + @test get_ring_info(invalid_mol) === missing + @test canonical_atom_ranks(invalid_mol) == Int[] + @test find_atom_environment(invalid_mol, 1, 0) === missing + @test mol_fragment_to_smarts(invalid_mol, [0, 1]) === missing + @test mol_fragment_to_smiles(invalid_mol, [0, 1]) === missing + + # In-place operations should return invalid molecule unchanged + @test !sanitize_mol!(invalid_mol).valid + @test !compute_2d_coords!(invalid_mol).valid + @test fast_find_rings!(invalid_mol) == false + @test !remove_stereochemistry!(invalid_mol).valid + @test !renumber_atoms(invalid_mol, [0, 1, 2]).valid + end + + @testset "Vectorized Operations" begin + mols = [mol_from_smiles(smiles) for (_, smiles) in test_molecules] + + # Test vectorized VSA descriptors + slogp2_values = slogp_vsa2.(mols) + @test length(slogp2_values) == length(mols) + @test all(v -> isa(v, Float64) || v === missing, slogp2_values) + + peoe1_values = peoe_vsa1.(mols) + @test length(peoe1_values) == length(mols) + @test all(v -> isa(v, Float64) || v === missing, peoe1_values) + + # Test vectorized BCUT descriptors + bcut_values = bcut2d_mwlow.(mols) + @test length(bcut_values) == length(mols) + @test all(v -> isa(v, Float64) || v === missing, bcut_values) + + # Test vectorized structure counts + aliphatic_counts = num_aliphatic_rings.(mols) + @test length(aliphatic_counts) == length(mols) + @test all(v -> isa(v, Int) || v === missing, aliphatic_counts) + + # Test vectorized 3D descriptors + pmi1_values = pmi1.(mols) + @test length(pmi1_values) == length(mols) + @test all(v -> isa(v, Float64) || v === missing, pmi1_values) + end + + @testset "Descriptor Value Sanity Checks" begin + # Test that descriptors give sensible values for known molecules + ethanol = mol_from_smiles("CCO") + benzene = mol_from_smiles("c1ccccc1") + cyclohexane = mol_from_smiles("C1CCCCC1") + + # Test ring counting consistency + @test num_aliphatic_rings(ethanol) == 0 + @test num_aliphatic_rings(cyclohexane) == 1 + @test num_aliphatic_rings(benzene) == 0 # Aromatic, not aliphatic + + @test num_aromatic_rings(benzene) == 1 + @test num_aromatic_rings(cyclohexane) == 0 + @test num_aromatic_rings(ethanol) == 0 + + # Test heterocycle counting + pyridine = mol_from_smiles("c1cccnc1") + @test num_heterocycles(pyridine) == 1 + @test num_heterocycles(benzene) == 0 + @test num_aromatic_heterocycles(pyridine) == 1 + + # Test structure features + @test num_spiro_atoms(ethanol) == 0 + @test num_bridgehead_atoms(ethanol) == 0 + + # Test that BCUT low/high relationships make sense (when both are valid) + mwlow = bcut2d_mwlow(benzene) + mwhi = bcut2d_mwhi(benzene) + if isa(mwlow, Float64) && isa(mwhi, Float64) + @test mwlow <= mwhi + end + end + + @testset "Performance and Memory" begin + # Test that descriptor calculations don't cause memory issues + mol = mol_from_smiles("CC(=O)OC1=CC=CC=C1C(=O)O") # Aspirin + + # Calculate many descriptors in sequence + descriptors = [ + slogp_vsa2, + slogp_vsa5, + slogp_vsa10, + smr_vsa3, + smr_vsa7, + peoe_vsa2, + peoe_vsa8, + peoe_vsa14, + bcut2d_mwlow, + bcut2d_logphi, + num_aliphatic_rings, + num_heterocycles, + hall_kier_alpha, + ] + + for desc in descriptors + result = desc(mol) + @test isa(result, Number) || result === missing + end + + # Test operations don't cause issues + get_ring_info(mol) + canonical_atom_ranks(mol) + sanitize_mol!(mol) + compute_2d_coords!(mol) + + @test mol.valid # Should still be valid after all operations + end +end + +@testset "Backward Compatibility" begin + # Ensure new wrappers don't break existing functionality + mol = mol_from_smiles("CCO") + + # Test that original descriptors still work + @test isa(molecular_weight(mol), Float64) + @test isa(heavy_atom_count(mol), Int) + @test isa(num_hbd(mol), Int) + @test isa(num_hba(mol), Int) + @test isa(logp(mol), Float64) + @test isa(tpsa(mol), Float64) + + # Test that original operations still work + @test isa(add_hs(mol), Molecule) + @test isa(remove_hs(mol), Molecule) + @test isa(mol_to_smiles(mol), String) + @test isa(mol_to_inchi_key(mol), String) + + # Test fingerprints still work + @test isa(morgan_fingerprint(mol), Vector{Bool}) + @test isa(rdk_fingerprint(mol), Vector{Bool}) + @test isa(maccs_fingerprint(mol), Vector{Bool}) +end diff --git a/test/test_operations.jl b/test/test_operations.jl index 5642ac5..1b241da 100644 --- a/test/test_operations.jl +++ b/test/test_operations.jl @@ -254,16 +254,145 @@ end # Test with first two atoms (indices are 0-based in RDKit) smarts = mol_fragment_to_smarts(mol, [0, 1]) - @test isa(smarts, String) + @test isa(smarts, String) || smarts === missing # Test with invalid molecule invalid_mol = mol_from_smiles("invalid_smiles") result = mol_fragment_to_smarts(invalid_mol, [0, 1]) - @test result == "" + @test result === missing # Test with empty indices result_empty = mol_fragment_to_smarts(mol, Int[]) - @test isa(result_empty, String) + @test isa(result_empty, String) || result_empty === missing + end + + @testset "mol_fragment_to_smiles function" begin + mol = mol_from_smiles("CCO") + + # Test with first two atoms (indices are 0-based in RDKit) + smiles = mol_fragment_to_smiles(mol, [0, 1]) + @test isa(smiles, String) || smiles === missing + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result = mol_fragment_to_smiles(invalid_mol, [0, 1]) + @test result === missing + + # Test with single atom + result_single = mol_fragment_to_smiles(mol, [0]) + @test isa(result_single, String) || result_single === missing + end +end + +@testset "Advanced Ring Analysis" begin + @testset "get_ring_info function" begin + # Test with benzene (aromatic ring) + benzene = mol_from_smiles("c1ccccc1") + ring_info = get_ring_info(benzene) + @test ring_info !== missing + + # Test with cyclohexane (aliphatic ring) + cyclohexane = mol_from_smiles("C1CCCCC1") + ring_info_aliphatic = get_ring_info(cyclohexane) + @test ring_info_aliphatic !== missing + + # Test with acyclic molecule + ethanol = mol_from_smiles("CCO") + ring_info_acyclic = get_ring_info(ethanol) + @test ring_info_acyclic !== missing + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result = get_ring_info(invalid_mol) + @test result === missing + end + + @testset "find_atom_environment function" begin + mol = mol_from_smiles("CCCCC") # Linear chain + + # Test finding environment around central atom + env = find_atom_environment(mol, 1, 2) # Radius 1 around atom 2 + @test isa(env, Vector{Int}) || env === missing + + # Test with different radius + env_large = find_atom_environment(mol, 2, 2) # Radius 2 around atom 2 + @test isa(env_large, Vector{Int}) || env_large === missing + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result = find_atom_environment(invalid_mol, 1, 0) + @test result === missing + end +end + +@testset "Molecular Editing Operations" begin + @testset "renumber_atoms function" begin + mol = mol_from_smiles("CCO") + + # Test reverse atom ordering + new_mol = renumber_atoms(mol, [2, 1, 0]) + @test new_mol.valid == true + @test new_mol.source == mol.source + + # Test with same ordering (identity) + same_mol = renumber_atoms(mol, [0, 1, 2]) + @test same_mol.valid == true + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result = renumber_atoms(invalid_mol, [0, 1, 2]) + @test result.valid == false + end + + @testset "remove_stereochemistry! function" begin + # Test with chiral molecule + chiral_mol = mol_from_smiles("C[C@H](O)C") + original_smiles = mol_to_smiles(chiral_mol) + + result = remove_stereochemistry!(chiral_mol) + @test result.valid == true + @test result == chiral_mol # Should return same object (in-place) + + # Test with achiral molecule + achiral_mol = mol_from_smiles("CCO") + result_achiral = remove_stereochemistry!(achiral_mol) + @test result_achiral.valid == true + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result_invalid = remove_stereochemistry!(invalid_mol) + @test result_invalid.valid == false + end + + @testset "sanitize_mol! function" begin + mol = mol_from_smiles("CCO") + + result = sanitize_mol!(mol) + @test result.valid == true + @test result == mol # Should return same object (in-place) + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result_invalid = sanitize_mol!(invalid_mol) + @test result_invalid.valid == false + end + + @testset "compute_2d_coords! function" begin + mol = mol_from_smiles("CCO") + + result = compute_2d_coords!(mol) + @test result.valid == true + @test result == mol # Should return same object (in-place) + + # Test with aromatic molecule + benzene = mol_from_smiles("c1ccccc1") + result_benzene = compute_2d_coords!(benzene) + @test result_benzene.valid == true + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + result_invalid = compute_2d_coords!(invalid_mol) + @test result_invalid.valid == false end end diff --git a/test/test_pharmacophore.jl b/test/test_pharmacophore.jl new file mode 100644 index 0000000..5221c0b --- /dev/null +++ b/test/test_pharmacophore.jl @@ -0,0 +1,240 @@ +using Test +using MoleculeFlow + +@testset "Pharmacophore Features" begin + @testset "Feature Factory Creation" begin + # Test default feature factory + factory = create_feature_factory() + @test factory isa FeatureFactory + @test length(factory.feature_families) > 0 + @test factory.num_definitions > 0 + + # Test that common feature families are present + families = get_feature_families(factory) + @test "Donor" in families + @test "Acceptor" in families + @test "Aromatic" in families + + # Test custom feature definition string + fdef_string = raw""" + DefineFeature HAcceptor1 [N,O;H0] + Family HBondAcceptor + Weights 1.0 + EndFeature + """ + + try + custom_factory = create_feature_factory( + use_default = false, fdef_string = fdef_string + ) + @test custom_factory isa FeatureFactory + catch e + @warn "Custom feature factory creation failed: $e" + end + end + + @testset "Molecular Feature Extraction" begin + # Test with simple molecules + ethanol = mol_from_smiles("CCO") + phenol = mol_from_smiles("c1ccccc1O") + + factory = create_feature_factory() + + # Test feature extraction without conformers (should return empty) + ethanol_features_no_conf = get_mol_features(ethanol, factory) + @test ethanol_features_no_conf isa Vector{ChemicalFeature} + @test isempty(ethanol_features_no_conf) # No features without conformers + + # Test feature extraction with 2D conformers + ethanol_2d = generate_2d_conformers(ethanol) + phenol_2d = generate_2d_conformers(phenol) + + if !isempty(ethanol_2d) && !isempty(phenol_2d) + ethanol_mol_2d = ethanol_2d[1].molecule + phenol_mol_2d = phenol_2d[1].molecule + + ethanol_features = get_mol_features(ethanol_mol_2d, factory) + @test ethanol_features isa Vector{ChemicalFeature} + @test !isempty(ethanol_features) # Should have features with conformers + + phenol_features = get_mol_features(phenol_mol_2d, factory) + @test phenol_features isa Vector{ChemicalFeature} + @test length(phenol_features) >= length(ethanol_features) # Phenol should have more features + + # Test that features have valid properties + if !isempty(ethanol_features) + feature = ethanol_features[1] + @test feature.family isa String + @test feature.type isa String + @test feature.atom_ids isa Vector{Int} + @test feature.position isa Vector{Float64} + @test feature.id isa Int + @test length(feature.position) == 3 # 3D coordinates + @test all(id -> id > 0, feature.atom_ids) # 1-based indexing + end + + # Test vectorized operation + molecules_2d = [ethanol_mol_2d, phenol_mol_2d] + all_features = get_mol_features(molecules_2d, factory) + @test length(all_features) == 2 + @test all_features[1] isa Vector{ChemicalFeature} + @test all_features[2] isa Vector{ChemicalFeature} + end + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + invalid_features = get_mol_features(invalid_mol, factory) + @test isempty(invalid_features) + end + + @testset "Feature Filtering" begin + mol = mol_from_smiles("CCO") # Ethanol: has donor and acceptor + factory = create_feature_factory() + + # Generate conformer for feature extraction + conformers_2d = generate_2d_conformers(mol) + if !isempty(conformers_2d) + mol_2d = conformers_2d[1].molecule + features = get_mol_features(mol_2d, factory) + + if !isempty(features) + # Test filtering by family + families = unique([f.family for f in features]) + if !isempty(families) + first_family = families[1] + filtered = filter_features_by_family(features, first_family) + @test all(f -> f.family == first_family, filtered) + end + + # Test filtering for non-existent family + empty_filtered = filter_features_by_family(features, "NonExistentFamily") + @test isempty(empty_filtered) + end + end + end + + @testset "Pharmacophore Fingerprints" begin + molecules = [ + mol_from_smiles("CCO"), # Ethanol + mol_from_smiles("c1ccccc1O"), # Phenol + mol_from_smiles("CC(=O)N"), # Acetamide + ] + + for mol in molecules + # Test basic fingerprint generation + fp = pharmacophore_fingerprint(mol) + @test fp isa Vector{Bool} || fp === missing + + if fp isa Vector{Bool} + @test length(fp) > 0 + # Test with custom parameters + fp_custom = pharmacophore_fingerprint(mol; min_points = 2, max_points = 4) + @test fp_custom isa Vector{Bool} || fp_custom === missing + end + end + + # Test vectorized fingerprint generation + fps = pharmacophore_fingerprint(molecules) + @test length(fps) == 3 + @test all(fp -> fp isa Vector{Bool} || fp === missing, fps) + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + invalid_fp = pharmacophore_fingerprint(invalid_mol) + @test invalid_fp === missing + end + + @testset "3D Pharmacophore Features" begin + mol = mol_from_smiles("CCO") + factory = create_feature_factory() + + # Test without 3D coordinates (should return empty) + ph4_no_coords = get_pharmacophore_3d(mol, factory) + @test ph4_no_coords isa Vector{Tuple{String, Vector{Float64}}} + @test isempty(ph4_no_coords) # No 3D features without conformers + + # Test with 3D coordinates if conformer generation is available + try + conformers_3d = generate_3d_conformers(mol, 1) + if !isempty(conformers_3d) + mol_3d = conformers_3d[1].molecule + ph4_3d = get_pharmacophore_3d(mol_3d, factory) + + @test ph4_3d isa Vector{Tuple{String, Vector{Float64}}} + + # Check that each pharmacophore point has valid data + for (family, position) in ph4_3d + @test family isa String + @test position isa Vector{Float64} + @test length(position) == 3 # 3D coordinates + @test !any(isnan, position) + end + + # Test vectorized 3D pharmacophore extraction + molecules_3d = [mol_3d] + ph4_batch = get_pharmacophore_3d(molecules_3d, factory) + @test length(ph4_batch) == 1 + @test ph4_batch[1] isa Vector{Tuple{String, Vector{Float64}}} + end + catch e + @warn "3D conformer generation not available for pharmacophore testing: $e" + end + + # Test with invalid molecule + invalid_mol = mol_from_smiles("invalid_smiles") + invalid_ph4 = get_pharmacophore_3d(invalid_mol, factory) + @test isempty(invalid_ph4) + end + + @testset "Edge Cases and Error Handling" begin + factory = create_feature_factory() + + # Test with empty molecule (if possible) + try + empty_mol = mol_from_smiles("") + empty_features = get_mol_features(empty_mol, factory) + @test isempty(empty_features) + + empty_fp = pharmacophore_fingerprint(empty_mol) + @test empty_fp isa Vector{Bool} || empty_fp === missing + catch + # Some SMILES parsers may not handle empty strings + end + + # Test with very simple molecule + methane = mol_from_smiles("C") + methane_features = get_mol_features(methane, factory) + @test methane_features isa Vector{ChemicalFeature} + + methane_fp = pharmacophore_fingerprint(methane) + @test methane_fp isa Vector{Bool} || methane_fp === missing + end + + @testset "Common Use Cases" begin + # Test drug-like molecules + aspirin = mol_from_smiles("CC(=O)Oc1ccccc1C(=O)O") + caffeine = mol_from_smiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") + + factory = create_feature_factory() + + # These should have multiple features + aspirin_features = get_mol_features(aspirin, factory) + caffeine_features = get_mol_features(caffeine, factory) + + @test aspirin_features isa Vector{ChemicalFeature} + @test caffeine_features isa Vector{ChemicalFeature} + + # Test pharmacophore fingerprints for drug-like molecules + aspirin_fp = pharmacophore_fingerprint(aspirin) + caffeine_fp = pharmacophore_fingerprint(caffeine) + + @test aspirin_fp isa Vector{Bool} || aspirin_fp === missing + @test caffeine_fp isa Vector{Bool} || caffeine_fp === missing + + # Fingerprints should be different for different molecules + if aspirin_fp isa Vector{Bool} && caffeine_fp isa Vector{Bool} + @test length(aspirin_fp) == length(caffeine_fp) # Same fingerprint size + @test aspirin_fp != caffeine_fp # Different molecules should have different fingerprints + end + end +end diff --git a/test/test_reaction.jl b/test/test_reaction.jl index a571dd7..82a03ea 100644 --- a/test/test_reaction.jl +++ b/test/test_reaction.jl @@ -27,7 +27,7 @@ end @testset "Reaction Validation" begin # Valid reaction rxn_smarts = "[C:1](=O)[O:2][C:3]>>[C:1](=O)[O-].[C:3][O+]" - rxn = reaction_from_smarts(rxn_smarts, validate=false) + rxn = reaction_from_smarts(rxn_smarts, validate = false) @test !rxn.validated @test validate_reaction!(rxn) @test rxn.validated @@ -48,7 +48,7 @@ end @test all(prodset -> all(prod -> prod isa Molecule, prodset), products) # Test with max_products limit - products_limited = run_reaction(rxn, [reactant], max_products=1) + products_limited = run_reaction(rxn, [reactant], max_products = 1) @test length(products_limited) <= 1 end @@ -95,7 +95,7 @@ end @test length(fp) == 2048 # Default size # Test cached fingerprint - fp2 = reaction_fingerprint(rxn, use_cache=true) + fp2 = reaction_fingerprint(rxn, use_cache = true) @test fp == fp2 # Test structural fingerprint @@ -115,12 +115,12 @@ end rxn2 = reaction_from_smarts(rxn2_smarts) # Test Tanimoto similarity - sim_tanimoto = reaction_similarity(rxn1, rxn2, method=:tanimoto) + sim_tanimoto = reaction_similarity(rxn1, rxn2, method = :tanimoto) @test isa(sim_tanimoto, Float64) @test 0.0 <= sim_tanimoto <= 1.0 # Test Dice similarity - sim_dice = reaction_similarity(rxn1, rxn2, method=:dice) + sim_dice = reaction_similarity(rxn1, rxn2, method = :dice) @test isa(sim_dice, Float64) @test 0.0 <= sim_dice <= 1.0 @@ -179,7 +179,8 @@ end # Test reaction classification classification = reaction_type_classification(rxn) @test isa(classification, Symbol) - @test classification in [:decomposition, :combination, :substitution, :rearrangement, :complex] + @test classification in + [:decomposition, :combination, :substitution, :rearrangement, :complex] end @testset "Reaction Library Enumeration" begin @@ -210,12 +211,12 @@ end reaction_db = [rxn1, rxn2, rxn3] # Search for similar reactions - similar = find_similar_reactions(rxn1, reaction_db, threshold=0.1) + similar = find_similar_reactions(rxn1, reaction_db, threshold = 0.1) @test isa(similar, Vector{Tuple{Reaction, Float64}}) @test length(similar) >= 1 # Should at least find itself # Check that similarities are sorted if length(similar) > 1 - @test all(i -> similar[i][2] >= similar[i+1][2], 1:(length(similar)-1)) + @test all(i -> similar[i][2] >= similar[i + 1][2], 1:(length(similar) - 1)) end -end \ No newline at end of file +end diff --git a/test/test_vsa_descriptors.jl b/test/test_vsa_descriptors.jl new file mode 100644 index 0000000..6eaf220 --- /dev/null +++ b/test/test_vsa_descriptors.jl @@ -0,0 +1,337 @@ +using Test +using MoleculeFlow + +@testset "VSA Descriptors" begin + # Test molecules + ethanol = mol_from_smiles("CCO") + benzene = mol_from_smiles("c1ccccc1") + aspirin = mol_from_smiles("CC(=O)OC1=CC=CC=C1C(=O)O") + invalid_mol = mol_from_smiles("invalid_smiles") + + @testset "SlogP_VSA Descriptors" begin + # Test SlogP_VSA2-12 descriptors + vsa_functions = [ + slogp_vsa2, + slogp_vsa3, + slogp_vsa4, + slogp_vsa5, + slogp_vsa6, + slogp_vsa7, + slogp_vsa8, + slogp_vsa9, + slogp_vsa10, + slogp_vsa11, + slogp_vsa12, + ] + + for (i, vsa_func) in enumerate(vsa_functions) + # Test with ethanol + result_ethanol = vsa_func(ethanol) + @test isa(result_ethanol, Float64) || result_ethanol === missing + if isa(result_ethanol, Float64) + @test result_ethanol >= 0.0 + end + + # Test with benzene + result_benzene = vsa_func(benzene) + @test isa(result_benzene, Float64) || result_benzene === missing + if isa(result_benzene, Float64) + @test result_benzene >= 0.0 + end + + # Test with invalid molecule + @test vsa_func(invalid_mol) === missing + end + + # Test vectorized operations + mols = [ethanol, benzene, aspirin] + results = slogp_vsa2.(mols) + @test length(results) == 3 + @test all(r -> isa(r, Float64) || r === missing, results) + end + + @testset "SMR_VSA Descriptors" begin + # Test SMR_VSA1-10 descriptors + vsa_functions = [ + smr_vsa1, + smr_vsa2, + smr_vsa3, + smr_vsa4, + smr_vsa5, + smr_vsa6, + smr_vsa7, + smr_vsa8, + smr_vsa9, + smr_vsa10, + ] + + for vsa_func in vsa_functions + # Test with ethanol + result_ethanol = vsa_func(ethanol) + @test isa(result_ethanol, Float64) || result_ethanol === missing + if isa(result_ethanol, Float64) + @test result_ethanol >= 0.0 + end + + # Test with benzene + result_benzene = vsa_func(benzene) + @test isa(result_benzene, Float64) || result_benzene === missing + if isa(result_benzene, Float64) + @test result_benzene >= 0.0 + end + + # Test with invalid molecule + @test vsa_func(invalid_mol) === missing + end + + # Test vectorized operations + mols = [ethanol, benzene, aspirin] + results = smr_vsa1.(mols) + @test length(results) == 3 + @test all(r -> isa(r, Float64) || r === missing, results) + end + + @testset "PEOE_VSA Descriptors" begin + # Test PEOE_VSA1-14 descriptors + vsa_functions = [ + peoe_vsa1, + peoe_vsa2, + peoe_vsa3, + peoe_vsa4, + peoe_vsa5, + peoe_vsa6, + peoe_vsa7, + peoe_vsa8, + peoe_vsa9, + peoe_vsa10, + peoe_vsa11, + peoe_vsa12, + peoe_vsa13, + peoe_vsa14, + ] + + for vsa_func in vsa_functions + # Test with ethanol + result_ethanol = vsa_func(ethanol) + @test isa(result_ethanol, Float64) || result_ethanol === missing + if isa(result_ethanol, Float64) + @test result_ethanol >= 0.0 + end + + # Test with benzene + result_benzene = vsa_func(benzene) + @test isa(result_benzene, Float64) || result_benzene === missing + if isa(result_benzene, Float64) + @test result_benzene >= 0.0 + end + + # Test with invalid molecule + @test vsa_func(invalid_mol) === missing + end + + # Test vectorized operations + mols = [ethanol, benzene, aspirin] + results = peoe_vsa1.(mols) + @test length(results) == 3 + @test all(r -> isa(r, Float64) || r === missing, results) + end + + @testset "VSA Descriptor Properties" begin + # Test that VSA descriptors give different values for different molecules + ethanol_slogp2 = slogp_vsa2(ethanol) + benzene_slogp2 = slogp_vsa2(benzene) + + if isa(ethanol_slogp2, Float64) && isa(benzene_slogp2, Float64) + # They should be different for different molecules + @test ethanol_slogp2 != benzene_slogp2 + end + + # Test that VSA descriptors are non-negative + for mol in [ethanol, benzene] + peoe1 = peoe_vsa1(mol) + if isa(peoe1, Float64) + @test peoe1 >= 0.0 + end + end + end +end + +@testset "BCUT Descriptors" begin + # Test molecules + ethanol = mol_from_smiles("CCO") + benzene = mol_from_smiles("c1ccccc1") + invalid_mol = mol_from_smiles("invalid_smiles") + + @testset "BCUT2D Descriptors" begin + # Test all BCUT2D descriptors + bcut_functions = [ + bcut2d_mwlow, + bcut2d_mwhi, + bcut2d_chglow, + bcut2d_chghi, + bcut2d_logplow, + bcut2d_logphi, + bcut2d_mrlow, + bcut2d_mrhi, + ] + + for bcut_func in bcut_functions + # Test with ethanol + result_ethanol = bcut_func(ethanol) + @test isa(result_ethanol, Float64) || result_ethanol === missing + + # Test with benzene + result_benzene = bcut_func(benzene) + @test isa(result_benzene, Float64) || result_benzene === missing + + # Test with invalid molecule + @test bcut_func(invalid_mol) === missing + end + + # Test vectorized operations + mols = [ethanol, benzene] + results = bcut2d_mwlow.(mols) + @test length(results) == 2 + @test all(r -> isa(r, Float64) || r === missing, results) + end + + @testset "BCUT Descriptor Properties" begin + # Test that high/low pairs make sense when both are valid + mwlow = bcut2d_mwlow(benzene) + mwhi = bcut2d_mwhi(benzene) + + if isa(mwlow, Float64) && isa(mwhi, Float64) + @test mwlow <= mwhi # Low should be <= high + end + + logplow = bcut2d_logplow(benzene) + logphi = bcut2d_logphi(benzene) + + if isa(logplow, Float64) && isa(logphi, Float64) + @test logplow <= logphi # Low should be <= high + end + end +end + +@testset "Additional Structure Count Descriptors" begin + # Test molecules with various structural features + ethanol = mol_from_smiles("CCO") + cyclohexane = mol_from_smiles("C1CCCCC1") + pyridine = mol_from_smiles("c1cccnc1") + tetrahydrofuran = mol_from_smiles("C1CCOC1") + invalid_mol = mol_from_smiles("invalid_smiles") + + @testset "Additional Ring Count Functions" begin + # Test num_aliphatic_heterocycles + @test num_aliphatic_heterocycles(ethanol) == 0 + @test num_aliphatic_heterocycles(tetrahydrofuran) == 1 # THF is aliphatic heterocycle + @test num_aliphatic_heterocycles(pyridine) == 0 # Pyridine is aromatic + @test num_aliphatic_heterocycles(invalid_mol) === missing + + # Test num_saturated_heterocycles + @test num_saturated_heterocycles(ethanol) == 0 + @test num_saturated_heterocycles(tetrahydrofuran) == 1 + @test num_saturated_heterocycles(invalid_mol) === missing + + # Test num_saturated_carbocycles + @test num_saturated_carbocycles(ethanol) == 0 + @test num_saturated_carbocycles(cyclohexane) == 1 + @test num_saturated_carbocycles(invalid_mol) === missing + + # Test num_aliphatic_rings + @test num_aliphatic_rings(ethanol) == 0 + @test num_aliphatic_rings(cyclohexane) == 1 + @test num_aliphatic_rings(pyridine) == 0 # Aromatic, not aliphatic + @test num_aliphatic_rings(invalid_mol) === missing + + # Test num_heterocycles + @test num_heterocycles(ethanol) == 0 + @test num_heterocycles(pyridine) == 1 + @test num_heterocycles(tetrahydrofuran) == 1 + @test num_heterocycles(invalid_mol) === missing + end + + @testset "Additional Stereochemistry and Structure Counts" begin + chiral_mol = mol_from_smiles("C[C@H](O)C") + + # Test num_unspecified_atom_stereo_centers + @test isa(num_unspecified_atom_stereo_centers(ethanol), Int) + @test isa(num_unspecified_atom_stereo_centers(chiral_mol), Int) + @test num_unspecified_atom_stereo_centers(invalid_mol) === missing + + # Test num_spiro_atoms + @test isa(num_spiro_atoms(ethanol), Int) + @test num_spiro_atoms(ethanol) == 0 # No spiro atoms in ethanol + @test num_spiro_atoms(invalid_mol) === missing + + # Test num_bridgehead_atoms + @test isa(num_bridgehead_atoms(ethanol), Int) + @test num_bridgehead_atoms(ethanol) == 0 # No bridgehead atoms in ethanol + @test num_bridgehead_atoms(invalid_mol) === missing + + # Test hall_kier_alpha (can be negative) + @test isa(hall_kier_alpha(ethanol), Float64) + @test hall_kier_alpha(invalid_mol) === missing + + # Note: num_sp3_heavy_atoms removed as RDKit function doesn't exist + end +end + +@testset "Additional E-State Descriptors" begin + ethanol = mol_from_smiles("CCO") + benzene = mol_from_smiles("c1ccccc1") + invalid_mol = mol_from_smiles("invalid_smiles") + + @testset "Max/Min Absolute E-State Indices" begin + # Test max_absolute_e_state_index + max_abs_estate = max_absolute_e_state_index(ethanol) + @test isa(max_abs_estate, Float64) || max_abs_estate === missing + @test max_absolute_e_state_index(invalid_mol) === missing + + # Test min_absolute_e_state_index + min_abs_estate = min_absolute_e_state_index(ethanol) + @test isa(min_abs_estate, Float64) || min_abs_estate === missing + @test min_absolute_e_state_index(invalid_mol) === missing + + # Test relationship between max and min (when both are valid) + if isa(max_abs_estate, Float64) && isa(min_abs_estate, Float64) + @test max_abs_estate >= min_abs_estate + end + end +end + +@testset "Vectorized Operations for New Descriptors" begin + mols = [ + mol_from_smiles("CCO"), mol_from_smiles("c1ccccc1"), mol_from_smiles("C1CCCCC1") + ] + + # Test vectorized VSA descriptors + slogp2_values = slogp_vsa2.(mols) + @test length(slogp2_values) == 3 + @test all(v -> isa(v, Float64) || v === missing, slogp2_values) + + smr1_values = smr_vsa1.(mols) + @test length(smr1_values) == 3 + @test all(v -> isa(v, Float64) || v === missing, smr1_values) + + peoe1_values = peoe_vsa1.(mols) + @test length(peoe1_values) == 3 + @test all(v -> isa(v, Float64) || v === missing, peoe1_values) + + # Test vectorized BCUT descriptors + bcut_mwlow_values = bcut2d_mwlow.(mols) + @test length(bcut_mwlow_values) == 3 + @test all(v -> isa(v, Float64) || v === missing, bcut_mwlow_values) + + # Test vectorized structure counts + aliphatic_rings = num_aliphatic_rings.(mols) + @test length(aliphatic_rings) == 3 + @test all(v -> isa(v, Int) || v === missing, aliphatic_rings) + + # Note: num_sp3_heavy_atoms test removed as function doesn't exist + + # Test vectorized 3D descriptors + pmi1_values = pmi1.(mols) + @test length(pmi1_values) == 3 + @test all(v -> isa(v, Float64) || v === missing, pmi1_values) +end