diff --git a/.gitignore b/.gitignore index 06dfc56..24fcec7 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ build/ .gradle # EFGF evaluation test output -ErtlFunctionalGroupsFinderEvaluationTest_Output/ \ No newline at end of file +ErtlFunctionalGroupsFinderEvaluationTest_Output/ +ChEBI_complete.sdf +/Output/ \ No newline at end of file diff --git a/License-header/License-header.txt b/License-header/License-header.txt index d1b5969..9b2deb0 100644 --- a/License-header/License-header.txt +++ b/License-header/License-header.txt @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) $today.year Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/build.gradle b/build.gradle index ee92eea..e2dae14 100644 --- a/build.gradle +++ b/build.gradle @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (C) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (C) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java index 43c471b..499487d 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinder.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -20,7 +20,10 @@ package org.openscience.cdk.tools; +import org.openscience.cdk.PseudoAtom; +import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.graph.ConnectedComponents; +import org.openscience.cdk.graph.ConnectivityChecker; import org.openscience.cdk.graph.GraphUtil; import org.openscience.cdk.graph.GraphUtil.EdgeToBondMap; import org.openscience.cdk.interfaces.IAtom; @@ -30,6 +33,7 @@ import org.openscience.cdk.interfaces.ILonePair; import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.interfaces.ISingleElectron; +import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import java.util.ArrayDeque; import java.util.ArrayList; @@ -38,748 +42,1292 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Queue; import java.util.Set; + /** - * Finds and extracts a molecules's functional groups in a purely rule-based manner. - * + * Finds and extracts a molecule's functional groups in a purely rule-based manner. * This class implements Peter Ertl's algorithm for the automated detection and extraction * of functional groups in organic molecules - * [Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]. + * ([Ertl P. An algorithm to identify functional groups in organic molecules. J Cheminform. 2017; 9:36.]) + * and has been described in a scientific publication + * ([Fritsch, S., Neumann, S., Schaub, J. et al. ErtlFunctionalGroupsFinder: automated rule-based functional group detection with the Chemistry Development Kit (CDK). J Cheminform. 2019; 11:37.]). + *
+ *
In brief, the algorithm iterates through all atoms in the input molecule and marks hetero atoms and specific carbon atoms + * (i.a. those in non-aromatic double or triple bonds etc.) as being part of a functional group. Connected groups of marked + * atoms are extracted as separate functional groups, together with their unmarked, "environmental" carbon atoms. These + * environments can be important, e.g. to differentiate an alcohol from a phenol, but are less important in other cases. + * To account for this, Ertl also devised a "generalization" scheme that generalizes the functional group environments + * in a way that accounts for their varying significance in different cases. Most environmental atoms are exchanged with + * pseudo ("R") atoms there. All these functionalities are available in ErtlFunctionalgroupsFinder. Additionally, only + * the marked atoms completely without their environments can be extracted. + *
+ *
To apply functional group detection to an input molecule, its atom types need to be set and aromaticity needs + * to be detected beforehand: + *
+ * //Prepare input
+ * SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance());
+ * IAtomContainer tmpInputMol = tmpSmiPar.parseSmiles("C[C@@H]1CN(C[C@H](C)N1)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F"); //PubChem CID 5257
+ * AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpInputMol);
+ * Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet());
+ * tmpAromaticity.apply(tmpInputMol);
+ * //Identify functional groups
+ * ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(); //default: generalization turned on
+ * List{@literal <}IAtomContainer{@literal >} tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol);
+ * 
+ * In order to only identify functional groups in standardised, organic structures, ErtlFunctionalGroupsFinder can + * be configured to only accept molecules that do *not* contain any metal, metalloid, or pseudo (R) atoms or formal charges. + * Also structures consisting of more than one unconnected component (e.g. ion and counter-ion) are not accepted if(!) the + * strict input restrictions are turned on (they are turned off by default). + * This can be done via a boolean parameter in a variant of the central find() method. + * To identify molecules that need to be filtered from the input set or preprocessed in this use case, convenience methods are + * available in this class. Please note that structural properties like formal charges and the others mentioned above + * are not expected to cause issues (exceptions) when processed by this class, but they are not explicitly regarded by + * the Ertl algorithm and hence this implementation, too. They might therefore cause unexpected behaviour in functional + * group identification. For example, a charge is not listed as a reason to mark a carbon atom. + *
+ *
Note: this implementation is not thread-safe. Each parallel thread should have its own instance of this class. * * @author Sebastian Fritsch, Jonas Schaub - * @version 1.2 + * @version 1.3 */ public class ErtlFunctionalGroupsFinder { - - private static ILoggingTool log = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); - private static final String CARBONYL_C_MARKER = "Carbonyl-C"; - private final Set nonmetalAtomicNumbers; - private final Mode mode; - private EdgeToBondMap bondMap; - private int[][] adjList; - private HashSet markedAtoms; - private HashMap aromaticHeteroAtoms; // key: atom idx, value: isInGroup - private Map> environmentsMap; - /** - * Defines the working mode. + * Defines the mode for generalizing functional group environments (default), keeping them whole, or only extracting marked atoms. */ - public static enum Mode{ + public static enum Mode { /** * Default mode including the generalization step. */ DEFAULT, /** - * Skips the generalization step. Functional groups will keep their full "environment". + * Skips the generalization step. Functional groups will keep their full environment. + */ + NO_GENERALIZATION, + /** + * Functional groups will only consist of atoms marked according to the conditions defined by Ertl, environments + * will be completely ignored. */ - NO_GENERALIZATION; + ONLY_MARKED_ATOMS; } - - private enum EnvironmentCalCType { C_AROMATIC, C_ALIPHATIC }; - + // + /** + * Defines whether an environmental carbon atom is aromatic or aliphatic. Only for internal use for caching this + * info in the EnvironmentalC instances (see private class below). + */ + private static enum EnvironmentalCType { + /** + * Aromatic environmental carbon. + */ + C_AROMATIC, + /** + * Aliphatic environmental carbon. + */ + C_ALIPHATIC; + } + // /** * Describes one carbon atom in the environment of a marked atom. It can either be aromatic * or aliphatic and also contains a clone of its connecting bond. */ - private class EnvironmentalC{ - private EnvironmentCalCType type; - private int bondIndex; - private IBond.Order bondOrder; - private IBond.Stereo bondStereo; - private boolean[] bondFlags; - - public EnvironmentalC(EnvironmentCalCType type, IBond bond, int indexInBond) { - this.type = type; - - bondIndex = indexInBond; - bondOrder = bond.getOrder(); - bondStereo = bond.getStereo(); - bondFlags = bond.getFlags(); + private class EnvironmentalC { + /** + * Indicates whether carbon atom is aromatic or aliphatic. + */ + private final EnvironmentalCType type; + // + /** + * Bond index of the original C atom. + */ + private final int bondIndex; + // + /** + * Order of the bond connecting this environmental C atom to the marked functional group atom. + */ + private final IBond.Order bondOrder; + // + /** + * Stereo information of the bond connecting this environmental C atom to the marked functional group atom. + */ + private final IBond.Stereo bondStereo; + // + /** + * Flags of the bond connecting this environmental C atom to the marked functional group atom. IChemObjecflags + * are properties defined by an integer value (array position) and a boolean value. + */ + private final boolean[] bondFlags; + // + /** + * Default constructor defining all fields. Order, stereo, and flags are taken from the IBond object directly. + * + * @param aType aromatic or aliphatic + * @param aConnectingBond bond instance connecting to the marked atom + * @param anIndexInBond index of the atom in the connecting bond + */ + public EnvironmentalC(EnvironmentalCType aType, IBond aConnectingBond, int anIndexInBond) { + this.type = aType; + this.bondIndex = anIndexInBond; + this.bondOrder = aConnectingBond.getOrder(); + this.bondStereo = aConnectingBond.getStereo(); + this.bondFlags = aConnectingBond.getFlags(); } - - public EnvironmentCalCType getType() { - return type; + // + /** + * Returns the type, i.e. whether this carbon atom is aromatic or aliphatic. + * + * @return EnvironmentalCType enum constant + */ + public EnvironmentalCType getType() { + return this.type; } - - public IBond createBond(IAtom targetAtom, IAtom cAtom) { - IBond bond = targetAtom.getBuilder().newInstance(IBond.class); - if(bondIndex == 0) { - bond.setAtoms(new IAtom[] {cAtom, targetAtom}); + // + /** + * Method for translating this instance back into a "real" IAtom instance when expanding the functional group + * environment, transferring all the cached properties, except the type(!). + * + * @param aTargetAtom marked functional group atom + * @param anEnvCAtom new carbon atom instance that should receive all the cached properties except the type(!); + * element, atom type "C" and implicit hydrogen count = 0 should be set already; type can later + * be set via .setIsAromatic(boolean); + * @return new bond connecting marked FG atom and environment atom in the correct order and with the cached properties + */ + public IBond createBond(IAtom aTargetAtom, IAtom anEnvCAtom) { + IBond tmpBond = aTargetAtom.getBuilder().newInstance(IBond.class); + if (this.bondIndex == 0) { + tmpBond.setAtoms(new IAtom[] {anEnvCAtom, aTargetAtom}); } else { - bond.setAtoms(new IAtom[] {targetAtom, cAtom}); + tmpBond.setAtoms(new IAtom[] {aTargetAtom, anEnvCAtom}); } - bond.setOrder(bondOrder); - bond.setStereo(bondStereo); - bond.setFlags(bondFlags); - - return bond; + tmpBond.setOrder(this.bondOrder); + tmpBond.setStereo(this.bondStereo); + tmpBond.setFlags(this.bondFlags); + return tmpBond; } } - + // + /** + * CDK logging tool instance for this class. Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); + * to activate debug messages. + */ + public static final ILoggingTool LOGGING_TOOL = LoggingToolFactory.createLoggingTool(ErtlFunctionalGroupsFinder.class); + // + /** + * Property name for marking carbonyl carbon atoms via IAtom properties. + */ + public static final String CARBONYL_C_MARKER = "EFGF-Carbonyl-C"; + // + /** + * Set of atomic numbers of nonmetal elements, namely hydrogen, carbon, nitrogen, oxygen, phosphorus, sulfur, selenium, + * halogens (fluorine, chlorine, bromine, iodine), and noble gases (helium, neon, argon, krypton, xenon, radon). + * Atoms of these elements are exclusively accepted in the input molecule if(!) the strict input restrictions are + * activated (turned off by default). + */ + public static final Set NONMETAL_ATOMIC_NUMBERS = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); + // /** - * Default constructor for ErtlFunctionalGroupsFinder. + * Environment mode setting, defining whether environments should be generalized (default) or kept as whole. + */ + private Mode envMode; + // + /** + * Map of bonds in the input molecule, cache(!). + */ + private EdgeToBondMap bondMapCache; + // + /** + * Adjacency list representation of input molecule, cache(!). + */ + private int[][] adjListCache; + // + /** + * Set for atoms marked as being part of a functional group, represented by an internal index based on the atom + * count in the input molecule, cache(!). + */ + private HashSet markedAtomsCache; + // + /** + * HashMap for storing aromatic hetero-atom indices and whether they have already been assigned to a larger functional + * group. If false, they form single-atom FG by themselves, cache(!). + * key: atom idx, value: isInGroup + */ + private HashMap aromaticHeteroAtomIndicesToIsInGroupBoolMapCache; + // + /** + * HashMap for storing marked atom to connected environmental carbon atom relations, cache(!). + */ + private HashMap> markedAtomToConnectedEnvCMapCache; + // + /** + * Default constructor for ErtlFunctionalGroupsFinder with functional group generalization turned ON. */ public ErtlFunctionalGroupsFinder() { this(Mode.DEFAULT); } - + // /** - * Constructor for ErtlFunctionalGroupsFinder. + * Constructor for ErtlFunctionalGroupsFinder that allows setting the treatment of environments in the identified + * functional groups. Default: environments will be generalized; no generalization: environments will be kept as whole; + * only marked atoms: no environmental atoms whatsoever will be attached to the extracted functional groups. * - * @param mode working mode (see {@code ErtlFunctionalGroupsFinder.Mode}). + * @param anEnvMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). */ - public ErtlFunctionalGroupsFinder(Mode mode) { - this.mode = mode; - - // init non-metal and non-metalloid atom numbers - nonmetalAtomicNumbers = Set.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); //ImmutableSet.of(1, 2, 6, 7, 8, 9, 10, 15, 16, 17, 18, 34, 35, 36, 53, 54, 86); + public ErtlFunctionalGroupsFinder(Mode anEnvMode) { + Objects.requireNonNull(anEnvMode, "Given environment mode cannot be null."); + this.envMode = anEnvMode; } - + // /** - * Find all functional groups contained in a molecule. + * Constructs a new ErtlFunctionalGroupsFinder instance with generalization of returned functional groups turned ON. * - * NOTE: The input must consist of one connected structure and may not contain charged atoms, metals or metalloids. + * @return new ErtlFunctionalGroupsFinder instance that generalizes returned functional groups + */ + public static ErtlFunctionalGroupsFinder newErtlFunctionalGroupsFinderGeneralizingMode() { + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + return tmpEFGF; + } + // + /** + * Constructs a new ErtlFunctionalGroupsFinder instance with generalization of returned functional groups turned OFF. + * The FG will have their full environments. * - * @param container the molecule which contains the functional groups (may not contain charged atoms, metals, - * metalloids or unconnected components!) - * @return a list with all functional groups found in the molecule. + * @return new ErtlFunctionalGroupsFinder instance that does NOT generalize returned functional groups */ - public List find(IAtomContainer container){ - return find(container, true); + public static ErtlFunctionalGroupsFinder newErtlFunctionalGroupsFinderFullEnvironmentMode() { + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + return tmpEFGF; } - + // + /** + * Constructs a new ErtlFunctionalGroupsFinder instance that extracts only the marked atoms of the functional groups, + * no attached environmental atoms. + * + * @return new ErtlFunctionalGroupsFinder instance that extracts only marked atoms + */ + public static ErtlFunctionalGroupsFinder newErtlFunctionalGroupsFinderOnlyMarkedAtomsMode() { + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + return tmpEFGF; + } + // + /** + * Allows setting the treatment of functional group environments after extraction. Default: environments will be + * generalized; no generalization: environments will be kept as whole; only marked atoms: no environmental atoms + * whatsoever will be attached to the extracted functional groups. + * + * @param anEnvMode mode for treating functional group environments (see {@link ErtlFunctionalGroupsFinder.Mode}). + */ + public void setEnvMode(Mode anEnvMode) { + Objects.requireNonNull(anEnvMode, "Given environment mode cannot be null."); + this.envMode = anEnvMode; + } + // + /** + * Returns the current setting for the treatment of functional group environments after extraction. + * + * @return currently set environment mode + */ + public Mode getEnvMode() { + return this.envMode; + } + // + /** + * Find all functional groups in a molecule. The input atom container instance is cloned before processing to leave + * the input container intact. + *

+ * Note: The strict input restrictions from previous versions (no charged atoms, metals, metalloids or + * unconnected components) do not apply anymore by default. They can be turned on again in another variant of + * this method below. + *

+ * + * @param aMolecule the molecule to identify functional groups in + * @throws CloneNotSupportedException if cloning is not possible + * @return a list with all functional groups found in the molecule + */ + public List find(IAtomContainer aMolecule) throws CloneNotSupportedException { + return this.find(aMolecule, true, false); + } + // /** - * Find all functional groups contained in a molecule. + * Find all functional groups in a molecule. + *

+ * Note: The strict input restrictions from previous versions (no charged atoms, metals, metalloids or + * unconnected components) do not apply anymore by default. They can be turned on again in another variant of + * this method below. + *

* - * NOTE: The input must consist of one connected structure and may not contain charged atoms, metals or metalloids. + * @param aMolecule the molecule to identify functional groups in + * @param aShouldInputBeCloned use 'false' to reuse the input container's bonds and atoms in the extraction of the functional + * groups; this may speed up the extraction and lower the memory consumption for processing large + * amounts of data but corrupts the original input container; use 'true' to work with a clone and + * leave the input container intact + * @throws CloneNotSupportedException if cloning is not possible + * @return a list with all functional groups found in the molecule + */ + public List find(IAtomContainer aMolecule, boolean aShouldInputBeCloned) throws CloneNotSupportedException { + return this.find(aMolecule, aShouldInputBeCloned, false); + } + // + /** + * Find all functional groups in a molecule. * - * @param container the molecule which contains the functional groups (may not contain charged atoms, metals, - * metalloids or unconnected components!) - * @param clone Use 'false' to reuse the input container's bonds and atoms in the extraction of the functional - * groups. This may speed up the extraction and lower the memory consumption for processing large - * amounts of data but corrupts the original input container. - * Use 'true' to work with a clone and leave the input container intact (default). - * @return a list with all functional groups found in the molecule. - */ - public List find(IAtomContainer container, boolean clone){ - // work with a clone? - IAtomContainer mol; - if(clone){ - try { - mol = container.clone(); - } catch (CloneNotSupportedException e) { - throw new IllegalStateException("Atom container could not be cloned"); + * @param aMolecule the molecule to identify functional groups in + * @param aShouldInputBeCloned use 'false' to reuse the input container's bonds and atoms in the extraction of the functional + * groups; this may speed up the extraction and lower the memory consumption for processing large + * amounts of data but corrupts the original input container; use 'true' to work with a clone and + * leave the input container intact + * @param anAreInputRestrictionsApplied if true, the input must consist of one connected structure and may not + * contain charged atoms, metals or metalloids; an IllegalArgumentException will + * be thrown otherwise; see convenience methods in this class for detecting + * illegal input structures for this case + * @throws CloneNotSupportedException if cloning is not possible + * @throws IllegalArgumentException if input restrictions are applied and the given molecule does not fulfill them + * @return a list with all functional groups found in the molecule + */ + public List find(IAtomContainer aMolecule, boolean aShouldInputBeCloned, boolean anAreInputRestrictionsApplied) + throws CloneNotSupportedException, IllegalArgumentException { + this.clearCache(); + IAtomContainer tmpMolecule; + if (aShouldInputBeCloned) { + tmpMolecule = aMolecule.clone(); + } else { + tmpMolecule = aMolecule; + } + for (IAtom tmpAtom : tmpMolecule.atoms()) { + if(Objects.isNull(tmpAtom.getImplicitHydrogenCount())) { + tmpAtom.setImplicitHydrogenCount(0); } } - else{ - mol = container; + this.bondMapCache = EdgeToBondMap.withSpaceFor(tmpMolecule); + this.adjListCache = GraphUtil.toAdjList(tmpMolecule, this.bondMapCache); + if (anAreInputRestrictionsApplied) { + // throws IllegalArgumentException if constraints are not met + // only done now because adjacency list cache is needed in the method + this.checkConstraints(tmpMolecule); } - - // init GraphUtil & EdgeToBondMap - bondMap = EdgeToBondMap.withSpaceFor(mol); - adjList = GraphUtil.toAdjList(mol, bondMap); - - checkConstraints(mol); - - // atom marking - markAtoms(mol); - + this.markAtoms(tmpMolecule); // extract raw groups - List groups = extractGroups(mol); - + List tmpFunctionalGroupsList = this.extractGroups(tmpMolecule); // handle environment - if(mode == Mode.DEFAULT) { - expandGeneralizedEnvironments(groups); + if (this.envMode == Mode.DEFAULT) { + this.expandGeneralizedEnvironments(tmpFunctionalGroupsList); + } else if (this.envMode == Mode.NO_GENERALIZATION) { + this.expandFullEnvironments(tmpFunctionalGroupsList); + } else if (this.envMode == Mode.ONLY_MARKED_ATOMS) { + //do nothing + }else { + throw new IllegalArgumentException("Unknown mode."); + } + this.clearCache(); + return tmpFunctionalGroupsList; + } + // + /** + * Applies the always necessary preprocessing for functional group detection. Atom types are set and aromaticity detected + * in the input molecule. + *
NOTE: This changes properties and flags in the given atom container instance. If you + * want to retain your object unchanged for future calculations, use the IAtomContainer's + * clone() method. + * + * @param aMolecule the molecule to process + * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen + * ElectronDonation model can massively influence the extracted functional groups of a molecule + * when using ErtlFunctionGroupsFinder! + * @throws NullPointerException if any parameter is null + * @throws IllegalArgumentException if the input molecule causes any other type of exception while processing + */ + public static void applyPreprocessing(IAtomContainer aMolecule, Aromaticity anAromaticityModel) throws NullPointerException, IllegalArgumentException { + Objects.requireNonNull(aMolecule, "Given atom container is 'null'."); + Objects.requireNonNull(anAromaticityModel, "Given aromaticity model is 'null'."); + try { + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(aMolecule); + anAromaticityModel.apply(aMolecule); + } catch (Exception anException) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn(anException); + throw new IllegalArgumentException(anException); + } + } + // + /** + * Returns the unmodifiable set containing the atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). These nonmetal elements include + * hydrogen, carbon, nitrogen, oxygen, phosphorus, sulfur, selenium, halogens (fluorine, chlorine, bromine, iodine), + * and noble gases (helium, neon, argon, krypton, xenon, radon). + * All other atomic numbers represent metal, metalloid, or pseudo ('R') atoms. + *
Convenience method analogous to using ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS directly. + * + * @return all valid atomic numbers for ErtlFunctionalGroupsFinder.find() if input restrictions are activated + */ + public static Set getNonmetalAtomicNumbers() { + return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS; + } + // + /** + * Checks whether a given atom is a metal, metalloid, or pseudo atom judging by its atomic number. These atoms + * cannot be passed on to ErtlFunctionalGroupsFinder.find() if(!) input restrictions are enabled (turned off by default). + * + * @param anAtom the atom to check + * @return true, if the atomic number is not in the nonmetal atomic numbers set or 'null' + * @throws NullPointerException if the given atom is 'null' + */ + public static boolean isMetalMetalloidOrPseudoAtom(IAtom anAtom) throws NullPointerException { + Objects.requireNonNull(anAtom, "Given atom is 'null'."); + if (Objects.isNull(anAtom.getAtomicNumber())) { + return true; } - else if (mode == Mode.NO_GENERALIZATION) { - expandFullEnvironments(groups); + return !ErtlFunctionalGroupsFinder.isNonmetal(anAtom); + } + // + /** + * Iterates through all atoms in the given molecule and checks them for metal, metalloid, and pseudo ("R") atoms. If this + * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). If you are using the strict input restrictions to + * only identify functional groups in standardised, organic structures, you should filter the molecules where this + * method returns true from your input set. + *
This method scales linearly with O(n) with n: number of atoms in the given + * molecule. + * + * @param aMolecule the molecule to check + * @return true, if the molecule contains one or more metal, metalloid, or pseudo ("R") atoms + * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' + */ + public static boolean containsMetalMetalloidOrPseudoAtom(IAtomContainer aMolecule) throws NullPointerException { + Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); + boolean tmpIsAtomicNumberInvalid; + for (IAtom tmpAtom : aMolecule.atoms()) { + // throws NullPointerException if tmpAtom is 'null' + tmpIsAtomicNumberInvalid = ErtlFunctionalGroupsFinder.isMetalMetalloidOrPseudoAtom(tmpAtom); + if (tmpIsAtomicNumberInvalid) { + return true; + } } - else { - throw new IllegalStateException("Unknown mode."); + return false; + } + // + /** + * Checks whether a given atom is charged. These atoms cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). + * + * @param anAtom the atom to check + * @return true, if the atom is charged + * @throws NullPointerException if the given atom is 'null' + */ + public static boolean isCharged(IAtom anAtom) throws NullPointerException { + Objects.requireNonNull(anAtom, "Given atom is 'null'."); + Integer tmpFormalCharge = anAtom.getFormalCharge(); + if (Objects.isNull(tmpFormalCharge)) { + return false; } - - // clear fields - bondMap = null; - adjList = null; - markedAtoms = null; - aromaticHeteroAtoms = null; - environmentsMap = null; - - return groups; + return (tmpFormalCharge.intValue() != 0); } - + // + /** + * Iterates through all atoms in the given molecule and checks whether they are charged. If this + * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() + * if(!) input restrictions are enabled (turned off by default). If you are using the strict input restrictions to + * only identify functional groups in standardised, organic structures, you can try to neutralise the charges in the + * molecules where this method returns true by standardisation routines. + *
This method scales linearly with O(n) with n: number of atoms in the given + * molecule. + * + * @param aMolecule the molecule to check + * @return true, if the molecule contains one or more charged atoms + * @throws NullPointerException if the given molecule is 'null' + */ + public static boolean containsChargedAtom(IAtomContainer aMolecule) throws NullPointerException { + Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); + boolean tmpIsAtomCharged; + for (IAtom tmpAtom : aMolecule.atoms()) { + //Throws NullPointerException if tmpAtom is 'null' + tmpIsAtomCharged = ErtlFunctionalGroupsFinder.isCharged(tmpAtom); + if (tmpIsAtomCharged) { + return true; + } + } + return false; + } + // + /** + * Checks whether the given molecule consists of two or more unconnected structures, e.g. ion and counter-ion. This + * would make it unfit to be passed to ErtlFunctionalGroupsFinder.find() if(!) the input restrictions are turned on (turned off by default). + * If you are using the strict input restrictions to only identify functional groups in standardised, organic structures, + * you can try to select the biggest connected component in the input atom containers where this method returns true + * and only pass that to ErtlFunctionalGroupsFinder. + * Note: this is a convenience method basically applying ConnectivityChecker.isConnected(aMolecule);. + * + * @param aMolecule the molecule to check + * @return true, if the molecule consists of two or more unconnected structures + * @throws NullPointerException if the given molecule is 'null' + */ + public static boolean isStructureUnconnected(IAtomContainer aMolecule) throws NullPointerException { + //Developer's note: the private checkConstraints() method is not used here because it is intertwined with the + // find() method for speed-up; but it basically does the same. + Objects.requireNonNull(aMolecule, "Given molecule is 'null'"); + boolean tmpIsConnected = ConnectivityChecker.isConnected(aMolecule); + return (!tmpIsConnected); + } + // + /** + * Checks whether the given molecule represented by an atom container can be passed on to the + * ErtlFunctionalGroupsFinder.find() method without problems even if(!) the input restrictions are turned on (turned off by default). + *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms or consists of + * multiple unconnected parts. Some of these issues (charges and multiple unconnected components) can be solved by + * respective standardisation routines. + * + * @param aMolecule the molecule to check + * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method if(!) the input restrictions are turned on (turned off by default) + * @throws NullPointerException if parameter is 'null' + * @throws IllegalArgumentException if the input molecule causes any other type of exception while processing + */ + public static boolean isValidInputMoleculeWithRestrictionsTurnedOn(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException { + Objects.requireNonNull(aMolecule, "Given molecule is null."); + boolean tmpIsValid; + try { + tmpIsValid = !(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) + || ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); + } catch (Exception anException) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.warn(anException); + throw new IllegalArgumentException(anException); + } + return tmpIsValid; + } + // + /** + * Clear caches related to the input molecule. Note, these are not proper caches, there are no results cached. Here, + * only data taken from the input molecule is saved for only one execution of the find() method, to facilitate + * communication between the private methods involved. + */ + private void clearCache() { + this.bondMapCache = null; + this.adjListCache = null; + this.markedAtomsCache = null; + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache = null; + this.markedAtomToConnectedEnvCMapCache = null; + } + // /** * Mark all atoms and store them in a set for further processing. * - * @param molecule Molecule with atoms to mark + * @param aMolecule molecule with atoms to mark */ - private void markAtoms(IAtomContainer molecule) { - if(isDbg()) log.debug("########## Starting search for atoms to mark ... ##########"); - + private void markAtoms(IAtomContainer aMolecule) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting search for atoms to mark ... ##########"); + } // store marked atoms - markedAtoms = new HashSet(molecule.getAtomCount()); //Sets.newHashSetWithExpectedSize(molecule.getAtomCount()); + this.markedAtomsCache = new HashSet<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); // store aromatic heteroatoms - aromaticHeteroAtoms = new HashMap<>(); - - for(int idx = 0; idx < molecule.getAtomCount(); idx++) { - // skip atoms that already got marked in a previous iteration - if(markedAtoms.contains(idx)) { + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); + for (int idx = 0; idx < aMolecule.getAtomCount(); idx++) { + // skip atoms that were already marked in a previous iteration + if (this.markedAtomsCache.contains(idx)) { continue; } - IAtom cAtom = molecule.getAtom(idx); - // skip aromatic atoms but add them to set - if(cAtom.isAromatic()) { - if(isHeteroatom(cAtom)) { - aromaticHeteroAtoms.put(idx, false); + IAtom tmpAtom = aMolecule.getAtom(idx); + // skip aromatic atoms but add aromatic HETERO-atoms to map for later processing + if (tmpAtom.isAromatic()) { + if (this.isHeteroatom(tmpAtom)) { + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.put(idx, false); } continue; } - - int atomicNr = cAtom.getAtomicNumber(); - + int tmpAtomicNr = tmpAtom.getAtomicNumber(); // if C... - if(atomicNr == 6) { - boolean isMarked = false; // to detect if foor loop ran with or without marking the C atom - int oNSCounter = 0; // count for the number of connected O, N & S atoms - for(int connectedIdx : adjList[idx]) { - IAtom connectedAtom = molecule.getAtom(connectedIdx); - IBond connectedBond = bondMap.get(idx, connectedIdx); - - // if connected to Heteroatom or C in aliphatic double or triple bond... [CONDITIONS 2.1 & 2.2] - if(connectedAtom.getAtomicNumber() != 1 && ((connectedBond.getOrder() == Order.DOUBLE - || connectedBond.getOrder() == Order.TRIPLE) && !connectedBond.isAromatic())) { - - // set the connected atom as marked - if(markedAtoms.add(connectedIdx)) { - String connectedAtomCondition = connectedAtom.getAtomicNumber() == 6 ? "2.1/2.2" : "1"; - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition %s", - connectedIdx, connectedAtom.getSymbol(), connectedAtomCondition)); + if (tmpAtomicNr == 6) { + // to detect if for loop ran with or without marking the C atom + boolean tmpIsMarked = false; + // count for the number of connected O, N & S atoms to detect acetal carbons + int tmpConnectedONSatomsCounter = 0; + for (int tmpConnectedIdx : this.adjListCache[idx]) { + IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); + IBond tmpConnectedBond = this.bondMapCache.get(idx, tmpConnectedIdx); + + // if connected to heteroatom or C in aliphatic double or triple bond... [CONDITIONS 2.1 & 2.2] + if (tmpConnectedAtom.getAtomicNumber() != 1 + && ((tmpConnectedBond.getOrder() == Order.DOUBLE || tmpConnectedBond.getOrder() == Order.TRIPLE) + && !tmpConnectedBond.isAromatic())) { + + // set the *connected* atom as marked (add() true if this set did not already contain the specified element) + if (this.markedAtomsCache.add(tmpConnectedIdx)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition %s", + tmpConnectedIdx, + tmpConnectedAtom.getSymbol(), + tmpConnectedAtom.getAtomicNumber() == 6 ? "2.1/2.2" : "1")); + } + } + // set the *current* atom as marked and break out of connected atoms + tmpIsMarked = true; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.1/2.2", + idx, + tmpAtom.getSymbol())); } - - // set the current atom as marked and break out of connected atoms - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.1/2.2", - idx, cAtom.getSymbol())); - isMarked = true; - // but check for carbonyl-C before break - if(connectedAtom.getAtomicNumber() == 8 && connectedBond.getOrder() == Order.DOUBLE - && adjList[idx].length == 3) { - if(isDbg()) log.debug(" - was flagged as Carbonly-C"); - cAtom.setProperty(CARBONYL_C_MARKER, true); + if (tmpConnectedAtom.getAtomicNumber() == 8 + && tmpConnectedBond.getOrder() == Order.DOUBLE + && this.adjListCache[idx].length == 3) { + tmpAtom.setProperty(CARBONYL_C_MARKER, true); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("- was flagged as Carbonly-C"); + } } - + // break out of connected atoms break; - } - // if connected to O/N/S in single bond... - else if((connectedAtom.getAtomicNumber() == 7 - || connectedAtom.getAtomicNumber() == 8 - || connectedAtom.getAtomicNumber() == 16) - && connectedBond.getOrder() == Order.SINGLE){ + } else if ((tmpConnectedAtom.getAtomicNumber() == 7 + || tmpConnectedAtom.getAtomicNumber() == 8 + || tmpConnectedAtom.getAtomicNumber() == 16) + && tmpConnectedBond.getOrder() == Order.SINGLE) { + // if connected to O/N/S in single bond... // if connected O/N/S is not aromatic... - if(!connectedAtom.isAromatic()) { + if (!tmpConnectedAtom.isAromatic()) { // set the connected O/N/S atom as marked - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 1", - connectedIdx, connectedAtom.getSymbol())); - markedAtoms.add(connectedIdx); - + this.markedAtomsCache.add(tmpConnectedIdx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 1", + tmpConnectedIdx, + tmpConnectedAtom.getSymbol())); + } // if "acetal C" (2+ O/N/S in single bonds connected to sp3-C)... [CONDITION 2.3] - boolean isAllSingleBonds = true; - for(int connectedInSphere2Idx : adjList[connectedIdx]) { - IBond sphere2Bond = bondMap.get(connectedIdx, connectedInSphere2Idx); - if(sphere2Bond.getOrder() != Order.SINGLE) { - isAllSingleBonds = false; + boolean tmpIsAllSingleBonds = true; + for (int tmpConnectedInSphere2Idx : this.adjListCache[tmpConnectedIdx]) { + IBond tmpSphere2Bond = this.bondMapCache.get(tmpConnectedIdx, tmpConnectedInSphere2Idx); + if (tmpSphere2Bond.getOrder() != Order.SINGLE) { + tmpIsAllSingleBonds = false; break; } } - if(isAllSingleBonds) { - oNSCounter++; - if(oNSCounter > 1 && adjList[idx].length + cAtom.getImplicitHydrogenCount() == 4) { + if (tmpIsAllSingleBonds) { + tmpConnectedONSatomsCounter++; + if (tmpConnectedONSatomsCounter > 1 && this.adjListCache[idx].length + tmpAtom.getImplicitHydrogenCount() == 4) { // set as marked and break out of connected atoms - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.3", - idx, cAtom.getSymbol())); - isMarked = true; + tmpIsMarked = true; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.3", + idx, + tmpAtom.getSymbol())); + } break; } } } - // if part of oxirane, aziridine and thiirane ring... [CONDITION 2.4] - for(int connectedInSphere2Idx : adjList[connectedIdx]) { - IAtom connectedInSphere2Atom = molecule.getAtom(connectedInSphere2Idx); - if(connectedInSphere2Atom.getAtomicNumber() == 6) { - for(int connectedInSphere3Idx : adjList[connectedInSphere2Idx]) { - IAtom connectedInSphere3Atom = molecule.getAtom(connectedInSphere3Idx); - if(connectedInSphere3Atom.equals(cAtom)) { + // if part of oxirane, aziridine, or thiirane ring... [CONDITION 2.4] + for (int tmpConnectedInSphere2Idx : this.adjListCache[tmpConnectedIdx]) { + IAtom tmpConnectedInSphere2Atom = aMolecule.getAtom(tmpConnectedInSphere2Idx); + if (tmpConnectedInSphere2Atom.getAtomicNumber() == 6) { + for (int tmpConnectedInSphere3Idx : this.adjListCache[tmpConnectedInSphere2Idx]) { + IAtom tmpConnectedInSphere3Atom = aMolecule.getAtom(tmpConnectedInSphere3Idx); + if (tmpConnectedInSphere3Atom.equals(tmpAtom)) { // set connected atoms as marked - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - connectedInSphere2Idx, connectedInSphere2Atom.getSymbol())); - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - connectedInSphere3Idx, connectedInSphere3Atom.getSymbol())); - markedAtoms.add(connectedInSphere2Idx); - markedAtoms.add(connectedInSphere3Idx); + this.markedAtomsCache.add(tmpConnectedInSphere2Idx); + this.markedAtomsCache.add(tmpConnectedInSphere3Idx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.4", + tmpConnectedInSphere2Idx, + tmpConnectedInSphere2Atom.getSymbol())); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.4", + tmpConnectedInSphere3Idx, + tmpConnectedInSphere3Atom.getSymbol())); + } // set current atom as marked and break out of connected atoms - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 2.4", - idx, cAtom.getSymbol())); - isMarked = true; + tmpIsMarked = true; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 2.4", + idx, + tmpAtom.getSymbol())); + } break; } } } - } - } - } - if(isMarked) { - markedAtoms.add(idx); + } //end of for loop iterating over second sphere atoms + } // end of else if connected to O/N/S in single bond + } //end of for loop that iterates over all connected atoms of the carbon atom + if (tmpIsMarked) { + this.markedAtomsCache.add(idx); continue; } // if none of the conditions 2.X apply, we have an unmarked C (not relevant here) - } - // if H... - else if (atomicNr == 1){ + } else if (tmpAtomicNr == 1) { + // if H... // convert to implicit H - IAtom connectedAtom; - try { - connectedAtom = molecule.getAtom(adjList[idx][0]); - } - catch(ArrayIndexOutOfBoundsException e) { - break; + IAtom tmpConnectedAtom; + if (this.adjListCache[idx].length > 0) { + tmpConnectedAtom = aMolecule.getAtom(this.adjListCache[idx][0]); + } else { + //unconnected, explicit hydrogen atoms (like e.g. in CHEBI:365445) have an array of bond partners of size 0 + // nothing to do about them, but they also do not concern us + continue; } - - - if(connectedAtom.getImplicitHydrogenCount() == null) { - connectedAtom.setImplicitHydrogenCount(1); + if (Objects.isNull(tmpConnectedAtom.getImplicitHydrogenCount())) { + tmpConnectedAtom.setImplicitHydrogenCount(1); + } else { + tmpConnectedAtom.setImplicitHydrogenCount(tmpConnectedAtom.getImplicitHydrogenCount() + 1); } - else { - connectedAtom.setImplicitHydrogenCount(connectedAtom.getImplicitHydrogenCount() + 1); + continue; + } else if (this.isHeteroatom(tmpAtom)) { + // if heteroatom... (CONDITION 1) + this.markedAtomsCache.add(idx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Marking Atom #%d (%s) - Met condition 1", + idx, + tmpAtom.getSymbol())); } continue; - } - // if heteroatom... (CONDITION 1) - else { - if(isDbg()) log.debug(String.format("Marking Atom #%d (%s) - Met condition 1", idx, cAtom.getSymbol())); - markedAtoms.add(idx); + } else { + //pseudo (R) atom, ignored continue; } + } //end of for loop that iterates over all atoms in the mol + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "########## End of search. Marked %d/%d atoms. ##########", + this.markedAtomsCache.size(), + aMolecule.getAtomCount())); } - if(isDbg()) log.debug(String.format("########## End of search. Marked %d/%d atoms. ##########", markedAtoms.size(), molecule.getAtomCount())); } - + // /** * Searches the molecule for groups of connected marked atoms and extracts each as a new functional group. - * The extraction process includes marked atom's "environments". Connected H's are captured implicitly. + * The extraction process includes marked atoms' "environments". Connected H's are captured implicitly. * - * @param molecule the molecule which contains the functional groups + * @param aMolecule the molecule which contains the functional groups * @return a list of all functional groups (including "environments") extracted from the molecule */ - private List extractGroups(IAtomContainer molecule) { - if(isDbg()) log.debug("########## Starting identification & extraction of functional groups... ##########"); - - environmentsMap = new HashMap>(molecule.getAtomCount());//Maps.newHashMapWithExpectedSize(molecule.getAtomCount()); - int[] atomIdxToFGMap = new int[molecule.getAtomCount()]; - Arrays.fill(atomIdxToFGMap, -1); - int fGroupIdx = -1; - - while(!markedAtoms.isEmpty()) { + private List extractGroups(IAtomContainer aMolecule) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting identification & extraction of functional groups... ##########"); + } + this.markedAtomToConnectedEnvCMapCache = new HashMap<>((int) ((aMolecule.getAtomCount() / 0.75f) + 2), 0.75f); + int[] tmpAtomIdxToFGArray = new int[aMolecule.getAtomCount()]; + Arrays.fill(tmpAtomIdxToFGArray, -1); + int tmpFunctionalGroupIdx = -1; + while (!this.markedAtomsCache.isEmpty()) { // search for another functional group - fGroupIdx++; - + tmpFunctionalGroupIdx++; // get next markedAtom as the starting node for the search - int beginIdx = markedAtoms.iterator().next(); - if(isDbg()) log.debug(String.format("Searching new functional group from atom #%d (%s)...", beginIdx, molecule.getAtom(beginIdx).getSymbol())); - + int tmpBeginIdx = this.markedAtomsCache.iterator().next(); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Searching new functional group from atom #%d (%s)...", + tmpBeginIdx, + aMolecule.getAtom(tmpBeginIdx).getSymbol())); + } // do a BFS from there - Queue queue = new ArrayDeque<>(); - queue.add(beginIdx); - - while(!queue.isEmpty()) { - int currentIdx = queue.poll(); - + Queue tmpQueue = new ArrayDeque<>(); + tmpQueue.add(tmpBeginIdx); + while (!tmpQueue.isEmpty()) { + int tmpCurrentQueueIdx = tmpQueue.poll(); // we are only interested in marked atoms that are not yet included in a group - if(!markedAtoms.contains(currentIdx)){ + if (!this.markedAtomsCache.contains(tmpCurrentQueueIdx)) { continue; } - // if it isn't... - IAtom currentAtom = molecule.getAtom(currentIdx); - if(isDbg()) log.debug(String.format(" visiting marked atom: #%d (%s)", currentIdx, currentAtom.getSymbol())); - + IAtom tmpCurrentAtom = aMolecule.getAtom(tmpCurrentQueueIdx); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("\tvisiting marked atom: #%d (%s)", + tmpCurrentQueueIdx, + tmpCurrentAtom.getSymbol())); + } // add its index to the functional group - atomIdxToFGMap[currentIdx] = fGroupIdx; + tmpAtomIdxToFGArray[tmpCurrentQueueIdx] = tmpFunctionalGroupIdx; // also scratch the index from markedAtoms - markedAtoms.remove(currentIdx); - - // and take look at the connected atoms - List currentEnvironment = new ArrayList<>(); - for(int connectedIdx : adjList[currentIdx]) { + this.markedAtomsCache.remove(tmpCurrentQueueIdx); + // and take a look at the connected atoms + List tmpCurrentEnvironment = new ArrayList<>(); + for (int tmpConnectedIdx : this.adjListCache[tmpCurrentQueueIdx]) { // add connected marked atoms to queue - if(markedAtoms.contains(connectedIdx)) { - queue.add(connectedIdx); + if (this.markedAtomsCache.contains(tmpConnectedIdx)) { + tmpQueue.add(tmpConnectedIdx); continue; } - // ignore already handled connected atoms - if(atomIdxToFGMap[connectedIdx] >= 0){ + if (tmpAtomIdxToFGArray[tmpConnectedIdx] >= 0) { continue; } - // add unmarked connected aromatic heteroatoms - IAtom connectedAtom = molecule.getAtom(connectedIdx); - if(isHeteroatom(connectedAtom) && connectedAtom.isAromatic()) { - if(isDbg()) log.debug(" added connected aromatic heteroatom " + connectedAtom.getSymbol()); - atomIdxToFGMap[connectedIdx] = fGroupIdx; + IAtom tmpConnectedAtom = aMolecule.getAtom(tmpConnectedIdx); + if (this.isHeteroatom(tmpConnectedAtom) && tmpConnectedAtom.isAromatic()) { + tmpAtomIdxToFGArray[tmpConnectedIdx] = tmpFunctionalGroupIdx; // note that this aromatic heteroatom has been added to a group - aromaticHeteroAtoms.put(connectedIdx, true); + this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.put(tmpConnectedIdx, true); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\tadded connected aromatic heteroatom " + + tmpConnectedAtom.getSymbol()); + } } - // add unmarked connected atoms to current marked atom's environment - IBond connectedBond = bondMap.get(currentIdx, connectedIdx); - - EnvironmentCalCType type; - if (connectedAtom.getAtomicNumber() == 6) { - if(connectedAtom.isAromatic()) - type = EnvironmentCalCType.C_AROMATIC; - else - type = EnvironmentCalCType.C_ALIPHATIC; + IBond tmpConnectedBond = this.bondMapCache.get(tmpCurrentQueueIdx, tmpConnectedIdx); + EnvironmentalCType tmpEnvironmentalCType; + if (tmpConnectedAtom.getAtomicNumber() == 6) { + if (tmpConnectedAtom.isAromatic()) { + tmpEnvironmentalCType = EnvironmentalCType.C_AROMATIC; + } else { + tmpEnvironmentalCType = EnvironmentalCType.C_ALIPHATIC; + } } else { // aromatic heteroatom, so just ignore continue; } - currentEnvironment.add(new EnvironmentalC(type, connectedBond, connectedBond.getBegin() == connectedAtom ? 0 : 1)); - } - environmentsMap.put(currentAtom, currentEnvironment); - + tmpCurrentEnvironment.add(new EnvironmentalC( + tmpEnvironmentalCType, + tmpConnectedBond, + tmpConnectedBond.getBegin().equals(tmpConnectedAtom) ? 0 : 1)); + } //end of loop of connected atoms + this.markedAtomToConnectedEnvCMapCache.put(tmpCurrentAtom, tmpCurrentEnvironment); // debug logging - if(isDbg()) { - int cAromCount = 0, cAliphCount = 0; - for(EnvironmentalC comp : currentEnvironment) { - if(comp.getType() == EnvironmentCalCType.C_AROMATIC) - cAromCount++; - else if(comp.getType() == EnvironmentCalCType.C_ALIPHATIC) - cAliphCount++; + if (ErtlFunctionalGroupsFinder.isDbg()) { + int tmpCAromCount = 0; + int tmpCAliphCount = 0; + for(EnvironmentalC tmpEnvC : tmpCurrentEnvironment) { + if (tmpEnvC.getType() == EnvironmentalCType.C_AROMATIC) { + tmpCAromCount++; + } else if (tmpEnvC.getType() == EnvironmentalCType.C_ALIPHATIC) { + tmpCAliphCount++; + } } - log.debug(String.format(" logged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", cAromCount, cAliphCount, currentAtom.getImplicitHydrogenCount())); + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t\tlogged marked atom's environment: C_ar:%d, C_al:%d (and %d implicit hydrogens)", + tmpCAromCount, + tmpCAliphCount, + tmpCurrentAtom.getImplicitHydrogenCount())); } + } // end of BFS + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\tsearch completed."); } - - if(isDbg()) log.debug(" search completed."); - } - - // also create FG for lone aromatic heteroatoms, not connected to a FG yet. - for(int atomIdx : aromaticHeteroAtoms.keySet()) { - if(!aromaticHeteroAtoms.get(atomIdx)) { - fGroupIdx++; - atomIdxToFGMap[atomIdx] = fGroupIdx; - if(isDbg()) log.debug("Created FG for lone aromatic heteroatom: " + molecule.getAtom(atomIdx).getSymbol()); + } //markedAtoms is empty now + // also create FG for lone aromatic heteroatoms, not connected to an FG yet. + for (int tmpAtomIdx : this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.keySet()) { + if (!this.aromaticHeteroAtomIndicesToIsInGroupBoolMapCache.get(tmpAtomIdx).booleanValue()) { + tmpFunctionalGroupIdx++; + tmpAtomIdxToFGArray[tmpAtomIdx] = tmpFunctionalGroupIdx; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("Created FG for lone aromatic heteroatom: " + + aMolecule.getAtom(tmpAtomIdx).getSymbol()); + } } } - - List fGs = partitionIntoGroups(molecule, atomIdxToFGMap, fGroupIdx + 1); - - if(isDbg()) log.debug(String.format("########## Found & extracted %d functional groups. ##########", fGroupIdx + 1)); - return fGs; + List tmpFunctionalGroupsList = this.partitionIntoGroups(aMolecule, tmpAtomIdxToFGArray, tmpFunctionalGroupIdx + 1); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("########## Found & extracted %d functional groups. ##########", + tmpFunctionalGroupIdx + 1)); + } + return tmpFunctionalGroupsList; } - + // /** - * Generalizes the full environments of functional groups, providing a good balance between preserving - * meaningful detail and generalization. + * Generalizes the full environments of functional groups, according to the Ertl generalization algorithm, providing + * a good balance between preserving meaningful detail and generalization. * - * @param fGroups the list of functional groups including "environments" + * @param aFunctionalGroupsList the list of functional groups including "environments" */ - private void expandGeneralizedEnvironments(List fGroups){ - if(isDbg()) log.debug("########## Starting generalization of functional groups... ##########"); - - for(IAtomContainer fGroup : fGroups) { - int atomCount = fGroup.getAtomCount(); - - if(isDbg()) log.debug(String.format("Generalizing functional group (%d atoms)...", atomCount)); - - // prechecking for special cases... - if(fGroup.getAtomCount() == 1) { - IAtom atom = fGroup.getAtom(0); - List environment = environmentsMap.get(atom); - - if(environment != null) { - int envCCount = environment.size(); - - // for H2N-C_env & HO-C_env -> do not replace H & C_env by R! - if((atom.getAtomicNumber() == 8 && envCCount == 1) - || (atom.getAtomicNumber() == 7 && envCCount == 1)){ - if(isDbg()) log.debug(String.format(" - found single atomic N or O FG with one env. C. Expanding environment...", atom.getSymbol())); - expandEnvironment(atom, fGroup); - - int hCount = atom.getImplicitHydrogenCount(); - if(hCount != 0) { - if(isDbg()) log.debug(String.format(" - adding %d hydrogens...", hCount)); - addHydrogens(atom, hCount, fGroup); - atom.setImplicitHydrogenCount(0); + private void expandGeneralizedEnvironments(List aFunctionalGroupsList) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting generalization of functional groups... ##########"); + } + for (IAtomContainer tmpFunctionalGroup : aFunctionalGroupsList) { + int tmpAtomCount = tmpFunctionalGroup.getAtomCount(); + if(ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("Generalizing functional group (%d atoms)...", tmpAtomCount)); + } + // pre-checking for special cases... + if (tmpFunctionalGroup.getAtomCount() == 1) { + IAtom tmpAtom = tmpFunctionalGroup.getAtom(0); + List tmpEnvironment = this.markedAtomToConnectedEnvCMapCache.get(tmpAtom); + + if (!Objects.isNull(tmpEnvironment)) { + int tmpEnvCCount = tmpEnvironment.size(); + // for H2N-C_env & HO-C_env -> do not replace H & C_env by R to differentiate primary/secondary/tertiary amine and alcohol vs. phenol + if ((tmpAtom.getAtomicNumber() == 8 && tmpEnvCCount == 1) + || (tmpAtom.getAtomicNumber() == 7 && tmpEnvCCount == 1)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t- found single atomic %s FG with one env. C. Expanding environment...", + tmpAtom.getSymbol())); + } + this.expandEnvironment(tmpAtom, tmpFunctionalGroup); + int tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); + if (tmpAtomImplicitHydrogenCount != 0) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t- adding %d hydrogens...", tmpAtomImplicitHydrogenCount)); } - continue; + this.addHydrogens(tmpAtom, tmpAtomImplicitHydrogenCount, tmpFunctionalGroup); + tmpAtom.setImplicitHydrogenCount(0); + } + continue; } // for HN-(C_env)-C_env & HS-C_env -> do not replace H by R! (only C_env!) - if((atom.getAtomicNumber() == 7 && envCCount == 2) - || (atom.getAtomicNumber() == 16 && envCCount == 1)) { - if(isDbg()) log.debug(" - found sec. amine or simple thiol"); - int hCount = atom.getImplicitHydrogenCount(); - if(hCount != 0) { - if(isDbg()) log.debug(String.format(" - adding %d hydrogens...", hCount)); - addHydrogens(atom, hCount, fGroup); - atom.setImplicitHydrogenCount(0); + if ((tmpAtom.getAtomicNumber() == 7 && tmpEnvCCount == 2) + || (tmpAtom.getAtomicNumber() == 16 && tmpEnvCCount == 1)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- found sec. amine or simple thiol"); + } + int tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); + if (tmpAtomImplicitHydrogenCount != 0) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("\t- adding %d hydrogens...", + tmpAtomImplicitHydrogenCount)); + } + this.addHydrogens(tmpAtom, tmpAtomImplicitHydrogenCount, tmpFunctionalGroup); + tmpAtom.setImplicitHydrogenCount(0); } - if(isDbg()) log.debug(" - expanding environment..."); - expandEnvironmentGeneralized(atom, fGroup); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- expanding environment..."); + } + this.expandEnvironmentGeneralized(tmpAtom, tmpFunctionalGroup); continue; } - } - else if(isHeteroatom(atom)) { - int rAtomCount = atom.getValency(); - Integer hCount = atom.getImplicitHydrogenCount(); - if(hCount != null && hCount != 0) { - atom.setImplicitHydrogenCount(0); + } else if (this.isHeteroatom(tmpAtom)) { + // env is null and marked atoms is a hetero atom -> single aromatic heteroatom + int tmpRAtomCount = tmpAtom.getValency(); + Integer tmpAtomImplicitHydrogenCount = tmpAtom.getImplicitHydrogenCount(); + if (tmpAtomImplicitHydrogenCount != null && tmpAtomImplicitHydrogenCount != 0) { + tmpAtom.setImplicitHydrogenCount(0); } - String atomTypeName = atom.getAtomTypeName(); - if(isDbg()) log.debug(String.format(" - found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", atom.getSymbol(), atomTypeName, rAtomCount)); - addRAtoms(atom, rAtomCount, fGroup); + String tmpAtomTypeName = tmpAtom.getAtomTypeName(); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t- found single aromatic heteroatom (%s, Atomtype %s). Adding %d R-Atoms...", + tmpAtom.getSymbol(), + tmpAtomTypeName, + tmpRAtomCount)); + } + this.addRAtoms(tmpAtom, tmpRAtomCount, tmpFunctionalGroup); continue; } - } - + } // end of pre-check for special one-atom FG cases // get atoms to process - List fGroupAtoms = new ArrayList(fGroup.getAtomCount());//Lists.newArrayList(fGroup.atoms()); - fGroup.atoms().forEach(fGroupAtoms::add); - - // process atoms... - for(IAtom atom : fGroupAtoms) { - List environment = environmentsMap.get(atom); - - if(environment == null) { - if(atom.getImplicitHydrogenCount() != 0) { - atom.setImplicitHydrogenCount(0); + List tmpFunctionalGroupAtoms = new ArrayList<>(tmpFunctionalGroup.getAtomCount()); + tmpFunctionalGroup.atoms().forEach(tmpFunctionalGroupAtoms::add); + // process individual functional group atoms... + for (IAtom tmpFunctionalGroupAtom : tmpFunctionalGroupAtoms) { + List tmpFGenvCs = this.markedAtomToConnectedEnvCMapCache.get(tmpFunctionalGroupAtom); + if (tmpFGenvCs == null) { + if (tmpFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { + tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); + } + int tmpRAtomCount = tmpFunctionalGroupAtom.getValency() - 1; + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t- found connected aromatic heteroatom (%s). Adding %d R-Atoms...", + tmpFunctionalGroupAtom.getSymbol(), + tmpRAtomCount)); } - int rAtomCount = atom.getValency() - 1; - if(isDbg()) log.debug(String.format(" - found connected aromatic heteroatom (%s). Adding %d R-Atoms...", atom.getSymbol(), rAtomCount)); - addRAtoms(atom, rAtomCount, fGroup); + this.addRAtoms(tmpFunctionalGroupAtom, tmpRAtomCount, tmpFunctionalGroup); } - // processing carbons... - if(atom.getAtomicNumber() == 6) { - if(atom.getProperty(CARBONYL_C_MARKER) == null) { - if(atom.getImplicitHydrogenCount() != 0) { - atom.setImplicitHydrogenCount(0); + if (tmpFunctionalGroupAtom.getAtomicNumber() == 6) { + if (Objects.isNull(tmpFunctionalGroupAtom.getProperty(ErtlFunctionalGroupsFinder.CARBONYL_C_MARKER))) { + if (tmpFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { + tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- ignoring environment for marked carbon atom"); } - if(isDbg()) log.debug(" - ignoring environment for marked carbon atom"); continue; - } - else { - if(isDbg()) log.debug(" - found carbonyl-carbon. Expanding environment..."); - expandEnvironmentGeneralized(atom, fGroup); + } else { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t- found carbonyl-carbon. Expanding environment..."); + } + this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); continue; } - } - // processing heteroatoms... - else { - if(isDbg()) log.debug(String.format(" - found heteroatom (%s). Expanding environment...", atom.getSymbol())); - expandEnvironmentGeneralized(atom, fGroup); + } else { // processing heteroatoms... + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format("\t- found heteroatom (%s). Expanding environment...", + tmpFunctionalGroupAtom.getSymbol())); + } + this.expandEnvironmentGeneralized(tmpFunctionalGroupAtom, tmpFunctionalGroup); continue; } } + } //end of loop over given functional groups list + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Generalization of functional groups completed. ##########"); } - - if(isDbg()) log.debug("########## Generalization of functional groups completed. ##########"); } - + // /** * Expands the full environments of functional groups, converted into atoms and bonds. * - * @param fGroups the list of functional groups including "environments" + * @param aFunctionalGroupsList the list of functional groups including their "environments" */ - private void expandFullEnvironments(List fGroups) { - if(isDbg()) log.debug("########## Starting expansion of full environments for functional groups... ##########"); - - for(IAtomContainer fGroup : fGroups) { - int atomCount = fGroup.getAtomCount(); - if(isDbg()) log.debug(String.format("Expanding environment on functional group (%d atoms)...", atomCount)); - - for(int i = 0; i < atomCount; i++) { - IAtom atom = fGroup.getAtom(i); - - if(isDbg()) log.debug(String.format(" - Atom #%d - Expanding environment...", i)); - expandEnvironment(atom, fGroup); - - int hCount = atom.getImplicitHydrogenCount(); - if(hCount != 0) { - if(isDbg()) log.debug(String.format(" - adding %d hydrogens...", hCount)); - addHydrogens(atom, hCount, fGroup); - atom.setImplicitHydrogenCount(0); + private void expandFullEnvironments(List aFunctionalGroupsList) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Starting expansion of full environments for functional groups... ##########"); + } + for (IAtomContainer tmpFunctionalGroup : aFunctionalGroupsList) { + int tmpAtomCount = tmpFunctionalGroup.getAtomCount(); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "Expanding environment on functional group (%d atoms)...", tmpAtomCount)); + } + for (int i = 0; i < tmpAtomCount; i++) { + IAtom tmpFunctionalGroupAtom = tmpFunctionalGroup.getAtom(i); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + " - Atom #%d - Expanding environment...", i)); + } + this.expandEnvironment(tmpFunctionalGroupAtom, tmpFunctionalGroup); + int tmpImplicitHydrogenCount = tmpFunctionalGroupAtom.getImplicitHydrogenCount(); + if (tmpImplicitHydrogenCount != 0) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t- adding %d hydrogens...", tmpImplicitHydrogenCount)); + } + this.addHydrogens(tmpFunctionalGroupAtom, tmpImplicitHydrogenCount, tmpFunctionalGroup); + tmpFunctionalGroupAtom.setImplicitHydrogenCount(0); } } } - - if(isDbg()) log.debug("########## Expansion of full environments for functional groups completed. ##########"); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("########## Expansion of full environments for functional groups completed. ##########"); + } } + // + /** + * Expand the environment of one atom in a functional group. Takes all environmental C atoms cached earlier and + * re-adds them to the atom as environment. + * + * @param aFunctionalGroupAtom the atom whose environment to expand + * @param aFunctionalGroup the functional group container that the atom is part of + */ + private void expandEnvironment(IAtom aFunctionalGroupAtom, IAtomContainer aFunctionalGroup) { + List tmpEnvCAtomsList = this.markedAtomToConnectedEnvCMapCache.get(aFunctionalGroupAtom); - private void expandEnvironment(IAtom atom, IAtomContainer container) { - List environment = environmentsMap.get(atom); - - if(environment == null || environment.isEmpty()) { - if(isDbg()) log.debug(" found no environment to expand."); + if (Objects.isNull(tmpEnvCAtomsList) || tmpEnvCAtomsList.isEmpty()) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\tfound no environment to expand."); + } return; } - - int cAromCount = 0, cAliphCount = 0; - for(EnvironmentalC envC : environment) { - IAtom cAtom = atom.getBuilder().newInstance(IAtom.class, "C"); - cAtom.setAtomTypeName("C"); - cAtom.setImplicitHydrogenCount(0); - if(envC.getType() == EnvironmentCalCType.C_AROMATIC) { - cAtom.setIsAromatic(true); - cAromCount++; - } - else { - cAliphCount++; + int tmpAromaticCAtomCount = 0; + int tmpAliphaticCAtomCount = 0; + for (EnvironmentalC tmpEnvCAtom : tmpEnvCAtomsList) { + IAtom tmpCAtom = aFunctionalGroupAtom.getBuilder().newInstance(IAtom.class, "C"); + tmpCAtom.setAtomTypeName("C"); + tmpCAtom.setImplicitHydrogenCount(0); + if (tmpEnvCAtom.getType() == EnvironmentalCType.C_AROMATIC) { + tmpCAtom.setIsAromatic(true); + tmpAromaticCAtomCount++; + } else { + tmpAliphaticCAtomCount++; } - - IBond bond = envC.createBond(atom, cAtom); - - container.addAtom(cAtom); - container.addBond(bond); + IBond tmpBond = tmpEnvCAtom.createBond(aFunctionalGroupAtom, tmpCAtom); + aFunctionalGroup.addAtom(tmpCAtom); + aFunctionalGroup.addBond(tmpBond); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t\texpanded environment: %dx C_ar and %dx C_al", + tmpAromaticCAtomCount, + tmpAliphaticCAtomCount)); } - - if(isDbg()) log.debug(String.format(" expanded environment: %dx C_ar and %dx C_al", cAromCount, cAliphCount)); } - - // only call this on marked heteroatoms / carbonyl-C's! - private void expandEnvironmentGeneralized(IAtom atom, IAtomContainer container) { - - List environment = environmentsMap.get(atom); - - if(environment == null) { - if(isDbg()) log.debug(" found no environment to expand."); + // + /** + * Expand the generalized environment of marked heteroatoms and carbonyl-Cs in a functional group. + * Takes all environmental C atoms cached earlier and re-adds them to the atom as environment. + * Note: only call this on marked heteroatoms / carbonyl-C's! + * + * @param aFunctionalGroupAtom the atom whose environment to expand + * @param aFunctionalGroup the functional group container that the atom is part of + */ + private void expandEnvironmentGeneralized(IAtom aFunctionalGroupAtom, IAtomContainer aFunctionalGroup) { + List tmpEnvironment = this.markedAtomToConnectedEnvCMapCache.get(aFunctionalGroupAtom); + if (Objects.isNull(tmpEnvironment)) { + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\tfound no environment to expand."); + } return; } - - int rAtomCount = environment.size(); - int rAtomsForCCount = rAtomCount; - if(atom.getAtomicNumber() == 8 && atom.getImplicitHydrogenCount() == 1) { - addHydrogens(atom, 1, container); - atom.setImplicitHydrogenCount(0); - if(isDbg()) log.debug(" expanded hydrogen on connected OH-Group"); - } - else if(isHeteroatom(atom)) rAtomCount += atom.getImplicitHydrogenCount(); - addRAtoms(atom, rAtomCount, container); - - if(atom.getImplicitHydrogenCount() != 0) { - atom.setImplicitHydrogenCount(0); + int tmpRAtomCount = tmpEnvironment.size(); + int tmpRAtomsForCCount = tmpRAtomCount; + if (aFunctionalGroupAtom.getAtomicNumber() == 8 && aFunctionalGroupAtom.getImplicitHydrogenCount() == 1) { + this.addHydrogens(aFunctionalGroupAtom, 1, aFunctionalGroup); + aFunctionalGroupAtom.setImplicitHydrogenCount(0); + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug("\t\texpanded hydrogen on connected OH-Group"); + } + } else if (this.isHeteroatom(aFunctionalGroupAtom)) { + tmpRAtomCount += aFunctionalGroupAtom.getImplicitHydrogenCount(); + } + this.addRAtoms(aFunctionalGroupAtom, tmpRAtomCount, aFunctionalGroup); + if (aFunctionalGroupAtom.getImplicitHydrogenCount() != 0) { + aFunctionalGroupAtom.setImplicitHydrogenCount(0); + } + if (ErtlFunctionalGroupsFinder.isDbg()) { + ErtlFunctionalGroupsFinder.LOGGING_TOOL.debug(String.format( + "\t\texpanded environment: %dx R-atom (incl. %d for H replacement)", + tmpRAtomCount, + tmpRAtomCount - tmpRAtomsForCCount)); } - - if(isDbg()) log.debug(String.format(" expanded environment: %dx R-atom (incl. %d for H replacement)", rAtomCount, rAtomCount - rAtomsForCCount)); } - - private static final boolean isHeteroatom(IAtom atom) { - int atomicNr = atom.getAtomicNumber(); - return atomicNr != 1 && atomicNr != 6; + // + /** + * Checks whether the given atom is a hetero-atom (i.e. non-carbon and non-hydrogen). Pseudo (R) atoms will also return false! + * + * @param anAtom the atom to test + * @return true if the given atom is neither a carbon nor a hydrogen or pseudo atom + */ + private boolean isHeteroatom(IAtom anAtom) { + Integer tmpAtomicNr = anAtom.getAtomicNumber(); + return tmpAtomicNr != 1 && tmpAtomicNr != 6 && tmpAtomicNr != 0 && tmpAtomicNr != null && !(anAtom instanceof PseudoAtom); } - - private final boolean isNonmetal(IAtom atom) { - return nonmetalAtomicNumbers.contains(atom.getAtomicNumber()); + // + /** + * Checks whether the given atom is from an element in the organic subset, i.e. not a metal or metalloid atom. + * See the public constant set of non-metal atomic numbers declared in this class. Pseudo (R) atoms will also return false. + * Given as static method here because it is used by static public utility methods (developer's note). + * + * @param anAtom atom to check + * @return true if the given atom is organic and not a metal or metalloid atom + */ + private static boolean isNonmetal(IAtom anAtom) { + Integer tmpAtomicNumber = anAtom.getAtomicNumber(); + if (Objects.isNull(tmpAtomicNumber)) { + return false; + } + int tmpAtomicNumberInt = tmpAtomicNumber.intValue(); + return ErtlFunctionalGroupsFinder.NONMETAL_ATOMIC_NUMBERS.contains(tmpAtomicNumberInt); } - - private void addHydrogens(IAtom atom, int number, IAtomContainer container) { - for(int i = 0; i < number; i++) { - IAtom hydrogen = atom.getBuilder().newInstance(IAtom.class, "H"); - hydrogen.setAtomTypeName("H"); - hydrogen.setImplicitHydrogenCount(0); - - container.addAtom(hydrogen); - container.addBond(atom.getBuilder().newInstance(IBond.class, atom, hydrogen, Order.SINGLE)); + // + /** + * Add explicit hydrogen atoms to an atom in a molecule. + * + * @param anAtom the atom to add the explicit hydrogen atoms to + * @param aNrOfHydrogenAtoms the number of explicit hydrogens atoms to add + * @param aMolecule the molecule the atom belongs to + */ + private void addHydrogens(IAtom anAtom, int aNrOfHydrogenAtoms, IAtomContainer aMolecule) { + for (int i = 0; i < aNrOfHydrogenAtoms; i++) { + IAtom tmpHydrogenAtom = anAtom.getBuilder().newInstance(IAtom.class, "H"); + tmpHydrogenAtom.setAtomTypeName("H"); + tmpHydrogenAtom.setImplicitHydrogenCount(0); + aMolecule.addAtom(tmpHydrogenAtom); + aMolecule.addBond(anAtom.getBuilder().newInstance(IBond.class, anAtom, tmpHydrogenAtom, Order.SINGLE)); } } - - private void addRAtoms(IAtom atom, int number, IAtomContainer container) { - for(int i = 0; i < number; i++) { - IPseudoAtom rAtom = atom.getBuilder().newInstance(IPseudoAtom.class, "R"); - rAtom.setAttachPointNum(1); - rAtom.setImplicitHydrogenCount(0); - - container.addAtom(rAtom); - container.addBond(atom.getBuilder().newInstance(IBond.class, atom, rAtom, Order.SINGLE)); + // + /** + * Add pseudo ("R") atoms to an atom in a molecule. + * + * @param anAtom the atom to add the pseudo atoms to + * @param aNrOfRAtoms the number of pseudo atoms to add + * @param aMolecule the molecule the atom belongs to + */ + private void addRAtoms(IAtom anAtom, int aNrOfRAtoms, IAtomContainer aMolecule) { + for (int i = 0; i < aNrOfRAtoms; i++) { + IPseudoAtom tmpRAtom = anAtom.getBuilder().newInstance(IPseudoAtom.class, "R"); + tmpRAtom.setAttachPointNum(1); + tmpRAtom.setImplicitHydrogenCount(0); + aMolecule.addAtom(tmpRAtom); + aMolecule.addBond(anAtom.getBuilder().newInstance(IBond.class, anAtom, tmpRAtom, Order.SINGLE)); } } - - private List partitionIntoGroups(IAtomContainer sourceContainer, int[] atomIdxToFGMap, int fGroupCount) { - List groups = new ArrayList<>(fGroupCount); - for(int i = 0; i < fGroupCount; i++) { - groups.add(sourceContainer.getBuilder().newInstance(IAtomContainer.class)); + // + /** + * Partitions the marked atoms and their processed environments into separate functional groups and builds atom containers + * for them as final step before returning them. Transfers the respective atoms, bonds, single electrons, and lone + * pairs from the source atom container to the new functional group atom containers. + * + * @param aSourceContainer molecule atom container to take atoms, bonds, and electron objects from + * @param anAtomIdxToFGIdxMap array that maps atom indices (array positions) to functional group indices that the atoms belong to + * @param aFunctionalGroupCount maximum functional group index (+1) to know how many functional group atom containers to build + * @return list of partitioned functional group atom containers + */ + private List partitionIntoGroups(IAtomContainer aSourceContainer, int[] anAtomIdxToFGIdxMap, int aFunctionalGroupCount) { + List tmpFunctionalGroups = new ArrayList<>(aFunctionalGroupCount); + for (int i = 0; i < aFunctionalGroupCount; i++) { + tmpFunctionalGroups.add(aSourceContainer.getBuilder().newInstance(IAtomContainer.class)); } - - Map atomtoFGMap = new HashMap(sourceContainer.getAtomCount());//Maps.newHashMapWithExpectedSize(sourceContainer.getAtomCount()); - + Map tmpAtomtoFGMap = new HashMap<>((int) ((aSourceContainer.getAtomCount() / 0.75f) + 2), 0.75f); // atoms - for(int atomIdx = 0; atomIdx < sourceContainer.getAtomCount(); atomIdx++) { - int fGroupId = atomIdxToFGMap[atomIdx]; - - if(fGroupId == -1) { + for (int tmpAtomIdx = 0; tmpAtomIdx < aSourceContainer.getAtomCount(); tmpAtomIdx++) { + int tmpFGroupIdx = anAtomIdxToFGIdxMap[tmpAtomIdx]; + if (tmpFGroupIdx == -1) { continue; } - - IAtom atom = sourceContainer.getAtom(atomIdx); - IAtomContainer myGroup = groups.get(fGroupId); - myGroup.addAtom(atom); - atomtoFGMap.put(atom, myGroup); + IAtom tmpAtom = aSourceContainer.getAtom(tmpAtomIdx); + IAtomContainer tmpPartitionedFunctionalGroup = tmpFunctionalGroups.get(tmpFGroupIdx); + tmpPartitionedFunctionalGroup.addAtom(tmpAtom); + tmpAtomtoFGMap.put(tmpAtom, tmpPartitionedFunctionalGroup); } - // bonds - for(IBond bond : sourceContainer.bonds()) { - IAtomContainer beginGroup = atomtoFGMap.get(bond.getBegin()); - IAtomContainer endGroup = atomtoFGMap.get(bond.getEnd()); - - if(beginGroup == null || endGroup == null || beginGroup != endGroup) + for (IBond tmpBond : aSourceContainer.bonds()) { + // check whether begin and end atom of the bond have been correctly assigned to the same FG + IAtomContainer tmpFGofBeginAtom = tmpAtomtoFGMap.get(tmpBond.getBegin()); + IAtomContainer tmpFGofEndAtom = tmpAtomtoFGMap.get(tmpBond.getEnd()); + if (Objects.isNull(tmpFGofBeginAtom) || Objects.isNull(tmpFGofEndAtom) || tmpFGofBeginAtom != tmpFGofEndAtom) { continue; - - beginGroup.addBond(bond); + } + tmpFGofBeginAtom.addBond(tmpBond); } - // single electrons - for (ISingleElectron electron : sourceContainer.singleElectrons()) { - IAtomContainer group = atomtoFGMap.get(electron.getAtom()); - if(group != null) - group.addSingleElectron(electron); + for (ISingleElectron tmpSingleElectron : aSourceContainer.singleElectrons()) { + IAtomContainer tmpFunctionalGroup = tmpAtomtoFGMap.get(tmpSingleElectron.getAtom()); + if (!Objects.isNull(tmpFunctionalGroup)) { + tmpFunctionalGroup.addSingleElectron(tmpSingleElectron); + } } - // lone pairs - for (ILonePair lonePair : sourceContainer.lonePairs()) { - IAtomContainer group = atomtoFGMap.get(lonePair.getAtom()); - if(group != null) - group.addLonePair(lonePair); + for (ILonePair tmpLonePair : aSourceContainer.lonePairs()) { + IAtomContainer tmpFunctionalGroup = tmpAtomtoFGMap.get(tmpLonePair.getAtom()); + if (!Objects.isNull(tmpFunctionalGroup)) { + tmpFunctionalGroup.addLonePair(tmpLonePair); + } } - - return groups; - } - - private boolean isDbg() { - return log.isDebugEnabled(); + return tmpFunctionalGroups; } - - private boolean checkConstraints(IAtomContainer molecule) { - for(IAtom atom : molecule.atoms()) { - if(atom.getFormalCharge() != null && atom.getFormalCharge() != 0) { + // + /** + * Checks input molecule for charged atoms, metal or metalloid atoms, and whether it consists of more than one unconnected structures. + * If one of the cases applies, an IllegalArgumentException is thrown with a specific error message. NOTE: adjacency + * list cache must already be set-up! + * + * @param aMolecule the molecule to check + * @throws IllegalArgumentException if one of the constraints is not met + */ + private void checkConstraints(IAtomContainer aMolecule) throws IllegalArgumentException { + for (IAtom tmpAtom : aMolecule.atoms()) { + if (ErtlFunctionalGroupsFinder.isCharged(tmpAtom)) { throw new IllegalArgumentException("Input molecule must not contain any charges."); } - if(!isNonmetal(atom)) { - throw new IllegalArgumentException("Input molecule must not contain metals or metalloids."); - } - if(atom.getImplicitHydrogenCount() == null) { - atom.setImplicitHydrogenCount(0); + if (!ErtlFunctionalGroupsFinder.isNonmetal(tmpAtom)) { + throw new IllegalArgumentException("Input molecule must not contain metal, metalloid, or pseudo atoms."); } } - - ConnectedComponents cc = new ConnectedComponents(adjList); - if(cc.nComponents() != 1) { - throw new IllegalArgumentException("Input molecule must consist of only a single connected stucture."); + Objects.requireNonNull(this.adjListCache, "Adjacency list cache must already be set-up for this check!"); + //Developer's note: this method does not use the public isStructureUnconnected() method because it is intertwined with the + // find() method for speed-up; but it basically does the same. + ConnectedComponents tmpConnectedComponents = new ConnectedComponents(this.adjListCache); + if (tmpConnectedComponents.nComponents() > 1) { + throw new IllegalArgumentException("Input molecule must consist of only a single connected structure."); } - - return true; + } + // + /** + * Returns whether the CDK logging tool of this class (logger) is currently configured to log debug messages. + *

+ * Use ErtlFunctionalGroupsFinder.LOGGING_TOOL.setLevel(ILoggingTool.DEBUG); to activate debug messages. + *

+ * + * @return true if debug messages are enabled + */ + private static boolean isDbg() { + return ErtlFunctionalGroupsFinder.LOGGING_TOOL.isDebugEnabled(); } } diff --git a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java index 9ec9468..f47803c 100644 --- a/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java +++ b/src/main/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtility.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -35,16 +35,12 @@ import org.openscience.cdk.interfaces.IAtomContainerSet; import org.openscience.cdk.interfaces.IAtomType; import org.openscience.cdk.interfaces.IChemObjectBuilder; -import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.logging.Level; @@ -52,7 +48,7 @@ /** * This class gives utility methods for using ErtlFunctionalGroupsFinder, - * a CDK-based implementation, published here, of the + * a CDK-based implementation, published here of the * Ertl algorithm for automated functional groups detection. * The methods of this class are basically public static re-implementations of the routines used for testing and * evaluating the ErtlFunctionalGroupsFinder, as described in the publication. @@ -90,51 +86,18 @@ public int encode(IAtom anAtom, IAtomContainer aContainer) { // // // - /** - * Atomic numbers that ErtlFunctionalGroupsFinder accepts, see getValidAtomicNumbers() - */ - private static final int[] VALID_ATOMIC_NUMBERS = new int[] {1,2,6,7,8,9,10,15,16,17,18,34,35,36,53,54,86}; - - /** - * Atomic numbers that ErtlFunctionalGroupsFinder accepts, loaded into a hash set for quick determination; set is - * filled in static initializer (see below) - */ - private static final HashSet VALID_ATOMIC_NUMBERS_SET = new HashSet<>(20, 1); - /** * Logger of this class */ private static final Logger LOGGER = Logger.getLogger(ErtlFunctionalGroupsFinderUtility.class.getName()); // // - // - /** - * Static initializer that sets up hash maps/sets used by static methods. - */ - static { - for (int i : ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS) { - ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS_SET.add(i); - } - } - // - // private ErtlFunctionalGroupsFinderUtility() { - + // only created because JavaDoc task complained. } // // // - /** - * Returns an integer array containing all atomic numbers that can be passed on to ErtlFunctionalGroupsFinder.find(). - * All other atomic numbers are invalid because they represent metal, metalloid or pseudo ('R') atoms. - * - * @return all valid atomic numbers for ErtlFunctionalGroupsFinder.find() - */ - public static int[] getValidAtomicNumbers() { - return Arrays.copyOf(ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS, - ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS.length); - } - /** * Constructs a CDK MoleculeHashGenerator that is configured to count frequencies of the functional groups * returned by ErtlFunctionalGroupsFinder. It takes elements, bond order sum, and aromaticity of the atoms in @@ -156,48 +119,12 @@ public static MoleculeHashGenerator getFunctionalGroupHashGenerator() { .molecular(); return tmpHashGenerator; } - - /** - * Constructs a new ErtlFunctionalGroupsFinder object with generalization of returned functional groups turned ON. - * - * @return new ErtlFunctionalGroupsFinder object that generalizes returned functional groups - */ - public static ErtlFunctionalGroupsFinder getErtlFunctionalGroupsFinderGeneralizingMode() { - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - return tmpEFGF; - } - - /** - * Constructs a new ErtlFunctionalGroupsFinder object with generalization of returned functional groups turned OFF. - * The FG will contain their full environments. - * - * @return new ErtlFunctionalGroupsFinder object that does NOT generalize returned functional groups - */ - public static ErtlFunctionalGroupsFinder getErtlFunctionalGroupsFinderNotGeneralizingMode() { - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); - return tmpEFGF; - } // // // - /** - * Checks whether the given molecule consists of two or more unconnected structures, e.g. ion and counter-ion. This - * would make it unfit to be passed to ErtlFunctionalGroupsFinder.find(). This can be fixed by preprocessing, see - * selectBiggestUnconnectedComponent() below. - * - * @param aMolecule the molecule to check - * @return true, if the molecule consists of two or more unconnected structures - * @throws NullPointerException if the given molecule is 'null' - */ - public static boolean isStructureUnconnected(IAtomContainer aMolecule) throws NullPointerException { - Objects.requireNonNull(aMolecule, "Given molecule is 'null'"); - boolean tmpIsConnected = ConnectivityChecker.isConnected(aMolecule); - return (!tmpIsConnected); - } - /** * Checks whether the atom count or bond count of the given molecule is zero. The ErtlFunctionalGroupsFinder.find() - * method would still accept these molecules but it is not recommended to pass them on (simply makes not much sense). + * method would still accept these molecules, but it is not recommended to pass them on (simply makes not much sense). * * @param aMolecule the molecule to check * @return true, if the atom or bond count of the molecule is zero @@ -209,108 +136,15 @@ public static boolean isAtomOrBondCountZero(IAtomContainer aMolecule) throws Nul int tmpBondCount = aMolecule.getBondCount(); return (tmpAtomCount == 0 || tmpBondCount == 0); } - - /** - * Iterates through all atoms in the given molecule and checks whether they are charged. If this method returns - * 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() but should be filtered or the - * charges neutralized (see neutralizeCharges() below). - *
If no charged atoms are found, this method scales linearly with O(n) with n: number of atoms in the given - * molecule. - * - * @param aMolecule the molecule to check - * @return true, if the molecule contains one or more charged atoms - * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' - */ - public static boolean isMoleculeCharged(IAtomContainer aMolecule) throws NullPointerException { - Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); - int tmpAtomCount = aMolecule.getAtomCount(); - if (tmpAtomCount == 0) { - return false; - } - Iterable tmpAtoms = aMolecule.atoms(); - boolean tmpIsAtomCharged; - for (IAtom tmpAtom : tmpAtoms) { - //Throws NullPointerException if tmpAtom is 'null' - tmpIsAtomCharged = ErtlFunctionalGroupsFinderUtility.isAtomCharged(tmpAtom); - if (tmpIsAtomCharged) { - return true; - } - } - return false; - } - - /** - * Checks whether a given atom is charged. - * - * @param anAtom the atom to check - * @return true, if the atom is charged - * @throws NullPointerException if the given atom or its formal charge is 'null' - */ - public static boolean isAtomCharged(IAtom anAtom) throws NullPointerException { - Objects.requireNonNull(anAtom, "Given atom is 'null'."); - Integer tmpFormalCharge = anAtom.getFormalCharge(); - Objects.requireNonNull(tmpFormalCharge, "Formal charge is 'null'."); - return (tmpFormalCharge.intValue() != 0); - } - - /** - * Checks whether a given atom is a metal, metalloid or pseudo atom judging by its atomic number. Atoms with invalid - * atomic numbers (metal, metalloid or pseudo ('R') atoms) cannot be passed on to ErtlFunctionalGroupsFinder.find() - * but should be filtered. - * - * @param anAtom the atom to check - * @return true, if the atomic number is invalid or 'null' - * @throws NullPointerException if the given atom or its atomic number is 'null' - */ - public static boolean isAtomicNumberInvalid(IAtom anAtom) throws NullPointerException { - Objects.requireNonNull(anAtom, "Given atom is 'null'."); - Integer tmpAtomicNumber = anAtom.getAtomicNumber(); - Objects.requireNonNull(tmpAtomicNumber, "Atomic number is 'null'."); - int tmpAtomicNumberInt = tmpAtomicNumber.intValue(); - boolean tmpIsAtomicNumberValid = ErtlFunctionalGroupsFinderUtility.VALID_ATOMIC_NUMBERS_SET.contains(tmpAtomicNumberInt); - return !tmpIsAtomicNumberValid; - } - - /** - * Iterates through all atoms in the given molecule and checks whether their atomic numbers are invalid. If this - * method returns 'true', the molecule cannot be passed on to ErtlFunctionalGroupsFinder.find() but should be - * filtered. - *
If no invalid atoms are found, this method scales linearly with O(n) with n: number of atoms in the given - * molecule. - * - * @param aMolecule the molecule to check - * @return true, if the molecule contains one or more atoms with invalid atomic numbers - * @throws NullPointerException if the given molecule (or one of its atoms) is 'null' - */ - public static boolean containsInvalidAtomicNumbers(IAtomContainer aMolecule) throws NullPointerException { - Objects.requireNonNull(aMolecule, "Given molecule is 'null'."); - int tmpAtomCount = aMolecule.getAtomCount(); - if (tmpAtomCount == 0) { - return false; - } - Iterable tmpAtoms = aMolecule.atoms(); - boolean tmpIsAtomicNumberInvalid; - for (IAtom tmpAtom : tmpAtoms) { - //Throws NullPointerException if tmpAtom is 'null' - tmpIsAtomicNumberInvalid = ErtlFunctionalGroupsFinderUtility.isAtomicNumberInvalid(tmpAtom); - if (tmpIsAtomicNumberInvalid) { - return true; - } - } - return false; - } - + // /** * Checks whether the given molecule represented by an atom container should NOT be passed on to the - * ErtlFunctionalGroupsFinder.find() method but instead be filtered. + * ErtlFunctionalGroupsFinder.find() method but instead be filtered if(!) strict input restrictions are turned on (turned off by default). *
In detail, this function returns true if the given atom container contains metal, metalloid, or pseudo atoms * or has an atom or bond count equal to zero. *
If this method returns false, this does NOT mean the molecule can be passed on to find() without a problem. It * still might need to be preprocessed first. * - * @see ErtlFunctionalGroupsFinderUtility#isValidArgumentForFindMethod(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the atom container to check * @return true if the given atom container should be discarded * @throws NullPointerException if parameter is 'null' @@ -318,19 +152,16 @@ public static boolean containsInvalidAtomicNumbers(IAtomContainer aMolecule) thr public static boolean shouldBeFiltered(IAtomContainer aMolecule) throws NullPointerException { return ErtlFunctionalGroupsFinderUtility.shouldBeFiltered(aMolecule, true); } - + // /** * Checks whether the given molecule represented by an atom container should NOT be passed on to the - * ErtlFunctionalGroupsFinder.find() method but instead be filtered. + * ErtlFunctionalGroupsFinder.find() method but instead be filtered if(!) strict input restrictions are turned on (turned off by default). *
In detail, this function returns true if the given atom container contains metal, metalloid, or pseudo atoms * or has an atom or bond count equal to zero. If the second parameter is set to "false", single atom molecules * (bond count is 0) are accepted and not recommended to be filtered if they fulfill the other requirements. *
If this method returns false, this does NOT mean the molecule can be passed on to find() without a problem. It * still might need to be preprocessed first. * - * @see ErtlFunctionalGroupsFinderUtility#isValidArgumentForFindMethod(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the atom container to check * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will return false (do not filter) * @return true if the given atom container should be discarded @@ -341,10 +172,10 @@ public static boolean shouldBeFiltered(IAtomContainer aMolecule, boolean areSing boolean tmpShouldBeFiltered; try { if (areSingleAtomsFiltered) { - tmpShouldBeFiltered = (ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpShouldBeFiltered = (ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || ErtlFunctionalGroupsFinderUtility.isAtomOrBondCountZero(aMolecule)); } else { - tmpShouldBeFiltered = (ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpShouldBeFiltered = (ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || aMolecule.getAtomCount() == 0); } @@ -356,18 +187,14 @@ public static boolean shouldBeFiltered(IAtomContainer aMolecule, boolean areSing } return tmpShouldBeFiltered; } - + // /** * Checks whether the given molecule represented by an atom container needs to be preprocessed before it is passed - * on to the ErtlFunctionalGroupsFinder.find() method because it is unconnected or contains charged atoms. + * on to the ErtlFunctionalGroupsFinder.find() method because it is unconnected or contains charged atoms if(!) + * strict input restrictions are turned on (turned off by default). *
It is advised to check via shouldBeFiltered() whether the given molecule should be discarded anyway before * calling this function. * - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#isValidArgumentForFindMethod(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) - * @see ErtlFunctionalGroupsFinderUtility#neutralizeCharges(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#selectBiggestUnconnectedComponent(IAtomContainer) * @param aMolecule the atom container to check * @return true is the given molecule needs to be preprocessed * @throws NullPointerException if parameter is 'null' @@ -376,8 +203,8 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null Objects.requireNonNull(aMolecule, "Given molecule is null."); boolean tmpNeedsPreprocessing; try { - tmpNeedsPreprocessing = (ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule)); + tmpNeedsPreprocessing = (ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); } catch (Exception anException) { ErtlFunctionalGroupsFinderUtility.LOGGER.log(Level.WARNING, anException.toString() + " Molecule ID: " + ErtlFunctionalGroupsFinderUtility.getIDForLogging(aMolecule), @@ -386,17 +213,13 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null } return tmpNeedsPreprocessing; } - + // /** * Checks whether the given molecule represented by an atom container can be passed on to the - * ErtlFunctionalGroupsFinder.find() method without problems. + * ErtlFunctionalGroupsFinder.find() method without problems if(!) strict input restrictions are turned on (turned off by default). *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms, contains * multiple unconnected parts, or has an atom or bond count of zero. * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the molecule to check * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method * @throws NullPointerException if parameter is 'null' @@ -404,18 +227,14 @@ public static boolean shouldBePreprocessed(IAtomContainer aMolecule) throws Null public static boolean isValidArgumentForFindMethod(IAtomContainer aMolecule) throws NullPointerException { return ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(aMolecule, true); } - + // /** * Checks whether the given molecule represented by an atom container can be passed on to the - * ErtlFunctionalGroupsFinder.find() method without problems. + * ErtlFunctionalGroupsFinder.find() method without problems if(!) strict input restrictions are turned on (turned off by default). *
This method will return false if the molecule contains any metal, metalloid, pseudo, or charged atoms, contains * multiple unconnected parts, or has an atom or bond count of zero. If the second parameter is set to "false", single atom molecules * (bond count is 0) are accepted and not recommended to be filtered if they fulfill the other requirements. * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#applyFiltersAndPreprocessing(IAtomContainer, Aromaticity) * @param aMolecule the molecule to check * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will return true (do not filter) * @return true if the given molecule is a valid parameter for ErtlFunctionalGroupsFinder.find() method @@ -426,15 +245,15 @@ public static boolean isValidArgumentForFindMethod(IAtomContainer aMolecule, boo boolean tmpIsValid; try { if (areSingleAtomsFiltered) { - tmpIsValid = !(ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpIsValid = !(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || ErtlFunctionalGroupsFinderUtility.isAtomOrBondCountZero(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule)); + || ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); } else { - tmpIsValid = !(ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule) + tmpIsValid = !(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule) || aMolecule.getAtomCount() == 0 - || ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule) - || ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule)); + || ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule) + || ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule)); } } catch (Exception anException) { ErtlFunctionalGroupsFinderUtility.LOGGER.log(Level.SEVERE, @@ -473,7 +292,7 @@ public static IAtomContainer selectBiggestUnconnectedComponent(IAtomContainer aM tmpBiggestComponent.setProperties(aMolecule.getProperties()); return tmpBiggestComponent; } - + // /** * Neutralizes charged atoms in the given atom container by zeroing the formal atomic charges and filling up free * valences with implicit hydrogen atoms (according to the CDK atom types). This procedure allows a more general @@ -499,7 +318,7 @@ public static void neutralizeCharges(IAtomContainer aMolecule) throws NullPointe ErtlFunctionalGroupsFinderUtility.neutralizeCharges(tmpAtom, aMolecule); } } - + // /** * Neutralizes a charged atom in the given parent atom container by zeroing the formal atomic charge and filling up free * valences with implicit hydrogen atoms (according to the CDK atom types). @@ -512,7 +331,6 @@ public static void neutralizeCharges(IAtomContainer aMolecule) throws NullPointe * @throws NullPointerException if anAtom or aParentMolecule is 'null' * @throws CDKException if the atom is not part of the molecule or no matching atom type can be determined for the * atom or there is a problem with adding the implicit hydrogen atoms. - * @see ErtlFunctionalGroupsFinderUtility#neutralizeCharges(IAtomContainer) */ public static void neutralizeCharges(IAtom anAtom, IAtomContainer aParentMolecule) throws NullPointerException, CDKException { Objects.requireNonNull(anAtom, "Given atom is 'null'."); @@ -544,7 +362,7 @@ public static void neutralizeCharges(IAtom anAtom, IAtomContainer aParentMolecul tmpHAdder.addImplicitHydrogens(aParentMolecule, anAtom); } } - + // /** * Convenience method to perceive atom types for all IAtoms in the IAtomContainer, using the * CDK AtomContainerManipulator or rather the CDKAtomTypeMatcher. If the matcher finds a matching atom type, the @@ -568,7 +386,7 @@ public static void perceiveAtomTypesAndConfigureAtoms(IAtomContainer aMolecule) //Might throw CDKException but it is unclear in what case AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(aMolecule); } - + // /** * Convenience method for applying the given aromaticity model to the given molecule. Any existing aromaticity flags * are removed - even if no aromatic bonds were found. This follows the idea of applying an aromaticity model to a @@ -607,7 +425,7 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat } return tmpIsAromatic; } - + // /** * Checks whether the given molecule represented by an atom container should be filtered instead of being passed * on to the ErtlFunctionalGroupsFinder.find() method and if not, applies necessary preprocessing steps. @@ -615,15 +433,12 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat * to the given atom container that is always needed (setting atom types and applying an aromaticity model) and * preprocessing steps that are only needed in specific cases (selecting the biggest unconnected component, neutralizing * charges). Molecules processed by this method can be passed on to find() without problems (Caution: The return value - * of this method is 'null' if the molecule should be filtered!). + * of this method is 'null' if the molecule should be filtered!) if(!) strict input restrictions are turned on (turned off by default). *
NOTE: This method changes major properties and the composition of the given IAtomContainer object! If you * want to retain your object unchanged for future calculations, use the IAtomContainer's * clone() method. *
NOTE2: The returned IAtomContainer object is the same as the one given as parameter! * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) * @param aMolecule the molecule to check and process * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen * ElectronDonation model can massively influence the extracted function groups of a molecule when using @@ -635,7 +450,7 @@ public static boolean applyAromaticityDetection(IAtomContainer aMolecule, Aromat public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecule, Aromaticity anAromaticityModel) throws NullPointerException { return ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(aMolecule, anAromaticityModel, true); } - + // /** * Checks whether the given molecule represented by an atom container should be filtered instead of being passed * on to the ErtlFunctionalGroupsFinder.find() method and if not, applies necessary preprocessing steps. @@ -643,18 +458,15 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu * to the given atom container that is always needed (setting atom types and applying an aromaticity model) and * preprocessing steps that are only needed in specific cases (selecting the biggest unconnected component, neutralizing * charges). Molecules processed by this method can be passed on to find() without problems (Caution: The return value - * of this method is 'null' if the molecule should be filtered!). + * of this method is 'null' if the molecule should be filtered!) if(!) strict input restrictions are turned on (turned off by default). *
NOTE: This method changes major properties and the composition of the given IAtomContainer object! If you * want to retain your object unchanged for future calculations, use the IAtomContainer's * clone() method. *
NOTE2: The returned IAtomContainer object is the same as the one given as parameter! * - * @see ErtlFunctionalGroupsFinder#find(IAtomContainer, boolean) - * @see ErtlFunctionalGroupsFinderUtility#shouldBeFiltered(IAtomContainer) - * @see ErtlFunctionalGroupsFinderUtility#shouldBePreprocessed(IAtomContainer) * @param aMolecule the molecule to check and process * @param anAromaticityModel the aromaticity model to apply to the molecule in preprocessing; Note: The chosen - * ElectronDonation model can massively influence the extracted function groups of a molecule when using + * ElectronDonation model can massively influence the extracted functional groups of a molecule when using * ErtlFunctionGroupsFinder! * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will be processed and * not return null @@ -680,17 +492,17 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu } } //From structures containing two or more unconnected structures (e.g. ions) choose the largest structure - boolean tmpIsUnconnected = ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(aMolecule); + boolean tmpIsUnconnected = ErtlFunctionalGroupsFinder.isStructureUnconnected(aMolecule); if (tmpIsUnconnected) { aMolecule = ErtlFunctionalGroupsFinderUtility.selectBiggestUnconnectedComponent(aMolecule); } //Filter - boolean tmpContainsInvalidAtoms = ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(aMolecule); + boolean tmpContainsInvalidAtoms = ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(aMolecule); if (tmpContainsInvalidAtoms) { return null; } //Neutralize charges if there are any - boolean tmpIsCharged = ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(aMolecule); + boolean tmpIsCharged = ErtlFunctionalGroupsFinder.containsChargedAtom(aMolecule); if (tmpIsCharged) { ErtlFunctionalGroupsFinderUtility.neutralizeCharges(aMolecule); } @@ -707,83 +519,6 @@ public static IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecu //
// // - /** - * Extracts functional groups from the given molecule, using the Ertl algorithm / ErtlFunctionalGroupsFinder, but - * only the marked atoms of every functional group are returned. They do not contain their environment (i.e. connected, - * unmarked carbon atoms) and are also not generalized. - * - * @param aMolecule the molecule to extracts functional groups from; it is not cloned in this method! - * @return List of IAtomContainer objects representing the detected functional groups - * @throws NullPointerException if the given atom container is null - * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; - * check methods for filtering and preprocessing in this case - */ - public static List findMarkedAtoms(IAtomContainer aMolecule) throws NullPointerException, IllegalArgumentException { - return ErtlFunctionalGroupsFinderUtility.findMarkedAtoms(aMolecule, true); - } - - /** - * Extracts functional groups from the given molecule, using the Ertl algorithm / ErtlFunctionalGroupsFinder, but - * only the marked atoms of every functional group are returned. They do not contain their environment (i.e. connected, - * unmarked carbon atoms) and are also not generalized. - * - * @param aMolecule the molecule to extracts functional groups from; it is not cloned in this method! - * @param areSingleAtomsFiltered if false, molecules with bond count 0 but atom count 1 will be processed and not raise - * an IllegalArgumentException - * @return List of IAtomContainer objects representing the detected functional groups - * @throws NullPointerException if the given atom container is null - * @throws IllegalArgumentException if the given atom container cannot be passed to ErtlFunctionalGroupsFinder; - * check methods for filtering and preprocessing in this case - */ - public static List findMarkedAtoms(IAtomContainer aMolecule, boolean areSingleAtomsFiltered) throws NullPointerException, IllegalArgumentException { - Objects.requireNonNull(aMolecule, "Given molecule is null."); - if (aMolecule.isEmpty()) { - return new ArrayList(0); - } - boolean tmpCanBeFragmented = ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(aMolecule, areSingleAtomsFiltered); - if (!tmpCanBeFragmented) { - throw new IllegalArgumentException("Given molecule cannot be fragmented but needs to be filtered or preprocessed."); - } - HashMap tmpIdToAtomMap = new HashMap<>(aMolecule.getAtomCount() + 1, 1); - for (int i = 0; i < aMolecule.getAtomCount(); i++) { - IAtom tmpAtom = aMolecule.getAtom(i); - tmpAtom.setProperty("EFGFUtility.INDEX", i); - tmpIdToAtomMap.put(i, tmpAtom); - } - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); - List tmpFunctionalGroups = tmpEFGF.find(aMolecule, false); - if (tmpFunctionalGroups.isEmpty()) { - return tmpFunctionalGroups; - } - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroups) { - for (int i = 0; i < tmpFunctionalGroup.getAtomCount(); i++) { - IAtom tmpAtom = tmpFunctionalGroup.getAtom(i); - if (Objects.isNull(tmpAtom.getProperty("EFGFUtility.INDEX"))) { - if (tmpAtom instanceof IPseudoAtom && "R".equals(((IPseudoAtom)tmpAtom).getLabel())) { - //atom is a pseudo atom added by the EFGF in generalization - tmpFunctionalGroup.removeAtom(tmpAtom); - i = i - 1; - continue; - } else if (tmpAtom.getSymbol().equals("C")){ - //atom is an environmental C added by the EFGF - tmpFunctionalGroup.removeAtom(tmpAtom); - i = i - 1; - continue; - } else if (tmpAtom.getSymbol().equals("H")) { - //atom is an explicit H added by the EFGF - tmpFunctionalGroup.removeAtom(tmpAtom); - i = i - 1; - continue; - } else { - //unknown atom - throw new IllegalArgumentException("Something went wrong, identified unknown added atom."); - } - } - } - } - return tmpFunctionalGroups; - } - /** * Replaces the environmental carbon or pseudo-atoms (new IAtom objects) inserted by the EFGF in an identified * functional group with the carbon IAtom objects from the original molecule object. @@ -882,7 +617,7 @@ public static void restoreOriginalEnvironmentalCarbons( } } } - + // /** * Gives the pseudo SMILES code for a given molecule / functional group. In this notation, aromatic atoms are marked * by asterisks (*) and pseudo atoms are indicated by 'R'. diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java b/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java index 4980d7c..0f04e40 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/ErtlFunctionalGroupsFinderPerformanceSnapshotApp.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -62,58 +62,60 @@ /** * An application for testing the performance of the ErtlFunctionalGroupsFinder.find() method under parallelization on * multiple threads. + *

+ * Legacy code that still assumes that the old input restrictions are turned on. * * @author Jonas Schaub * @version 1.2.0.0 */ public class ErtlFunctionalGroupsFinderPerformanceSnapshotApp { - + // // /** * Name of file for logging occurred exceptions */ private static final String EXCEPTIONS_LOG_FILE_NAME = "Exceptions_Log.txt"; - + // /** * Name of file for writing results */ private static final String RESULTS_FILE_NAME = "Results.txt"; - + // /** * All allowed atomic numbers to pass to the ErtlFunctionalGroupsFinder; * String will be split and resulting integers passed to a set */ private static final String NON_METALLIC_ATOMIC_NUMBERS = "1,2,6,7,8,9,10,15,16,17,18,34,35,36,53,54,86"; // - + // // /** * All allowed atomic numbers to pass to the ErtlFunctionalGroupsFinder as a set of integers (will be parsed from * NON_METALLIC_ATOMIC_NUMBERS) */ private Set nonMetallicAtomicNumbersSet; - + // /** * The working directory (the jar-file's directory) */ private String workingPath; - + // /** * The given number of different threads to use */ private int numberOfThreadsToUse; - + // /** * All molecules loaded from the SD file */ private IAtomContainer[] moleculesArray; - + // /** * The aromaticity model in use */ private Aromaticity aromaticityModel; // - + // // /** * Instantiates and starts the application. It first loads all molecules from a given SD file into memory and then @@ -265,7 +267,7 @@ public ErtlFunctionalGroupsFinderPerformanceSnapshotApp(String[] anArgs) throws } } // - + // // /** * Performs all preprocessing needed for the ErtlFunctionalGroupsFinder and throws an IllegalArgumentException @@ -311,7 +313,7 @@ private IAtomContainer applyFiltersAndPreprocessing(IAtomContainer aMolecule) th this.aromaticityModel.apply(aMolecule); return aMolecule; } - + // /** * Appends the given exception's stack trace to a log file. * diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java b/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java index 9191158..90ee117 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/ExtractFunctionalGroupsTask.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -33,11 +33,11 @@ * @version 1.2 */ public class ExtractFunctionalGroupsTask implements Callable { - + // private final IAtomContainer[] moleculesArray; - + // private final ErtlFunctionalGroupsFinder ertlFinder; - + // /** * Instantiates the thread. * @@ -48,7 +48,7 @@ public ExtractFunctionalGroupsTask(IAtomContainer[] aListOfMolecules) { this.moleculesArray = aListOfMolecules; this.ertlFinder = new ErtlFunctionalGroupsFinder(); } - + // /** * Applies the ErtlFunctionalGroupsFinder.find(IAtomContainer container, boolean clone) method on all given * molecules (parameter clone = false) and counts the occurring exceptions. @@ -68,5 +68,4 @@ public Integer call() throws Exception { } return tmpExceptionsCounter; } - } diff --git a/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java b/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java index 0500b9e..954af5d 100644 --- a/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java +++ b/src/main/java/org/openscience/cdk/tools/efgf/app/Main.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -27,11 +27,11 @@ * @version 1.2 */ public class Main { - + // private Main() { - + // only created because JavaDoc task complained. } - + // /** * Starts the application. Command line arguments must be the name of an SD-file to read (must be located in the * same directory as the application's .jar file) and the number of different threads to use for calculation. @@ -46,5 +46,4 @@ public static void main(String[] args) { System.exit(1); } } - } diff --git a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderEvaluationTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java similarity index 85% rename from src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderEvaluationTest.java rename to src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java index c108c32..4aedfe7 100644 --- a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderEvaluationTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderEvaluationTest.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -18,16 +18,17 @@ * along with this program. If not, see . */ -package org.openscience.cdk.tools.test; +package org.openscience.cdk.tools; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.openscience.cdk.Atom; import org.openscience.cdk.CDKConstants; import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.aromaticity.ElectronDonation; import org.openscience.cdk.atomtype.CDKAtomTypeMatcher; +import org.openscience.cdk.depict.DepictionGenerator; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.graph.ConnectivityChecker; import org.openscience.cdk.graph.CycleFinder; @@ -46,9 +47,6 @@ import org.openscience.cdk.silent.SilentChemObjectBuilder; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; -import org.openscience.cdk.smiles.SmilesParser; -import org.openscience.cdk.tools.CDKHydrogenAdder; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder; import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder.Mode; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; @@ -67,36 +65,46 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Set; /** * This test class can be used to read an SD file containing chemical structures, to extract their functional groups using * the ErtlFunctionalGroupsFinder with different settings (i.e. electron donation model and cycle finder algorithm), and write - * the functional groups with their associated frequency under the given settings in this SD file to a CSV file. + * the identified functional groups with their associated frequency under the given settings in this SD file to a CSV file. *

- * To run correctly the constant SD_FILE_PATH must be set to where to find the specific file on the local system. + * To run correctly, the constant SD_FILE_TEST_RESOURCE_NAME must be set to the name of the SD file to analyse which must be + * situated in the test resources folder. *

- * All written files will be placed in a new folder in the same directory as the read SD file. + * All written files will be placed in the output folder. *

- * Note for addition of new tests: Only one SD file should be analyzed per test method (since some mechanisms work under + * Note for addition of new tests: Only one SD file should be analysed per test method (since some mechanisms work under * that assumption). + *

+ * NOTE that this code was written before the class ErtlFunctionalGroupsFinderUtility was implemented to make this type + * of analyses more straightforward using its utility methods. This test class here therefore does not use the EFGFUtility class. + * In fact, the routines used here were extracted to develop the EFGFUtility class. + * This test class was also developed and used before EFGF was reworked before version 1.3. It can now only bee seen as outdated example/legacy code + * on how to analyse larger datasets using EFGF!!! It is also documentation on how the analyses presented in the scientific + * article about EFGF were conducted. * * @author Jonas Schaub - * @version 1.2 + * @version 1.3 */ +@Disabled("Legacy code and lengthy analyses") public class ErtlFunctionalGroupsFinderEvaluationTest { // - // + // /** - * Path to SD file that should be analyzed + * Name of SD file in test resources folder that should be analyzed */ - private static final String SD_FILE_PATH = "...\\ChEBI_lite_3star_subset.sdf"; + private static final String SD_FILE_TEST_RESOURCE_NAME = "ChEBI_lite_3star_subset.sdf"; /** - * Directory for output files; Will be created as sub-folder in the working directory (the directory of the read SD file) + * Folder name for output files; will be created in repo root directory */ private static final String OUTPUT_FOLDER_FROM_WORKING_DIRECTORY = "ErtlFunctionalGroupsFinderEvaluationTest_Output"; @@ -108,7 +116,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { /** * Separator for file name segments (test identifier, file name, time stamp) */ - private static final String FILE_NAME_ADDITION_SEPERATOR = "_"; + private static final String FILE_NAME_ADDITION_SEPARATOR = "_"; // /** @@ -142,10 +150,10 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { * First lines in the exceptions log file */ private static final String EXCEPTIONS_LOG_FILE_HEADER = "Following molecules led to the specified exceptions:" - + System.getProperty("line.separator") + + System.lineSeparator() + "(Note: If too many exceptions are thrown too fast the JVM stops filling in the complete stack trace. " + "You need to be looking at an earlier stack trace to see the details.)" - + System.getProperty("line.separator"); + + System.lineSeparator(); // // @@ -236,7 +244,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { /** * Separator for the output file's values */ - private static final String OUTPUT_FILE_SEPERATOR = ","; + private static final String OUTPUT_FILE_SEPARATOR = ","; /** * Placeholder String for every functional group's SMILES code whose real SMILES representation could not be @@ -365,7 +373,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { // /** - * Directory for all produced files; It will be the directory where th SD file that is analyzed was loaded from + * Directory for all produced files; It will be the directory where the SD file that is analyzed was loaded from */ private String outputDirectory; @@ -432,7 +440,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { * functional groups and its values are inner HashMaps that hold the (pseudo) SMILES representation of a functional * group and its frequencies for different settings as String-Object pairs, plus an exemplary molecule of origin */ - private HashMap masterHashMap; + private HashMap> masterHashMap; /** * A map that gives a certain element symbol for a placeholder atom marking a specific aromatic atom in pseudo SMILES @@ -462,7 +470,7 @@ public class ErtlFunctionalGroupsFinderEvaluationTest { /** * Constructor *

- * Note: it does not initialize any class variables (except 5) because that would be unnecessary when it is called by a + * Note: it does not initialize any class variables because that would be unnecessary when it is called by a * test method inherited from CDKTestCase; these initializations are done by initialize(). */ public ErtlFunctionalGroupsFinderEvaluationTest() { @@ -476,8 +484,6 @@ public ErtlFunctionalGroupsFinderEvaluationTest() { // // - - // /** * Test for analyzing molecules in an SD file for all four different electron donation models supplied by the cdk: * daylight, cdk, piBonds, cdkAllowingExocyclic and the aromaticity model cdkLegacy. @@ -488,7 +494,7 @@ public ErtlFunctionalGroupsFinderEvaluationTest() { */ @Test public void testElectronDonationDependency() throws Exception { - this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH, + this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME, ErtlFunctionalGroupsFinderEvaluationTest.ELECTRON_DONATION_TEST_IDENTIFIER, true); } @@ -504,7 +510,7 @@ public void testElectronDonationDependency() throws Exception { */ @Test public void testElectronDonationDependencyNoMultiples() throws Exception { - this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH, + this.analyzeElectronDonationDependency(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME, ErtlFunctionalGroupsFinderEvaluationTest.ELECTRON_DONATION_NO_MULTIPLES_TEST_IDENTIFIER, false); } @@ -519,12 +525,12 @@ public void testElectronDonationDependencyNoMultiples() throws Exception { */ @Test public void testCycleFinderDependency() throws Exception { - this.initializeWithFileOperations(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH, + this.initializeWithFileOperations(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME, ErtlFunctionalGroupsFinderEvaluationTest.CYCLE_FINDER_TEST_IDENTIFIER); Assumptions.assumeTrue(this.isTestAbleToRun); - System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH); - File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH); + System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME).getPath()); int tmpRequiredNumberOfReaders = 6; IteratingSDFReader[] tmpReaders = new IteratingSDFReader[tmpRequiredNumberOfReaders]; try { @@ -561,7 +567,7 @@ public void testCycleFinderDependency() throws Exception { } this.saveData(); System.out.println("\nFinished!"); - System.out.println("\nNumber of occured exceptions: " + this.exceptionsCounter); + System.out.println("\nNumber of occurred exceptions: " + this.exceptionsCounter); } /** @@ -574,11 +580,10 @@ public void testCycleFinderDependency() throws Exception { public void testPerformance() throws Exception { this.initialize(true, "PerformanceTest"); //First, check if the SD file is present and ignore test if it is not - String tmpPathToSDFile = ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_PATH; - System.out.println("\nLoading file with path: " + tmpPathToSDFile); - File tmpSDFile = new File(tmpPathToSDFile); + System.out.println("\nLoading file with path: " + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME).getPath()); if (!tmpSDFile.canRead()) { - System.out.println("\n\tUnable to find or read a file with path \"" + tmpPathToSDFile + "\"."); + System.out.println("\n\tUnable to find or read a file with path \"" + ErtlFunctionalGroupsFinderEvaluationTest.SD_FILE_TEST_RESOURCE_NAME + "\"."); System.out.println("\nTest is ignored."); Assumptions.assumeTrue(false); return; @@ -620,123 +625,126 @@ public void testPerformance() throws Exception { long tmpEndTime = System.currentTimeMillis(); System.out.println("\nExtraction of functional groups from these molecules took " + (tmpEndTime - tmpStartTime) + " ms.\n"); } - // - - // + // /** - * Test for correct MoleculeHashGenerator settings/performance on some examples. + * Reads the ChEBI lite 3-star subset and determines the functional groups in it to compare how many input molecules + * cause exceptions with vs. without the earlier EFGF input restrictions. + * ChEBI lite 3-star subset SDF: 1396 structures in file (some will automatically be skipped by SDF reader). + * If needed, it can also generate images of those molecules that would be filtered and their functional groups. * - * @throws java.lang.Exception if initialize() throws an exception or a SMILES code can not be parsed into a molecule + * @throws Exception if anything goes wrong */ @Test - public void testMoleculeHashGeneratorSettings() throws Exception { - this.initialize(false, ""); - SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - - /*Chebi70986, Chebi16238 and Chebi57692 all contain the same functional group with pseudo SMILES code - "O=C1N=C(C(=NR)C(=O)N1R)N(R)R", but different hybridizations in the resulting atom containers. But their hash - codes should be the same under the given settings. This is tested exemplary for many similar cases*/ - String[] tmpSmilesArray = {"OC[C@@H](O)[C@@H](O)[C@@H](O)CN1CC(CO)N=C2C(=O)NC(=O)N=C12", - "Cc1cc2nc3c(nc(=O)[nH]c3=O)n(C[C@H](O)[C@H](O)[C@H](O)COP(O)(=O)OP(O)(=O)OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)n3cnc4c(N)ncnc34)c2cc1C", - "Cc1cc2nc3c(nc(=O)[n-]c3=O)n(C[C@H](O)[C@H](O)[C@H](O)COP([O-])(=O)OP([O-])(=O)OC[C@H]3O[C@H]([C@H](O)[C@@H]3O)n3cnc4c(N)ncnc34)c2cc1C"}; - List tmpHashCodesList = new LinkedList<>(); - for (String tmpSmilesCode : tmpSmilesArray) { - IAtomContainer tmpParsedMolecule = tmpSmilesParser.parseSmiles(tmpSmilesCode); - tmpParsedMolecule = this.applyFiltersAndPreprocessing(tmpParsedMolecule); - Aromaticity.cdkLegacy().apply(tmpParsedMolecule); - List tmpFunctionalGroups = this.ertlFGFinderGenOn.find(tmpParsedMolecule); - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroups) { - if (this.getPseudoSmilesCode(tmpFunctionalGroup).equals("O=C1N=C(C(=NR)C(=O)N1R)N(R)R")) { - tmpHashCodesList.add(this.molHashGenerator.generate(tmpFunctionalGroup)); + public void readChebiLite3StarSubset() throws Exception { + // change to true to activate image generation! + boolean tmpDepictFilteredMols = false; + IteratingSDFReader tmpChebiSDFReader = null; + try { + tmpChebiSDFReader = new IteratingSDFReader( + ErtlFunctionalGroupsFinderTest.class.getResourceAsStream("ChEBI_lite_3star_subset.sdf"), + SilentChemObjectBuilder.getInstance(), + true); + } catch (Exception e) { + System.out.println("\nSD file could not be found. Test is ignored."); + Assumptions.assumeTrue(false); + return; + } + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.DEFAULT); + int tmpMoleculeCouter = 0; + int tmpExceptionsCounter = 0; + int tmpExceptionsWithRestrictionsCounter = 0; + int tmpExceptionsWithoutRestrictionsCounter = 0; + int tmpMoleculesFilteredCounter = 0; + while (tmpChebiSDFReader.hasNext()) { + IAtomContainer tmpMolecule = null; + tmpMoleculeCouter++; + try { + tmpMolecule = tmpChebiSDFReader.next(); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); + tmpAromaticity.apply(tmpMolecule); + tmpMolecule = tmpMolecule.clone(); + } catch (Exception anException) { + tmpExceptionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not parse molecule! Counter: " + tmpMoleculeCouter); } + continue; } - } - for (Long tmpHashCode1 : tmpHashCodesList) { - for (Long tmpHashCode2 : tmpHashCodesList) { - Assertions.assertEquals(tmpHashCode1.longValue(), tmpHashCode2.longValue()); + try { + if (ErtlFunctionalGroupsFinder.isValidInputMoleculeWithRestrictionsTurnedOn(tmpMolecule)) { + List tmpFGList = tmpEFGF.find(tmpMolecule, false, true); + } else { + tmpMoleculesFilteredCounter++; + try { + if (tmpDepictFilteredMols) { + DepictionGenerator tmpDepictGen = new DepictionGenerator().withSize(712, 712).withFillToFit().withMargin(10); + String tmpSourceFolder = new File("").getAbsolutePath(); + tmpDepictGen.depict(tmpMolecule).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + ".png"); + List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); + int i = 0; + for (IAtomContainer tmpFG : tmpFGList) { + tmpDepictGen.depict(tmpFG).writeTo(tmpSourceFolder + File.separator + "Output" + File.separator + tmpMolecule.getProperty("ChEBI ID").toString().replace(':', '_') + "_" + i + ".png"); + i++; + } + } + } catch (Exception anException) { + anException.printStackTrace(); + break; + } + } + } catch (Exception anException) { + tmpExceptionsWithRestrictionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); + } + } + try { + List tmpFGList = tmpEFGF.find(tmpMolecule, false, false); + } catch (Exception anException) { + tmpExceptionsWithoutRestrictionsCounter++; + if (!Objects.isNull(tmpMolecule)) { + System.out.println(tmpMolecule.getProperty("ChEBI ID") + "," + anException.toString() + "," + tmpMoleculeCouter); + } else { + System.out.println("Could not identify FG in molecule! Counter: " + tmpMoleculeCouter); + } + continue; } } - - /*Functional groups like the tertiary amine or the hydroxyl group appear with aromatic and non-aromatic central - atoms. These two cases should be discrimated by the MoleculeHashGenerator under the given settings*/ - String tmpTertiaryAmineSmiles = "*N(*)*"; - IAtomContainer tmpAromMol = tmpSmilesParser.parseSmiles(tmpTertiaryAmineSmiles); - IAtomContainer tmpNonAromMol = tmpSmilesParser.parseSmiles(tmpTertiaryAmineSmiles); - for (IAtom tmpAtom : tmpAromMol.atoms()) { - if (tmpAtom.getSymbol().equals("N")) - tmpAtom.setIsAromatic(true); - } - Assertions.assertNotEquals(this.molHashGenerator.generate(tmpAromMol), this.molHashGenerator.generate(tmpNonAromMol)); - String tmpHydroxylGroupSmiles = "[H]O[C]"; - tmpAromMol = tmpSmilesParser.parseSmiles(tmpHydroxylGroupSmiles); - tmpNonAromMol = tmpSmilesParser.parseSmiles(tmpHydroxylGroupSmiles); - for (IAtom tmpAtom : tmpAromMol.atoms()) { - if (tmpAtom.getSymbol().equals("C")) - tmpAtom.setIsAromatic(true); - } - Assertions.assertNotEquals(this.molHashGenerator.generate(tmpAromMol), this.molHashGenerator.generate(tmpNonAromMol)); - - /*The following are examples of different (unique!) SMILES codes representing the same functional groups. - They should be assigned the same hash code*/ - HashMap tmpEquivalentSmilesMap = new HashMap<>(20); - tmpEquivalentSmilesMap.put("*[N](*)=C(N(*)*)N(*)*", "*N(*)C(=[N](*)*)N(*)*"); - tmpEquivalentSmilesMap.put("*SC1=[N](*)[C]=[C]N1*", "*SC=1N(*)[C]=[C][N]1*"); - tmpEquivalentSmilesMap.put("*[N]1=[C][C]=[C]N1*", "*N1[C]=[C][C]=[N]1*"); - tmpEquivalentSmilesMap.put("*[N](*)=[C]N(*)*", "*N(*)[C]=[N](*)*"); - tmpEquivalentSmilesMap.put("*N(*)[C]=[C][C]=[C][C]=[C][C]=[C][C]=[N](*)*", "*[N](*)=[C][C]=[C][C]=[C][C]=[C][C]=[C]N(*)*"); - tmpEquivalentSmilesMap.put("*[N](*)=C(N(*)*)N(*)P(=O)(O[H])O[H]", "*N(*)C(=[N](*)*)N(*)P(=O)(O[H])O[H]"); - tmpEquivalentSmilesMap.put("[O]I(=O)=O", "O=I(=O)[O]"); - tmpEquivalentSmilesMap.put("[O]Br(=O)=O", "O=Br(=O)[O]"); - tmpEquivalentSmilesMap.put("[O]Cl(=O)(=O)=O", "O=Cl(=O)(=O)[O]"); - tmpEquivalentSmilesMap.put("[C]=[C][C]=[C]C#C[C]=[C]C#[C]", "[C]#C[C]=[C]C#C[C]=[C][C]=[C]"); - tmpEquivalentSmilesMap.put("*N1[C]=[C][C]=[N]1*", "*[N]1=[C][C]=[C]N1*"); - tmpEquivalentSmilesMap.put("O=C(*)O*", "*OC(*)=O"); - for (String tmpKeySmiles : tmpEquivalentSmilesMap.keySet()) { - IAtomContainer tmpKeyMol = tmpSmilesParser.parseSmiles(tmpKeySmiles); - IAtomContainer tmpValueMol = tmpSmilesParser.parseSmiles(tmpEquivalentSmilesMap.get(tmpKeySmiles)); - Assertions.assertEquals(this.molHashGenerator.generate(tmpKeyMol), this.molHashGenerator.generate(tmpValueMol)); - } - } - - /** - * Test for correct preprocessing (neutralization of charges and selection of biggest fragment). - * - * @throws Exception if initialize() throws an exception or a SMILES code can not be parsed into a molecule - */ - @Test - public void testPreprocessing() throws Exception { - this.initialize(false, ""); - String tmpSmiles = "CC[O-].C"; - SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer tmpMol = tmpSmilesParser.parseSmiles(tmpSmiles); - tmpMol = this.applyFiltersAndPreprocessing(tmpMol); - SmilesGenerator tmpGenerator = SmilesGenerator.unique(); - Assertions.assertEquals("OCC", tmpGenerator.create(tmpMol)); + System.out.println("Number of parsed molecules: " + tmpMoleculeCouter); + System.out.println("Exceptions while preprocessing: " + tmpExceptionsCounter); + System.out.println("Molecules that would be filtered due to input restrictions: " + tmpMoleculesFilteredCounter); + System.out.println("Exceptions with restrictions (prefiltered): " + tmpExceptionsWithRestrictionsCounter); + System.out.println("Exceptions without restrictions: " + tmpExceptionsWithoutRestrictionsCounter); } // - // - // /** * Analyzes molecules in an SD file for all four different electron donation models supplied by the cdk: * daylight, cdk, piBonds, cdkAllowingExocyclic and the aromaticity model cdkLegacy. * - * @param anSDFilePath absolute path of the SD file to analyze - * @param aTestIdentifier a folder with this name will be created in the output directory and it will be added to + * @param anSDFileResourceName name of the SD file to analyse which must be situated in the test resources folder + * @param aTestIdentifier a folder with this name will be created in the output directory, and it will be added to * the output and log files' names for association of test and files; may be null or empty * @param anAreMultiplesCounted if false, functional groups that occur multiple times in the same molecule will * only be counted once * @throws java.lang.Exception if initializeWithFileOperations() throws an exception or an unexpected exception occurs */ - private void analyzeElectronDonationDependency(String anSDFilePath, + private void analyzeElectronDonationDependency( + String anSDFileResourceName, String aTestIdentifier, - boolean anAreMultiplesCounted) throws Exception { - this.initializeWithFileOperations(anSDFilePath, aTestIdentifier); + boolean anAreMultiplesCounted) + throws Exception { + this.initializeWithFileOperations(anSDFileResourceName, aTestIdentifier); Assumptions.assumeTrue(this.isTestAbleToRun); - System.out.println("\nLoading file with path: " + anSDFilePath); - File tmpSDFile = new File(anSDFilePath); + System.out.println("\nLoading file with path: " + anSDFileResourceName); + File tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(anSDFileResourceName).getPath()); int tmpRequiredNumberOfReaders = 5; IteratingSDFReader[] tmpReaders = new IteratingSDFReader[tmpRequiredNumberOfReaders]; try { @@ -776,7 +784,7 @@ private void analyzeElectronDonationDependency(String anSDFilePath, } this.saveData(); System.out.println("\nFinished!"); - System.out.println("\nNumber of occured exceptions: " + this.exceptionsCounter); + System.out.println("\nNumber of occurred exceptions: " + this.exceptionsCounter); } /** @@ -806,7 +814,7 @@ private void initialize(boolean aShouldPrintHeader, String aTestIdentifier) { .molecular(); this.ertlFGFinderGenOff = new ErtlFunctionalGroupsFinder(Mode.NO_GENERALIZATION); this.ertlFGFinderGenOn = new ErtlFunctionalGroupsFinder(Mode.DEFAULT); - this.masterHashMap = new HashMap(ErtlFunctionalGroupsFinderEvaluationTest.MASTER_HASHMAP_INITIAL_CAPACITY, + this.masterHashMap = new HashMap<>(ErtlFunctionalGroupsFinderEvaluationTest.MASTER_HASHMAP_INITIAL_CAPACITY, ErtlFunctionalGroupsFinderEvaluationTest.MASTER_HASHMAP_LOAD_FACTOR); this.settingsKeysList = new LinkedList<>(); this.exceptionsCounter = 0; @@ -816,7 +824,7 @@ private void initialize(boolean aShouldPrintHeader, String aTestIdentifier) { for (int i = 0; i < tmpMetalNumbersStrings.length; i++) { tmpMetalNumbersInt[i] = Integer.parseInt(tmpMetalNumbersStrings[i]); } - this.nonMetallicAtomicNumbersSet = new HashSet(Arrays.asList(tmpMetalNumbersInt)); + this.nonMetallicAtomicNumbersSet = new HashSet<>(Arrays.asList(tmpMetalNumbersInt)); this.pseudoSmilesAromaticElementToPlaceholderElementMap = new HashMap<>(10, 1); this.pseudoSmilesAromaticElementToPlaceholderElementMap.put("C", "Ce"); this.pseudoSmilesAromaticElementToPlaceholderElementMap.put("N", "Nd"); @@ -848,30 +856,36 @@ private void initialize(boolean aShouldPrintHeader, String aTestIdentifier) { /** * Initializes all class variables and determines the output directory. * - * @param anSDFilePath absolute path of the SD file to analyze for a quick pre-check if it is present and the test - * is therefore meant to run; may be empty but not null + * @param anSDFileResourceName name of the SD file to analyse which must be situated in the test resources folder * @param aTestIdentifier a folder with this name will be created in the output directory and it will be added to * the output and log files' names for association of test and files; may be null or empty * @throws java.lang.Exception if one the FileWriter instances can not be instantiated, more than * Integer.MAX-VALUE tests are to be run this minute (error in the naming of output files), aPathOfSDFile is null or * an unexpected exception occurs. */ - private void initializeWithFileOperations(String anSDFilePath, String aTestIdentifier) throws Exception { + private void initializeWithFileOperations(String anSDFileResourceName, String aTestIdentifier) throws Exception { System.out.println("\n#########################################################################\n"); System.out.println("Starting new test, identifier: " + aTestIdentifier); System.out.println("\nInitializing class variables..."); this.isTestAbleToRun = true; //First, check if the SD file is present and ignore test if it is not - File tmpSDFile = new File(anSDFilePath); - if (!tmpSDFile.canRead() || tmpSDFile.getAbsoluteFile().getParent() == null) { - System.out.println("\n\tUnable to find or read a file with path \"" + anSDFilePath + "\" or to get its parent directory."); - System.out.println("\nTest is ignored."); + File tmpSDFile = null; + try { + tmpSDFile = new File(ErtlFunctionalGroupsFinderEvaluationTest.class.getResource(anSDFileResourceName).getPath()); + } catch (Exception e) { + this.isTestAbleToRun = false; + } + if (!tmpSDFile.canRead()) { this.isTestAbleToRun = false; + } + if (!this.isTestAbleToRun) { + System.out.println("\n\tUnable to find or read a file with path \"" + anSDFileResourceName + "\"."); + System.out.println("\nTest is ignored."); Assumptions.assumeTrue(false); return; } //Determine the output directory - String tmpOutputRootDirectory = tmpSDFile.getAbsoluteFile().getParent() + File.separator; + String tmpOutputRootDirectory = new File("").getAbsolutePath() + File.separator; this.outputDirectory = tmpOutputRootDirectory + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FOLDER_FROM_WORKING_DIRECTORY + File.separator @@ -887,9 +901,9 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent ErtlFunctionalGroupsFinderEvaluationTest.DATE_TIME_FORMAT_PATTERN)); //Set up exceptions log file File tmpExceptionsLogFile = new File(this.outputDirectory + File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_TYPE); int tmpFilesInThisMinuteCounter = 1; @@ -899,9 +913,9 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent tmpNumberAddedToFileName = true; while (tmpFilesInThisMinuteCounter <= Integer.MAX_VALUE) { tmpExceptionsLogFile = new File(this.outputDirectory + File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + "(" + tmpFilesInThisMinuteCounter + ")" + ErtlFunctionalGroupsFinderEvaluationTest.EXCEPTIONS_LOG_FILE_TYPE); @@ -924,16 +938,16 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent File tmpFilteredMoleculesFile; if (tmpNumberAddedToFileName) { tmpFilteredMoleculesFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + "(" + tmpFilesInThisMinuteCounter + ")" + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_TYPE); } else { tmpFilteredMoleculesFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + ErtlFunctionalGroupsFinderEvaluationTest.FILTERED_MOLECULES_FILE_TYPE); } @@ -946,17 +960,17 @@ private void initializeWithFileOperations(String anSDFilePath, String aTestIdent File tmpOutputFile; if (tmpNumberAddedToFileName) { tmpOutputFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + "(" + tmpFilesInThisMinuteCounter + ")" + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_TYPE); } else { tmpOutputFile = new File(this.outputDirectory+ File.separator + aTestIdentifier - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_NAME - + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPERATOR + + ErtlFunctionalGroupsFinderEvaluationTest.FILE_NAME_ADDITION_SEPARATOR + tmpDateTimeAddition + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_TYPE); } @@ -1077,9 +1091,10 @@ private void calculateAbsoluteFGFrequencies( } try { aReader.close(); - } catch (IOException anIOException) { } - //Since the filters remain the same in every iteration filtered molecules must be logged only once - //(assuming that only one SD file is analyzed in a test) + } catch (IOException anIOException) { + //Since the filters remain the same in every iteration filtered molecules must be logged only once + //(assuming that only one SD file is analyzed in a test) + } if (!this.areFilteredMoleculesLogged) { this.areFilteredMoleculesLogged = true; } @@ -1275,7 +1290,7 @@ private void enterFunctionalGroupsIntoMasterMap( } //Case: functional group is already in the master HashMap if (this.masterHashMap.containsKey(tmpHashCode)) { - HashMap tmpInnerMap = (HashMap)this.masterHashMap.get(tmpHashCode); + HashMap tmpInnerMap = this.masterHashMap.get(tmpHashCode); //And a key-value pair for this settings key is already present too -> raise frequency by one if (tmpInnerMap.containsKey(aSettingsKey)) { int tmpFrequency = (int)tmpInnerMap.get(aSettingsKey); @@ -1287,7 +1302,7 @@ private void enterFunctionalGroupsIntoMasterMap( } //The functional group did not occur before -> create a new inner HashMap for this molecule } else { - HashMap tmpNewInnerMap = new HashMap( + HashMap tmpNewInnerMap = new HashMap<>( ErtlFunctionalGroupsFinderEvaluationTest.INNER_HASHMAPS_INITIAL_CAPACITY); tmpNewInnerMap.put(ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY, anFGContainingMolecule); tmpNewInnerMap.put(aSettingsKey, 1); @@ -1328,51 +1343,50 @@ private void saveData() { } System.out.println("\nWriting to file..."); //Writing the output file's header - String tmpFileHeader = ErtlFunctionalGroupsFinderEvaluationTest.HASH_CODE_KEY - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + StringBuilder tmpFileHeaderBuilder = new StringBuilder(ErtlFunctionalGroupsFinderEvaluationTest.HASH_CODE_KEY + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.PSEUDO_SMILES_CODE_KEY - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR - + ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY; + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + + ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY); for (String tmpSettingsKey : this.settingsKeysList) { - tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpSettingsKey; + tmpFileHeaderBuilder.append(ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR).append(tmpSettingsKey); } - tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + String tmpFileHeader = tmpFileHeaderBuilder.toString(); + tmpFileHeader += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY; this.dataOutputPrintWriter.println(tmpFileHeader); this.dataOutputPrintWriter.flush(); - Iterator tmpFunctionalGroupsIterator = this.masterHashMap.keySet().iterator(); + Iterator tmpFunctionalGroupsIterator = this.masterHashMap.keySet().iterator(); //Iteration for all molecules in the master HashMap while (tmpFunctionalGroupsIterator.hasNext()) { - long tmpHashCode = (long)tmpFunctionalGroupsIterator.next(); - HashMap tmpInnerMap = (HashMap)this.masterHashMap.get(tmpHashCode); + long tmpHashCode = tmpFunctionalGroupsIterator.next(); + HashMap tmpInnerMap = this.masterHashMap.get(tmpHashCode); String tmpSmilesCode = (String) tmpInnerMap.get(ErtlFunctionalGroupsFinderEvaluationTest.SMILES_CODE_KEY); String tmpPseudoSmilesCode = (String) tmpInnerMap.get(ErtlFunctionalGroupsFinderEvaluationTest.PSEUDO_SMILES_CODE_KEY); //Writing the record for this functional group - String tmpRecord = tmpHashCode - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + StringBuilder tmpRecordBuilder = new StringBuilder(tmpHashCode + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpPseudoSmilesCode - + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR - + tmpSmilesCode; + + ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + + tmpSmilesCode); for (String tmpSettingsKey : this.settingsKeysList) { - if (tmpInnerMap.get(tmpSettingsKey) == null) { - tmpInnerMap.put(tmpSettingsKey, 0); - } - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR - + tmpInnerMap.get(tmpSettingsKey); + tmpInnerMap.putIfAbsent(tmpSettingsKey, 0); + tmpRecordBuilder.append(ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR).append(tmpInnerMap.get(tmpSettingsKey)); } + String tmpRecord = tmpRecordBuilder.toString(); IAtomContainer tmpMoleculeOfOrigin = (IAtomContainer)tmpInnerMap.get( ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_KEY); String tmpChebiId = tmpMoleculeOfOrigin.getProperty("ChEBI ID"); String tmpChemblId = tmpMoleculeOfOrigin.getProperty("chembl_id"); String tmpCdkTitle = tmpMoleculeOfOrigin.getProperty(CDKConstants.TITLE); if (tmpChebiId != null) { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpChebiId; + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpChebiId; } else if (tmpChemblId != null) { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpChemblId; + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpChemblId; } else if (tmpCdkTitle != null) { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpCdkTitle; + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + tmpCdkTitle; } else { - tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPERATOR + tmpRecord += ErtlFunctionalGroupsFinderEvaluationTest.OUTPUT_FILE_SEPARATOR + ErtlFunctionalGroupsFinderEvaluationTest.MOLECULE_OF_ORIGIN_ID_PLACEHOLDER; } this.dataOutputPrintWriter.println(tmpRecord); @@ -1443,7 +1457,7 @@ private String getPseudoSmilesCode(IAtomContainer aMolecule) throws CDKException */ private void logFilteredMolecule(IAtomContainer aMolecule, int aCounter, String aCause) { if(!this.areFileOperationsActivated) { - System.out.println("\nFile operations are not activated, invokation of logFilteredMolecule() is therefore not possible."); + System.out.println("\nFile operations are not activated, invocation of logFilteredMolecule() is therefore not possible."); return; } this.filteredMoleculesPrintWriter.println(); @@ -1481,7 +1495,7 @@ private void logFilteredMolecule(IAtomContainer aMolecule, int aCounter, String */ private void logException(Exception anException, String aSettingsKey, IAtomContainer aMolecule) { if(!this.areFileOperationsActivated) { - System.out.println("\nFile operations are not activated, invokation of logException() is therefore not possible."); + System.out.println("\nFile operations are not activated, invocation of logException() is therefore not possible."); return; } this.exceptionsCounter++; diff --git a/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java new file mode 100644 index 0000000..5319b61 --- /dev/null +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderTest.java @@ -0,0 +1,746 @@ +/* + * ErtlFunctionalGroupsFinder for CDK + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * + * Source code is available at + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package org.openscience.cdk.tools; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.openscience.cdk.AtomContainer; +import org.openscience.cdk.PseudoAtom; +import org.openscience.cdk.aromaticity.Aromaticity; +import org.openscience.cdk.aromaticity.ElectronDonation; +import org.openscience.cdk.exception.InvalidSmilesException; +import org.openscience.cdk.graph.Cycles; +import org.openscience.cdk.interfaces.IAtom; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.interfaces.IBond.Order; +import org.openscience.cdk.interfaces.IChemObjectBuilder; +import org.openscience.cdk.interfaces.IPseudoAtom; +import org.openscience.cdk.isomorphism.Mappings; +import org.openscience.cdk.isomorphism.Pattern; +import org.openscience.cdk.isomorphism.VentoFoggia; +import org.openscience.cdk.silent.SilentChemObjectBuilder; +import org.openscience.cdk.smiles.SmiFlavor; +import org.openscience.cdk.smiles.SmilesGenerator; +import org.openscience.cdk.smiles.SmilesParser; +import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; + +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Test for ErtlFunctionalGroupsFinder. + * + * @author Sebastian Fritsch, Jonas Schaub + * @version 1.3 + */ +public class ErtlFunctionalGroupsFinderTest { + /** + * Constructor. + */ + public ErtlFunctionalGroupsFinderTest() { + super(); + } + // + /** + * Example code to be used in the GitHub wiki of the project. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void gitHubWikiTest() throws Exception { + //Prepare input + SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer tmpInputMol = tmpSmiPar.parseSmiles("C[C@@H]1CN(C[C@H](C)N1)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F"); //PubChem CID 5257 + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpInputMol); + Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); + tmpAromaticity.apply(tmpInputMol); + //Identify functional groups + ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(); //default: generalization turned on + List tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); + for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { + String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); + System.out.println(tmpSmilesString); + } + //non-generalized functional groups + System.out.println("----------------"); + tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); + for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { + String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); + System.out.println(tmpSmilesString); + } + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind1() throws Exception { + String tmpMoleculeSmiles = "Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])S(=O)(=O)[R]", "[c]N(H)H", "NarR3", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind2() throws Exception { + String tmpMoleculeSmiles = "NC(=N)c1ccc(\\\\C=C\\\\c2ccc(cc2O)C(=N)N)cc1"; + String[] tmpExpectedFGs = new String[] {"[R]N=C-N([R])[R]", "[C]=[C]", "[c]OH", "[R]N=C-N([R])[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind3() throws Exception { + String tmpMoleculeSmiles = "CC(=O)Nc1nnc(s1)S(=O)(=O)N"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]S(=O)(=O)N([R])[R]", "NarR3", "NarR3", "SarR2"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind4() throws Exception { + String tmpMoleculeSmiles = "NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl"; + String[] tmpExpectedFGs = new String[] {"[R]S(=O)(=O)N([R])[R]", "[R]S(=O)(=O)N([R])[C]N([R])[R]", "[R]Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind5() throws Exception { + String tmpMoleculeSmiles = "CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])[C]=N[R]", "[R]Cl", "[R]N(=O)=[C]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind6() throws Exception { + String tmpMoleculeSmiles = "Cc1onc(c2ccccc2)c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"O=C([R])N([R])[R]", "O=C([R])N([R])[C]S[R]", "O=C([R])OH", "OarR2", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind7() throws Exception { + String tmpMoleculeSmiles = "Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O"; + String[] tmpExpectedFGs = new String[] {"[R]Cl", "[R]N=[C]", "[R]C(=O)N([R])[R]", "O=N([R])=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind8() throws Exception { + String tmpMoleculeSmiles = "COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N"; + String[] tmpExpectedFGs = new String[] {"[R]O[R]", "[R]N([R])C(=O)[R]", "N([R])([R])[R]", "[C]=[C]", "[R]O[R]", "[R]S(=O)(=O)N([R])[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind9() throws Exception { + String tmpMoleculeSmiles = "Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl"; + String[] tmpExpectedFGs = new String[] {"[R]Cl", "[R]N(H)[R]", "O=C(OH)[R]", "[R]Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind10() throws Exception { + String tmpMoleculeSmiles = "Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1"; + String[] tmpExpectedFGs = new String[] {"[R]Cl", "[R]O[R]", "[R]N([R])[C]=N[R]", "[R]N([H])[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind11() throws Exception { + String tmpMoleculeSmiles = "FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13"; + String[] tmpExpectedFGs = new String[] {"[R]F", "[R]F", "[R]F", "O=C([R])N([R])[R]", "[R]N=[C]", "[R]Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind12() throws Exception { + String tmpMoleculeSmiles = "OC[C@H]1O[C@H](C[C@@H]1O)n2cnc3[C@H](O)CNC=Nc23"; + String[] tmpExpectedFGs = new String[] {"[C]O[H]", "[R]O[R]", "[C]OH", "[C]OH", "[R]N=CN([R])[R]", "NarR3", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind13() throws Exception { + String tmpMoleculeSmiles = "CCN[C@H]1C[C@H](C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])H", "O=S(=O)([R])[R]", "[R]S(=O)(=O)N([R])[R]", "SarR2"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind14() throws Exception { + String tmpMoleculeSmiles = "C[C@@H](O)[C@@H]1[C@H]2[C@@H](C)C(=C(N2C1=O)C(=O)O)S[C@@H]3CN[C@@H](C3)C(=O)N(C)C"; + String[] tmpExpectedFGs = new String[] {"[C]O[H]", "O=C([R])N([R])C(C(=O)(OH))=[C]S[R]", "[R]N(H)[R]", "[R]N([R])C([R])=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind15() throws Exception { + String tmpMoleculeSmiles = "C[C@@H]1CN(C[C@H](C)N1)c2c(F)c(N)c3C(=O)C(=CN(C4CC4)c3c2F)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])[R]", "[R]N([H])[R]", "[R]F", "[c]N(H)H", "[c]=O", "[R]F", "[R]C(=O)OH", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind16() throws Exception { + String tmpMoleculeSmiles = "CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C"; + String[] tmpExpectedFGs = new String[] {"[C]=[C]", "[R]C(=O)N([R])N([R])C(=O)[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind17() throws Exception { + String tmpMoleculeSmiles = "Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl"; + String[] tmpExpectedFGs = new String[] {"Cl[R]", "[R]N=C(N([R])[R])N([R])C(=O)[R]", "Cl[R]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind18() throws Exception { + String tmpMoleculeSmiles = "CC(=O)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](O)[C@H](O)CO)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]N([R])C(=N[R])N([R])[R]", "O=C(OH)C(=[C])O[R]" , "[C]OH", "[C]OH", "[C]OH"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind19() throws Exception { + String tmpMoleculeSmiles = "C[C@H](O)[C@H](O)[C@H]1CNc2nc(N)nc(O)c2N1"; + String[] tmpExpectedFGs = new String[] {"[C]OH", "[C]OH", "[R]N(H)[R]" , "[c]N(H)H", "[c]OH", "[R]N(H)[R]", "NarR3", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule taken from Figure 1 of the original Ertl algorithm article. + * + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + @Test + public void testFind20() throws Exception { + String tmpMoleculeSmiles = "N[C@@H]1CCCCN(C1)c2c(Cl)cc3C(=O)C(=CN(C4CC4)c3c2Cl)C(=O)O"; + String[] tmpExpectedFGs = new String[] {"[C]N([H])[H]", "[R]N([R])[R]", "[R]Cl" , "[c]=O", "[R]Cl", "[R]C(=O)OH", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule. Specifically, the extraction of only the marked atoms + * in a functional group is tested. This feature was added in a later version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testOnlyMarkedAtoms1() throws Exception { + String tmpMoleculeSmiles = "CCO[Si](OCC)(OCC)OCC"; //Tetraethyl Orthosilicate + String[] tmpExpectedFGs = new String[]{"[O][Si]([O])([O])[O]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + } + // + /** + * Tests correct functional group identification on an example molecule. Specifically, the extraction of only the marked atoms + * in a functional group is tested. This feature was added in a later version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testOnlyMarkedAtoms2() throws Exception { + String tmpMoleculeSmiles = "Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"; //same mol as testFind1() from the Ertl figure + String[] tmpExpectedFGs = new String[] {"O=[S](=O)[NH]", "[NH2]", "Nar" , "Nar"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + } + // + /** + * Tests correct functional group identification on an example molecule. Specifically, the extraction of only the marked atoms + * in a functional group is tested. This feature was added in a later version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testOnlyMarkedAtoms3() throws Exception { + String tmpMoleculeSmiles = "CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"; //CHEMBL1201736 + String[] tmpExpectedFGs = new String[] {"[O]N=[C]C(=O)[NH]", "[C]=C(C(=O)[O-])N([C]=O)[CH][S]", "[N+]", "[NH2]", "Cl", "Sar", "Nar"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.ONLY_MARKED_ATOMS); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules1() throws Exception { + String tmpMoleculeSmiles = "CC(=O)OC1=CC=CC=C1C(=O)[O+]"; //charged ASA + String[] tmpExpectedFGs = new String[] {"*OC(*)=O", "*C(=O)[O+]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules2() throws Exception { + String tmpMoleculeSmiles = "C1=CC(=CC=C1[N+](=O)[O-])O"; //Nitrophenol + String[] tmpExpectedFGs = new String[] {"*[N+](=O)[O-]", "[H]O[c]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules3() throws Exception { + String tmpMoleculeSmiles = "C[N+](C)(C)C"; //Tetramethylammonium + String[] tmpExpectedFGs = new String[] {"*[N+](*)(*)*"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with formal charges. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testChargedMolecules4() throws Exception { + String tmpMoleculeSmiles = "c1ccccc1[CH+]C(Br)C"; //Carbenium ion in beta position to Br + // carbenium ion is ignored since a charge is not a reason to mark carbon atom + String[] tmpExpectedFGs = new String[] {"[C]Br"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + + tmpMoleculeSmiles = "c1ccccc1[CH+]C(Br)C"; //Carbenium ion in beta position to Br + // carbenium ion is ignored since a charge is not a reason to mark carbon atom + tmpExpectedFGs = new String[] {"[C]Br"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + + tmpMoleculeSmiles = "c1ccccc1[C+](Br)C"; //Carbenium ion in alpha position to Br + // carbenium ion is extracted as environmental carbon and replaced by a new atom instance as all env carbon atoms in EFGF; so it lost its charge! + tmpExpectedFGs = new String[] {"[C]Br"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); + } + // + /** + * Tests correct functional group identification on an example molecule with a disconnected structure. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testDisconnectedMolecules1() throws Exception { + String tmpMoleculeSmiles = "CC(=O)O.CC(=O)O.C1=CC(=CC=C1NC(=NC(=NCCCCCCN=C(N)N=C(N)NC2=CC=C(C=C2)Cl)N)N)Cl"; //Chlorhexidine Diacetate + String[] tmpExpectedFGs = new String[] {"*C(=O)O[H]", "*C(=O)O[H]", "*N=C(N=C(N(*)*)N(*)*)N(*)*", "*N=C(N=C(N(*)*)N(*)*)N(*)*", "*Cl", "*Cl"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with a disconnected structure. + * This was not allowed in a previous version. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testDisconnectedMolecules2() throws Exception { + String tmpMoleculeSmiles = "C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"; //Sodium edetate + String[] tmpExpectedFGs = new String[] {"*N(*)*", "*C(=O)[O-]", "*C(=O)[O-]", "*N(*)*", "*C(=O)[O-]", "*C(=O)[O-]", "[Na+]", "[Na+]", "[Na+]", "[Na+]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with metal/metalloid atoms. + * + * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like + * any other hetero atom. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testMetalsMetalloids1() throws Exception { + String tmpMoleculeSmiles = "CCO[Si](OCC)(OCC)OCC"; //Tetraethyl Orthosilicate + String[] tmpExpectedFGs = new String[]{"*O[Si](O*)(O*)O*"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with metal/metalloid atoms. + * + * Note: all atoms are marked as hetero atoms by EFGF that are not H or C. So, metals and metalloids get treated like + * any other hetero atom. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testMetalsMetalloids2() throws Exception { + String tmpMoleculeSmiles = "O.O.O=[Al]O[Si](=O)O[Si](=O)O[Al]=O"; //Kaolin + String[] tmpExpectedFGs = new String[]{"*O*", "*O*", "O=[Al]O[Si](=O)O[Si](=O)O[Al]=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests correct functional group identification on an example molecule with pseudo (R) atoms. + * + * Note: these pseudo (R) atoms are simply ignored by EFGF. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testRAtoms1() throws Exception { + String tmpMoleculeSmiles = "OCC(CO[*])OC([*])=O"; //CHEBI:598 + String[] tmpExpectedFGs = new String[]{"[H]O[C]", "[C][O]", "*O[C]=O"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Tests that a former bug concerning unconnected, explicit hydrogen atoms does not occur anymore. + * + * @throws Exception if anything goes wrong + * @author Jonas Schaub + */ + @Test + public void testHydrogenBug() throws Exception { + String tmpMoleculeSmiles = "[H+].[H+].[O-]C(=O)\\C=C/C([O-])=O.[H][C@@]12Cc3c[nH]c4cccc(C1=C[C@@H](COC(=O)C1CCCCC1)CN2C)c34"; //CHEBI:365445 + String[] tmpExpectedFGs = new String[]{"O=C([O-])[C]=[C]C(=O)[O-]", "[C]=[C]", "*OC(*)=O", "[R]N([R])[R]", "NarR3"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + + tmpMoleculeSmiles = "[HH].O=C1N([C@H](C)C(C1=C(O)[C@]2([C@]3([C@H](C=C([C@H]2[C@@H](C(=O)O)CC)C)C[C@H](C)CC3)C)C)=O)C"; //CHEBI:223373 + tmpExpectedFGs = new String[]{"*C(=O)C(=[C]O[H])C(=O)N(*)*", "[C]=[C]", "*C(=O)O[H]"}; + this.testFind(tmpMoleculeSmiles, tmpExpectedFGs); + } + // + /** + * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given + * expected FG, using i.a. an identity search. Note that the order of the given FG must match the order of the detected + * FG. The expected FG can contain pseudo-SMILES code for some specific cases, where aromatic atoms are marked using + * "-ar" and pseudo-atoms (R) can be included. Uses the electron donation model daylight and the cycle finder "all" + * for aromaticity detection in the input molecule. + * + * @param aMoleculeSmiles input molecule to detect FG in + * @param anExpectedFGPseudoSmilesArray expected FG + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray) throws Exception { + this.testFind(aMoleculeSmiles, anExpectedFGPseudoSmilesArray, new Aromaticity(ElectronDonation.daylight(), Cycles.all()), + ErtlFunctionalGroupsFinder.Mode.DEFAULT); + } + // + /** + * Applies EFGF to detect functional groups in the given molecule and compares the identified FG to the given + * expected FG, using i.a. an identity search. Note that the order of the given FG must match the order of the detected + * FG. The expected FG can contain pseudo-SMILES code for some specific cases, where aromatic atoms are marked using + * "-ar" and pseudo-atoms (R) can be included. The given aromaticity model is used for preprocessing the input molecule. + * + * @param aMoleculeSmiles input molecule to detect FG in + * @param anExpectedFGPseudoSmilesArray expected FG + * @param anAromaticityModel for aromaticity detection in preprocessing of the input molecule + * @param aFunctionalGroupEnvironmentMode to configure the EFGF used here + * @throws Exception if anything goes wrong + * @author Sebastian Fritsch + */ + private void testFind(String aMoleculeSmiles, String[] anExpectedFGPseudoSmilesArray, Aromaticity anAromaticityModel, + ErtlFunctionalGroupsFinder.Mode aFunctionalGroupEnvironmentMode) + throws Exception { + // prepare input + SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer tmpMolecule = tmpSmilesParser.parseSmiles(aMoleculeSmiles); + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpMolecule); + anAromaticityModel.apply(tmpMolecule); + // find functional groups + ErtlFunctionalGroupsFinder tmpFGFinder = new ErtlFunctionalGroupsFinder(aFunctionalGroupEnvironmentMode); + List tmpFunctionalgroupsList = tmpFGFinder.find(tmpMolecule); + // get expected groups + List tmpExpectedFGs = new LinkedList<>(); + for (String tmpFGString : anExpectedFGPseudoSmilesArray) { + tmpExpectedFGs.add(this.buildFunctionalGroup(tmpFGString)); + } + // compare + this.assertIsomorphism(tmpExpectedFGs, tmpFunctionalgroupsList); + } + // + /** + * Asserts the isomorphism between two lists of functional group atom containers. Compares their sizes, atom counts, + * bond counts, performs an identity match using the Vento-Foggia algorithm, and checks that aromaticity annotations + * match for the atoms and bonds. + * NOTE: actual and expected functional groups must be in the same order! + * + * @param anExpectedFGsList list of expected functional groups + * @param anActualFGsList list of actual functional groups + * @author Sebastian Fritsch + */ + private void assertIsomorphism(List anExpectedFGsList, List anActualFGsList) { + Assertions.assertEquals(anExpectedFGsList.size(), anActualFGsList.size(), + "Number of functional groups does not match the expected number of groups"); + for (int i = 0; i < anExpectedFGsList.size(); i++) { + IAtomContainer tmpExpectedFG = anExpectedFGsList.get(i); + IAtomContainer tmpActualFG = anActualFGsList.get(i); + Assertions.assertEquals(tmpExpectedFG.getAtomCount(), tmpActualFG.getAtomCount(), + "Groups #" + i + ": different atom count"); + Assertions.assertEquals(tmpExpectedFG.getBondCount(), tmpActualFG.getBondCount(), + "Groups #" + i + ": different bond count"); + Pattern tmpExpectedFGPattern = VentoFoggia.findIdentical(tmpExpectedFG); + Assertions.assertTrue(tmpExpectedFGPattern.matches(tmpActualFG), "Groups #" + i + ": not isomorphic"); + Mappings tmpExpFGinActFGmappings = tmpExpectedFGPattern.matchAll(tmpActualFG); + Map tmpAtomMap = tmpExpFGinActFGmappings.toAtomMap().iterator().next(); + for (Map.Entry tmpMapEntry : tmpAtomMap.entrySet()) { + IAtom tmpExpectedAtom = tmpMapEntry.getKey(); + IAtom tmpActualAtom = tmpMapEntry.getValue(); + Assertions.assertEquals(tmpExpectedAtom.isAromatic(), tmpActualAtom.isAromatic(), + "Groups #" + i + ": Atom aromaticity does not match (" + + tmpActualAtom.getSymbol() + tmpActualAtom.isAromatic() + + ":" + + tmpExpectedAtom.getSymbol() + tmpExpectedAtom.isAromatic() + + ")"); + } + Map tmpBondMap = tmpExpFGinActFGmappings.toBondMap().iterator().next(); + for (Map.Entry tmpMapEntry : tmpBondMap.entrySet()) { + IBond tmpExpectedBond = tmpMapEntry.getKey(); + IBond tmpActualBond = tmpMapEntry.getValue(); + Assertions.assertEquals(tmpExpectedBond.isAromatic(), tmpActualBond.isAromatic(), + "Groups #" + i + ": Bond aromaticity does not match"); + } + } + } + // + /** + * Constructs a functional group atom container object from a given SMILES or pseudo-SMILES code. + * Pseudo-SMILES codes have aromatic atoms marked by "-ar", e.g. "Nar", and contain pseudo-atoms given as "R". + * But the only available cases here are "NarR3", "SarR2", and "OarR2". There is no general treatment of any pseudo-SMILES + * code! If the given string does not match any of the given three templates, it has to be a valid SMILES string! + * + * @param aFunctionalGroupPseudoSmiles SMILES code or specific pseudo-SMILES code + * @return functional group atom container built from the given code + * @author Sebastian Fritsch + */ + private IAtomContainer buildFunctionalGroup(String aFunctionalGroupPseudoSmiles) { + IAtom a1, a2, a3, a4, a5, a6, a7, a8, a9; + IBond b1, b2, b3, b4, b5, b6, b7, b8, b9; + IChemObjectBuilder tmpBuilder = SilentChemObjectBuilder.getInstance(); + IAtomContainer tmpFunctionalGroup; + // custom templates: + switch (aFunctionalGroupPseudoSmiles) { + case "NarR3": + a1 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a2 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a3 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a4 = tmpBuilder.newInstance(IAtom.class, "N"); + a4.setIsAromatic(true); + + b1 = tmpBuilder.newInstance(IBond.class, a1, a4, Order.SINGLE); + b2 = tmpBuilder.newInstance(IBond.class, a2, a4, Order.SINGLE); + b3 = tmpBuilder.newInstance(IBond.class, a3, a4, Order.SINGLE); + + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3, a4}); + tmpFunctionalGroup.setBonds(new IBond[] {b1, b2, b3}); + return tmpFunctionalGroup; + case "SarR2": + a1 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a2 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a3 = tmpBuilder.newInstance(IAtom.class, "S"); + a3.setIsAromatic(true); + + b1 = tmpBuilder.newInstance(IBond.class, a1, a3, Order.SINGLE); + b2 = tmpBuilder.newInstance(IBond.class, a2, a3, Order.SINGLE); + + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3}); + tmpFunctionalGroup.setBonds(new IBond[] {b1, b2}); + return tmpFunctionalGroup; + case "OarR2": + a1 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a2 = tmpBuilder.newInstance(IPseudoAtom.class, "R"); + a3 = tmpBuilder.newInstance(IAtom.class, "O"); + a3.setIsAromatic(true); + + b1 = tmpBuilder.newInstance(IBond.class, a1, a3, Order.SINGLE); + b2 = tmpBuilder.newInstance(IBond.class, a2, a3, Order.SINGLE); + + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1, a2, a3}); + tmpFunctionalGroup.setBonds(new IBond[] {b1, b2}); + return tmpFunctionalGroup; + case "Nar": + a1 = tmpBuilder.newInstance(IAtom.class, "N"); + a1.setIsAromatic(true); + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1}); + return tmpFunctionalGroup; + case "Sar": + a1 = tmpBuilder.newInstance(IAtom.class, "S"); + a1.setIsAromatic(true); + tmpFunctionalGroup = new AtomContainer(); + tmpFunctionalGroup.setAtoms(new IAtom[] {a1}); + return tmpFunctionalGroup; + default: + // treat as normal SMILES code + try { + SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); + try { + if (aFunctionalGroupPseudoSmiles.equals("[c]=O")) { + tmpSmilesParser.kekulise(false); + } + tmpFunctionalGroup = tmpSmilesParser.parseSmiles(aFunctionalGroupPseudoSmiles); + } catch(InvalidSmilesException e) { + tmpSmilesParser.kekulise(false); + tmpFunctionalGroup = tmpSmilesParser.parseSmiles(aFunctionalGroupPseudoSmiles); + } + for(IAtom a : tmpFunctionalGroup.atoms()) { + if (a instanceof PseudoAtom) { + a.setSymbol("R"); + } + } + return tmpFunctionalGroup; + } catch(InvalidSmilesException e) { + throw new IllegalArgumentException("Input string '" + aFunctionalGroupPseudoSmiles + " could not be found as a template " + + "and is not a valid SMILES string."); + } + } + } +} diff --git a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderUtilityTest.java b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java similarity index 85% rename from src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderUtilityTest.java rename to src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java index 9611ffb..4f46b75 100644 --- a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderUtilityTest.java +++ b/src/test/java/org/openscience/cdk/tools/ErtlFunctionalGroupsFinderUtilityTest.java @@ -1,6 +1,6 @@ /* * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny + * Copyright (c) 2024 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny * * Source code is available at * @@ -18,7 +18,7 @@ * along with this program. If not, see . */ -package org.openscience.cdk.tools.test; +package org.openscience.cdk.tools; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -30,10 +30,7 @@ import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinderUtility; -import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -74,7 +71,7 @@ public void testPseudoSmilesGeneration() throws Exception { Assertions.assertEquals(tmpTestPairsMap.get(tmpSmilesCode), tmpPseudoSmilesCode); } } - + // /** * Test for correct MoleculeHashGenerator settings/performance on some examples. * @@ -83,7 +80,7 @@ public void testPseudoSmilesGeneration() throws Exception { @Test public void testMoleculeHashGeneratorSettings() throws Exception { SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - ErtlFunctionalGroupsFinder tmpGeneralizingEFGF = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpGeneralizingEFGF = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderGeneralizingMode(); MoleculeHashGenerator tmpHashGenerator = ErtlFunctionalGroupsFinderUtility.getFunctionalGroupHashGenerator(); /*Chebi70986, Chebi16238 and Chebi57692 all contain the same functional group with pseudo SMILES code "O=C1N=C(C(=NR)C(=O)N1R)N(R)R", but different hybridizations in the resulting atom containers. But their hash @@ -146,7 +143,7 @@ public void testMoleculeHashGeneratorSettings() throws Exception { Assertions.assertEquals(tmpHashGenerator.generate(tmpKeyMol), tmpHashGenerator.generate(tmpValueMol)); } } - + // /** * Test for correct preprocessing (neutralization of charges and selection of biggest fragment). * @@ -163,7 +160,7 @@ public void testPreprocessing() throws Exception { SmilesGenerator tmpGenerator = new SmilesGenerator(SmiFlavor.Unique); Assertions.assertEquals("OCC", tmpGenerator.create(tmpMol)); } - + // /** * Tests the restoration of environmental carbon atom objects on one example molecule. Nothing is asserted here, it * is meant for visual inspection. @@ -176,7 +173,7 @@ public void testRestorationOfEnvironmentalCarbons() throws Exception { SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Unique); //Adenophostin B, COCONUT ID CNP0214672 IAtomContainer tmpMolecule = tmpSmiPar.parseSmiles("O=C(OCC1OC(OC2C(OC(N3C=NC=4C(=NC=NC43)N)C2OP(=O)(O)O)CO)C(O)C(OP(=O)(O)O)C1OP(=O)(O)O)C"); - ErtlFunctionalGroupsFinder tmpEFGFFullEnv = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderNotGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpEFGFFullEnv = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderFullEnvironmentMode(); tmpMolecule = ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(tmpMolecule, Aromaticity.cdkLegacy()); List tmpFGList = tmpEFGFFullEnv.find(tmpMolecule, false); System.out.println("FGs with full environment returned by EFGF:"); @@ -194,7 +191,7 @@ public void testRestorationOfEnvironmentalCarbons() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } tmpMolecule = tmpSmiPar.parseSmiles("O=C(OCC1OC(OC2C(OC(N3C=NC=4C(=NC=NC43)N)C2OP(=O)(O)O)CO)C(O)C(OP(=O)(O)O)C1OP(=O)(O)O)C"); - ErtlFunctionalGroupsFinder tmpEFGFgeneralized = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpEFGFgeneralized = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderGeneralizingMode(); tmpMolecule = ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(tmpMolecule, Aromaticity.cdkLegacy()); tmpFGList = tmpEFGFgeneralized.find(tmpMolecule, false); System.out.println("FGs with generalized environment returned by EFGF:"); @@ -212,7 +209,7 @@ public void testRestorationOfEnvironmentalCarbons() throws Exception { System.out.println(tmpSmiGen.create(tmpFG)); } } - + // /** * Imports a charged molecule with a counter-ion from ChEMBL to test the filtering and preprocessing routines * of ErtlFunctionalGroupsFinderUtility. @@ -224,10 +221,10 @@ public void testOnMolecule() throws Exception { SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); //CHEMBL1201736 IAtomContainer tmpMolecule = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); - Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.isStructureUnconnected(tmpMolecule)); - Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.isMoleculeCharged(tmpMolecule)); + Assertions.assertTrue(ErtlFunctionalGroupsFinder.isStructureUnconnected(tmpMolecule)); + Assertions.assertTrue(ErtlFunctionalGroupsFinder.containsChargedAtom(tmpMolecule)); Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.isAtomOrBondCountZero(tmpMolecule)); - Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.containsInvalidAtomicNumbers(tmpMolecule)); + Assertions.assertFalse(ErtlFunctionalGroupsFinder.containsMetalMetalloidOrPseudoAtom(tmpMolecule)); Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.shouldBeFiltered(tmpMolecule)); Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.shouldBePreprocessed(tmpMolecule)); Assertions.assertFalse(ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(tmpMolecule)); @@ -237,37 +234,25 @@ public void testOnMolecule() throws Exception { ErtlFunctionalGroupsFinderUtility.perceiveAtomTypesAndConfigureAtoms(tmpMolecule); ErtlFunctionalGroupsFinderUtility.applyAromaticityDetection(tmpMolecule, Aromaticity.cdkLegacy()); Assertions.assertTrue(ErtlFunctionalGroupsFinderUtility.isValidArgumentForFindMethod(tmpMolecule)); - ErtlFunctionalGroupsFinder tmpEFGF = ErtlFunctionalGroupsFinderUtility.getErtlFunctionalGroupsFinderGeneralizingMode(); + ErtlFunctionalGroupsFinder tmpEFGF = ErtlFunctionalGroupsFinder.newErtlFunctionalGroupsFinderGeneralizingMode(); List tmpFGList = tmpEFGF.find(tmpMolecule); for (IAtomContainer tmpFG : tmpFGList) { System.out.println(ErtlFunctionalGroupsFinderUtility.createPseudoSmilesCode(tmpFG)); } } - + // /** - * Tests the extraction of only atoms marked by the Ertl algorithm as functional groups, implemented in - * ErtlFunctionalGroupsFinderUtility as a third option to "full environment" / "generalized environment". - * - * @throws Exception if anything goes wrong + * Test charge neutralization. */ @Test - public void testFindMarkedAtoms() throws Exception { + public void testNeutralization() throws Exception { SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - //CHEMBL1201736 - IAtomContainer tmpMolecule = tmpSmiPar.parseSmiles("CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N+]3(C)CCCC3)CS[C@H]12)c1csc(N)n1.Cl"); - tmpMolecule = ErtlFunctionalGroupsFinderUtility.applyFiltersAndPreprocessing(tmpMolecule, Aromaticity.cdkLegacy()); - List tmpFGList = ErtlFunctionalGroupsFinderUtility.findMarkedAtoms(tmpMolecule); - List tmpPseudoSmilesList = new ArrayList<>(6); - for (IAtomContainer tmpFG : tmpFGList) { - String tmpPseudoSmiles = ErtlFunctionalGroupsFinderUtility.createPseudoSmilesCode(tmpFG); - System.out.println(tmpPseudoSmiles); - tmpPseudoSmilesList.add(tmpPseudoSmiles); - } - Assertions.assertTrue(tmpPseudoSmilesList.contains("[N]C(=O)[C]=N[O]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[C]=C(C(=O)[O])N([C]=O)[C][S]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[N]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[S*]")); - Assertions.assertTrue(tmpPseudoSmilesList.contains("[N*]")); - Assertions.assertTrue(tmpPseudoSmilesList.size() == 6); + IAtomContainer tmpAmmonia = tmpSmiPar.parseSmiles("[NH4+]"); + ErtlFunctionalGroupsFinderUtility.neutralizeCharges(tmpAmmonia); + SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical); + System.out.println(tmpSmiGen.create(tmpAmmonia)); + IAtomContainer tmpNitro = tmpSmiPar.parseSmiles("C[N+](=O)[O-]"); + ErtlFunctionalGroupsFinderUtility.neutralizeCharges(tmpNitro); + System.out.println(tmpSmiGen.create(tmpNitro)); } } diff --git a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderTest.java b/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderTest.java deleted file mode 100644 index 28b2d9b..0000000 --- a/src/test/java/org/openscience/cdk/tools/test/ErtlFunctionalGroupsFinderTest.java +++ /dev/null @@ -1,385 +0,0 @@ -/* - * ErtlFunctionalGroupsFinder for CDK - * Copyright (c) 2023 Sebastian Fritsch, Stefan Neumann, Jonas Schaub, Christoph Steinbeck, and Achim Zielesny - * - * Source code is available at - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -package org.openscience.cdk.tools.test; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.openscience.cdk.AtomContainer; -import org.openscience.cdk.PseudoAtom; -import org.openscience.cdk.aromaticity.Aromaticity; -import org.openscience.cdk.aromaticity.ElectronDonation; -import org.openscience.cdk.exception.InvalidSmilesException; -import org.openscience.cdk.graph.Cycles; -import org.openscience.cdk.interfaces.IAtom; -import org.openscience.cdk.interfaces.IAtomContainer; -import org.openscience.cdk.interfaces.IBond; -import org.openscience.cdk.interfaces.IBond.Order; -import org.openscience.cdk.interfaces.IChemObjectBuilder; -import org.openscience.cdk.interfaces.IPseudoAtom; -import org.openscience.cdk.isomorphism.Mappings; -import org.openscience.cdk.isomorphism.Pattern; -import org.openscience.cdk.isomorphism.VentoFoggia; -import org.openscience.cdk.silent.SilentChemObjectBuilder; -import org.openscience.cdk.smiles.SmiFlavor; -import org.openscience.cdk.smiles.SmilesGenerator; -import org.openscience.cdk.smiles.SmilesParser; -import org.openscience.cdk.tools.ErtlFunctionalGroupsFinder; -import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; - -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -/** - * Test for ErtlFunctionalGroupsFinder. - * - * @author Sebastian Fritsch - * @version 1.2 - */ -public class ErtlFunctionalGroupsFinderTest { - - public ErtlFunctionalGroupsFinderTest() { - super(); - } - - @Test - public void testFind1() throws Exception { - String moleculeSmiles = "Cc1cc(C)nc(NS(=O)(=O)c2ccc(N)cc2)n1"; - String[] expectedFGs = new String[] {"[R]N([R])S(=O)(=O)[R]", "[c]N(H)H", "NarR3", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind2() throws Exception{ - String moleculeSmiles = "NC(=N)c1ccc(\\\\C=C\\\\c2ccc(cc2O)C(=N)N)cc1"; - String[] expectedFGs = new String[] {"[R]N=C-N([R])[R]", "[C]=[C]", "[c]OH", "[R]N=C-N([R])[R]"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind3() throws Exception { - String moleculeSmiles = "CC(=O)Nc1nnc(s1)S(=O)(=O)N"; - String[] expectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]S(=O)(=O)N([R])[R]", "NarR3", "NarR3", "SarR2"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind4() throws Exception { - String moleculeSmiles = "NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl"; - String[] expectedFGs = new String[] {"[R]S(=O)(=O)N([R])[R]", "[R]S(=O)(=O)N([R])[C]N([R])[R]", "[R]Cl"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind5() throws Exception { - String moleculeSmiles = "CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3"; - String[] expectedFGs = new String[] {"[R]N([R])[C]=N[R]", "[R]Cl", "[R]N(=O)=[C]"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind6() throws Exception { - String moleculeSmiles = "Cc1onc(c2ccccc2)c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C(=O)O"; - String[] expectedFGs = new String[] {"O=C([R])N([R])[R]", "O=C([R])N([R])[C]S[R]", "O=C([R])OH", "OarR2", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind7() throws Exception { - String moleculeSmiles = "Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O"; - String[] expectedFGs = new String[] {"[R]Cl", "[R]N=[C]", "[R]C(=O)N([R])[R]", "O=N([R])=O"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind8() throws Exception { - String moleculeSmiles = "COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N"; - String[] expectedFGs = new String[] {"[R]O[R]", "[R]N([R])C(=O)[R]", "N([R])([R])[R]", "[C]=[C]", "[R]O[R]", "[R]S(=O)(=O)N([R])[R]"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind9() throws Exception { - String moleculeSmiles = "Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl"; - String[] expectedFGs = new String[] {"[R]Cl", "[R]N(H)[R]", "O=C(OH)[R]", "[R]Cl"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind10() throws Exception { - String moleculeSmiles = "Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1"; - String[] expectedFGs = new String[] {"[R]Cl", "[R]O[R]", "[R]N([R])[C]=N[R]", "[R]N([H])[R]"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind11() throws Exception { - String moleculeSmiles = "FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13"; - String[] expectedFGs = new String[] {"[R]F", "[R]F", "[R]F", "O=C([R])N([R])[R]", "[R]N=[C]", "[R]Cl"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind12() throws Exception { - String moleculeSmiles = "OC[C@H]1O[C@H](C[C@@H]1O)n2cnc3[C@H](O)CNC=Nc23";; - String[] expectedFGs = new String[] {"[C]O[H]", "[R]O[R]", "[C]OH", "[C]OH", "[R]N=CN([R])[R]", "NarR3", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind13() throws Exception { - String moleculeSmiles = "CCN[C@H]1C[C@H](C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N"; - String[] expectedFGs = new String[] {"[R]N([R])H", "O=S(=O)([R])[R]", "[R]S(=O)(=O)N([R])[R]", "SarR2"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind14() throws Exception { - String moleculeSmiles = "C[C@@H](O)[C@@H]1[C@H]2[C@@H](C)C(=C(N2C1=O)C(=O)O)S[C@@H]3CN[C@@H](C3)C(=O)N(C)C"; - String[] expectedFGs = new String[] {"[C]O[H]", "O=C([R])N([R])C(C(=O)(OH))=[C]S[R]", "[R]N(H)[R]", "[R]N([R])C([R])=O"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind15() throws Exception { - String moleculeSmiles = "C[C@@H]1CN(C[C@H](C)N1)c2c(F)c(N)c3C(=O)C(=CN(C4CC4)c3c2F)C(=O)O"; - String[] expectedFGs = new String[] {"[R]N([R])[R]", "[R]N([H])[R]", "[R]F", "[c]N(H)H", "[c]=O", "[R]F", "[R]C(=O)OH", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind16() throws Exception { - String moleculeSmiles = "CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C"; - String[] expectedFGs = new String[] {"[C]=[C]", "[R]C(=O)N([R])N([R])C(=O)[R]"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind17() throws Exception { - String moleculeSmiles = "Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl"; - String[] expectedFGs = new String[] {"Cl[R]", "[R]N=C(N([R])[R])N([R])C(=O)[R]", "Cl[R]"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind18() throws Exception { - String moleculeSmiles = "CC(=O)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](O)[C@H](O)CO)C(=O)O"; - String[] expectedFGs = new String[] {"[R]N([R])C(=O)[R]", "[R]N([R])C(=N[R])N([R])[R]", "O=C(OH)C(=[C])O[R]" , "[C]OH", "[C]OH", "[C]OH"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind19() throws Exception { - String moleculeSmiles = "C[C@H](O)[C@H](O)[C@H]1CNc2nc(N)nc(O)c2N1"; - String[] expectedFGs = new String[] {"[C]OH", "[C]OH", "[R]N(H)[R]" , "[c]N(H)H", "[c]OH", "[R]N(H)[R]", "NarR3", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); - } - - @Test - public void testFind20() throws Exception { - String moleculeSmiles = "N[C@@H]1CCCCN(C1)c2c(Cl)cc3C(=O)C(=CN(C4CC4)c3c2Cl)C(=O)O"; - String[] expectedFGs = new String[] {"[C]N([H])[H]", "[R]N([R])[R]", "[R]Cl" , "[c]=O", "[R]Cl", "[R]C(=O)OH", "NarR3"}; - testFind(moleculeSmiles, expectedFGs); - } - - /** - * Example code to be used in the GitHub wiki of the project. - * - * @throws Exception if anything goes wrong - * @author Jonas Schaub - */ - @Test - public void gitHubWikiTest() throws Exception { - //Prepare input - SmilesParser tmpSmiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer tmpInputMol = tmpSmiPar.parseSmiles("C[C@@H]1CN(C[C@H](C)N1)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F"); //PubChem CID 5257 - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(tmpInputMol); - Aromaticity tmpAromaticity = new Aromaticity(ElectronDonation.cdk(), Cycles.cdkAromaticSet()); - tmpAromaticity.apply(tmpInputMol); - //Identify functional groups - ErtlFunctionalGroupsFinder tmpEFGF = new ErtlFunctionalGroupsFinder(); //default: generalization turned on - List tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); - SmilesGenerator tmpSmiGen = new SmilesGenerator(SmiFlavor.Canonical | SmiFlavor.UseAromaticSymbols); - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { - String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); - System.out.println(tmpSmilesString); - } - //non-generalized functional groups - System.out.println("----------------"); - tmpEFGF = new ErtlFunctionalGroupsFinder(ErtlFunctionalGroupsFinder.Mode.NO_GENERALIZATION); - tmpFunctionalGroupsList = tmpEFGF.find(tmpInputMol); - for (IAtomContainer tmpFunctionalGroup : tmpFunctionalGroupsList) { - String tmpSmilesString = tmpSmiGen.create(tmpFunctionalGroup); - System.out.println(tmpSmilesString); - } - } - - private void testFind(String moleculeSmiles, String[] fGStrings) throws Exception { - testFind(moleculeSmiles, fGStrings, new Aromaticity(ElectronDonation.daylight(), Cycles.all())); - } - - private void testFind(String moleculeSmiles, String[] fGStrings, Aromaticity aromaticity) throws Exception { - // prepare input - SmilesParser smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer mol = smilesParser.parseSmiles(moleculeSmiles); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol); - aromaticity.apply(mol); - - // find functional groups - ErtlFunctionalGroupsFinder fgFinder = new ErtlFunctionalGroupsFinder(); - List fGs = fgFinder.find(mol); - - // get expected groups - List expectedFGs = new LinkedList<>(); - for (String fGString : fGStrings) { - expectedFGs.add(buildFunctionalGroup(fGString)); - } - - // compare - this.assertIsomorphism(expectedFGs, fGs); - } - - /** - * NOTE: actual and expected functional groups must be in the same order! - * - * @param expectedFGs list of expected functional groups - * @param actualFGs list of actual functional groups - * @throws Exception if anything does not work as planned - */ - private void assertIsomorphism(List expectedFGs, List actualFGs) { - Assertions.assertEquals(expectedFGs.size(), actualFGs.size(), - "Number of functional groups does not match the expected number of groups"); - - for(int i = 0; i < expectedFGs.size(); i++) { - IAtomContainer cExp = expectedFGs.get(i); - IAtomContainer cAct = actualFGs.get(i); - - Assertions.assertEquals(cExp.getAtomCount(), cAct.getAtomCount(), - "Groups #" + i + ": different atom count"); - Assertions.assertEquals(cExp.getBondCount(), cAct.getBondCount(), - "Groups #" + i + ": different bond count"); - - Pattern pattern = VentoFoggia.findIdentical(cExp); - - Assertions.assertTrue(pattern.matches(cAct), "Groups #" + i + ": not isomorph"); - - Mappings mappings = pattern.matchAll(cAct); - - Map atomMap = mappings.toAtomMap().iterator().next(); - for (Map.Entry e : atomMap.entrySet()) { - IAtom atomExp = e.getKey(); - IAtom atomAct = e.getValue(); - Assertions.assertEquals(atomExp.isAromatic(), atomAct.isAromatic(), - "Groups #" + i + ": Atom aromaticity does not match" - + atomAct.getSymbol() + atomAct.isAromatic() + atomExp.getSymbol() - + atomExp.isAromatic()); - } - - Map bondMap = mappings.toBondMap().iterator().next(); - for (Map.Entry e : bondMap.entrySet()) { - IBond bondExp = e.getKey(); - IBond bondAct = e.getValue(); - Assertions.assertEquals(bondExp.isAromatic(), bondAct.isAromatic(), - "Groups #" + i + ": Bond aromaticity does not match"); - } - } - } - - private IAtomContainer buildFunctionalGroup(String string) { - IAtom a1, a2, a3, a4, a5, a6, a7, a8, a9; - IBond b1, b2, b3, b4, b5, b6, b7, b8, b9; - IChemObjectBuilder builder = SilentChemObjectBuilder.getInstance(); - IAtomContainer container; - - // custom templates - switch(string) { - case "NarR3": - a1 = builder.newInstance(IPseudoAtom.class, "R"); - a2 = builder.newInstance(IPseudoAtom.class, "R"); - a3 = builder.newInstance(IPseudoAtom.class, "R"); - a4 = builder.newInstance(IAtom.class, "N"); - a4.setIsAromatic(true); - - b1 = builder.newInstance(IBond.class, a1, a4, Order.SINGLE); - b2 = builder.newInstance(IBond.class, a2, a4, Order.SINGLE); - b3 = builder.newInstance(IBond.class, a3, a4, Order.SINGLE); - - container = new AtomContainer(); - container.setAtoms(new IAtom[] {a1, a2, a3, a4}); - container.setBonds(new IBond[] {b1, b2, b3}); - return container; - - case "SarR2": - a1 = builder.newInstance(IPseudoAtom.class, "R"); - a2 = builder.newInstance(IPseudoAtom.class, "R"); - a3 = builder.newInstance(IAtom.class, "S"); - a3.setIsAromatic(true); - - b1 = builder.newInstance(IBond.class, a1, a3, Order.SINGLE); - b2 = builder.newInstance(IBond.class, a2, a3, Order.SINGLE); - - container = new AtomContainer(); - container.setAtoms(new IAtom[] {a1, a2, a3}); - container.setBonds(new IBond[] {b1, b2}); - return container; - - case "OarR2": - a1 = builder.newInstance(IPseudoAtom.class, "R"); - a2 = builder.newInstance(IPseudoAtom.class, "R"); - a3 = builder.newInstance(IAtom.class, "O"); - a3.setIsAromatic(true); - - b1 = builder.newInstance(IBond.class, a1, a3, Order.SINGLE); - b2 = builder.newInstance(IBond.class, a2, a3, Order.SINGLE); - - container = new AtomContainer(); - container.setAtoms(new IAtom[] {a1, a2, a3}); - container.setBonds(new IBond[] {b1, b2}); - return container; - - // smiles - default: - try { - SmilesParser smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - try { - if(string.equals("[c]=O")) - smilesParser.kekulise(false); - container = smilesParser.parseSmiles(string); - } - catch(InvalidSmilesException e) { - smilesParser.kekulise(false); - container = smilesParser.parseSmiles(string); - } - - for(IAtom a : container.atoms()) { - if(a instanceof PseudoAtom) { - a.setSymbol("R"); - } - } - return container; - } - catch(InvalidSmilesException e) { - throw new IllegalArgumentException("Input string '" + string + " could not be found as a template " + - "and is not a valid SMILES string."); - } - } - } -} diff --git a/src/test/resources/ChEBI_lite_3star_subset.sdf b/src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset.sdf similarity index 100% rename from src/test/resources/ChEBI_lite_3star_subset.sdf rename to src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset.sdf diff --git a/src/test/resources/ChEBI_lite_3star_subset_readme.md b/src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset_readme.md similarity index 100% rename from src/test/resources/ChEBI_lite_3star_subset_readme.md rename to src/test/resources/org/openscience/cdk/tools/ChEBI_lite_3star_subset_readme.md