Skip to content

Commit 052ccdf

Browse files
authoredDec 13, 2024··
Reformat loaders for different smiles paths (#1211)
* Reformat loaders for different smiles paths * merge test_compound tests * allow for lists of smiles or .smi file lists to be loaded into a compound * remove extra large_smiles test * remove duplicated tests
1 parent 9acf939 commit 052ccdf

File tree

2 files changed

+133
-72
lines changed

2 files changed

+133
-72
lines changed
 

‎mbuild/conversion.py

+108-72
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,11 @@ def load(
8888
structure's position (recommended).
8989
"""
9090
# First check if we are loading from an object
91-
if not isinstance(filename_or_object, str):
91+
if not (
92+
isinstance(filename_or_object, str)
93+
or isinstance(filename_or_object, list)
94+
or isinstance(filename_or_object, tuple)
95+
):
9296
return load_object(
9397
obj=filename_or_object,
9498
compound=compound,
@@ -97,19 +101,26 @@ def load(
97101
**kwargs,
98102
)
99103
# Second check if we are loading SMILES strings
100-
elif smiles:
104+
elif smiles and (backend is None or backend.lower() == "rdkit"):
101105
# Ignore the box info for SMILES (its never there)
102-
ignore_box_warn = True
103-
return load_smiles(
106+
seed = kwargs.get("seed", 0)
107+
return load_rdkit_smiles(
104108
smiles_or_filename=filename_or_object,
105109
compound=compound,
106110
infer_hierarchy=infer_hierarchy,
107-
ignore_box_warn=ignore_box_warn,
108-
backend=backend,
109-
**kwargs,
111+
ignore_box_warn=True,
112+
seed=seed,
110113
)
111-
# Last, if none of the above, load from file
114+
elif smiles and isinstance(backend, str) and backend.lower() == "pybel":
115+
return load_pybel_smiles(
116+
smiles_or_filename=filename_or_object,
117+
compound=compound,
118+
infer_hierarchy=infer_hierarchy,
119+
ignore_box_warn=True,
120+
)
121+
112122
else:
123+
# Last, if none of the above, load from file
113124
return load_file(
114125
filename=filename_or_object,
115126
relative_to_module=relative_to_module,
@@ -191,14 +202,12 @@ def load_object(
191202
raise ValueError(f"Object of type {type(obj).__name__} is not supported.")
192203

193204

194-
def load_smiles(
205+
def load_pybel_smiles(
195206
smiles_or_filename,
196207
compound=None,
197208
infer_hierarchy=True,
198209
ignore_box_warn=False,
199-
backend="rdkit",
200210
coords_only=False,
201-
**kwargs,
202211
):
203212
"""Load a SMILES string as an mBuild Compound.
204213
@@ -216,88 +225,115 @@ def load_smiles(
216225
If True, ignore warning if no box is present.
217226
coords_only : bool, optional, default=False
218227
Only load the coordinates into a provided compound.
219-
backend : str, optional, default='rdkit'
220-
The smiles loading backend, either 'rdkit' or 'pybel'
221228
222229
Returns
223230
-------
224231
compound : mb.Compound
225232
"""
226-
# Initialize an mb.Compound if none is provided
227-
if not compound:
233+
if compound is None:
228234
compound = mb.Compound()
229235

230-
test_path = Path(smiles_or_filename)
236+
pybel = import_("pybel")
237+
# First we try treating smiles_or_filename as a SMILES string
238+
try:
239+
mymol = pybel.readstring("smi", smiles_or_filename)
240+
mymolGen = [mymol]
241+
# Now we treat it as a filename
242+
except (OSError, IOError):
243+
mymolGen = pybel.readfile("smi", smiles_or_filename)
244+
245+
for mymol in mymolGen:
246+
mymol.make3D()
247+
from_pybel(
248+
pybel_mol=mymol,
249+
compound=compound,
250+
infer_hierarchy=infer_hierarchy,
251+
ignore_box_warn=ignore_box_warn,
252+
)
253+
254+
return compound
231255

232-
# Will try to support list of smiles strings in the future
233-
if backend is None:
234-
backend = "rdkit"
235256

236-
if backend == "rdkit":
237-
rdkit = import_("rdkit") # noqa: F841
238-
from rdkit import Chem
257+
def load_rdkit_smiles(
258+
smiles_or_filename,
259+
compound=None,
260+
infer_hierarchy=True,
261+
ignore_box_warn=False,
262+
coords_only=False,
263+
seed=0,
264+
):
265+
"""Load a SMILES string as an mBuild Compound.
239266
240-
if test_path.exists():
241-
# assuming this is a smi file now
242-
mymol = Chem.SmilesMolSupplier(smiles_or_filename)
243-
if not mymol:
244-
raise ValueError(
245-
"Provided smiles string or file was invalid. Refer to the "
246-
"above RDKit error messages for additional information."
247-
)
248-
mol_list = [mol for mol in mymol]
249-
if len(mol_list) == 1:
250-
rdmol = mymol[0]
251-
else:
252-
rdmol = mymol[0]
253-
warn(
254-
"More than one SMILES string in file, more than one SMILES "
255-
f"string is not supported, using {Chem.MolToSmiles(rdmol)}"
256-
)
257-
else:
258-
rdmol = Chem.MolFromSmiles(smiles_or_filename)
267+
Loading SMILES string from a string, a list, or a file using RDKit by
268+
default. Must have rdkit or pybel packages installed.
269+
270+
Parameters
271+
----------
272+
smiles_or_filename : str
273+
SMILES string or file of SMILES string to load
274+
compound : mb.Compound
275+
The host mbuild Compound
276+
infer_hierarchy : bool, optional, default=True
277+
ignore_box_warn : bool, optional, default=False
278+
If True, ignore warning if no box is present.
279+
coords_only : bool, optional, default=False
280+
Only load the coordinates into a provided compound.
259281
260-
seed = kwargs.get("smiles_seed", 0)
282+
Returns
283+
-------
284+
compound : mb.Compound
285+
"""
286+
# Initialize an mb.Compound if none is provided
287+
if not compound:
288+
compound = mb.Compound()
289+
290+
if not seed: # default rdkit seed
291+
seed = 0
292+
293+
rdkit = import_("rdkit") # noqa: F841
294+
from rdkit import Chem
295+
296+
if isinstance(smiles_or_filename, (tuple, list)):
297+
for mol in smiles_or_filename:
298+
rdmol = Chem.MolFromSmiles(mol)
299+
from_rdkit(
300+
rdkit_mol=rdmol,
301+
compound=compound,
302+
coords_only=coords_only,
303+
smiles_seed=seed,
304+
)
305+
return compound
261306

307+
rdmol = Chem.MolFromSmiles(smiles_or_filename)
308+
if rdmol: # return right away if the smiles loads properly
262309
return from_rdkit(
263310
rdkit_mol=rdmol,
264311
compound=compound,
265312
coords_only=coords_only,
266313
smiles_seed=seed,
267314
)
268-
elif backend == "pybel":
269-
pybel = import_("pybel")
270-
# First we try treating filename_or_object as a SMILES string
271-
try:
272-
mymol = pybel.readstring("smi", smiles_or_filename)
273-
# Now we treat it as a filename
274-
except (OSError, IOError):
275-
# For now, we only support reading in a single smiles molecule,
276-
# but pybel returns a generator, so we get the first molecule
277-
# and warn the user if there is more
278-
279-
mymol_generator = pybel.readfile("smi", smiles_or_filename)
280-
mymol_list = list(mymol_generator)
281-
if len(mymol_list) == 1:
282-
mymol = mymol_list[0]
283-
else:
284-
mymol = mymol_list[0]
285-
warn(
286-
"More than one SMILES string in file, more than one SMILES "
287-
f"string is not supported, using {mymol.write('smi')}"
288-
)
289-
mymol.make3D()
290-
return from_pybel(
291-
pybel_mol=mymol,
315+
316+
# Try to assume it's a smiles file
317+
mymol = Chem.SmilesMolSupplier(smiles_or_filename, titleLine=0)
318+
if not mymol:
319+
raise ValueError(
320+
"Provided smiles string or file was invalid. Refer to the "
321+
"above RDKit error messages for additional information."
322+
)
323+
molList = [mol for mol in mymol]
324+
for rdmol in molList:
325+
from_rdkit(
326+
rdkit_mol=rdmol,
292327
compound=compound,
293-
infer_hierarchy=infer_hierarchy,
294-
ignore_box_warn=ignore_box_warn,
328+
coords_only=coords_only,
329+
smiles_seed=seed,
295330
)
296-
else:
331+
if not compound:
297332
raise ValueError(
298-
"Expected SMILES loading backend 'rdkit' or 'pybel'. "
299-
f"Was provided: {backend}"
333+
"Expected SMILES loading backend 'rdkit' failed to load any compouds."
334+
f"Check the SMILES string of .smi file passed to {smiles_or_filename=}"
300335
)
336+
return compound
301337

302338

303339
def load_file(
@@ -409,7 +445,7 @@ def load_file(
409445
elif extension == ".txt":
410446
warn(".txt file detected, loading as a SMILES string")
411447
# Fail-safe measure
412-
compound = load_smiles(filename, compound)
448+
compound = load_pybel_smiles(filename, compound)
413449

414450
# Then mdtraj reader
415451
elif backend == "mdtraj":

‎mbuild/tests/test_compound.py

+25
Original file line numberDiff line numberDiff line change
@@ -2537,9 +2537,34 @@ def test_catalog_bondgraph_types(self, benzene):
25372537
== "particle_graph"
25382538
)
25392539

2540+
def test_load_large_smiles(self):
2541+
cpd = mb.load(
2542+
(
2543+
"CC1C(=O)NC(C(=O)NC(CSC(C2=C(C3=CC4=C(C(SCC(C(=O)N1)"
2544+
"NC(=O)C(CCCCN)NC(=O)C(CCC(=O)N)NC(=O)C(C(C)C)N)C)C(=C("
2545+
"[N-]4)C=C5C(=C(C(=N5)C=C6C(=C(C(=CC2=N3)[N-]6)C)CCC(=O"
2546+
")[O-])CCC(=O)[O-])C)C)C)C)C(=O)NC(CC7=CNC=N7)C(=O)NC(C"
2547+
"(C)O)C(=O)NC(C(C)C)C(=O)NC(CCC(=O)[O-])C(=O)[O-])CCC(="
2548+
"O)N.[Na+].[Na+].[Na+].[Na+].[Fe+2]"
2549+
),
2550+
smiles=True,
2551+
)
2552+
assert cpd.n_particles == 244
2553+
25402554
def test_reset_labels(self):
25412555
ethane = mb.load("CC", smiles=True)
25422556
Hs = ethane.particles_by_name("H")
25432557
ethane.remove(Hs, reset_labels=True)
25442558
ports = set(f"port[{i}]" for i in range(6))
25452559
assert ports.issubset(set(ethane.labels.keys()))
2560+
2561+
def test_load_molfile(self):
2562+
with open("ethane.smi", "w") as f:
2563+
f.writelines("CC ethane\nCCC propane\nCCCC butane") # write a test file
2564+
2565+
cpd = mb.load("ethane.smi", smiles=True, backend="rdkit")
2566+
assert cpd.n_particles == 33
2567+
2568+
def test_load_list_of_smiles(self):
2569+
cpd = mb.load(["C", "O"], smiles=True)
2570+
assert len(cpd.children) == 8

0 commit comments

Comments
 (0)
Please sign in to comment.