From cdbbfa9c5fbf804ce20bba281614ca67f30a9dd6 Mon Sep 17 00:00:00 2001 From: Osman Mamun Date: Wed, 27 Mar 2024 15:21:52 -0500 Subject: [PATCH] xyz file reader added --- .gitignore | 1 + src/data/get_mol_objects.py | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 src/data/get_mol_objects.py diff --git a/.gitignore b/.gitignore index a1086d0..502bc9b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .coverage *.egg-info/ *.db +raw_data diff --git a/src/data/get_mol_objects.py b/src/data/get_mol_objects.py new file mode 100644 index 0000000..cb804fc --- /dev/null +++ b/src/data/get_mol_objects.py @@ -0,0 +1,46 @@ +from rdkit import Chem +from pathlib import Path + +TARGET_MAP = { + "RC_A" : 1, + "RC_B" : 2, + "RC_C" : 3, + "mu": 4, + "alpha": 5, + "homo": 6, + "lumo": 7, + "gap": 8, + "r2": 9, + "zpve": 10, + "U_0" : 11, + "U_298" : 12, + "H_enthalpy" : 13, + "G_free_energy" : 14, + "Cv": 15 + } + +def get_mol_data(idx, targets): + """ + Gets the molecular data for the MOBOQM9 model. + + args: + idx: Index of the data. + + returns: + rdkit mol object and targets. + """ + content = Path(f"raw_data/dsgdb9nsd_{idx:06d}.xyz").read_text().split("\n") + props = content[1].split("\t") + target_list = [float(props[TARGET_MAP[target]]) for target in targets] + temp_file = Path("data.xyz") + content = content[:-4] + for i in range(2, len(content)): + content[i] = "\t".join(content[i].split("\t")[:-1]) + temp_file.write_text("\n".join(content)) + mol = Chem.MolFromXYZFile(str(temp_file)) + temp_file.unlink() + return mol, target_list + + +if __name__ == "__main__": + print(get_mol_data(199, ["mu", "alpha"])) \ No newline at end of file