generated from ersilia-os/eos-template
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4623127
commit f0597c4
Showing
16 changed files
with
263 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
DS_Store | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from rdkit import Chem | ||
from rdkit.Chem import AllChem | ||
|
||
def _InitialiseNeutralisationReactions(): | ||
patts= ( | ||
# Imidazoles | ||
('[n+;H]','n'), | ||
# Amines | ||
('[N+;!H0]','N'), | ||
# Carboxylic acids and alcohols | ||
('[$([O-]);!$([O-][#7])]','O'), | ||
# Thiols | ||
('[S-;X1]','S'), | ||
# Sulfonamides | ||
('[$([N-;X2]S(=O)=O)]','N'), | ||
# Enamines | ||
('[$([N-;X2][C,N]=C)]','N'), | ||
# Tetrazoles | ||
('[n-]','[nH]'), | ||
# Sulfoxides | ||
('[$([S-]=O)]','S'), | ||
# Amides | ||
('[$([N-]C=O)]','N'), | ||
) | ||
return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts] | ||
|
||
_reactions=None | ||
|
||
def NeutraliseCharges(smiles, reactions=None): | ||
global _reactions | ||
if reactions is None: | ||
if _reactions is None: | ||
_reactions=_InitialiseNeutralisationReactions() | ||
reactions=_reactions | ||
mol = Chem.MolFromSmiles(smiles) | ||
replaced = False | ||
for i,(reactant, product) in enumerate(reactions): | ||
while mol.HasSubstructMatch(reactant): | ||
replaced = True | ||
rms = AllChem.ReplaceSubstructs(mol, reactant, product) | ||
mol = rms[0] | ||
if replaced: | ||
return (Chem.MolToSmiles(mol,True), True) | ||
else: | ||
return (smiles, False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import tempfile | ||
from tqdm import tqdm | ||
from rdkit import Chem | ||
from rdkit.Chem import AllChem | ||
from rdkit.ML.Descriptors import MoleculeDescriptors | ||
from rdkit.Chem import Descriptors | ||
|
||
|
||
def calculate_conformers(smiles_list): | ||
|
||
smis_mini = smiles_list | ||
names_mini = ["molecule_{}".format(i) for i in range(len(smiles_list))] | ||
names = names_mini | ||
|
||
print("Creating temporary file to store conformers") | ||
with tempfile.NamedTemporaryFile(delete=False, suffix=".sdf") as tmp: | ||
tmp_name = tmp.name | ||
|
||
print("Generating conformers") | ||
w = Chem.SDWriter(tmp_name) | ||
rms_dict = dict() | ||
for j in tqdm(range(len(smis_mini))): | ||
m = Chem.MolFromSmiles(smis_mini[j]) | ||
m2 = Chem.AddHs(m) | ||
# run the new CSD-based method | ||
cids = AllChem.EmbedMultipleConfs(m2, numConfs=10, params=AllChem.ETKDG()) | ||
for cid in cids: | ||
_ = AllChem.MMFFOptimizeMolecule(m2, confId=cid) | ||
m2.SetProp("_Name", names_mini[j]) | ||
Chem.MolToMolBlock(m2) | ||
|
||
rmslist=[] | ||
AllChem.AlignMolConformers(m2, RMSlist=rmslist) | ||
rms_dict[names[j]] = rmslist | ||
w.write(m2) | ||
w.close() | ||
|
||
print("Reading the conformers file") | ||
dataset = [mol for mol in Chem.SDMolSupplier(tmp_name) if mol is not None] | ||
return dataset | ||
|
||
|
||
def descriptors_calculator(smiles_list): | ||
print("Calculating conformers") | ||
dataset = calculate_conformers(smiles_list) | ||
|
||
print("Calculating descriptors") | ||
nms=[x[0] for x in Descriptors._descList] | ||
calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms) | ||
print(len(nms)) | ||
|
||
datasetDescrs = [calc.CalcDescriptors(x) for x in dataset] | ||
datasetDescrs = np.array(datasetDescrs) | ||
|
||
df = pd.DataFrame(datasetDescrs, columns=nms) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,4 @@ | ||
smiles | ||
smiles | ||
Clc1c(N(S(=O)(C)=O)CCO)c2c(NC(C(N2)=O)=O)cc1Cl | ||
Clc1ccc(C(N2CCN(CCOCCO)CC2)c3ccccc3)cc1 | ||
OC(C1CCN(CCCC(O)C2=CC=C(C(C)(C)C)C=C2)CC1)(C3=CC=CC=C3)C4=CC=CC=C4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
rfc_score,gbc_score,logreg_score | ||
0.1714101727051474,2.165122025378177e-09,0.00011447820403389557 | ||
0.6954498139522349,0.9999999942868296,0.9226070246759587 | ||
0.5897135663254606,0.9999999835095177,0.8595850881766514 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import os | ||
import pandas as pd | ||
import joblib | ||
import sys | ||
import numpy as np | ||
from rdkit import Chem | ||
from sklearn.preprocessing import MinMaxScaler | ||
|
||
root = os.path.dirname(os.path.abspath(__file__)) | ||
sys.path.append(os.path.join(root, "..", "code")) | ||
from charge_neutralizer import NeutraliseCharges | ||
from descriptors_calculator import descriptors_calculator | ||
|
||
print("Loading the molecules") | ||
df = pd.read_excel(os.path.join(root, "datasetsCompounds.xlsx"), sheet_name="CPSMs") | ||
print(df.head()) | ||
|
||
def str2float(x): | ||
try: | ||
return float(x) | ||
except: | ||
return np.nan | ||
|
||
y_logbb = np.array([str2float(x) for x in df["logBB value"].tolist()]) | ||
|
||
smiles_list = [] | ||
for smi in df["SMILES"].tolist(): | ||
mol = Chem.MolFromSmiles(smi) | ||
if mol is None: | ||
smiles_list += [None] | ||
else: | ||
smiles_list += [Chem.MolToSmiles(mol)] | ||
|
||
print("Neutralizing SMILES") | ||
neutralized_smiles = [] | ||
for smiles in smiles_list: | ||
neutralized_smiles += [NeutraliseCharges(smiles)[0]] | ||
|
||
print("Calculating descriptors") | ||
descriptors = np.array(descriptors_calculator(neutralized_smiles)) | ||
|
||
# MinMax scaling | ||
scaler = MinMaxScaler() | ||
scaler.fit(descriptors) | ||
|
||
# Save scaler | ||
joblib.dump(scaler, os.path.join(root, "..", "..", "checkpoints", "scaler.joblib")) | ||
|
||
# Scale descriptors | ||
descriptors_scaled = np.array(pd.DataFrame(scaler.transform(descriptors)).fillna(0)) | ||
|
||
# Binarize classification | ||
y_logbb_class = [] | ||
for y in y_logbb: | ||
if np.isnan(y): | ||
y_logbb_class += [np.nan] | ||
continue | ||
if y > 0.1: | ||
y_logbb_class += [1] | ||
else: | ||
y_logbb_class += [0] | ||
y_logbb_class = np.array(y_logbb_class) | ||
print(y_logbb_class) | ||
|
||
# Get data | ||
mask = ~np.isnan(y_logbb_class) | ||
x = descriptors_scaled[mask] | ||
y = y_logbb_class[mask] | ||
|
||
# Setting up for ML | ||
from sklearn import model_selection | ||
seed = 42 | ||
skfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed, shuffle= True) | ||
|
||
# Models | ||
from sklearn.ensemble import RandomForestClassifier | ||
opt_rfc = RandomForestClassifier(n_estimators=700, max_depth=5, max_features='log2', min_samples_leaf = 2) | ||
scores = model_selection.cross_val_score(opt_rfc, x, y, cv=skfold, scoring='accuracy') | ||
print("Stratified 10-fold CV Model accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100)) | ||
opt_rfc.fit(x, y) | ||
joblib.dump(opt_rfc, os.path.join(root, "..", "..", "checkpoints", "model1_rfc.joblib")) | ||
|
||
from sklearn.ensemble import GradientBoostingClassifier | ||
opt_gbc = GradientBoostingClassifier(n_estimators=300, max_depth=15, max_features='log2', min_samples_leaf = 4) | ||
scores = model_selection.cross_val_score(opt_gbc, x, y, cv=skfold, scoring='accuracy') | ||
print("Stratified 10-fold CV Model accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100)) | ||
opt_gbc.fit(x, y) | ||
joblib.dump(opt_gbc, os.path.join(root, "..", "..", "checkpoints", "model2_gbc.joblib")) | ||
|
||
from sklearn.linear_model import LogisticRegression | ||
opt_logreg = LogisticRegression(penalty = 'l2', C = 10) | ||
scores = model_selection.cross_val_score(opt_logreg, x, y, cv=skfold, scoring='accuracy') | ||
print("Stratified 10-fold CV Model accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100)) | ||
opt_logreg.fit(x, y) | ||
joblib.dump(opt_logreg, os.path.join(root, "..", "..", "checkpoints", "model3_logreg.joblib")) |
Binary file not shown.