Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
miquelduranfrigola committed Oct 23, 2024
1 parent 4623127 commit f0597c4
Show file tree
Hide file tree
Showing 16 changed files with 263 additions and 25 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
26 changes: 12 additions & 14 deletions metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,22 @@ Identifier: eos3mk2
Slug: bbbp-marine-kinase-inhibitors
Status: In progress
Title: BBBP model tested on marine-derived kinase inhibitors
Description: Developed binary ensemble classifiers to predict the Blood-Brain Barrier
(BBB) permeability of small organic compounds. Applied our best models to natural
Description: A set of three binary classifiers (random forest, gradient boosting classifier,
and logistic regression) to predict the Blood-Brain Barrier
(BBB) permeability of small organic compounds. The best models were applied to natural
products of marine origin, able to inhibit kinases associated with neurodegenerative
disorders.
Mode: ''
Task: []
Input: []
Input Shape: ''
Output: []
Output Type: []
Output Shape: ''
Interpretation: ''
disorders. The training set size was around 300 compounds.
Mode: 'Retrained'
Task: 'Classification'
Input: 'Compound'
Input Shape: 'Single'
Output: 'Probability'
Output Type: 'Float'
Output Shape: 'List'
Interpretation: 'Classification score over three classifiers, namely random forest (rfc), gradient boosting classifier (gbc), and logistic regression (logres).'
Tag:
- Drug-likeness
- Permeability
Publication: https://pubmed.ncbi.nlm.nih.gov/30699889/
Source Code: https://github.com/plissonf/BBB-Models
License: MIT
DockerHub: https://hub.docker.com/r/ersiliaos/eos3mk2
Docker Architecture:
- AMD64
Binary file added model/.DS_Store
Binary file not shown.
Binary file added model/checkpoints/model1_rfc.joblib
Binary file not shown.
Binary file added model/checkpoints/model2_gbc.joblib
Binary file not shown.
Binary file added model/checkpoints/model3_logreg.joblib
Binary file not shown.
Binary file added model/checkpoints/scaler.joblib
Binary file not shown.
Binary file added model/framework/.DS_Store
Binary file not shown.
45 changes: 45 additions & 0 deletions model/framework/code/charge_neutralizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from rdkit import Chem
from rdkit.Chem import AllChem

def _InitialiseNeutralisationReactions():
patts= (
# Imidazoles
('[n+;H]','n'),
# Amines
('[N+;!H0]','N'),
# Carboxylic acids and alcohols
('[$([O-]);!$([O-][#7])]','O'),
# Thiols
('[S-;X1]','S'),
# Sulfonamides
('[$([N-;X2]S(=O)=O)]','N'),
# Enamines
('[$([N-;X2][C,N]=C)]','N'),
# Tetrazoles
('[n-]','[nH]'),
# Sulfoxides
('[$([S-]=O)]','S'),
# Amides
('[$([N-]C=O)]','N'),
)
return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]

_reactions=None

def NeutraliseCharges(smiles, reactions=None):
global _reactions
if reactions is None:
if _reactions is None:
_reactions=_InitialiseNeutralisationReactions()
reactions=_reactions
mol = Chem.MolFromSmiles(smiles)
replaced = False
for i,(reactant, product) in enumerate(reactions):
while mol.HasSubstructMatch(reactant):
replaced = True
rms = AllChem.ReplaceSubstructs(mol, reactant, product)
mol = rms[0]
if replaced:
return (Chem.MolToSmiles(mol,True), True)
else:
return (smiles, False)
58 changes: 58 additions & 0 deletions model/framework/code/descriptors_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import numpy as np
import pandas as pd
import tempfile
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors


def calculate_conformers(smiles_list):

smis_mini = smiles_list
names_mini = ["molecule_{}".format(i) for i in range(len(smiles_list))]
names = names_mini

print("Creating temporary file to store conformers")
with tempfile.NamedTemporaryFile(delete=False, suffix=".sdf") as tmp:
tmp_name = tmp.name

print("Generating conformers")
w = Chem.SDWriter(tmp_name)
rms_dict = dict()
for j in tqdm(range(len(smis_mini))):
m = Chem.MolFromSmiles(smis_mini[j])
m2 = Chem.AddHs(m)
# run the new CSD-based method
cids = AllChem.EmbedMultipleConfs(m2, numConfs=10, params=AllChem.ETKDG())
for cid in cids:
_ = AllChem.MMFFOptimizeMolecule(m2, confId=cid)
m2.SetProp("_Name", names_mini[j])
Chem.MolToMolBlock(m2)

rmslist=[]
AllChem.AlignMolConformers(m2, RMSlist=rmslist)
rms_dict[names[j]] = rmslist
w.write(m2)
w.close()

print("Reading the conformers file")
dataset = [mol for mol in Chem.SDMolSupplier(tmp_name) if mol is not None]
return dataset


def descriptors_calculator(smiles_list):
print("Calculating conformers")
dataset = calculate_conformers(smiles_list)

print("Calculating descriptors")
nms=[x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
print(len(nms))

datasetDescrs = [calc.CalcDescriptors(x) for x in dataset]
datasetDescrs = np.array(datasetDescrs)

df = pd.DataFrame(datasetDescrs, columns=nms)
return df
53 changes: 43 additions & 10 deletions model/framework/code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,71 @@
import os
import csv
import sys
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
import joblib
import numpy as np
import pandas as pd

# parse arguments
input_file = sys.argv[1]
output_file = sys.argv[2]

# current file directory
root = os.path.dirname(os.path.abspath(__file__))
checkpoints_dir = os.path.join(root, "..", "..", "checkpoints")

# my model
def my_model(smiles_list):
return [MolWt(Chem.MolFromSmiles(smi)) for smi in smiles_list]
# append system path
sys.path.append(root)
from charge_neutralizer import NeutraliseCharges
from descriptors_calculator import descriptors_calculator

# loading scaler
scaler = joblib.load(os.path.join(checkpoints_dir, "scaler.joblib"))

# loading models
model_1 = joblib.load(os.path.join(checkpoints_dir, "model1_rfc.joblib"))
model_2 = joblib.load(os.path.join(checkpoints_dir, "model2_gbc.joblib"))
model_3 = joblib.load(os.path.join(checkpoints_dir, "model3_logreg.joblib"))

# read SMILES from .csv file, assuming one column with header
with open(input_file, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
smiles_list = [r[0] for r in reader]

# run model
outputs = my_model(smiles_list)
# neutralize charges
neutralized_smiles = []
for smiles in smiles_list:
neutralized_smiles += [NeutraliseCharges(smiles)[0]]

print(neutralized_smiles)

# calculate descriptors
descriptors = np.array(descriptors_calculator(neutralized_smiles))

# normalize descriptors
descriptors = np.array(pd.DataFrame(scaler.transform(descriptors)).fillna(0))

# run models
output_1 = model_1.predict_proba(descriptors)[:, 1]
output_2 = model_2.predict_proba(descriptors)[:, 1]
output_3 = model_3.predict_proba(descriptors)[:, 1]

# reconstruct output
outputs = []
for i in range(len(smiles_list)):
o1 = float(output_1[i])
o2 = float(output_2[i])
o3 = float(output_3[i])
outputs += [[o1, o2, o3]]

#check input and output have the same lenght
#check input and output have the same length
input_len = len(smiles_list)
output_len = len(outputs)
assert input_len == output_len

# write output in a .csv file
with open(output_file, "w") as f:
writer = csv.writer(f)
writer.writerow(["value"]) # header
writer.writerow(["rfc_score", "gbc_score", "logreg_score"]) # header
for o in outputs:
writer.writerow([o])
writer.writerow(o)
5 changes: 4 additions & 1 deletion model/framework/examples/input.csv
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
smiles
smiles
Clc1c(N(S(=O)(C)=O)CCO)c2c(NC(C(N2)=O)=O)cc1Cl
Clc1ccc(C(N2CCN(CCOCCO)CC2)c3ccccc3)cc1
OC(C1CCN(CCCC(O)C2=CC=C(C(C)(C)C)C=C2)CC1)(C3=CC=CC=C3)C4=CC=CC=C4
4 changes: 4 additions & 0 deletions model/framework/examples/output.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
rfc_score,gbc_score,logreg_score
0.1714101727051474,2.165122025378177e-09,0.00011447820403389557
0.6954498139522349,0.9999999942868296,0.9226070246759587
0.5897135663254606,0.9999999835095177,0.8595850881766514
95 changes: 95 additions & 0 deletions model/framework/fit/01_fit_scaler_and_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
import pandas as pd
import joblib
import sys
import numpy as np
from rdkit import Chem
from sklearn.preprocessing import MinMaxScaler

root = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(root, "..", "code"))
from charge_neutralizer import NeutraliseCharges
from descriptors_calculator import descriptors_calculator

print("Loading the molecules")
df = pd.read_excel(os.path.join(root, "datasetsCompounds.xlsx"), sheet_name="CPSMs")
print(df.head())

def str2float(x):
try:
return float(x)
except:
return np.nan

y_logbb = np.array([str2float(x) for x in df["logBB value"].tolist()])

smiles_list = []
for smi in df["SMILES"].tolist():
mol = Chem.MolFromSmiles(smi)
if mol is None:
smiles_list += [None]
else:
smiles_list += [Chem.MolToSmiles(mol)]

print("Neutralizing SMILES")
neutralized_smiles = []
for smiles in smiles_list:
neutralized_smiles += [NeutraliseCharges(smiles)[0]]

print("Calculating descriptors")
descriptors = np.array(descriptors_calculator(neutralized_smiles))

# MinMax scaling
scaler = MinMaxScaler()
scaler.fit(descriptors)

# Save scaler
joblib.dump(scaler, os.path.join(root, "..", "..", "checkpoints", "scaler.joblib"))

# Scale descriptors
descriptors_scaled = np.array(pd.DataFrame(scaler.transform(descriptors)).fillna(0))

# Binarize classification
y_logbb_class = []
for y in y_logbb:
if np.isnan(y):
y_logbb_class += [np.nan]
continue
if y > 0.1:
y_logbb_class += [1]
else:
y_logbb_class += [0]
y_logbb_class = np.array(y_logbb_class)
print(y_logbb_class)

# Get data
mask = ~np.isnan(y_logbb_class)
x = descriptors_scaled[mask]
y = y_logbb_class[mask]

# Setting up for ML
from sklearn import model_selection
seed = 42
skfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed, shuffle= True)

# Models
from sklearn.ensemble import RandomForestClassifier
opt_rfc = RandomForestClassifier(n_estimators=700, max_depth=5, max_features='log2', min_samples_leaf = 2)
scores = model_selection.cross_val_score(opt_rfc, x, y, cv=skfold, scoring='accuracy')
print("Stratified 10-fold CV Model accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100))
opt_rfc.fit(x, y)
joblib.dump(opt_rfc, os.path.join(root, "..", "..", "checkpoints", "model1_rfc.joblib"))

from sklearn.ensemble import GradientBoostingClassifier
opt_gbc = GradientBoostingClassifier(n_estimators=300, max_depth=15, max_features='log2', min_samples_leaf = 4)
scores = model_selection.cross_val_score(opt_gbc, x, y, cv=skfold, scoring='accuracy')
print("Stratified 10-fold CV Model accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100))
opt_gbc.fit(x, y)
joblib.dump(opt_gbc, os.path.join(root, "..", "..", "checkpoints", "model2_gbc.joblib"))

from sklearn.linear_model import LogisticRegression
opt_logreg = LogisticRegression(penalty = 'l2', C = 10)
scores = model_selection.cross_val_score(opt_logreg, x, y, cv=skfold, scoring='accuracy')
print("Stratified 10-fold CV Model accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100))
opt_logreg.fit(x, y)
joblib.dump(opt_logreg, os.path.join(root, "..", "..", "checkpoints", "model3_logreg.joblib"))
Binary file added model/framework/fit/datasetsCompounds.xlsx
Binary file not shown.

0 comments on commit f0597c4

Please sign in to comment.