-
Notifications
You must be signed in to change notification settings - Fork 10
/
isim_utils.py
154 lines (123 loc) · 4.4 KB
/
isim_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from isim_comp import calculate_isim
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
def pairwise_average(fingerprints: np.ndarray, n_ary: str = 'RR'):
"""
This function computes the pairwise average similarity between all objects in the dataset.
Parameters:
fingerprints: numpy array of fingerprints
n_ary: type of similarity to index compute
Returns:
average: average similarity between all objects
"""
# Compute the pairwise similarities
pairwise_sims = []
for i in range(len(fingerprints)):
for j in range(len(fingerprints)):
if i != j:
pairwise_sims.append(calculate_isim(np.array([fingerprints[i], fingerprints[j]]), n_ary = n_ary))
# Compute the average similarity
average = np.mean(pairwise_sims)
return average
def binary_fps(smiles: list, fp_type: str = 'RDKIT', n_bits: int = 2048):
"""
This function generates binary fingerprints for the dataset.
Parameters:
smiles: list of SMILES strings
fp_type: type of fingerprint to generate ['RDKIT', 'ECFP4', 'ECFP6', or 'MACCS']
n_bits: number of bits for the fingerprint
Returns:
fingerprints: numpy array of fingerprints
"""
# Generate the fingerprints
if fp_type == 'RDKIT':
def generate_fp(mol, fp):
DataStructs.cDataStructs.ConvertToNumpyArray(Chem.RDKFingerprint(mol), fp)
elif fp_type == 'ECFP4':
def generate_fp(mol, fp):
DataStructs.cDataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits), fp)
elif fp_type == 'ECFP6':
def generate_fp(mol, fp):
DataStructs.cDataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=n_bits), fp)
elif fp_type == 'MACCS':
def generate_fp(mol, fp):
DataStructs.cDataStructs.ConvertToNumpyArray(Chem.MACCSkeys.GenMACCSKeys(mol), fp)
else:
print('Invalid fingerprint type: ', fp_type)
exit(0)
fingerprints = []
for smi in smiles:
# Generate the mol object
try:
mol = Chem.MolFromSmiles(smi)
except:
print('Invalid SMILES: ', smi)
exit(0)
# Generate the fingerprint and append to the list
fingerprint = np.array([])
generate_fp(mol, fingerprint)
fingerprints.append(fingerprint)
fingerprints = np.array(fingerprints)
return fingerprints
def real_fps(smiles):
"""
This function generates real number fingerprints for the dataset.
Parameters:
smiles: list of SMILES strings
Returns:
fingerprints: numpy array of fingerprints
"""
fps = []
for smi in smiles:
# Generate the mol object
try:
mol = Chem.MolFromSmiles(smi)
except:
print('Invalid SMILES: ', smi)
exit(0)
# Generate the fingerprint and append to the list
des = []
for nm, fn in Descriptors._descList:
try:
val = fn(mol)
except:
print('Error computing descriptor: ', nm)
val = 'NaN'
des.append(val)
fps.append(des)
# Drop columns with NaN values
fps = np.array(fps)
fps = fps[:, ~np.isnan(fps).any(axis = 0)]
return fps
def minmax_norm(fps):
"""
This function performs min-max normalization on the dataset.
Parameters:
fps: numpy array of fingerprints
Returns:
fps: normalized numpy array of fingerprints
"""
# Turn the array into a DataFrame
df = pd.DataFrame(fps)
# Normalize the data
df_numeric = df.select_dtypes(include = [np.number])
columns = df_numeric.columns
for column in columns:
min_prop = np.min(df[column])
max_prop = np.max(df[column])
try:
df[column] = [(x - min_prop) / (max_prop - min_prop) for x in df[column]]
except ZeroDivisionError:
df.drop(column, axis = 1)
df = df.dropna(axis = 'columns')
# Return the normalized data as a numpy array
return df.to_numpy()
def rdkit_pairwise_sim(fingerprints):
nfps = len(fingerprints)
similarity = []
for n in range(nfps - 1):
sim = DataStructs.BulkTanimotoSimilarity(fingerprints[n], fingerprints[n+1:])
similarity.extend([s for s in sim])
return np.mean(similarity)