-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils_newbg.py
536 lines (513 loc) · 29.3 KB
/
utils_newbg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
from dataclasses import dataclass, field
from typing import Dict, Iterable, List, Optional, Union, Any, Literal
from torch.utils.data import Dataset
import numpy as np
import logging
from logging import info, warning
from time import time
from datetime import timedelta
from itertools import combinations, product
from random import sample, shuffle
from utils import pair_weights
import pandas as pd
from collections import Counter, defaultdict
from pprint import pprint
logger = logging.getLogger('twosteprt.utils')
info = logger.info
warning = logger.warning
SPECIAL_FEATURES = []
SPECIAL_FEATURES_SIZE = sum([x[1] for x in SPECIAL_FEATURES])
def compute_special_features(mol, sysfeatures):
features = np.zeros((mol.GetNumAtoms(), SPECIAL_FEATURES_SIZE + (len(sysfeatures) if sysfeatures is not None else 0)))
for i, a in enumerate(mol.GetAtoms()):
j = 0
for f, n in SPECIAL_FEATURES:
res = f(a)
assert n == len(res)
features[i, j:j+n] = res
j += n
if sysfeatures is not None:
features[i, j:] = sysfeatures
return features
def sysfeature_graph(smiles, graph, sysfeatures, bond_or_atom='bond', special_features=False):
from dmpnn_graph import dmpnn_graph as mol2graph
if bond_or_atom == 'bond':
return mol2graph(smiles, bond_features_extra=np.array([sysfeatures] * int(graph.n_bonds / 2)))
elif bond_or_atom == 'atom':
if not special_features:
return mol2graph(smiles, atom_features_extra=np.array([sysfeatures] * graph.n_atoms))
else:
from chemprop.rdkit import make_mol
mol = make_mol(smiles, False, False, False)
return mol2graph(mol, atom_features_extra=compute_special_features(mol, sysfeatures))
@dataclass
class RankDataset(Dataset):
x_mols: List[Union[Any, str]] # mol graphs or SMILES
x_extra: Union[np.ndarray, List[List[float]]] # extra compound features, e.g., logp
x_sys: Union[np.ndarray, List[List[float]]] # system features
x_ids: List[str] # ID (e.g., smiles) for each sample
y: Union[np.ndarray, List[float]] # retention times
x_sys_global_num: Optional[int] = None # which (exclusive max slice index) of the x_sys features are global for the whole dataset
use_pair_weights: bool=True # use pair weights (epsilon)
epsilon: float=0.5 # soft threshold for retention time difference
discard_smaller_than_epsilon: bool=False # don't weigh by rt diff; simply discard any pairs with rt_diff<epsilon
use_group_weights: bool=True # weigh number of samples per group
cluster: bool=False # cluster datasets with same column params for calculating
# group weights
downsample_groups: bool=False # min number of pairs will be used as the max pair nr for each group
downsample_always_confl: bool=False # include all conflicting pairs also when downsampling
downsample_factor: float=1.0 # if greater than 1, some clusters may have less pairs
group_weights_only_intra_cluster: bool=False # group-weights are used, but only for weighing within a cluster
weight_steepness: float=20 # steepness of the pair_weight_fn
weight_mid: float=0.75 # mid-factor of the weight_mid
dynamic_weights: bool=True # adapt epsilon to gradient length
pair_step: int=1 # step size for generating pairs
pair_stop: Optional[int]=None # stop number for generating pairs
dataset_info: Optional[List[str]] = None # Dataset ID for each datum
void_info: Optional[Dict[str, float]] = None # void time mapping for dataset IDs
void: Optional[float]=None # global void time
no_inter_pairs: bool=True # don't generate inter dataset pairs
no_intra_pairs: bool=False # don't generate intra dataset pairs
max_indices_size:Optional[int]=None # limit for the size of indices
max_num_pairs:Optional[int]=None # limit for the number of pairs per dataset/group
y_neg : bool=False # -1 instead of 0 for negative pair
y_float : bool=False # yield target values as floats instead of as longs
conflicting_smiles_pairs:dict = field(default_factory=dict) # conflicting pairs (smiles)
only_confl: bool=False # gather only conflicting pairs
confl_weight: float=1. # weight modifier for conflicting pairs
add_sysfeatures_to_graphs: bool=False
sysfeatures_graphs_mode: Literal['bond', 'atom']='bond'
include_special_atom_features: bool=False
def __post_init__(self):
if (isinstance(self.x_extra, np.ndarray)):
self.x_extra = self.x_extra.astype('float32')
if (isinstance(self.x_sys, np.ndarray)):
self.x_sys = self.x_sys.astype('float32')
# assert dimensions etc.
assert len(self.x_mols) == len(self.x_extra) == len(self.x_sys) == len(self.x_ids) == len(self.y)
if (self.dataset_info is not None):
assert len(self.y) == len(self.dataset_info)
assert not (self.no_inter_pairs and self.no_intra_pairs), (
'no_inter_pairs and no_intra_pairs can\'t be both set')
# preprocess doublets
self.preprocess_doublets()
# transform single compounds(+info) into pairs for ranking
transformed = self._transform_pairwise()
self.x1_indices = transformed['x1_indices']
self.x2_indices = transformed['x2_indices']
self.y_trans = transformed['y_trans']
if (self.y_float):
self.y_trans = self.y_trans.astype('float32')
self.weights = transformed['weights']
self.is_confl = transformed['is_confl']
# for including sysfeatures into graphs, graphs have to be recomputed
if (self.add_sysfeatures_to_graphs or self.include_special_atom_features):
if self.add_sysfeatures_to_graphs:
print('add system features to graphs')
if self.include_special_atom_features:
print('add special atom features to graphs')
for i in range(len(self.x_mols)):
self.x_mols[i] = sysfeature_graph(self.x_ids[i], self.x_mols[i], self.x_sys[i] if self.add_sysfeatures_to_graphs else None,
bond_or_atom=self.sysfeatures_graphs_mode,
special_features=self.include_special_atom_features)
def _transform_pairwise(self):
x1_indices = []
x2_indices = []
y_trans = []
weights = []
is_confl = []
# group by dataset
groups = {}
pair_nrs = {}
group_index_start = {}
group_index_end = {}
groups_max_rts = defaultdict(float)
# confl_pair_report = {}
if (self.dataset_info is None):
groups['unk'] = list(range(len(self.y)))
else:
for i in range(len(self.y)):
groups.setdefault(self.dataset_info[i], []).append(i)
groups_max_rts[self.dataset_info[i]] = max(groups_max_rts[self.dataset_info[i]],
self.y[i])
print(f'{groups_max_rts=}')
# preprocess confl pair list for O(1) lookup
# and disregard confl pairs not conflicting for this training set
confl_pairs_lookup = {k for k, v in self.conflicting_smiles_pairs.items()
if any(all(xi in groups for xi in x) for x in v)}
print(f'using {len(confl_pairs_lookup)} out of the {len(self.conflicting_smiles_pairs)} '
'conflicting pairs provided')
# same-dataset pairs
inter_pair_nr = intra_pair_nr = 0
if (not self.no_intra_pairs):
info('computing intra-dataset pairs...')
t0 = time()
for group in groups:
group_index_start[group] = len(weights)
group_void_rt = (self.void_info[group] if self.void_info is not None
and group in self.void_info else self.void)
pair_nr = 0
# get conflicting smiles pairs indices
confl_indices = set()
if (len(confl_pairs_lookup) > 0):
for i, j in combinations(groups[group], 2):
if frozenset((self.x_ids[i], self.x_ids[j])) in confl_pairs_lookup:
confl_indices.add(frozenset((i, j)))
it = self.dataset_pair_it(groups[group], self.pair_step, self.pair_stop,
max_indices_size=self.max_indices_size,
max_num_pairs=self.max_num_pairs,
obl_indices=confl_indices)
if (logger.level <= logging.INFO):
from tqdm import tqdm
it = tqdm(it)
doublets_filtered = 0
for i, j, w in it:
# filter out invalid pairs due to doublets
if (hasattr(self, 'doublet_rt_ranges') and
((group, self.x_ids[i]) in self.doublet_rt_ranges or
(group, self.x_ids[j]) in self.doublet_rt_ranges)):
min_i = (self.doublet_rt_ranges[(group, self.x_ids[i])][0]
if (group, self.x_ids[i]) in self.doublet_rt_ranges
else self.y[i])
max_i = (self.doublet_rt_ranges[(group, self.x_ids[i])][1]
if (group, self.x_ids[i]) in self.doublet_rt_ranges
else self.y[i])
min_j = (self.doublet_rt_ranges[(group, self.x_ids[j])][0]
if (group, self.x_ids[j]) in self.doublet_rt_ranges
else self.y[j])
max_j = (self.doublet_rt_ranges[(group, self.x_ids[j])][1]
if (group, self.x_ids[j]) in self.doublet_rt_ranges
else self.y[j])
if (((max_i >= min_j) and (max_i <= max_j))
or ((min_i >= min_j) and (min_i <= max_j))
or ((min_i >= min_j) and (max_i <= max_j))
or ((max_i >= max_j) and (min_i <= min_j))):
# print(f'filtered doublet pair ({self.x_ids[i]}, {self.x_ids[j]}); ranges '
# f'{(min_i, max_i)}, {(min_j, max_j)}')
doublets_filtered += 1
continue
res = self.get_pair(self.y, i, j, group_void_rt or 0, group_void_rt or 0, self.y_neg)
if (res is None):
continue
pos_idx, neg_idx, yi = res
x1_indices.append(pos_idx)
x2_indices.append(neg_idx)
y_trans.append(yi)
# weights
weights.append(w)
# is conflicting pair?
is_confl.append(frozenset((pos_idx, neg_idx)) in confl_indices)
pair_nr += 1
pair_nrs[group] = pair_nr
intra_pair_nr += pair_nr
group_index_end[group] = len(weights)
info(f'filtered out {doublets_filtered} invalid pairs due to doublets for group {group}')
info(f'done ({str(timedelta(seconds=time() - t0))} elapsed)')
# between groups
if (not self.no_inter_pairs):
info('compute inter dataset pairs...')
t0 = time()
inter_group_nr = len(list(combinations(groups, 2)))
it = combinations(groups, 2)
if (logger.level <= logging.INFO):
from tqdm import tqdm
it = tqdm(list(it))
for group1, group2 in it:
group_index_start[(group1, group2)] = len(weights)
void_i = (self.void_info[group1] if self.void_info is not None
and group1 in self.void_info else self.void)
void_j = (self.void_info[group2] if self.void_info is not None
and group2 in self.void_info else self.void)
pair_nr = 0
n = min(max(len(groups[group1]), len(groups[group2])), self.max_indices_size or 1e9)
max_pair_nr = (n * np.ceil((self.pair_stop if self.pair_stop is not None else n) / self.pair_step)
* (1/(inter_group_nr / len(groups)))).astype(int)
potential_pairs = self.get_comparable_pairs(groups[group1], groups[group2], self.y, self.x_ids,
void_i=void_i or 0, void_j=void_j or 0,
y_neg=self.y_neg, epsilon=self.epsilon,
pairs_compute_threshold=10 * max_pair_nr)
info(f'{group1}, {group2} {max_pair_nr=}, {(len(potential_pairs))=}')
for pos_idx, neg_idx, yi in iter(sample(potential_pairs, min(max_pair_nr, len(potential_pairs)))):
x1_indices.append(pos_idx)
x2_indices.append(neg_idx)
y_trans.append(yi)
weights.append(1.0) # absolute rt difference of pairs of two different datasets can't be compared
is_confl.append(None)
pair_nr += 1
pair_nrs[(group1, group2)] = pair_nr
inter_pair_nr += pair_nr
group_index_end[(group1, group2)] = len(weights)
info(f'done ({str(timedelta(seconds=time() - t0))} elapsed)')
info(f'{inter_pair_nr=}, {intra_pair_nr=}')
# cluster groups by system params
if (len(pair_nrs) > 0):
print(f'number of pairs per dataset ({len(pair_nrs)}): min={min(pair_nrs.values())}, max={max(pair_nrs.values())}')
pair_nrs_precluster = pair_nrs.copy()
pair_nrs_cluster_min = {}
if (self.cluster):
cluster_sys = {g: self.x_sys[x1_indices[group_index_start[g]]][:self.x_sys_global_num] for g in pair_nrs
if group_index_end[g] != group_index_start[g]} # empty group
clusters = {}
for g, sysf in cluster_sys.items():
clusters.setdefault(tuple(sysf), []).append(g)
pprint(clusters)
clusters = list(clusters.values())
pprint(pair_nrs)
for c in clusters:
pair_num_sum = sum([pair_nrs[g] for g in c])
pair_num_min = min([pair_nrs[g] for g in c])
for g in c:
pair_nrs[g] = pair_num_sum
pair_nrs_cluster_min[g] = pair_num_min
if (len(pair_nrs) > 0):
print(f'number of pairs per cluster ({len(clusters)}): min={min(pair_nrs.values())}, max={max(pair_nrs.values())}')
self.dataset_clusters = clusters
nr_group_pairs_max = max(list(pair_nrs.values()) + [0])
downsample_nr = min(list(pair_nrs.values()) + [np.infty]) * self.downsample_factor
pprint(pair_nrs)
info('computing pair weights')
for g in pair_nrs:
weight_modifier = self.confl_weight # confl pairs are already balanced by weight; here they can be boosted additionally
if (self.downsample_groups):
downsample_nr_g = int(np.ceil(downsample_nr / (pair_nrs[g] / pair_nrs_precluster[g])))
actual_downsample_nr_g = min([downsample_nr_g, group_index_end[g] - group_index_start[g]])
print(f'{g}: {actual_downsample_nr_g=} = {downsample_nr=} / ({pair_nrs[g]=} / {pair_nrs_precluster[g]=})'
+ (f' [SHOULD BE {downsample_nr_g} ({actual_downsample_nr_g/downsample_nr_g:.0%})]'
if downsample_nr_g != actual_downsample_nr_g else ''))
downsample_whitelist = set(sample(range(group_index_start[g], group_index_end[g]), actual_downsample_nr_g))
# TODO: make sure many conflicting pairs are included in the sample
for i in range(group_index_start[g], group_index_end[g]):
if self.downsample_groups and i not in downsample_whitelist:
if self.downsample_always_confl and frozenset([self.x_ids[x1_indices[i]], self.x_ids[x2_indices[i]]]) in self.conflicting_smiles_pairs:
pass # with this option, conflicting pairs are never removed in downsampling
else:
weights[i] = None
continue
rt_diff = (np.infty if isinstance(g, tuple) # no statement can be made for inter-group pairs
or not self.use_pair_weights
else np.abs(self.y[x1_indices[i]] - self.y[x2_indices[i]]))
if self.use_group_weights:
if self.group_weights_only_intra_cluster:
nr_group_pairs = pair_nrs_precluster[g]
nr_group_pairs_max = pair_nrs_cluster_min[g]
else:
nr_group_pairs = pair_nrs[g]
nr_group_pairs_max = nr_group_pairs_max
else:
nr_group_pairs = nr_group_pairs_max
weights_mod = pair_weights(smiles1=self.x_ids[x1_indices[i]], smiles2=self.x_ids[x2_indices[i]],
rt_diff=rt_diff,
nr_group_pairs=nr_group_pairs, nr_group_pairs_max=nr_group_pairs_max,
confl_weights_modifier=weight_modifier, confl_pair_list=self.conflicting_smiles_pairs,
only_confl=self.only_confl,
weight_steepness=self.weight_steepness,
weight_mid=self.weight_mid,
max_rt=groups_max_rts[g] if self.dynamic_weights else None,
epsilon=self.epsilon, discard_smaller_than_epsilon=self.discard_smaller_than_epsilon)
if (rt_diff < self.epsilon and weights_mod is not None and self.discard_smaller_than_epsilon):
print(rt_diff, 'should this pair not have been discarded?')
weights[i] = (weights_mod * weights[i]) if weights_mod is not None else None
# NOTE: pair weights can be "None"
info('done. removing None weights')
# remove Nones
x1_indices_new = []
x2_indices_new = []
y_trans_new = []
weights_new = []
is_confl_new = []
removed_counter = 0
for i in range (len(y_trans)):
if (weights[i] is not None):
x1_indices_new.append(x1_indices[i])
x2_indices_new.append(x2_indices[i])
y_trans_new.append(y_trans[i])
weights_new.append(weights[i])
is_confl_new.append(is_confl[i])
else:
removed_counter += 1
info(f'removed {removed_counter} (of {len(y_trans)}) pairs for having "None" weights')
info('done generating pairs')
return dict(x1_indices= np.asarray(x1_indices_new),
x2_indices=np.asarray(x2_indices_new),
y_trans=np.asarray(y_trans_new),
weights=np.asarray(weights_new),
is_confl=np.asarray(is_confl_new))
@staticmethod
def weight_fn(x, steep=4, mid=0.75):
"""sigmoid function with f(0) → 0, f(2) → 1, f(0.75) = 0.5"""
return 1 / (1 + np.exp(-steep * (x - mid)))
@staticmethod
def dataset_pair_it(indices, pair_step=1, pair_stop=None,
max_indices_size=None, max_num_pairs=None,
obl_indices=set()):
n = len(indices)
if (max_indices_size is not None):
it = sorted(sample(list(range(n)), min(max_indices_size, n)))
elif (max_num_pairs is not None):
it = sample(list(range(n)), n)
else:
it = range(n)
non_obl_pairs = 0
do_break = False
for i in it:
if do_break:
break
for j in range(i + 1,
(n if pair_stop is None else min(i + pair_stop, n)),
pair_step):
if (frozenset((indices[i], indices[j])) not in obl_indices):
if (max_num_pairs is not None and non_obl_pairs > max_num_pairs):
do_break = True
break
yield indices[i], indices[j], 1.0
non_obl_pairs += 1
if (len(obl_indices) > 0):
obl_weight = non_obl_pairs / len(obl_indices)
print(f'{non_obl_pairs} non-conflicting pairs, {len(obl_indices)} conflicting pairs; weight: {obl_weight:.2f}')
for i, j in obl_indices:
yield i, j, obl_weight
@staticmethod
def inter_dataset_pair_it(indices1, indices2, pair_step=1, pair_stop=None,
nr_groups_norm=1, max_indices_size=None):
max_ = max(len(indices1), len(indices2))
if (max_indices_size is not None):
max_ = min(max_, max_indices_size)
all_combs = list(product(indices1, indices2))
k = (max_ * np.ceil((pair_stop if pair_stop is not None else max_) / pair_step)
* nr_groups_norm).astype(int)
return iter(sample(all_combs, min(k, len(all_combs))))
@staticmethod
def get_pair(y, i, j, void_i=0, void_j=0, y_neg=False):
# pos: eluting second, neg: eluting first; (pos, neg) := 1 <-> (neg, pos) := 0(-1)
pos_idx, neg_idx = (i, j) if y[i] > y[j] else (j, i)
# void
if (y[i] < void_i and y[j] < void_j):
# don't take pairs where both compounds are in void volume
return None
# balanced class
if 1 != (-1)**(pos_idx + neg_idx):
return pos_idx, neg_idx, 1
else:
return neg_idx, pos_idx, (-1 if y_neg else 0)
def preprocess_doublets(self):
doublet_rt_ranges = {} # {(ds, id_): (1.2, 2.1)}
for i in range(len(self.y)):
rt = self.y[i]
id_ = self.x_ids[i]
ds = self.dataset_info[i]
if ((ds, id_) not in doublet_rt_ranges):
doublet_rt_ranges[(ds, id_)] = (rt, rt)
doublet_rt_ranges[(ds, id_)] = (min(rt, *doublet_rt_ranges[(ds, id_)]),
max(rt, *doublet_rt_ranges[(ds, id_)]))
self.doublet_rt_ranges = {k: v for k, v in doublet_rt_ranges.items()
if v[0] != v[1]}
# stats on doublets: how many per dataset? mean/median rt difference per doublet
data = pd.DataFrame.from_records([{'dataset': k[0], 'rt_diff': v[1] - v[0]}
for k, v in self.doublet_rt_ranges.items()])
if len(data) > 0:
stats = data.groupby('dataset').rt_diff.agg(['count', 'mean', 'median'])
print('doublet stats:\n' + stats.to_string())
def get_comparable_pairs(self, indices_i, indices_j, rts, ids,
void_i=0, void_j=0, y_neg=False, epsilon=0.5,
pairs_compute_threshold=None):
pairs = set()
def make_pairs(indices_pre, indices_post):
for i, (i_pre, i_post) in enumerate(product(indices_pre, indices_post)):
yield (i_post, i_pre, 1) if 1 == (-1)**i else (i_pre, i_post, -1 if y_neg else 0)
inters = list(set([ids[i] for i in indices_i]) & set([ids[j] for j in indices_j]))
shuffle(inters)
# TODO: problem if IDs not unique, assert this somewhere!
for id_k in inters:
if (pairs_compute_threshold is not None and len(pairs) > pairs_compute_threshold):
info('too many inter-pairs to consider; aborting with compute threshold')
warning('inter-pairs might be unbalanced due to their potentially large number!')
break
k_i = [i for i in indices_i if ids[i] == id_k][0]
k_j = [j for j in indices_j if ids[j] == id_k][0]
if (rts[k_i] < void_i or rts[k_j] < void_j):
continue
pre_is = [i for i in indices_i if rts[i] + epsilon < rts[k_i] and rts[i] >= void_i]
post_is = [i for i in indices_i if rts[i] > rts[k_i] + epsilon and rts[i] >= void_i]
pre_js = [j for j in indices_j if rts[j] + epsilon < rts[k_j] and rts[j] >= void_j]
post_js = [j for j in indices_j if rts[j] > rts[k_j] + epsilon and rts[j] >= void_j]
pairs |= set(make_pairs(pre_is, post_js))
pairs |= set(make_pairs(pre_js, post_is))
return list(pairs)
def remove_indices(self, indices):
assert all(len(_) == len(self.x1_indices) for _ in [
self.x1_indices, self.x2_indices, self.y_trans, self.weights,
self.is_confl])
x1_indices_new = []
x2_indices_new = []
y_trans_new = []
weights_new = []
is_confl_new = []
indices = set(indices)
for i in range(len(self.x1_indices)):
if (i not in indices):
x1_indices_new.append(self.x1_indices[i])
x2_indices_new.append(self.x2_indices[i])
y_trans_new.append(self.y_trans[i])
weights_new.append(self.weights[i])
is_confl_new.append(self.is_confl[i])
self.x1_indices = np.asarray(x1_indices_new)
self.x2_indices = np.asarray(x2_indices_new)
self.y_trans = np.asarray(y_trans_new)
self.weights = np.asarray(weights_new)
self.is_confl = np.asarray(is_confl_new)
def __len__(self):
return self.y_trans.shape[0]
def __getitem__(self, index):
# x1_sys == x2_sys for the first `x_info_global_num` features
# returns ((graph, extra, sys) x 2, y, weight)
return (((self.x_mols[self.x1_indices[index]], self.x_extra[self.x1_indices[index]],
self.x_sys[self.x1_indices[index]]),
(self.x_mols[self.x2_indices[index]], self.x_extra[self.x2_indices[index]],
self.x_sys[self.x2_indices[index]])),
self.y_trans[index], self.weights[index], self.is_confl[index])
def check_integrity(x: RankDataset, clean=False):
pairs = {}
for i, (x1, x2, y) in enumerate(zip(x.x1_indices, x.x2_indices, x.y_trans)):
p = tuple(sorted([x.x_ids[x1], x.x_ids[x2]]))
if (p[0] == x.x_ids[x2]):
y = (-1 if x.y_neg else 0) if y == 1 else 1
pairs.setdefault(p, []).append(
(x.dataset_info[x1], y, x.x_sys[x1][:x.x_sys_global_num], i))
# NOTE: only taking the global sys features makes most sense, although due to different
# gradient positions, pairs cleaned in this manner *technically can be possible*.
records = []
clean_indices = []
same_settings_datasets = []
for v in pairs.values():
nr_confl = nr_invalid = nr_combs = 0
invalid = []
for (ds_i, y_i, sys_i, i), (ds_j, y_j, sys_j, j) in combinations(v, 2):
nr_combs += 1
if (y_i != y_j):
if (ds_i == ds_j):
print(ds_i, x.x_ids[x.x1_indices[i]], x.x_ids[x.x2_indices[i]],
x.x_ids[x.x1_indices[j]], x.x_ids[x.x2_indices[j]])
nr_confl += 1
if ((sys_i == sys_j).all()):
nr_invalid += 1
same_settings_datasets.append((ds_i, ds_j))
invalid.append((i, j))
if (clean):
# gready algorithm to remove indices with most invalid pairs
while (len(invalid) > 0):
max_i = Counter(np.asarray(invalid).flatten()).most_common()[0][0]
clean_indices.append(max_i)
invalid = [_ for _ in invalid if max_i not in _]
records.append(dict(nr_combs=nr_combs, nr_confl=nr_confl, nr_invalid=nr_invalid))
stats = pd.DataFrame.from_records(records)
if (len(stats) != 0):
print(f'conflicting pairs percentage: {stats.nr_confl.sum() / stats.nr_combs.sum():.2%}')
print(f'conflicting pairs percentage (averaged): {(stats.nr_confl / stats.nr_combs).mean():.2%}')
print(f'invalid conflicting pairs percentage: {stats.nr_invalid.sum() / stats.nr_confl.sum():.2%}')
print(f'invalid pairs percentage (of total): {stats.nr_invalid.sum() / stats.nr_combs.sum():.2%}')
# dss = pd.merge(dss, pd.read_csv(os.path.join('../RepoRT/', 'ph_info.csv'), sep='\t', index_col=0)[REL_ONEHOT_COLUMNS], how='left', left_index=True, right_index=True)
# for ds1, ds2 in same_settings_datasets:
# assert all(x[0][0] == x[0][1] or np.isnan(x[0][0]) and np.isnan(x[0][1]) for x in
# zip(dss.loc[[ds1, ds2], ['column.id', 'column.flowrate', 'column.length'] +
# ['class.pH.A', 'class.pH.B', 'class.solvent'] + ['H']].values.transpose()))
return stats, clean_indices, same_settings_datasets