@@ -39,16 +39,24 @@ def select_compound_pairs_wrapper(
39
39
settings .same_prob_bins ,
40
40
settings .include_diagonal )
41
41
42
- aimed_nr_of_pairs_per_bin = determine_aimed_nr_of_pairs_per_bin (available_pairs_per_bin_matrix ,
43
- settings ,
44
- nr_of_inchikeys = len (inchikeys14_unique ))
45
-
46
- pair_frequency_matrixes = balanced_selection_of_pairs_per_bin (available_pairs_per_bin_matrix ,
47
- settings .max_pair_resampling ,
48
- aimed_nr_of_pairs_per_bin )
49
-
50
- selected_pairs_per_bin = convert_to_selected_pairs_list (pair_frequency_matrixes , available_pairs_per_bin_matrix ,
51
- available_scores_per_bin_matrix , inchikeys14_unique )
42
+ aimed_nr_of_pairs_per_bin , bin_priorities = determine_aimed_nr_of_pairs_per_bin (
43
+ available_pairs_per_bin_matrix ,
44
+ settings ,
45
+ nr_of_inchikeys = len (inchikeys14_unique )
46
+ )
47
+
48
+ pair_frequency_matrixes = balanced_selection_of_pairs_per_bin (
49
+ available_pairs_per_bin_matrix ,
50
+ settings .max_pair_resampling ,
51
+ aimed_nr_of_pairs_per_bin
52
+ )
53
+
54
+ selected_pairs_per_bin = convert_to_selected_pairs_list (
55
+ pair_frequency_matrixes ,
56
+ available_pairs_per_bin_matrix ,
57
+ available_scores_per_bin_matrix ,
58
+ inchikeys14_unique
59
+ )
52
60
return [pair for pairs in selected_pairs_per_bin for pair in pairs ]
53
61
54
62
@@ -143,12 +151,19 @@ def compute_jaccard_similarity_per_bin(
143
151
144
152
def determine_aimed_nr_of_pairs_per_bin (available_pairs_per_bin_matrix , settings , nr_of_inchikeys ):
145
153
"""Determines the aimed_nr_of_pairs_per_bin.
146
- If the settings given are higher than the highest possible number of pairs it is lowered to that"""
154
+
155
+ If the settings given are higher than the highest possible number of pairs it is lowered to that.
156
+ """
147
157
148
158
# Select the nr_of_pairs_per_bin to use
149
159
nr_of_available_pairs_per_bin = get_nr_of_available_pairs_in_bin (available_pairs_per_bin_matrix )
150
160
lowest_max_number_of_pairs = min (nr_of_available_pairs_per_bin ) * settings .max_pair_resampling
151
161
print (f"The available nr of pairs per bin are: { nr_of_available_pairs_per_bin } " )
162
+
163
+ # Set bin priority from lowest to highest no. of available pairs
164
+ bin_priority = np .argsort (nr_of_available_pairs_per_bin )
165
+ print (f"Bin priorities will be orderd accordingly: { [settings .same_prob_bins [i ] for i in bin_priority ]} " )
166
+
152
167
aimed_nr_of_pairs_per_bin = settings .average_pairs_per_bin * nr_of_inchikeys
153
168
if lowest_max_number_of_pairs < aimed_nr_of_pairs_per_bin :
154
169
print (f"Warning: The average_pairs_per_bin: { settings .average_pairs_per_bin } cannot be reached, "
@@ -158,13 +173,14 @@ def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings
158
173
f"Instead the lowest number of available pairs in a bin times the resampling is used, "
159
174
f"which is: { lowest_max_number_of_pairs } " )
160
175
aimed_nr_of_pairs_per_bin = lowest_max_number_of_pairs
161
- return aimed_nr_of_pairs_per_bin
176
+ return aimed_nr_of_pairs_per_bin , bin_priority
162
177
163
178
164
179
def balanced_selection_of_pairs_per_bin (
165
180
available_pairs_per_bin_matrix : np .ndarray ,
166
181
max_pair_resampling : int ,
167
- nr_of_pairs_per_bin : int
182
+ nr_of_pairs_per_bin : int ,
183
+ bin_priority : np .ndarray = None ,
168
184
) -> np .ndarray :
169
185
"""From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution.
170
186
@@ -190,11 +206,16 @@ def balanced_selection_of_pairs_per_bin(
190
206
Resampling means that the exact same inchikey pair is added multiple times to the list of pairs.
191
207
nr_of_pairs_per_bin:
192
208
The number of pairs that should be sampled for each tanimoto bin.
209
+ bin_priority:
210
+ Bins will be processed in the order given in bin_priority. Default is set to None in which case no change
211
+ to the order will be done.
193
212
"""
213
+ if bin_priority is None :
214
+ bin_priority = np .arange (0 , available_pairs_per_bin_matrix .shape [0 ])
194
215
195
216
inchikey_count = np .zeros (available_pairs_per_bin_matrix .shape [1 ])
196
217
pair_frequency_matrixes = []
197
- for pairs_in_bin in available_pairs_per_bin_matrix :
218
+ for pairs_in_bin in available_pairs_per_bin_matrix [ bin_priority ] :
198
219
pair_frequencies , inchikey_count = select_balanced_pairs (pairs_in_bin ,
199
220
inchikey_count ,
200
221
nr_of_pairs_per_bin ,
0 commit comments