Skip to content

Commit

Permalink
Merge pull request #124 from matchms/models_with_metadata
Browse files Browse the repository at this point in the history
Train models with Metadata
  • Loading branch information
florian-huber authored Aug 11, 2023
2 parents d6ccc93 + 4fb591f commit fb3b9e5
Show file tree
Hide file tree
Showing 17 changed files with 858 additions and 150 deletions.
13 changes: 2 additions & 11 deletions .github/workflows/CI_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,16 @@ jobs:
run: |
which python
python --version
- name: Install Tensorflow version 2.4
- name: Install Tensorflow version 2.6
run: |
python -m pip install --upgrade pip
pip install "tensorflow>=2.4,<2.5"
pip install "tensorflow>=2.6,<2.7"
- name: Install other dependencies
run: |
pip install -e .[dev,train]
- name: Show pip list
run: |
pip list
- name: Run test with tensorflow version 2.4
run: pytest
- name: Install Tensorflow version 2.6
run: |
pip install --upgrade "tensorflow>=2.6,<2.7"
pip install --upgrade "numpy>1.20,<1.24.0"
- name: Show pip list
run: |
pip list
- name: Run test with tensorflow version 2.6
run: pytest
- name: Install Tensorflow version 2.8
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ output/
models_trained/
computed_results/
notebooks/.ipynb_checkpoints/
notebooks/train_with_additional_metadata
__pycache__/


Expand Down
8 changes: 2 additions & 6 deletions ms2deepscore/MS2DeepScoreMonteCarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,8 @@ def _create_monte_carlo_base(self):
dropout_in_first_layer = ('dropout' in self.model.base.layers[3].name)

# re-build base network with dropout layers always on
base = self.model.get_base_model(input_dim=self.input_vector_dim,
base_dims=base_dims,
embedding_dim=self.output_vector_dim,
dropout_rate=dropout_rate,
dropout_always_on=True,
dropout_in_first_layer=dropout_in_first_layer)
base = self.model.get_base_model(base_dims=base_dims, embedding_dim=self.output_vector_dim, dropout_rate=dropout_rate,
dropout_in_first_layer=dropout_in_first_layer, dropout_always_on=True)
base.set_weights(self.model.base.get_weights())
return base

Expand Down
17 changes: 4 additions & 13 deletions ms2deepscore/data_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,9 +365,10 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType],
self.on_epoch_end()

def __len__(self):
"""Denotes the number of batches per epoch"""
# TODO: this means we don't see all data every epoch, because the last half-empty batch
# is omitted. I guess that is expected behavior? --> Yes, with the shuffling in each epoch that seem OK to me (and makes the code easier).
"""Denotes the number of batches per epoch.
NB1: self.reference_scores_df only contains 'selected' inchikeys, see `self._data_selection`.
NB2: We don't see all data every epoch, because the last half-empty batch is omitted.
"""
return int(self.settings["num_turns"]) * int(np.floor(len(self.binned_spectrums) / self.settings["batch_size"]))

def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
Expand All @@ -382,7 +383,6 @@ def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
for index in indexes:
spectrum1 = self.binned_spectrums[index]
inchikey1 = spectrum1.get("inchikey")[:14]

# Randomly pick the desired target score range and pick matching ID
target_score_range = same_prob_bins[np.random.choice(np.arange(len(same_prob_bins)))]
inchikey2 = self._find_match_in_range(inchikey1, target_score_range)
Expand All @@ -393,7 +393,6 @@ def on_epoch_end(self):
"""Updates indexes after each epoch"""
self.indexes = np.tile(np.arange(len(self.binned_spectrums)), int(self.settings["num_turns"]))
if self.settings["shuffle"]:

np.random.shuffle(self.indexes)

def _exclude_not_selected_inchikeys(self, reference_scores_df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -476,7 +475,6 @@ def __len__(self):
This is expected behavior, with the shuffling this is OK.
"""
return int(self.settings["num_turns"]) * int(np.floor(len(self.reference_scores_df) / self.settings["batch_size"]))

def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
"""
Generate spectrum pairs for batch. For each 'source' inchikey pick an inchikey in the
Expand All @@ -486,7 +484,6 @@ def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
batch_size = self.settings["batch_size"]
# Go through all indexes
indexes = self.indexes[batch_index * batch_size:(batch_index + 1) * batch_size]

for index in indexes:
inchikey1 = self.reference_scores_df.index[index]
# Randomly pick the desired target score range and pick matching inchikey
Expand All @@ -495,14 +492,12 @@ def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
spectrum1 = self._get_spectrum_with_inchikey(inchikey1)
spectrum2 = self._get_spectrum_with_inchikey(inchikey2)
yield SpectrumPair(spectrum1, spectrum2)

@ staticmethod
def _data_selection(reference_scores_df, selected_inchikeys):
"""
Select labeled data to generate from based on `selected_inchikeys`
"""
return reference_scores_df.loc[selected_inchikeys, selected_inchikeys]

def on_epoch_end(self):
"""Updates indexes after each epoch"""
self.indexes = np.tile(np.arange(len(self.reference_scores_df)), int(self.settings["num_turns"]))
Expand All @@ -514,24 +509,20 @@ class Container:
"""
Helper class for DataGenerator
"""

def __init__(self, spectrum_pair, tanimoto_score, dim, _data_augmentation, additional_inputs=None):
self.spectrum_left = spectrum_pair[0]
self.spectrum_right = spectrum_pair[1]
self.spectrum_values_left = np.zeros((dim, ))
self.spectrum_values_right = np.zeros((dim, ))
self.idx_left, self.values_left = _data_augmentation(self.spectrum_left.binned_peaks)
self.idx_right, self.values_right = _data_augmentation(self.spectrum_right.binned_peaks)

self.spectrum_values_left[self.idx_left] = self.values_left
self.spectrum_values_right[self.idx_right] = self.values_right

self.additional_inputs_left = []
self.additional_inputs_right = []
for additional_input in additional_inputs:
self.additional_inputs_left.append([float(self.spectrum_left.get(additional_input))])
self.additional_inputs_right.append([float(self.spectrum_right.get(additional_input))])

self.tanimoto_score = tanimoto_score


Expand Down
56 changes: 20 additions & 36 deletions ms2deepscore/models/SiameseModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,10 @@ def __init__(self,

if keras_model is None:
# Create base model
self.base = self.get_base_model(input_dim=self.input_dim,
base_dims=base_dims,
embedding_dim=embedding_dim,
dropout_rate=dropout_rate,
dropout_in_first_layer=dropout_in_first_layer,
l1_reg=l1_reg,
l2_reg=l2_reg,
additional_input=self.nr_of_additional_inputs)
self.base = self.get_base_model(base_dims=base_dims, embedding_dim=embedding_dim, dropout_rate=dropout_rate,
dropout_in_first_layer=dropout_in_first_layer, l1_reg=l1_reg, l2_reg=l2_reg)
# Create head model
self.model = self._get_head_model(input_dim=self.input_dim,
additional_input=self.nr_of_additional_inputs,
base_model=self.base)
self.model = self._get_head_model()
else:
self._construct_from_keras_model(keras_model)

Expand All @@ -113,22 +105,18 @@ def save(self, filename: Union[str, Path]):
f.attrs['spectrum_binner'] = self.spectrum_binner.to_json()
f.attrs['additional_input'] = self.nr_of_additional_inputs

@staticmethod
def get_base_model(input_dim: int,
def get_base_model(self,
base_dims: Tuple[int, ...] = (600, 500, 500),
embedding_dim: int = 400,
dropout_rate: float = 0.25,
dropout_in_first_layer: bool = False,
l1_reg: float = 1e-6,
l2_reg: float = 1e-6,
dropout_always_on: bool = False,
additional_input=0) -> keras.Model:
dropout_always_on: bool = False) -> keras.Model:
"""Create base model for Siamaese network.
Parameters
----------
input_dim : int
Dimension of the input vectors.
base_dims
Tuple of integers depicting the dimensions of the desired hidden
layers of the base model
Expand All @@ -147,15 +135,13 @@ def get_base_model(input_dim: int,
model training, but switched off during inference. When set to True,
dropout layers will always be on, which is used for ensembling via
Monte Carlo dropout.
additional_input
Default is 0, shape of additional inputs
"""
# pylint: disable=too-many-arguments, disable=too-many-locals

dropout_starting_layer = 0 if dropout_in_first_layer else 1
base_input = Input(shape=input_dim, name='base_input')
if (additional_input > 0):
side_input = Input(shape=additional_input, name="additional_input")
base_input = Input(shape=self.input_dim, name='base_input')
if self.nr_of_additional_inputs > 0:
side_input = Input(shape=self.nr_of_additional_inputs, name="additional_input")
model_input = concatenate([base_input, side_input], axis=1)
else:
model_input = base_input
Expand All @@ -174,28 +160,26 @@ def get_base_model(input_dim: int,
model_layer = Dropout(dropout_rate, name='dropout'+str(i+1))(model_layer)

embedding = Dense(embedding_dim, activation='relu', name='embedding')(model_layer)
if additional_input > 0:
if self.nr_of_additional_inputs > 0:
return keras.Model(inputs=[base_input, side_input], outputs=[embedding], name='base')

return keras.Model(inputs=[base_input], outputs=[embedding], name='base')

@staticmethod
def _get_head_model(input_dim: int,
additional_input: int,
base_model: keras.Model):
input_a = Input(shape=input_dim, name="input_a")
input_b = Input(shape=input_dim, name="input_b")
def _get_head_model(self):

if additional_input > 0:
input_a_2 = Input(shape=additional_input, name="input_a_2")
input_b_2 = Input(shape=additional_input, name="input_b_2")
input_a = Input(shape=self.input_dim, name="input_a")
input_b = Input(shape=self.input_dim, name="input_b")

if self.nr_of_additional_inputs > 0:
input_a_2 = Input(shape=self.nr_of_additional_inputs, name="input_a_2")
input_b_2 = Input(shape=self.nr_of_additional_inputs, name="input_b_2")
inputs = [input_a, input_a_2, input_b, input_b_2]

embedding_a = base_model([input_a, input_a_2])
embedding_b = base_model([input_b, input_b_2])
embedding_a = self.base([input_a, input_a_2])
embedding_b = self.base([input_b, input_b_2])
else:
embedding_a = base_model(input_a)
embedding_b = base_model(input_b)
embedding_a = self.base(input_a)
embedding_b = self.base(input_b)
inputs = [input_a, input_b]

cosine_similarity = keras.layers.Dot(axes=(1, 1),
Expand Down
67 changes: 63 additions & 4 deletions ms2deepscore/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,18 @@ def plot_histograms(histograms,
alpha = 1.0 #0.5

# Create plot
plt.figure(figsize=(10,10))
plt.figure(figsize=(10, 10))

# Loop over each bin.
for i in range(0, len(histograms)):
data = histograms[len(histograms)-i-1][0]
data = data/max(data)
plt.fill_between(histograms[0][1][:100], -shift*i, [(-shift*i + x) for x in data], color=cmap1(i/10), alpha=alpha)
# Normalize the data to have the same area under the curve
data = data/sum(data)*len(data)/4
plt.fill_between(histograms[0][1][:len(data)], -shift*i, [(-shift*i + x) for x in data], color=cmap1(i/10), alpha=alpha)
if i > 0:
plt.plot(histograms[0][1][:100], [(-shift*i + x) for x in data], color="white")
plt.plot(histograms[0][1][:len(data)], [(-shift*i + x) for x in data], color="white")
if bin_content:
# Writes down the number of pairs per bin
plt.text(0.01, -shift*i+shift/6, f"{bin_content[::-1][i]} pairs")#, color="white")

plt.xticks(fontsize=14)
Expand Down Expand Up @@ -219,3 +222,59 @@ def derive_scatter_data(reference_scores,
(bins_y[j] + bins_y[j+1])/2,
idx[0].shape[0]))
return confusion_like_matrix, confusion_like_matrix_scatter


def tanimoto_dependent_losses(scores, scores_ref, ref_score_bins):
"""Compute errors (RMSE and MSE) for different bins of the reference scores (scores_ref).
Parameters
----------
scores
Scores that should be evaluated
scores_ref
Reference scores (= ground truth).
ref_score_bins
Bins for the refernce score to evaluate the performance of scores.
"""
bin_content = []
rmses = []
maes = []
bounds = []
ref_scores_bins_inclusive = ref_score_bins.copy()
ref_scores_bins_inclusive[0] = -np.inf
ref_scores_bins_inclusive[-1] = np.inf
for i in range(len(ref_scores_bins_inclusive) - 1):
low = ref_scores_bins_inclusive[i]
high = ref_scores_bins_inclusive[i + 1]
bounds.append((low, high))
idx = np.where((scores_ref >= low) & (scores_ref < high))
bin_content.append(idx[0].shape[0])
maes.append(np.abs(scores_ref[idx] - scores[idx]).mean())
rmses.append(np.sqrt(np.square(scores_ref[idx] - scores[idx]).mean()))
return bin_content, bounds, rmses, maes


def plot_rmse_per_bin(predicted_scores, true_scores):
ref_score_bins = np.linspace(0, 1.0, 11)
bin_content, bounds, rmses, _ = tanimoto_dependent_losses(
predicted_scores,
true_scores,
ref_score_bins)

_, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(4, 5), dpi=120)

ax1.plot(np.arange(len(rmses)), rmses, "o:", color="crimson")
ax1.set_title('RMSE')
ax1.set_ylabel("RMSE")
ax1.grid(True)

ax2.plot(np.arange(len(rmses)), bin_content, "o:", color="teal")
ax2.set_title('# of spectrum pairs')
ax2.set_ylabel("# of spectrum pairs")
ax2.set_xlabel("Tanimoto score bin")
plt.yscale('log')
plt.xticks(np.arange(len(rmses)),
[f"{a:.1f} to < {b:.1f}" for (a, b) in bounds], fontsize=9, rotation='vertical')
ax2.grid(True)
plt.tight_layout()
Empty file.
Loading

0 comments on commit fb3b9e5

Please sign in to comment.