Merge pull request #98 from zdk123/dupe-inchikeys

svenvanderburg · web-flow · commit 517d8482a4a8 · 2022-03-09T08:43:59.000+01:00
Report duplicate Inchikeys
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## [0.2.3] - 2022-03-02
+
+## Fixed
+
+- Fixes issue [#97](https://github.com/matchms/ms2deepscore/pull/97) by raising a ValueError when duplicate InChiKey14 are specified by the user in the reference_scores_df DataFrame.
+
 ## [Unreleased]
 
 ## Changed
diff --git a/ms2deepscore/__version__.py b/ms2deepscore/__version__.py
@@ -1 +1 @@
-__version__ = '0.2.2'
+__version__ = '0.2.3'
diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py
@@ -85,11 +85,17 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType],
     def _collect_and_validate_inchikeys(self):
         """Collect all inchikeys14 (first 14 characters) of all binned_spectrums
         and check if all are present in the reference scores as well.
+        Check for duplicate inchikeys.
         """
         self.spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.binned_spectrums])
         for inchikey in np.unique(self.spectrum_inchikeys):
             assert inchikey in self.reference_scores_df.index, \
                 "InChIKey in given spectrum not found in reference scores"
+        inchikeys = self.reference_scores_df.index
+        if len(set(inchikeys)) != len(inchikeys):
+            raise ValueError("Duplicate InChIKeys-14 detected in reference_scores_df: %s" % list(inchikeys[inchikeys.duplicated()]))
+
+
 
     @staticmethod
     def _validate_labels(reference_scores_df: pd.DataFrame):
@@ -316,7 +322,7 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType],
             by inchikeys. Columns and index should be inchikeys, the value in a row x column
             depicting the similarity score for that pair. Must be symmetric
             (reference_scores_df[i,j] == reference_scores_df[j,i]) and column names should be
-            identical to the index.
+            identical to the index and unique.
         dim
             Input vector dimension.
         As part of **settings, defaults for the following parameters can be set:
@@ -460,7 +466,7 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType], selected_inchikey
     def __len__(self):
         """Denotes the number of batches per epoch
         NB1: self.reference_scores_df only contains 'selected' inchikeys, see `self._data_selection`.
-        NB2: We don't see all data every epoch, because the last half-empty batch is omitted. 
+        NB2: We don't see all data every epoch, because the last half-empty batch is omitted.
         This is expected behavior, with the shuffling this is OK.
         """
         return int(self.settings["num_turns"]) * int(np.floor(len(self.reference_scores_df) / self.settings[
diff --git a/tests/test_models_duplicate_inchikeys.py b/tests/test_models_duplicate_inchikeys.py
@@ -0,0 +1,38 @@
+import os
+from pathlib import Path
+
+from ms2deepscore import SpectrumBinner
+from ms2deepscore.data_generators import DataGeneratorAllSpectrums, DataGeneratorAllInchikeys
+from tests.test_user_worfklow import load_processed_spectrums, get_reference_scores
+
+import pytest
+
+
+def test_error_duplicate_inchikeys():
+    """Test an expected error when duplicate inchikeys are given to DataGenerator"""
+    ## Get test data ##
+    spectrums = load_processed_spectrums()
+    tanimoto_scores_df = get_reference_scores()
+
+    ## Create duplicate inchikeys ##
+    sel = list(range(30)) + list(range(30))
+    tanimoto_scores_df = tanimoto_scores_df.iloc[sel, sel]
+    selected_inchikeys = tanimoto_scores_df.index[:60].unique()
+
+    ## Subset spectra to selected inchikeys and bin
+    spectrums = [s for s in spectrums if s.get("inchikey")[:14] in selected_inchikeys]
+    spectrum_binner = SpectrumBinner(400, mz_min=10.0, mz_max=500.0, peak_scaling=0.5)
+    binned_spectrums = spectrum_binner.fit_transform(spectrums)
+    dimension = len(spectrum_binner.known_bins)
+
+    ## Setup DataGenerator
+    with pytest.raises(ValueError):
+        DataGeneratorAllInchikeys(binned_spectrums=binned_spectrums,
+                                  selected_inchikeys=selected_inchikeys,
+                                  reference_scores_df=tanimoto_scores_df,
+                                  dim=dimension)
+
+    with pytest.raises(ValueError):
+        DataGeneratorAllSpectrums(binned_spectrums=binned_spectrums,
+                                  reference_scores_df=tanimoto_scores_df,
+                                  dim=dimension)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.2.2'`
	`1`	`+__version__ = '0.2.3'`