Skip to content

Commit 517d848

Browse files
Merge pull request #98 from zdk123/dupe-inchikeys
Report duplicate Inchikeys
2 parents 965e680 + 857be4a commit 517d848

File tree

4 files changed

+54
-3
lines changed

4 files changed

+54
-3
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
9+
## [0.2.3] - 2022-03-02
10+
11+
## Fixed
12+
13+
- Fixes issue [#97](https://github.com/matchms/ms2deepscore/pull/97) by raising a ValueError when duplicate InChiKey14 are specified by the user in the reference_scores_df DataFrame.
14+
815
## [Unreleased]
916

1017
## Changed

ms2deepscore/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.2.2'
1+
__version__ = '0.2.3'

ms2deepscore/data_generators.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,17 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType],
8585
def _collect_and_validate_inchikeys(self):
8686
"""Collect all inchikeys14 (first 14 characters) of all binned_spectrums
8787
and check if all are present in the reference scores as well.
88+
Check for duplicate inchikeys.
8889
"""
8990
self.spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.binned_spectrums])
9091
for inchikey in np.unique(self.spectrum_inchikeys):
9192
assert inchikey in self.reference_scores_df.index, \
9293
"InChIKey in given spectrum not found in reference scores"
94+
inchikeys = self.reference_scores_df.index
95+
if len(set(inchikeys)) != len(inchikeys):
96+
raise ValueError("Duplicate InChIKeys-14 detected in reference_scores_df: %s" % list(inchikeys[inchikeys.duplicated()]))
97+
98+
9399

94100
@staticmethod
95101
def _validate_labels(reference_scores_df: pd.DataFrame):
@@ -316,7 +322,7 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType],
316322
by inchikeys. Columns and index should be inchikeys, the value in a row x column
317323
depicting the similarity score for that pair. Must be symmetric
318324
(reference_scores_df[i,j] == reference_scores_df[j,i]) and column names should be
319-
identical to the index.
325+
identical to the index and unique.
320326
dim
321327
Input vector dimension.
322328
As part of **settings, defaults for the following parameters can be set:
@@ -460,7 +466,7 @@ def __init__(self, binned_spectrums: List[BinnedSpectrumType], selected_inchikey
460466
def __len__(self):
461467
"""Denotes the number of batches per epoch
462468
NB1: self.reference_scores_df only contains 'selected' inchikeys, see `self._data_selection`.
463-
NB2: We don't see all data every epoch, because the last half-empty batch is omitted.
469+
NB2: We don't see all data every epoch, because the last half-empty batch is omitted.
464470
This is expected behavior, with the shuffling this is OK.
465471
"""
466472
return int(self.settings["num_turns"]) * int(np.floor(len(self.reference_scores_df) / self.settings[
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import os
2+
from pathlib import Path
3+
4+
from ms2deepscore import SpectrumBinner
5+
from ms2deepscore.data_generators import DataGeneratorAllSpectrums, DataGeneratorAllInchikeys
6+
from tests.test_user_worfklow import load_processed_spectrums, get_reference_scores
7+
8+
import pytest
9+
10+
11+
def test_error_duplicate_inchikeys():
12+
"""Test an expected error when duplicate inchikeys are given to DataGenerator"""
13+
## Get test data ##
14+
spectrums = load_processed_spectrums()
15+
tanimoto_scores_df = get_reference_scores()
16+
17+
## Create duplicate inchikeys ##
18+
sel = list(range(30)) + list(range(30))
19+
tanimoto_scores_df = tanimoto_scores_df.iloc[sel, sel]
20+
selected_inchikeys = tanimoto_scores_df.index[:60].unique()
21+
22+
## Subset spectra to selected inchikeys and bin
23+
spectrums = [s for s in spectrums if s.get("inchikey")[:14] in selected_inchikeys]
24+
spectrum_binner = SpectrumBinner(400, mz_min=10.0, mz_max=500.0, peak_scaling=0.5)
25+
binned_spectrums = spectrum_binner.fit_transform(spectrums)
26+
dimension = len(spectrum_binner.known_bins)
27+
28+
## Setup DataGenerator
29+
with pytest.raises(ValueError):
30+
DataGeneratorAllInchikeys(binned_spectrums=binned_spectrums,
31+
selected_inchikeys=selected_inchikeys,
32+
reference_scores_df=tanimoto_scores_df,
33+
dim=dimension)
34+
35+
with pytest.raises(ValueError):
36+
DataGeneratorAllSpectrums(binned_spectrums=binned_spectrums,
37+
reference_scores_df=tanimoto_scores_df,
38+
dim=dimension)

0 commit comments

Comments
 (0)