Skip to content

Commit

Permalink
Merge pull request #15 from matchms/update_to_matchms018
Browse files Browse the repository at this point in the history
updates to recent matchms changes
  • Loading branch information
florian-huber authored Feb 3, 2023
2 parents c11690c + 0cd6c37 commit 56b3057
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 260 deletions.
13 changes: 11 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.4.0] - 2023-02-03

### Changed

- removed network creation functions which have meanwhile been incorporated in matchms
- switch default id fieldname form "spectrumid" to "spectrum_id" (matchms > 0.14.0)
- adapt test to new Scores object design (matchms > 0.18.0)

## [0.3.0] - 2021-12-19

### Added
Expand Down Expand Up @@ -55,8 +63,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- This is the initial version mostly taken from https://github.com/iomega/spec2vec_gnps_data_analysis

[Unreleased]: https://github.com/matchms/matchmsextras/compare/0.3.0...HEAD
[0.2.3]: https://github.com/matchms/matchmsextras/compare/0.2.3...0.3.0
[Unreleased]: https://github.com/matchms/matchmsextras/compare/0.4.0...HEAD
[0.4.0]: https://github.com/matchms/matchmsextras/compare/0.3.0...0.4.0
[0.3.0]: https://github.com/matchms/matchmsextras/compare/0.2.3...0.3.0
[0.2.3]: https://github.com/matchms/matchmsextras/compare/0.2.2...0.2.3
[0.2.2]: https://github.com/matchms/matchmsextras/compare/0.2.1...0.2.2
[0.2.1]: https://github.com/matchms/matchmsextras/compare/0.2.0...0.2.1
Expand Down
2 changes: 1 addition & 1 deletion matchmsextras/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.0'
__version__ = '0.4.0'
113 changes: 1 addition & 112 deletions matchmsextras/networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,123 +11,12 @@
from matplotlib import pyplot as plt
import matplotlib
from matchms import Spectrum, Scores
from matchms.networking.networking_functions import get_top_hits

# ----------------------------------------------------------------------------
# ---------------- Graph / networking related functions ----------------------
# ----------------------------------------------------------------------------

def get_top_hits(scores, identifier: str = "spectrum_id",
top_n: int = 25, search_by: str = "queries") -> Tuple[dict, dict]:
"""Get top_n highest scores (and indices) for every entry.
Args:
--------
scores
Matchms Scores object containing all Spec2Vec similarities.
identifier
Unique intentifier for each spectrum in scores. Will also be used for
node names.
top_n
Return the indexes and scores for the top_n highest scores.
search_by
Chose between 'queries' or 'references' which decides if the top_n matches
for every spectrum in scores.queries or in scores.references will be
collected and returned
"""
assert search_by in ["queries", "references"], \
"search_by must be 'queries' or 'references"
if top_n < 2:
top_n = 2
print("Set top_n to minimum value of 2")
#dim1 = len(scores.references) if search_by=="references" else len(scores.queries)
#dim2 = min(top_n, len(scores.queries) if search_by=="references" else len(scores.references))

similars_idx = dict()
similars_scores = dict()

if search_by=="queries":
for i, spec in enumerate(scores.queries):
spec_id = spec.get(identifier)
similars_idx[spec_id] = scores.scores[:, i].argsort()[::-1][:top_n]
similars_scores[spec_id] = scores.scores[similars_idx[spec_id], i]
elif search_by=="references":
for i, spec in enumerate(scores.references):
spec_id = spec.get(identifier)
similars_idx[spec_id] = scores.scores[i, :].argsort()[::-1][:top_n]
similars_scores[spec_id] = scores.scores[i, similars_idx[spec_id]]
return similars_idx, similars_scores


def create_network(scores: Scores,
identifier: str = "spectrum_id",
top_n: int = 20,
max_links: int = 10,
cutoff: float = 0.7,
link_method: str = 'single') -> nx.Graph:
"""
Function to create network from given top-n similarity values. Expects that
similarities given in scores are from an all-vs-all comparison including all
possible pairs.
Args:
--------
scores
Matchms Scores object containing all Spec2Vec similarities.
identifier
Unique intentifier for each spectrum in scores. Will also be used for
node names.
top_n
Consider edge between spectrumA and spectrumB if score falls into
top_n for spectrumA or spectrumB (link_method="single"), or into
top_n for spectrumA and spectrumB (link_method="mutual"). From those
potential links, only max_links will be kept, so top_n must be >= max_links.
max_links
Maximum number of links to add per node. Default = 10.
Due to incoming links, total number of links per node can be higher.
cutoff
Threshold for given similarities. Edges/Links will only be made for
similarities > cutoff. Default = 0.7.
link_method
Chose between 'single' and 'mutual'. 'single will add all links based
on individual nodes. 'mutual' will only add links if that link appears
in the given top-n list for both nodes.
"""
assert top_n >= max_links, "top_n must be >= max_links"
assert np.all(scores.queries == scores.references), \
"Expected symmetric scores object with queries==references"
unique_ids = list({s.get(identifier) for s in scores.queries})
dimension = len(unique_ids)

# Initialize network graph, add nodes
msnet = nx.Graph()
msnet.add_nodes_from(unique_ids)

# Collect location and score of highest scoring candidates for queries and references
similars_idx, similars_scores = get_top_hits(scores, top_n=top_n,
search_by="queries")

# Add edges based on global threshold (cutoff) for weights
for i, spec in enumerate(scores.queries):
query_id = spec.get(identifier)

ref_candidates = np.array([scores.references[x].get(identifier)
for x in similars_idx[query_id]])
idx = np.where((similars_scores[query_id] >= cutoff) & (ref_candidates != query_id))[0][:max_links]
if link_method == "single":
new_edges = [(query_id, str(ref_candidates[x]),
float(similars_scores[query_id][x])) for x in idx]
elif link_method == "mutual":
new_edges = [(query_id, str(ref_candidates[x]),
float(similars_scores[query_id][x]))
for x in idx if i in similars_idx[ref_candidates[x]][:]]
else:
raise ValueError("Link method not kown")

msnet.add_weighted_edges_from(new_edges)

return msnet


def create_network_asymmetric(scores: Scores,
identifier: str = "spectrum_id",
add_links_from_queries: bool = True,
Expand Down
2 changes: 1 addition & 1 deletion matchmsextras/pubchem_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import pubchempy as pcp
import numpy as np
from matchms.utils import is_valid_inchikey
from matchms.metadata_utils import is_valid_inchikey


logger = logging.getLogger("matchms")
Expand Down
159 changes: 15 additions & 144 deletions tests/test_networking.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest
import numpy as np
from matchms import Spectrum, calculate_scores
from matchms.networking import SimilarityNetwork
from matchms.similarity import FingerprintSimilarity
from matchmsextras.networking import create_network, create_network_asymmetric
from matchmsextras.networking import get_top_hits
from matchmsextras.networking import create_network_asymmetric
from matchmsextras.networking import dilate_cluster
from matchmsextras.networking import extract_networking_metadata

Expand Down Expand Up @@ -50,61 +50,14 @@ def create_dummy_scores_symmetric():
return scores


def test_get_top_hits_by_references():
scores = create_dummy_scores()
idx_ref, scores_ref = get_top_hits(scores, top_n=10, search_by="references")

expected_scores_ref = {'ref_spec_0': np.array([0.66666667, 0.5 , 0. ]),
'ref_spec_1': np.array([0.66666667, 0.5 , 0. ]),
'ref_spec_2': np.array([0.66666667, 0.66666667, 0.5 ]),
'ref_spec_3': np.array([0.8, 0.5, 0.5]),
'ref_spec_4': np.array([1. , 0.8, 0.5])}
expected_idx_ref = {'ref_spec_0': np.array([0, 2, 1], dtype=np.int64),
'ref_spec_1': np.array([1, 2, 0], dtype=np.int64),
'ref_spec_2': np.array([1, 0, 2], dtype=np.int64),
'ref_spec_3': np.array([2, 1, 0], dtype=np.int64),
'ref_spec_4': np.array([0, 2, 1], dtype=np.int64)}
for key in scores_ref.keys():
assert np.allclose(scores_ref[key], expected_scores_ref[key], atol=1e-5), \
"Expected different selected scores"
for key in idx_ref.keys():
assert np.allclose(idx_ref[key], expected_idx_ref[key], atol=1e-5), \
"Expected different selected indices"

# Test lower top_n
idx_ref, scores_ref = get_top_hits(scores, top_n=2, search_by="references")
for key in scores_ref.keys():
assert np.allclose(scores_ref[key], expected_scores_ref[key][:2], atol=1e-5), \
"Expected different selected scores"
for key in idx_ref.keys():
assert np.allclose(idx_ref[key], expected_idx_ref[key][:2], atol=1e-5), \
"Expected different selected indices"

def test_get_top_hits_by_queries():
scores = create_dummy_scores()
idx_query, scores_query = get_top_hits(scores, top_n=10, search_by="queries")

expected_scores_query = {'query_spec_0': np.array([1. , 0.66666667, 0.66666667, 0.5 , 0. ]),
'query_spec_1': np.array([0.66666667, 0.66666667, 0.5 , 0.5 , 0. ]),
'query_spec_2': np.array([0.8, 0.8, 0.5, 0.5, 0.5])}
expected_idx_query = {'query_spec_0': np.array([4, 2, 0, 3, 1], dtype=np.int64),
'query_spec_1': np.array([2, 1, 4, 3, 0], dtype=np.int64),
'query_spec_2': np.array([4, 3, 2, 1, 0], dtype=np.int64)}
for key in scores_query.keys():
assert np.allclose(scores_query[key], expected_scores_query[key], atol=1e-5), \
"Expected different selected scores"
for key in idx_query.keys():
assert np.allclose(idx_query[key], expected_idx_query[key], atol=1e-5), \
"Expected different selected indices"

# Test lower top_n
idx_query, scores_query = get_top_hits(scores, top_n=2, search_by="queries")
for key in scores_query.keys():
assert np.allclose(scores_query[key], expected_scores_query[key][:2], atol=1e-5), \
"Expected different selected scores"
for key in idx_query.keys():
assert np.allclose(idx_query[key], expected_idx_query[key][:2], atol=1e-5), \
"Expected different selected indices"
@pytest.fixture()
def network_symmetric():
"""Test creating a graph from a symmetric Scores object"""
cutoff = 0.7
scores = create_dummy_scores_symmetric()
msnet = SimilarityNetwork(score_cutoff=cutoff)
msnet.create_network(scores)
return msnet.graph


def test_create_network_asymmetric():
Expand All @@ -127,7 +80,7 @@ def test_create_network_asymmetric():
'ref_spec_3',
'ref_spec_4',
'ref_spec_4']
assert len(edges_list) == np.sum(scores.scores > cutoff), \
assert len(edges_list) == np.sum(scores.scores.to_array() > cutoff), \
"Expected different number of edges"
assert np.all([(x[0] in nodes_with_edges) for x in edges_list]), "Expected different edges in graph"
assert np.all([(x[1] in nodes_with_edges) for x in edges_list]), "Expected different edges in graph"
Expand All @@ -137,96 +90,14 @@ def test_create_network_asymmetric():
"Expected different edge weight"


def test_create_network_symmetric_wrong_input():
"""Test if function is used with non-symmetric scores object"""
scores = create_dummy_scores()
with pytest.raises(AssertionError) as msg:
_ = create_network(scores)

expected_msg = "Expected symmetric scores object with queries==references"
assert expected_msg in str(msg), "Expected different exception"


def test_create_network_symmetric():
"""Test creating a graph from a symmetric Scores object"""
cutoff=0.7
scores = create_dummy_scores_symmetric()
msnet = create_network(scores, cutoff=cutoff)

edges_list = list(msnet.edges())
edges_list.sort()
nodes_without_edges = ['ref_spec_0',
'ref_spec_1',
'ref_spec_2']
assert len(edges_list) == 5, "Expected different number of edges"
assert np.all([(x[0] not in nodes_without_edges) for x in edges_list]), \
"Expected this node to have no edges"
assert np.all([(x[1] not in nodes_without_edges) for x in edges_list]), \
"Expected this node to have no edges"


def test_create_network_symmetric_higher_cutoff():
cutoff=0.9
scores = create_dummy_scores_symmetric()
msnet = create_network(scores, cutoff=cutoff)

edges_list = list(msnet.edges())
edges_list.sort()
assert len(edges_list) == 1, "Expected only one link"
assert edges_list[0][0] in ['query_spec_0', 'ref_spec_4'], \
"Expected different node to have a link"
assert edges_list[0][1] in ['query_spec_0', 'ref_spec_4'], \
"Expected different node to have a link"


def test_create_network_symmetric_mutual_method():
"""Test creating a graph from a Scores object"""
cutoff=0.7
scores = create_dummy_scores_symmetric()
# change some scores
scores._scores[7, 6] = scores._scores[6, 7] = 0.85
scores._scores[7, 5] = scores._scores[5, 7] = 0.75
scores._scores[7, 3] = scores._scores[3, 7] = 0.7

msnet = create_network(scores, cutoff=cutoff, top_n=3,
max_links=3, link_method="mutual")
nodes_with_edges = ['query_spec_0', 'query_spec_1', 'query_spec_2', 'ref_spec_4']
edges_list = list(msnet.edges())
edges_list.sort()
assert len(edges_list) == 3, "Expected only four link"
assert np.all([(x[0] in nodes_with_edges) for x in edges_list]), "Expected different edges in graph"
assert np.all([(x[1] in nodes_with_edges) for x in edges_list]), "Expected different edges in graph"


def test_create_network_symmetric_max_links_1():
"""Test creating a graph from a Scores object using max_links=1"""
cutoff=0.7
scores = create_dummy_scores_symmetric()
msnet = create_network(scores, cutoff=cutoff, max_links=1, link_method="single")

edges_list = list(msnet.edges())
edges_list.sort()
nodes_without_edges = ['ref_spec_0',
'ref_spec_1',
'ref_spec_2',]
assert len(edges_list) == 3, "Expected different number of edges"
assert np.all([(x[0] not in nodes_without_edges) for x in edges_list]), \
"Expected this node to have no edges"
assert np.all([(x[1] not in nodes_without_edges) for x in edges_list]), \
"Expected this node to have no edges"


def test_dilate_cluster():
# Create graph
cutoff=0.7
def test_dilate_cluster(network_symmetric):
scores = create_dummy_scores_symmetric()
msnet = create_network(scores, cutoff=cutoff, top_n=3, max_links=3)
assert len(msnet.edges()) == 5, \
assert len(network_symmetric.edges()) == 5, \
"Expected different number of edges before dilating"

# Dilation step
msnet_dilated, links_added = dilate_cluster(msnet, scores)
assert len(msnet.edges()) == 12, \
msnet_dilated, _ = dilate_cluster(network_symmetric, scores)
assert len(msnet_dilated.edges()) == 12, \
"Expected different number of edges after dilating"


Expand Down

0 comments on commit 56b3057

Please sign in to comment.