From d5cbe4d4d34a271eea84926897599f39c41c7be5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Dec 2023 18:14:57 +0100 Subject: [PATCH 1/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 169612c..02ac69b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- Logo for SeMRA + Logo for SeMRA

# Semantic Mapping Reasoning Assembler (SeMRA) From 2151ec6c0fcd3cbd4579d1464b28f997a074f87e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 10:23:04 +0100 Subject: [PATCH 2/5] Minor API refactoring --- src/semra/api.py | 41 +++++++++++++++++++++-------------------- tests/test_api.py | 28 ++++++++++++++-------------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/src/semra/api.py b/src/semra/api.py index 000c579..37ca9ad 100644 --- a/src/semra/api.py +++ b/src/semra/api.py @@ -268,20 +268,21 @@ def infer_mutations( for mapping in _tqdm(mappings, desc="Adding mutated predicates", progress=progress): rv.append(mapping) confidence = pairs.get((mapping.s.prefix, mapping.o.prefix)) - if confidence is not None and mapping.p == old: - inferred_mapping = Mapping( - s=mapping.s, - p=new, - o=mapping.o, - evidence=[ - ReasonedEvidence( - justification=KNOWLEDGE_MAPPING, - mappings=[mapping], - confidence_factor=confidence, - ) - ], - ) - rv.append(inferred_mapping) + if confidence is None or mapping.p != old: + continue + inferred_mapping = Mapping( + s=mapping.s, + p=new, + o=mapping.o, + evidence=[ + ReasonedEvidence( + justification=KNOWLEDGE_MAPPING, + mappings=[mapping], + confidence_factor=confidence, + ) + ], + ) + rv.append(inferred_mapping) return rv @@ -313,12 +314,12 @@ def keep_object_prefixes(mappings: Iterable[Mapping], prefixes: str | Iterable[s ] -def filter_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str]) -> list[Mapping]: +def filter_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str], *, progress: bool = True) -> list[Mapping]: """Filter out mappings whose subject or object are in the given list of prefixes.""" prefixes = set(prefixes) return [ mapping - for mapping in _tqdm(mappings, desc=f"Filtering out {len(prefixes)} prefixes") + for mapping in _tqdm(mappings, desc=f"Filtering out {len(prefixes)} prefixes", progress=progress) if mapping.s.prefix not in prefixes and mapping.o.prefix not in prefixes ] @@ -375,8 +376,8 @@ def project( mappings: list[Mapping], source_prefix: str, target_prefix: str, *, return_sus: bool = False, progress: bool = False ) -> list[Mapping] | tuple[list[Mapping], list[Mapping]]: """Ensure that each identifier only appears as the subject of one mapping.""" - mappings = keep_subject_prefixes(mappings, source_prefix) - mappings = keep_object_prefixes(mappings, target_prefix) + mappings = keep_subject_prefixes(mappings, source_prefix, progress=progress) + mappings = keep_object_prefixes(mappings, target_prefix, progress=progress) m2m_mappings = get_many_to_many(mappings) mappings = filter_mappings(mappings, m2m_mappings, progress=progress) mappings = assemble_evidences(mappings, progress=progress) @@ -459,11 +460,11 @@ def deduplicate_evidence(evidence: list[Evidence]) -> list[Evidence]: return list(d.values()) -def validate_mappings(mappings: list[Mapping]) -> None: +def validate_mappings(mappings: list[Mapping], *, progress: bool = True) -> None: """Validate mappings against the Bioregistry and raise an error on the first invalid.""" import bioregistry - for mapping in tqdm(mappings, desc="Validating mappings", unit_scale=True, unit="mapping"): + for mapping in tqdm(mappings, desc="Validating mappings", unit_scale=True, unit="mapping", disable=not progress): if bioregistry.normalize_prefix(mapping.s.prefix) != mapping.s.prefix: raise ValueError(f"invalid subject prefix.\n\nMapping: {mapping}\n\nSubject:{mapping.s}.") if bioregistry.normalize_prefix(mapping.o.prefix) != mapping.o.prefix: diff --git a/tests/test_api.py b/tests/test_api.py index 512c026..b19881d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -31,9 +31,9 @@ def _exact(s, o, evidence: list[SimpleEvidence] | None = None) -> Mapping: EV = SimpleEvidence( justification=MANUAL_MAPPING, - mapping_set=MappingSet(name="test_mapping_set"), + mapping_set=MappingSet(name="test_mapping_set", confidence=0.95), ) -MS = MappingSet(name="test") +MS = MappingSet(name="test", confidence=0.95) class TestOperations(unittest.TestCase): @@ -87,7 +87,7 @@ def test_index(self): ) m1 = Mapping(s=r1, p=EXACT_MATCH, o=r2, evidence=[e1]) m2 = Mapping(s=r1, p=EXACT_MATCH, o=r2, evidence=[e2]) - index = get_index([m1, m2]) + index = get_index([m1, m2], progress=False) self.assertIn(m1.triple, index) self.assertEqual(1, len(index)) self.assertEqual(2, len(index[m1.triple])) @@ -130,10 +130,10 @@ def test_infer_exact_match(self): backwards_msg = "backwards inference is not supposed to be done here" - index = get_index(infer_chains([m1, m2], backwards=False)) + index = get_index(infer_chains([m1, m2], backwards=False, progress=False), progress=False) self.assertNotIn(m4_inv.triple, index, msg=backwards_msg) - index = get_index(infer_chains([m1, m2, m3], backwards=False)) + index = get_index(infer_chains([m1, m2, m3], backwards=False, progress=False), progress=False) self.assert_same_triples([m1, m2, m3, m4, m4, m5, m6], index) self.assertNotIn(m4_inv.triple, index, msg=backwards_msg) self.assertNotIn(m5_inv.triple, index, msg=backwards_msg) @@ -164,12 +164,12 @@ def test_no_infer(self): def test_infer_broad_match_1(self): r1, r2, r3, r4 = _get_references(4) m1, m2, m3 = line(r1, EXACT_MATCH, r2, BROAD_MATCH, r3, EXACT_MATCH, r4) - m4 = Mapping(s=r1, p=BROAD_MATCH, o=r3) - m5 = Mapping(s=r1, p=BROAD_MATCH, o=r4) - m6 = Mapping(s=r2, p=BROAD_MATCH, o=r4) - m4_i = Mapping(o=r1, p=NARROW_MATCH, s=r3) - m5_i = Mapping(o=r1, p=NARROW_MATCH, s=r4) - m6_i = Mapping(o=r2, p=NARROW_MATCH, s=r4) + m4 = Mapping(s=r1, p=BROAD_MATCH, o=r3, evidence=[EV]) + m5 = Mapping(s=r1, p=BROAD_MATCH, o=r4, evidence=[EV]) + m6 = Mapping(s=r2, p=BROAD_MATCH, o=r4, evidence=[EV]) + m4_i = Mapping(o=r1, p=NARROW_MATCH, s=r3, evidence=[EV]) + m5_i = Mapping(o=r1, p=NARROW_MATCH, s=r4, evidence=[EV]) + m6_i = Mapping(o=r2, p=NARROW_MATCH, s=r4, evidence=[EV]) # Check inference over two steps self.assert_same_triples( @@ -258,7 +258,7 @@ def test_filter_self(self): m2 = Mapping(s=r12, p=EXACT_MATCH, o=r22) m3 = Mapping(s=r11, p=EXACT_MATCH, o=r13) mappings = [m1, m2, m3] - self.assert_same_triples([m1, m2], filter_self_matches(mappings)) + self.assert_same_triples([m1, m2], filter_self_matches(mappings, progress=False)) def test_filter_negative(self): """Test filtering out mappings within a given prefix.""" @@ -268,7 +268,7 @@ def test_filter_negative(self): m2 = Mapping(s=r12, p=EXACT_MATCH, o=r22) mappings = [m1, m2] negative = [m2] - self.assert_same_triples([m1], filter_mappings(mappings, negative)) + self.assert_same_triples([m1], filter_mappings(mappings, negative, progress=False)) def test_project(self): """Test projecting into a given source/target pair.""" @@ -280,7 +280,7 @@ def test_project(self): m2_i = Mapping(o=r12, p=EXACT_MATCH, s=r22) m3 = Mapping(s=r11, p=EXACT_MATCH, o=r31) mappings = [m1, m2, m2_i, m3] - self.assert_same_triples([m1, m2], project(mappings, "p1", "p2")) + self.assert_same_triples([m1, m2], project(mappings, "p1", "p2", progress=False)) def test_get_many_to_many(self): """Test getting many-to-many mappings.""" From a8091bb36f05c0116f3c50557c853fabd1d090f9 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 10:25:42 +0100 Subject: [PATCH 3/5] Update probability model and docs --- src/semra/io.py | 17 +++++-- src/semra/struct.py | 112 +++++++++++++++++++++++--------------------- tests/test_api.py | 32 ++++++++++++- 3 files changed, 102 insertions(+), 59 deletions(-) diff --git a/src/semra/io.py b/src/semra/io.py index f5aad28..4e1bada 100644 --- a/src/semra/io.py +++ b/src/semra/io.py @@ -391,7 +391,7 @@ def _get_sssom_row(mapping: Mapping, e: Evidence): ",".join(sorted(e.mapping_set_names)), mapping_set_version, mapping_set_license, - round(confidence, CONFIDENCE_PRECISION) if (confidence := e.confidence) is not None else "", + _safe_confidence(e), e.author.curie if e.author else "", e.explanation, ) @@ -436,6 +436,13 @@ def _neo4j_bool(b: bool, /) -> Literal["true", "false"]: # noqa:FBT001 return "true" if b else "false" # type:ignore +def _safe_confidence(x) -> str: + confidence = x.get_confidence() + if confidence is None: + return "" + return str(round(confidence, CONFIDENCE_PRECISION)) + + def write_neo4j( mappings: list[Mapping], directory: str | Path, @@ -515,7 +522,7 @@ def write_neo4j( mapping.s.curie, mapping.p.curie, mapping.o.curie, - round(c, CONFIDENCE_PRECISION) if (c := mapping.confidence) is not None else "", + _safe_confidence(mapping), _neo4j_bool(mapping.has_primary), _neo4j_bool(mapping.has_secondary), _neo4j_bool(mapping.has_tertiary), @@ -566,7 +573,7 @@ def write_neo4j( "mapping", "semra.mapping", mapping.p.curie, - mapping.confidence and round(mapping.confidence, CONFIDENCE_PRECISION), + _safe_confidence(mapping), _neo4j_bool(mapping.has_primary), _neo4j_bool(mapping.has_secondary), _neo4j_bool(mapping.has_tertiary), @@ -585,7 +592,7 @@ def write_neo4j( mapping_set.name, mapping_set.license or "", mapping_set.version or "", - c if (c := mapping_set.confidence) is not None else "", + _safe_confidence(mapping_set), ) for mapping_set in sorted(mapping_sets.values(), key=lambda n: n.curie) ), @@ -600,7 +607,7 @@ def write_neo4j( "semra.evidence", evidence.evidence_type, evidence.justification.curie, - c if (c := evidence.confidence) is not None else "", + _safe_confidence(evidence), ) for evidence in sorted(evidences.values(), key=lambda row: row.curie) ), diff --git a/src/semra/struct.py b/src/semra/struct.py index 9289c63..a4e60c5 100644 --- a/src/semra/struct.py +++ b/src/semra/struct.py @@ -8,7 +8,7 @@ from collections.abc import Iterable from hashlib import md5 from itertools import islice -from typing import Annotated, Literal, Optional, Union +from typing import Annotated, ClassVar, Literal, Optional, Union import pydantic from curies import Reference @@ -37,6 +37,7 @@ def triple_key(triple: Triple) -> tuple[str, str, str]: return triple[0].curie, triple[2].curie, triple[1].curie +EPSILON = 1e-6 EvidenceType = Literal["simple", "mutated", "reasoned"] JUSTIFICATION_FIELD = Field(description="A SSSOM-compliant justification") @@ -47,43 +48,70 @@ def _md5_hexdigest(picklable) -> str: return hasher.hexdigest() -class EvidenceMixin: +class KeyedMixin: + """A mixin for a class that can be hashed and CURIE-encoded.""" + + #: The prefix for CURIEs for instances of this class + _prefix: ClassVar[str] + + def __init_subclass__(cls, *, prefix: str, **kwargs): + cls._prefix = prefix + def key(self): + """Return a picklable key.""" raise NotImplementedError def hexdigest(self) -> str: + """Generate a hexadecimal representation of the MD5 hash of the pickled key() for this class.""" key = self.key() return _md5_hexdigest(key) def get_reference(self) -> Reference: - return Reference(prefix="semra.evidence", identifier=self.hexdigest()) + """Get a CURIE reference using this class's prefix and its hexadecimal representation.""" + return Reference(prefix=self._prefix, identifier=self.hexdigest()) @property def curie(self) -> str: + """Get a string representing the CURIE.""" return self.get_reference().curie -class MappingSet(pydantic.BaseModel): +class ConfidenceMixin: + def get_confidence(self) -> float: + raise NotImplementedError + + +class EvidenceMixin: + @property + def explanation(self) -> str: + return "" + + +class MappingSet(pydantic.BaseModel, ConfidenceMixin, KeyedMixin, prefix="semra.mappingset"): + """Represents a set of semantic mappings. + + For example, this might correspond to: + + 1. All the mappings extracted from an ontology + 2. All the mappings published with a database + 3. All the mappings inferred by SeMRA based on a given configuration + """ + name: str = Field(..., description="Name of the mapping set") version: Optional[str] = Field(default=None, description="The version of the dataset from which the mapping comes") license: Optional[str] = Field(default=None, description="License name or URL for mapping set") - confidence: Optional[float] = Field(default=None, description="Mapping set level confidence") + confidence: float = Field(..., description="Mapping set level confidence") def key(self): - return self.name, self.version or "", self.license or "", 1.0 if self.confidence is None else self.confidence + """Get a picklable key representing the mapping set.""" + return self.name, self.version or "", self.license or "", self.confidence - def hexdigest(self) -> str: - return _md5_hexdigest(self.key()) + def get_confidence(self) -> float: + """Get the confidence for the mapping set.""" + return self.confidence - def get_reference(self) -> Reference: - return Reference(prefix="semra.mappingset", identifier=self.hexdigest()) - @property - def curie(self) -> str: - return self.get_reference().curie - - -class SimpleEvidence(pydantic.BaseModel, EvidenceMixin): +class SimpleEvidence(pydantic.BaseModel, KeyedMixin, EvidenceMixin, ConfidenceMixin, prefix="semra.evidence"): """Evidence for a mapping. Ideally, this matches the SSSOM data model. @@ -108,6 +136,7 @@ class Config: ], ) uuid: UUID4 = Field(default_factory=uuid.uuid4) + confidence: Optional[float] = Field(None, description="The confidence") def key(self): """Get a key suitable for hashing the evidence. @@ -116,22 +145,17 @@ def key(self): Note: this should be extended to include basically _all_ fields """ - return (self.evidence_type, self.justification, self.author, self.mapping_set.key(), self.uuid) + return self.evidence_type, self.justification, self.author, self.mapping_set.key(), self.uuid @property def mapping_set_names(self) -> set[str]: return {self.mapping_set.name} - @property - def confidence(self) -> Optional[float]: - return self.mapping_set.confidence - - @property - def explanation(self) -> str: - return "" + def get_confidence(self) -> float: + return self.confidence if self.confidence is not None else self.mapping_set.confidence -class ReasonedEvidence(pydantic.BaseModel, EvidenceMixin): +class ReasonedEvidence(pydantic.BaseModel, KeyedMixin, EvidenceMixin, ConfidenceMixin, prefix="semra.evidence"): """A complex evidence based on multiple mappings.""" class Config: @@ -145,7 +169,7 @@ class Config: ..., description="A list of mappings and their evidences consumed to create this evidence" ) author: Optional[Reference] = None - confidence_factor: float = 1.0 + confidence_factor: float = Field(1.0, description="The probability that the reasoning method is correct") def key(self): return ( @@ -154,13 +178,9 @@ def key(self): *((*m.triple, *(e.key() for e in m.evidence)) for m in self.mappings), ) - @property - def confidence(self) -> Optional[float]: - confidences = [mapping.confidence for mapping in self.mappings] - nn_confidences = [c for c in confidences if c is not None] - if not nn_confidences: - return None - return self.confidence_factor * _joint_probability(nn_confidences) + def get_confidence(self) -> float: + confidences = [mapping.get_confidence() for mapping in self.mappings] + return _joint_probability([self.confidence_factor, *confidences]) @property def mapping_set(self) -> None: @@ -183,7 +203,7 @@ def explanation(self) -> str: ] -class Mapping(pydantic.BaseModel): +class Mapping(pydantic.BaseModel, ConfidenceMixin, KeyedMixin, prefix="semra.mapping"): """A semantic mapping.""" class Config: @@ -202,30 +222,16 @@ def triple(self) -> Triple: return self.s, self.p, self.o @classmethod - def from_triple(cls, triple: Triple, evidence: Union[list[Evidence], None] = None) -> Mapping: + def from_triple(cls, triple: Triple, evidence: Optional[list[Evidence]] = None) -> Mapping: """Instantiate a mapping from a triple.""" s, p, o = triple return cls(s=s, p=p, o=o, evidence=evidence or []) - @property - def confidence(self) -> Optional[float]: + def get_confidence(self) -> float: + """Get the mapping's confidence by aggregating its evidences' confidences in a binomial model.""" if not self.evidence: - return None - confidences = [e.confidence for e in self.evidence] - nn_confidences = [c for c in confidences if c is not None] - if not nn_confidences: - return None - return _joint_probability(nn_confidences) - - def hexdigest(self) -> str: - return _md5_hexdigest(self.triple) - - def get_reference(self) -> Reference: - return Reference(prefix="semra.mapping", identifier=self.hexdigest()) - - @property - def curie(self) -> str: - return self.get_reference().curie + raise ValueError("can not calculate confidence since no evidence") + return _joint_probability(e.get_confidence() for e in self.evidence) @property def has_primary(self) -> bool: diff --git a/tests/test_api.py b/tests/test_api.py index b19881d..c6fe7d2 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -4,6 +4,7 @@ from semra.api import ( BROAD_MATCH, + DB_XREF, EXACT_MATCH, NARROW_MATCH, Index, @@ -13,11 +14,12 @@ get_index, get_many_to_many, infer_chains, + infer_mutations, infer_reversible, keep_prefixes, project, ) -from semra.rules import MANUAL_MAPPING +from semra.rules import KNOWLEDGE_MAPPING, MANUAL_MAPPING from semra.struct import Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence, line, triple_key @@ -295,3 +297,31 @@ def test_get_many_to_many(self): m4 = Mapping(s=a3, p=EXACT_MATCH, o=b2) self.assert_same_triples([m3, m4], get_many_to_many([m2, m3, m4])) + + +class TestUpgrades(unittest.TestCase): + """Test inferring mutations.""" + + def test_infer_mutations(self): + """Test inferring mutations.""" + (a1,) = _get_references(1, prefix="a") + (b1,) = _get_references(1, prefix="b") + original_confidence = 0.95 + mutation_confidence = 0.80 + m1 = Mapping(s=a1, p=DB_XREF, o=b1, evidence=[SimpleEvidence(confidence=original_confidence, mapping_set=MS)]) + new_mappings = infer_mutations( + [m1], {("a", "b"): mutation_confidence}, old=DB_XREF, new=EXACT_MATCH, progress=False + ) + self.assertEqual(2, len(new_mappings)) + new_m1, new_m2 = new_mappings + self.assertEqual(m1, new_m1) + self.assertEqual(a1, new_m2.s) + self.assertEqual(EXACT_MATCH, new_m2.p) + self.assertEqual(b1, new_m2.o) + self.assertEqual(1, len(new_m2.evidence)) + new_evidence = new_m2.evidence[0] + self.assertIsInstance(new_evidence, ReasonedEvidence) + new_confidence = new_evidence.get_confidence() + self.assertIsNotNone(new_confidence) + self.assertEqual(1 - (1 - original_confidence) * (1 - mutation_confidence), new_confidence) + self.assertEqual(KNOWLEDGE_MAPPING, new_evidence.justification) From ce4f34a448a46b3bdbebf358ab899d001368b693 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 11:06:27 +0100 Subject: [PATCH 4/5] Add py38 tests --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0eab69d..1c70cb3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -13,7 +13,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - python-version: [ "3.11", "3.9" ] + python-version: [ "3.11", "3.8" ] pydantic: [ "pydantic1", "pydantic2" ] steps: - uses: actions/checkout@v2 From 8d1d4b472ddb50d34d5ab1cf2e326f54d54aa928 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 11:07:16 +0100 Subject: [PATCH 5/5] Add confidence filter function --- src/semra/api.py | 11 +++++++++++ tests/test_api.py | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/semra/api.py b/src/semra/api.py index 37ca9ad..f2091b5 100644 --- a/src/semra/api.py +++ b/src/semra/api.py @@ -498,3 +498,14 @@ def summarize_prefixes(mappings: list[Mapping]) -> pd.DataFrame: [(prefix, bioregistry.get_name(prefix), bioregistry.get_description(prefix)) for prefix in sorted(prefixes)], columns=["prefix", "name", "description"], ).set_index("prefix") + + +def filter_minimum_confidence(mappings: Iterable[Mapping], cutoff: float = 0.7) -> Iterable[Mapping]: + """Filter mappings below a given confidence.""" + for mapping in mappings: + try: + confidence = mapping.get_confidence() + except ValueError: + continue + if confidence >= cutoff: + yield mapping diff --git a/tests/test_api.py b/tests/test_api.py index c6fe7d2..edd1fa7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -2,6 +2,7 @@ import unittest +from semra import api from semra.api import ( BROAD_MATCH, DB_XREF, @@ -298,6 +299,15 @@ def test_get_many_to_many(self): m4 = Mapping(s=a3, p=EXACT_MATCH, o=b2) self.assert_same_triples([m3, m4], get_many_to_many([m2, m3, m4])) + def test_filter_confidence(self): + """Test filtering by confidence.""" + (a1, a2) = _get_references(2, prefix="a") + (b1, b2) = _get_references(2, prefix="b") + m1 = Mapping(s=a1, p=DB_XREF, o=b1, evidence=[SimpleEvidence(confidence=0.95, mapping_set=MS)]) + m2 = Mapping(s=a1, p=DB_XREF, o=b1, evidence=[SimpleEvidence(confidence=0.65, mapping_set=MS)]) + mmm = list(api.filter_minimum_confidence([m1, m2], cutoff=0.7)) + self.assertEqual([m1], mmm) + class TestUpgrades(unittest.TestCase): """Test inferring mutations."""