diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6cb2580..5e83b4c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -70,6 +70,9 @@ jobs: - name: Test with pytest and generate coverage file run: tox run -e py-pydantic${{ matrix.pydantic }} + - name: Doctests + run: + tox run -e doctests - name: Upload coverage report to codecov uses: codecov/codecov-action@v1 if: success() diff --git a/MANIFEST.in b/MANIFEST.in index e17692c..6c3ce2a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -10,6 +10,7 @@ prune docs/source/api recursive-include docs/source *.py recursive-include docs/source *.rst recursive-include docs/source *.png +recursive-include docs/source *.svg global-exclude *.py[cod] __pycache__ *.so *.dylib .DS_Store *.gpickle diff --git a/README.md b/README.md index 894af60..bad9f5c 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,50 @@ will return `GO:0032571` instead of `OBO:GO_0032571`. Full documentation is available at [curies.readthedocs.io](https://curies.readthedocs.io). +### Chaining + +This package implements a faultless chain operation `curies.chain` that is configurable for case +sensitivity and fully considers all synonyms. + +`chain()` prioritizes based on the order given. Therefore, if two prefix maps +having the same prefix but different URI prefixes are given, the first is retained. The second +is retained as a synonym: + +```python +from curies import Converter, chain + +c1 = Converter.from_prefix_map({"GO": "http://purl.obolibrary.org/obo/GO_"}) +c2 = Converter.from_prefix_map({"GO": "https://identifiers.org/go:"}) +converter = chain([c1, c2]) + +>>> converter.expand("GO:1234567") +'http://purl.obolibrary.org/obo/GO_1234567' +>>> converter.compress("http://purl.obolibrary.org/obo/GO_1234567") +'GO:1234567' +>>> converter.compress("https://identifiers.org/go:1234567") +'GO:1234567' +``` + + Chain is the perfect tool if you want to override parts of an existing extended + prefix map. For example, if you want to use most of the Bioregistry, but you + would like to specify a custom URI prefix (e.g., using Identifiers.org), you + can do the following: + +```python +from curies import Converter, chain, get_bioregistry_converter + +overrides = Converter.from_prefix_map({"pubmed": "https://identifiers.org/pubmed:"}) +bioregistry_converter = get_bioregistry_converter() +converter = chain([overrides, bioregistry_converter]) + +>>> converter.expand("pubmed:1234") +'https://identifiers.org/pubmed:1234' +``` + ### Standardization The `curies.Converter` data structure supports prefix and URI prefix synonyms. -The following exampl demonstrates +The following example demonstrates using these synonyms to support standardizing prefixes, CURIEs, and URIs. Note below, the colloquial prefix `gomf`, sometimes used to represent the subspace in the [Gene Ontology (GO)](https://obofoundry.org/ontology/go) corresponding to molecular diff --git a/docs/source/struct.rst b/docs/source/struct.rst index a2ce74b..59a0028 100644 --- a/docs/source/struct.rst +++ b/docs/source/struct.rst @@ -1,4 +1,117 @@ Data Structures =============== -To do: add an explanation of prefix maps, bimaps, reverse prefix maps, extended prefix maps. -In the meantime, see https://cthoyt.com/2023/01/10/curies-package.html. +A *semantic space* is a collections of identifiers for concepts. For example, +the Chemical Entities of Biomedical Interest (ChEBI) has a semantic space +including identifiers for chemicals. Within ChEBI's semantic space, +`138488` corresponds to the chemical `alsterpaullone `_. + +.. warning:: + + `138488` is a *local unique identifier*. Other semantic spaces might use the same local + unique identifier to refer to a different concept in their respective domain. + +Therefore, local unique identifiers should be qualified with some additional information saying what semantic space +it comes from. The two common formalisms for doing this are Uniform Resource Identifiers (URIs) and +Compact URIs (CURIEs): + +.. image:: syntax_demo.svg + :alt: Demo of URI and CURIE for alsterpaullone. + +In many applications, it's important to be able to convert between CURIEs and URIs. +Therefore, we need a data structure that connects the CURIE prefixes like ``CHEBI`` +to the URI prefixes like ``http://purl.obolibrary.org/obo/CHEBI_``. + +Prefix Maps +----------- +A prefix map is a dictionary data structure where keys represent CURIE prefixes +and their associated values represent URI prefixes. Ideally, these are constrained +to be bijective (i.e., no duplicate keys, no duplicate values), but this is not always +done in practice. Here's an example prefix map containing information about semantic +spaces from a small selection of OBO Foundry ontologies: + +.. code-block:: json + + { + "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", + "MONDO": "http://purl.obolibrary.org/obo/MONDO_", + "GO": "http://purl.obolibrary.org/obo/GO_" + } + +Prefix maps have the benefit of being simple and straightforward. +They appear in many linked data applications, including: + +- the ``@prefix`` declarations at the top of Turtle (RDF) documents and SPARQL queries +- `JSON-LD `_ +- XML documents +- OWL ontologies + +.. note:: + + Prefix maps can be loaded using :meth:`curies.Converter.from_prefix_map`. + +*However*, prefix maps have the main limitation that they do not have first-class support for +synonyms of CURIE prefixes or URI prefixes. In practice, a variety of synonyms are used +for both. For example, the NCBI Taxonomy database appears with many different CURIE prefixes: + +============== ==================================== +CURIE Prefix Resource(s) +============== ==================================== +``taxonomy`` Identifiers.org, Name-to-Thing +``taxon`` Gene Ontology Registry +``NCBITaxon`` OBO Foundry, Prefix Commons, OntoBee +``NCBITAXON`` BioPortal +``NCBI_TaxID`` Cellosaurus +``ncbitaxon`` OLS +``P685`` Wikidata +``fj07xj`` FAIRsharing +============== ==================================== + +Similarly, many different URIs can be constructed for the same ChEBI local unique identifier. Using +alsterpaullone as an example, this includes (many omitted): + +==================================================== =================== +URI Prefix Provider +==================================================== =================== +``https://www.ebi.ac.uk/chebi/searchId.do?chebiId=`` ChEBI (first-party) +``https://identifiers.org/CHEBI:`` Identifiers.org +``https://identifiers.org/CHEBI/`` Identifiers.org +``http://identifiers.org/CHEBI:`` Identifiers.org +``http://identifiers.org/CHEBI/`` Identifiers.org +``http://purl.obolibrary.org/obo/CHEBI_`` OBO Foundry +``https://n2t.net/chebi:`` Name-to-thing +==================================================== =================== + +In practice, we need to be able to support the fact that there are many CURIE prefixes +and URI prefixes for most semantic spaces as well as specify which CURIE prefix and +URI prefix is the "preferred" one in a given context. Prefix maps, unfortunately, have no way to +address this. Therefore, we're going to introduce a new data structure. + +Extended Prefix Maps +-------------------- +Extended Prefix Maps (EPMs) address the issues with prefix maps by including explicit +fields for CURIE prefix synonyms and URI prefix synonyms while maintaining an explicit +field for the preferred CURIE prefix and URI prefix. An abbreviated example (just +containing an entry for ChEBI) looks like: + +.. code-block:: json + + [ + { + "prefix": "CHEBI", + "uri_prefix": "http://purl.obolibrary.org/obo/CHEBI_", + "prefix_synonyms": ["chebi"], + "uri_prefix_synonyms": [ + "https://identifiers.org/chebi:" + ] + } + ] + +EPMs have the benefit that they are still encoded in JSON and can easily be encoded in +YAML, TOML, RDF, and other schemata. + +.. note:: + + We are introducing this as a new standard in the :mod:`curies` package. They + can be loaded using :meth:`curies.Converter.from_extended_prefix_map`. + We provide a Pydantic model representing it. Later, we hope to have an external, stable definition + of this data schema. diff --git a/docs/source/syntax_demo.svg b/docs/source/syntax_demo.svg new file mode 100644 index 0000000..934fe12 --- /dev/null +++ b/docs/source/syntax_demo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/curies/api.py b/src/curies/api.py index 18e844e..fd7cc1a 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -145,6 +145,9 @@ def from_curie(cls, curie: str, sep: str = ":") -> "Reference": return cls(prefix=prefix, identifier=identifier) +RecordKey = Tuple[str, str, str, str] + + class Record(BaseModel): # type:ignore """A record of some prefixes and their associated URI prefixes.""" @@ -181,6 +184,16 @@ def _all_prefixes(self) -> List[str]: def _all_uri_prefixes(self) -> List[str]: return [self.uri_prefix, *self.uri_prefix_synonyms] + @property + def _key(self) -> RecordKey: + """Get a hashable key.""" + return ( + self.prefix, + self.uri_prefix, + ",".join(sorted(self.prefix_synonyms)), + ",".join(sorted(self.uri_prefix_synonyms)), + ) + class DuplicateValueError(ValueError): """An error raised with constructing a converter with data containing duplicate values.""" @@ -365,24 +378,73 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool self.reverse_prefix_map = _get_reverse_prefix_map(records) self.trie = StringTrie(self.reverse_prefix_map) - def _check_record(self, record: Record) -> None: - """Check if the record can be added.""" - if record.prefix in self.prefix_map: - raise ValueError(f"new record has duplicate prefix: {record.prefix}") - if record.uri_prefix in self.reverse_prefix_map: - raise ValueError(f"new record has duplicate URI prefix: {record.uri_prefix}") - for prefix_synonym in record.prefix_synonyms: - if prefix_synonym in self.prefix_map: - raise ValueError(f"new record has duplicate prefix: {prefix_synonym}") - for uri_prefix_synonym in record.uri_prefix_synonyms: - if uri_prefix_synonym in self.reverse_prefix_map: - raise ValueError(f"new record has duplicate URI prefix: {uri_prefix_synonym}") - - def add_record(self, record: Record) -> None: + @property + def bimap(self) -> Mapping[str, str]: + """Get the bijective mapping between CURIE prefixes and URI prefixes.""" + return {r.prefix: r.uri_prefix for r in self.records} + + def _match_record( + self, external: Record, case_sensitive: bool = True + ) -> Mapping[RecordKey, List[str]]: + """Match the given record to existing records.""" + rv: DefaultDict[RecordKey, List[str]] = defaultdict(list) + for record in self.records: + # Match CURIE prefixes + if _eq(external.prefix, record.prefix, case_sensitive=case_sensitive): + rv[record._key].append("prefix match") + if _in(external.prefix, record.prefix_synonyms, case_sensitive=case_sensitive): + rv[record._key].append("prefix match") + for prefix_synonym in external.prefix_synonyms: + if _eq(prefix_synonym, record.prefix, case_sensitive=case_sensitive): + rv[record._key].append("prefix match") + if _in(prefix_synonym, record.prefix_synonyms, case_sensitive=case_sensitive): + rv[record._key].append("prefix match") + + # Match URI prefixes + if _eq(external.uri_prefix, record.uri_prefix, case_sensitive=case_sensitive): + rv[record._key].append("URI prefix match") + if _in(external.uri_prefix, record.uri_prefix_synonyms, case_sensitive=case_sensitive): + rv[record._key].append("URI prefix match") + for uri_prefix_synonym in external.uri_prefix_synonyms: + if _eq(uri_prefix_synonym, record.uri_prefix, case_sensitive=case_sensitive): + rv[record._key].append("URI prefix match") + if _in( + uri_prefix_synonym, record.uri_prefix_synonyms, case_sensitive=case_sensitive + ): + rv[record._key].append("URI prefix match") + return dict(rv) + + def add_record(self, record: Record, case_sensitive: bool = True, merge: bool = False) -> None: """Append a record to the converter.""" - self._check_record(record) + matched = self._match_record(record, case_sensitive=case_sensitive) + if len(matched) > 1: + raise ValueError(f"new record has duplicates: {matched}") + if len(matched) == 1: + if not merge: + raise ValueError(f"new record already exists and merge=False: {matched}") + + key = list(matched)[0] + existing_record = next(r for r in self.records if r._key == key) + self._merge(record, into=existing_record) + self._index(existing_record) + else: + # Append a new record + self.records.append(record) + self._index(record) - self.records.append(record) + @staticmethod + def _merge(record: Record, into: Record) -> None: + for prefix_synonym in itt.chain([record.prefix], record.prefix_synonyms): + if prefix_synonym not in into._all_prefixes: + into.prefix_synonyms.append(prefix_synonym) + into.prefix_synonyms.sort() + + for uri_prefix_synonym in itt.chain([record.uri_prefix], record.uri_prefix_synonyms): + if uri_prefix_synonym not in into._all_uri_prefixes: + into.uri_prefix_synonyms.append(uri_prefix_synonym) + into.uri_prefix_synonyms.sort() + + def _index(self, record: Record) -> None: self.prefix_map[record.prefix] = record.uri_prefix self.synonym_to_prefix[record.prefix] = record.prefix for prefix_synonym in record.prefix_synonyms: @@ -401,6 +463,9 @@ def add_prefix( uri_prefix: str, prefix_synonyms: Optional[Collection[str]] = None, uri_prefix_synonyms: Optional[Collection[str]] = None, + *, + case_sensitive: bool = True, + merge: bool = False, ) -> None: """Append a prefix to the converter. @@ -413,6 +478,13 @@ def add_prefix( :param uri_prefix_synonyms: An optional collections of synonyms for the URI prefix such as ``https://bioregistry.io/go:``, ``http://www.informatics.jax.org/searches/GO.cgi?id=GO:``, etc. + :param case_sensitive: + Should prefixes and URI prefixes be compared in a case-sensitive manner when checking + for uniqueness? Defaults to True. + :param merge: + Should this record be merged into an existing record if it uniquely maps to a single + existing record? When false, will raise an error if one or more existing records can + be mapped. Defaults to false. This can be used to add missing namespaces on-the-fly to an existing converter: @@ -421,7 +493,7 @@ def add_prefix( >>> converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:") >>> converter.expand("hgnc:1234") 'https://bioregistry.io/hgnc:1234' - >>> converter.expand("GO:0032571 ") + >>> converter.expand("GO:0032571") 'http://purl.obolibrary.org/obo/GO_0032571' This can also be used to incrementally build up a converter from scratch: @@ -438,7 +510,7 @@ def add_prefix( prefix_synonyms=sorted(prefix_synonyms or []), uri_prefix_synonyms=sorted(uri_prefix_synonyms or []), ) - self.add_record(record) + self.add_record(record, case_sensitive=case_sensitive, merge=merge) @classmethod def from_extended_prefix_map( @@ -475,16 +547,24 @@ def from_extended_prefix_map( ... }, ... ] >>> converter = Converter.from_extended_prefix_map(epm) - # Canonical prefix + + Expand using the preferred/canonical prefix: + >>> converter.expand("CHEBI:138488") 'http://purl.obolibrary.org/obo/CHEBI_138488' - # Prefix synoynm + + Expand using a prefix synonym: + >>> converter.expand("chebi:138488") 'http://purl.obolibrary.org/obo/CHEBI_138488' - # Canonical URI prefix + + Compress using the preferred/canonical URI prefix: + >>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488") 'CHEBI:138488' - # URI prefix synoynm + + Compressing using a URI prefix synonym: + >>> converter.compress("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488") 'CHEBI:138488' @@ -616,7 +696,7 @@ def from_reverse_prefix_map( >>> url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/contexts/bioregistry.rpm.json" >>> converter = Converter.from_reverse_prefix_map(url) - >>> "chebi" in Converter.prefix_map + >>> "chebi" in converter.prefix_map """ dd = defaultdict(list) for uri_prefix, prefix in _prepare(reverse_prefix_map).items(): @@ -770,7 +850,7 @@ def parse_uri(self, uri: str) -> Union[ReferenceTuple, Tuple[None, None]]: ... "GO": "http://purl.obolibrary.org/obo/GO_", ... }) >>> converter.parse_uri("http://purl.obolibrary.org/obo/CHEBI_138488") - ('CHEBI', '138488') + ReferenceTuple(prefix='CHEBI', identifier='138488') >>> converter.parse_uri("http://example.org/missing:0000000") (None, None) """ @@ -1126,11 +1206,20 @@ def get_record(self, prefix: str) -> Optional[Record]: return None -def _f(x: str) -> str: - return x +def _eq(a: str, b: str, case_sensitive: bool) -> bool: + if case_sensitive: + return a == b + return a.casefold() == b.casefold() + + +def _in(a: str, bs: Iterable[str], case_sensitive: bool) -> bool: + if case_sensitive: + return a in bs + nfa = a.casefold() + return any(nfa == b.casefold() for b in bs) -def chain(converters: Sequence[Converter], case_sensitive: bool = True) -> Converter: +def chain(converters: Sequence[Converter], *, case_sensitive: bool = True) -> Converter: """Chain several converters. :param converters: A list or tuple of converters @@ -1139,52 +1228,63 @@ def chain(converters: Sequence[Converter], case_sensitive: bool = True) -> Conve A converter that looks up one at a time in the other converters. :raises ValueError: If there are no converters + + Chain is the perfect tool if you want to override parts of an existing extended + prefix map. For example, if you want to use most of the Bioregistry, but you + would like to specify a custom URI prefix (e.g., using Identifiers.org), you + can do the following: + + >>> from curies import Converter, chain, get_bioregistry_converter + >>> overrides = Converter.from_prefix_map({"pubmed": "https://identifiers.org/pubmed:"}) + >>> bioregistry_converter = get_bioregistry_converter() + >>> converter = chain([overrides, bioregistry_converter]) + >>> converter.bimap["pubmed"] + 'https://identifiers.org/pubmed:' + + Similarly, this also works if you want to override a prefix. Keep in mind for this to work + with a simple prefix map, you need to make sure the URI prefix matches in each converter, + otherwise you will get duplicates: + + >>> from curies import Converter, chain, get_bioregistry_converter + >>> overrides = Converter.from_prefix_map({"PMID": "https://www.ncbi.nlm.nih.gov/pubmed/"}) + >>> bioregistry_converter = get_bioregistry_converter() + >>> converter = chain([overrides, bioregistry_converter]) + >>> converter.bimap["PMID"] + 'https://www.ncbi.nlm.nih.gov/pubmed/' + + A safer way is to specify your override using an extended prefix map, which can tie together + prefix synonyms and URI prefix synonyms: + + >>> from curies import Converter, chain, get_bioregistry_converter + >>> overrides = Converter.from_extended_prefix_map([ + ... { + ... "prefix": "PMID", + ... "prefix_synonyms": ["pubmed", "PubMed"], + ... "uri_prefix": "https://www.ncbi.nlm.nih.gov/pubmed/", + ... "uri_prefix_synonyms": [ + ... "https://identifiers.org/pubmed:", + ... "http://bio2rdf.org/pubmed:", + ... ], + ... }, + ... ]) + >>> converter = chain([overrides, bioregistry_converter]) + >>> converter.bimap["PMID"] + 'https://www.ncbi.nlm.nih.gov/pubmed/' + + Chain prioritizes based on the order given. Therefore, if two prefix maps + having the same prefix but different URI prefixes are given, the first is retained + + >>> from curies import Converter, chain + >>> c1 = Converter.from_prefix_map({"GO": "http://purl.obolibrary.org/obo/GO_"}) + >>> c2 = Converter.from_prefix_map({"GO": "https://identifiers.org/go:"}) + >>> c3 = chain([c1, c2]) + >>> c3.prefix_map["GO"] + 'http://purl.obolibrary.org/obo/GO_' """ if not converters: raise ValueError - - norm_func: Callable[[str], str] - if case_sensitive: - norm_func = _f - else: - norm_func = str.casefold - - key_to_pair: Dict[str, Tuple[str, str]] = {} - #: A mapping from the canonical key to the secondary URI expansions - uri_prefix_tails: DefaultDict[str, Set[str]] = defaultdict(set) - #: A mapping from the canonical key to the secondary prefixes - prefix_tails: DefaultDict[str, Set[str]] = defaultdict(set) + rv = Converter([]) for converter in converters: for record in converter.records: - key = norm_func(record.prefix) - if key not in key_to_pair: - key_to_pair[key] = record.prefix, record.uri_prefix - uri_prefix_tails[key].update(record.uri_prefix_synonyms) - prefix_tails[key].update(record.prefix_synonyms) - else: - uri_prefix_tails[key].add(record.uri_prefix) - uri_prefix_tails[key].update(record.uri_prefix_synonyms) - prefix_tails[key].add(record.prefix) - prefix_tails[key].update(record.prefix_synonyms) - - # clean up potential duplicates from merging - for key, uri_prefixes in uri_prefix_tails.items(): - uri_prefix = key_to_pair[key][1] - if uri_prefix in uri_prefixes: - uri_prefixes.remove(uri_prefix) - for key, prefixes in prefix_tails.items(): - prefix = key_to_pair[key][0] - if prefix in prefixes: - prefixes.remove(prefix) - - return Converter( - [ - Record( - prefix=prefix, - uri_prefix=uri_prefix, - prefix_synonyms=sorted(prefix_tails[key]), - uri_prefix_synonyms=sorted(uri_prefix_tails[key]), - ) - for key, (prefix, uri_prefix) in key_to_pair.items() - ] - ) + rv.add_record(record, case_sensitive=case_sensitive, merge=True) + return rv diff --git a/tests/constants.py b/tests/constants.py new file mode 100644 index 0000000..ad13ed8 --- /dev/null +++ b/tests/constants.py @@ -0,0 +1,6 @@ +"""Constants for testing.""" + +import unittest + +RUN_SLOW = True +SLOW = unittest.skipUnless(RUN_SLOW, reason="Skipping slow tests") diff --git a/tests/test_api.py b/tests/test_api.py index 291e999..c73012c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -30,6 +30,155 @@ get_obo_converter, ) from curies.version import get_version +from tests.constants import SLOW + +CHEBI_URI_PREFIX = "http://purl.obolibrary.org/obo/CHEBI_" +GO_URI_PREFIX = "http://purl.obolibrary.org/obo/GO_" + + +class TestAddRecord(unittest.TestCase): + """Test adding records.""" + + def setUp(self) -> None: + """Set up the test case.""" + self.prefix = "CHEBI" + self.uri_prefix = CHEBI_URI_PREFIX + self.prefix_synonym = "p" + self.uri_prefix_synonym = "u" + self.converter = Converter.from_extended_prefix_map( + [ + { + "prefix": self.prefix, + "prefix_synonyms": [self.prefix_synonym], + "uri_prefix": self.uri_prefix, + "uri_prefix_synonyms": [self.uri_prefix_synonym], + }, + ] + ) + + def test_duplicate_failure(self): + """Test failure caused by double matching.""" + self.converter.add_prefix("GO", GO_URI_PREFIX) + with self.assertRaises(ValueError): + self.converter.add_record(Record(prefix="GO", uri_prefix=CHEBI_URI_PREFIX)) + + def test_extend_on_prefix_match(self): + """Test adding a new prefix in merge mode.""" + s1, s2, s3 = "s1", "s2", "s3" + for record in [ + Record( + prefix="CHEBI", + prefix_synonyms=[s1], + uri_prefix=s2, + uri_prefix_synonyms=[s3], + ), + Record( + prefix=s1, + prefix_synonyms=["CHEBI"], + uri_prefix=s2, + uri_prefix_synonyms=[s3], + ), + ]: + with self.assertRaises(ValueError): + self.converter.add_record(record, merge=False) + self.converter.add_record(record, merge=True) + self.assertEqual(1, len(self.converter.records)) + record = self.converter.records[0] + self.assertEqual("CHEBI", record.prefix) + self.assertEqual({s1, self.prefix_synonym}, set(record.prefix_synonyms)) + self.assertEqual(CHEBI_URI_PREFIX, record.uri_prefix) + self.assertEqual({s2, s3, self.uri_prefix_synonym}, set(record.uri_prefix_synonyms)) + + def test_extend_on_uri_prefix_match(self): + """Test adding a new prefix in merge mode.""" + s1, s2, s3 = "s1", "s2", "s3" + for record in [ + Record( + prefix=s1, + prefix_synonyms=[s3], + uri_prefix=s2, + uri_prefix_synonyms=[CHEBI_URI_PREFIX], + ), + Record( + prefix=s1, + prefix_synonyms=[s3], + uri_prefix=CHEBI_URI_PREFIX, + uri_prefix_synonyms=[s2], + ), + ]: + with self.assertRaises(ValueError): + self.converter.add_record(record, merge=False) + self.converter.add_record(record, merge=True) + self.assertEqual(1, len(self.converter.records)) + record = self.converter.records[0] + self.assertEqual("CHEBI", record.prefix) + self.assertEqual({s1, s3, self.prefix_synonym}, set(record.prefix_synonyms)) + self.assertEqual(CHEBI_URI_PREFIX, record.uri_prefix) + self.assertEqual({s2, self.uri_prefix_synonym}, set(record.uri_prefix_synonyms)) + + def test_extend_on_prefix_synonym_match(self): + """Test adding a new prefix in merge mode.""" + s1, s2, s3 = "s1", "s2", "s3" + for record in [ + Record( + prefix=self.prefix_synonym, + prefix_synonyms=[s1], + uri_prefix=s2, + uri_prefix_synonyms=[s3], + ), + Record( + prefix=s1, + prefix_synonyms=[self.prefix_synonym], + uri_prefix=s2, + uri_prefix_synonyms=[s3], + ), + ]: + self.converter.add_record(record, merge=True) + self.assertEqual(1, len(self.converter.records)) + record = self.converter.records[0] + self.assertEqual("CHEBI", record.prefix) + self.assertEqual({s1, self.prefix_synonym}, set(record.prefix_synonyms)) + self.assertEqual(CHEBI_URI_PREFIX, record.uri_prefix) + self.assertEqual({s2, s3, self.uri_prefix_synonym}, set(record.uri_prefix_synonyms)) + + def test_extend_on_uri_prefix_synonym_match(self): + """Test adding a new prefix in merge mode.""" + s1, s2, s3 = "s1", "s2", "s3" + for record in [ + Record( + prefix=s1, + prefix_synonyms=[s2], + uri_prefix=self.uri_prefix_synonym, + uri_prefix_synonyms=[s3], + ), + Record( + prefix=s1, + prefix_synonyms=[s2], + uri_prefix=s3, + uri_prefix_synonyms=[self.uri_prefix_synonym], + ), + ]: + self.converter.add_record(record, merge=True) + self.assertEqual(1, len(self.converter.records)) + record = self.converter.records[0] + self.assertEqual("CHEBI", record.prefix) + self.assertEqual({s1, s2, self.prefix_synonym}, set(record.prefix_synonyms)) + self.assertEqual(CHEBI_URI_PREFIX, record.uri_prefix) + self.assertEqual({s3, self.uri_prefix_synonym}, set(record.uri_prefix_synonyms)) + + def test_extend_on_prefix_match_ci(self): + """Test adding a new prefix in merge mode.""" + s1, s2, s3 = "s1", "s2", "s3" + record = Record( + prefix="chebi", prefix_synonyms=[s1], uri_prefix=s2, uri_prefix_synonyms=[s3] + ) + self.converter.add_record(record, case_sensitive=False, merge=True) + self.assertEqual(1, len(self.converter.records)) + record = self.converter.records[0] + self.assertEqual("CHEBI", record.prefix) + self.assertEqual({"chebi", s1, self.prefix_synonym}, set(record.prefix_synonyms)) + self.assertEqual(CHEBI_URI_PREFIX, record.uri_prefix) + self.assertEqual({s2, s3, self.uri_prefix_synonym}, set(record.uri_prefix_synonyms)) class TestConverter(unittest.TestCase): @@ -107,8 +256,10 @@ def test_convert(self): def _assert_convert(self, converter: Converter): self.assertIn("GO", converter.prefix_map) + self.assertIn("GO", converter.bimap) self.assertIn("http://purl.obolibrary.org/obo/GO_", converter.reverse_prefix_map) self.assertIn("http://purl.obolibrary.org/obo/GO_", converter.trie) + self.assertIn("http://purl.obolibrary.org/obo/GO_", converter.bimap.values()) for curie, uri in [ ("CHEBI:1", "http://purl.obolibrary.org/obo/CHEBI_1"), ("OBO:unnamespaced", "http://purl.obolibrary.org/obo/unnamespaced"), @@ -134,6 +285,7 @@ def _assert_convert(self, converter: Converter): self.assertIsInstance(record, Record) self.assertEqual("GO", record.prefix) + @SLOW def test_bioregistry(self): """Test loading a remote JSON-LD context.""" for web in [True, False]: @@ -162,6 +314,7 @@ def test_jsonld(self): self.assertIn("hello", converter.prefix_map) self.assertIn("CHEBI", converter.prefix_map) + @SLOW def test_from_github(self): """Test getting a JSON-LD map from GitHub.""" with self.assertRaises(ValueError): @@ -173,18 +326,21 @@ def test_from_github(self): ) self.assertIn("rdf", semweb_converter.prefix_map) + @SLOW def test_obo(self): """Test the OBO converter.""" obo_converter = get_obo_converter() self.assertIn("CHEBI", obo_converter.prefix_map) self.assertNotIn("chebi", obo_converter.prefix_map) + @SLOW def test_monarch(self): """Test the Monarch converter.""" monarch_converter = get_monarch_converter() self.assertIn("CHEBI", monarch_converter.prefix_map) self.assertNotIn("chebi", monarch_converter.prefix_map) + @SLOW def test_go_registry(self): """Test the GO registry converter.""" go_converter = get_go_converter() @@ -202,8 +358,10 @@ def assert_bioregistry_converter(self, converter: Converter) -> None: self.assertIn("ChEBI", record.prefix_synonyms) self.assertIn("chebi", converter.prefix_map) + self.assertIn("chebi", converter.bimap) # Synonyms that are non-conflicting also get added self.assertIn("CHEBI", converter.prefix_map) + self.assertNotIn("CHEBI", converter.bimap) chebi_uri = converter.prefix_map["chebi"] self.assertIn(chebi_uri, converter.reverse_prefix_map) self.assertEqual("chebi", converter.reverse_prefix_map[chebi_uri]) @@ -299,6 +457,8 @@ def test_combine(self): } ) converter = chain([c1, c2], case_sensitive=True) + + self.assertEqual("CHEBI", converter.get_record("CHEBI").prefix) for url in [ "http://purl.obolibrary.org/obo/CHEBI_138488", "https://bioregistry.io/chebi:138488", @@ -306,16 +466,48 @@ def test_combine(self): "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=138488", ]: self.assertEqual("CHEBI:138488", converter.compress(url)) + + self.assertEqual("GO", converter.get_record("GO").prefix) self.assertEqual( "GO:0000001", converter.compress("http://purl.obolibrary.org/obo/GO_0000001"), ) + + self.assertEqual( + "http://purl.obolibrary.org/obo/CHEBI_", converter.get_record("CHEBI").uri_prefix + ) + self.assertIn("CHEBI", converter.prefix_map) + self.assertEqual("http://purl.obolibrary.org/obo/CHEBI_", converter.prefix_map["CHEBI"]) self.assertEqual( "http://purl.obolibrary.org/obo/CHEBI_138488", converter.expand("CHEBI:138488"), ) self.assertNotIn("nope", converter.get_prefixes()) + def test_combine_with_synonyms(self): + """Test combination with synonyms.""" + r1 = Record(prefix="GO", uri_prefix=GO_URI_PREFIX) + r2 = Record(prefix="go", prefix_synonyms=["GO"], uri_prefix="https://identifiers.org/go:") + + c1 = Converter([]) + c1.add_record(r1) + self.assertEqual(c1.records, Converter([r1]).records) + + c1.add_record(r2, merge=True) + self.assertEqual(1, len(c1.records)) + r = c1.records[0] + self.assertEqual("GO", r.prefix) + self.assertEqual({"go"}, set(r.prefix_synonyms)) + self.assertEqual("http://purl.obolibrary.org/obo/GO_", r.uri_prefix) + self.assertEqual({"https://identifiers.org/go:"}, set(r.uri_prefix_synonyms)) + + c3 = chain([Converter([r1]), Converter([r2])]) + self.assertEqual(1, len(c3.records)) + self.assertIn("GO", c3.prefix_map) + self.assertIn("go", c3.prefix_map, msg=f"PM: {c3.prefix_map}") + self.assertNotIn("go", c3.bimap) + self.assertIn("GO", c3.bimap) + def test_combine_ci(self): """Test combining case insensitive.""" c1 = Converter.from_priority_prefix_map( diff --git a/tests/test_mapping_service.py b/tests/test_mapping_service.py index da58772..d5ca72c 100644 --- a/tests/test_mapping_service.py +++ b/tests/test_mapping_service.py @@ -24,6 +24,7 @@ handle_header, sparql_service_available, ) +from tests.constants import SLOW VALID_CONTENT_TYPES = { *CONTENT_TYPE_TO_HANDLER, @@ -338,6 +339,7 @@ def test_post_service_query(self): class TestUtils(unittest.TestCase): """Test utilities.""" + @SLOW def test_availability(self): """Test sparql service availability check.""" self.assertTrue(