diff --git a/python/acl_anthology/anthology.py b/python/acl_anthology/anthology.py index 2f3135e42c..a38e7cf044 100644 --- a/python/acl_anthology/anthology.py +++ b/python/acl_anthology/anthology.py @@ -43,6 +43,7 @@ from .sigs import SIGIndex from .venues import VenueIndex +CacheDict: TypeAlias = dict[str, str | tuple[int, int]] NameSpecificationOrIter: TypeAlias = NameSpecification | Iterator[NameSpecification] PersonOrList: TypeAlias = Person | list[Person] @@ -95,6 +96,26 @@ def _check_schema_compatibility(self) -> None: if datadir_schema != expected_schema: warnings.warn(SchemaMismatchWarning()) + def _compute_cache_dict(self, depends_on: list[str]) -> CacheDict: + """Compute a dictionary of file stats for caching purposes. + + Arguments: + depends_on: A list of files or glob patterns in the Anthology's data directory that the cache depends on. + + Returns: + A dictionary containing {'datadir': self.datadir} plus an entry with stats for every file that matches the supplied glob patterns. + """ + cache_dict: CacheDict = {"datadir": str(self.datadir.resolve())} + for pattern in depends_on: + for path in self.datadir.glob(pattern): + if path.is_file(): + stat = path.stat() + cache_dict[str(path.relative_to(self.datadir))] = ( + stat.st_size, + int(stat.st_mtime), + ) + return cache_dict + @classmethod def from_repo( cls, diff --git a/python/acl_anthology/collections/collection.py b/python/acl_anthology/collections/collection.py index 29615e78f7..5937f24f81 100644 --- a/python/acl_anthology/collections/collection.py +++ b/python/acl_anthology/collections/collection.py @@ -249,7 +249,6 @@ def load(self) -> None: if self.is_data_loaded: return - log.debug(f"Parsing XML data file: {self.path}") current_volume = cast(Volume, None) # noqa: F841 for _, element in etree.iterparse( self.path, diff --git a/python/acl_anthology/config.py b/python/acl_anthology/config.py index 545acbaa11..cb03ca5421 100644 --- a/python/acl_anthology/config.py +++ b/python/acl_anthology/config.py @@ -16,9 +16,14 @@ from attrs import define from omegaconf import OmegaConf +from pathlib import Path from platformdirs import PlatformDirs +dirs = PlatformDirs("acl-anthology") +"""A [PlatformDirs instance](https://platformdirs.readthedocs.io/en/latest/api.html#platformdirs) that returns platform-specific directories for storing data.""" + + @define class DefaultConfig: url_prefix: str = "${oc.env:ANTHOLOGY_PREFIX,https://aclanthology.org}" @@ -51,9 +56,12 @@ class DefaultConfig: disable_gc: bool = True """If True, disables garbage collection while parsing XML files and building indices. This typically results in a considerable speed-up, but if it happens to cause problems, it can be disabled here.""" + cache_path: Path = dirs.user_cache_path + """Path where cache files should be saved/loaded.""" + + disable_caching: bool = False + """If True, disables both saving & loading of cache files.""" + config = OmegaConf.structured(DefaultConfig) """A [structured configuration instance](https://omegaconf.readthedocs.io/en/latest/structured_config.html) that is used by all `acl_anthology` classes.""" - -dirs = PlatformDirs("acl-anthology") -"""A [PlatformDirs instance](https://platformdirs.readthedocs.io/en/latest/api.html#platformdirs) that returns platform-specific directories for storing data.""" diff --git a/python/acl_anthology/people/index.py b/python/acl_anthology/people/index.py index 980909d464..310fbef21c 100644 --- a/python/acl_anthology/people/index.py +++ b/python/acl_anthology/people/index.py @@ -14,10 +14,12 @@ from __future__ import annotations +import attrs from attrs import define, field from collections.abc import Iterable from collections import Counter, defaultdict import itertools as it +import msgpack from pathlib import Path from rich.progress import track from scipy.cluster.hierarchy import DisjointSet # type: ignore @@ -30,6 +32,7 @@ except ImportError: # pragma: no cover from yaml import Loader, Dumper # type: ignore +from ..config import config from ..containers import SlottedDict from ..exceptions import ( AnthologyException, @@ -44,7 +47,7 @@ if TYPE_CHECKING: from _typeshed import StrPath - from ..anthology import Anthology + from ..anthology import Anthology, CacheDict from ..collections import Paper, Volume log = get_logger() @@ -93,6 +96,10 @@ class PersonIndex(SlottedDict[Person]): def _path(self) -> Path: return self.parent.datadir / Path(PEOPLE_INDEX_FILE) + @property + def _cache_file(self) -> Path: + return cast(Path, config.cache_path) / "PersonIndex.cache" + @property def by_orcid(self) -> dict[str, str]: if not self.is_data_loaded: @@ -208,10 +215,10 @@ def find_coauthors_counter( def load(self) -> None: """Loads or builds the index.""" - # This function exists so we can later add the option to read the index - # from a cache if it doesn't need re-building. if self.is_data_loaded: return + if not config.disable_caching and self._load_cache(): + return self.build(show_progress=self.verbose) def reset(self) -> None: @@ -273,6 +280,8 @@ def build(self, show_progress: bool = False) -> None: "An exception was raised while building PersonIndex; check the logger for details." ) # pragma: no cover self.is_data_loaded = True + if not config.disable_caching: + self._save_cache() def _load_people_index(self) -> None: """Load and parse the `people.yaml` file. @@ -621,3 +630,60 @@ def save(self, path: Optional[StrPath] = None) -> None: with open(path, "w", encoding="utf-8") as f: yaml.dump(data, f, allow_unicode=True, Dumper=Dumper) + + def _compute_cache_dict(self) -> CacheDict: + """Compute the cache dictionary for this index. + + If the return value is identical between a saved cache file and this instance, the data can be loaded from the cache. + """ + return self.parent._compute_cache_dict(depends_on=["xml/*", "yaml/people.yaml"]) + + def _save_cache(self) -> None: + """Save the entire PersonIndex to a cache file.""" + config.cache_path.mkdir(parents=True, exist_ok=True) + + with open(self._cache_file, "wb") as f: + # The first saved message is the cache key + msgpack.pack(self._compute_cache_dict(), f) + # We serialize each Person in the index as a single message + for person in self.values(): + msgpack.pack( + attrs.asdict( + person, + filter=lambda attr, value: value and attr.name != "parent", + value_serializer=lambda _, __, value: ( + value if not isinstance(value, NameLink) else value.value + ), + ), + f, + ) + + def _load_cache(self) -> bool: + """Load the entire PersonIndex from a cache file, if possible. + + Checks if the cache file exists and only loads it if its key is compatible with this Anthology instance (i.e. no files that this cache depends on appear to have changed). + + Returns: + True if the PersonIndex could be loaded from a cache file. + """ + if not self._cache_file.exists(): + return False + + with open(self._cache_file, "rb") as f: + unpacker = msgpack.Unpacker(f, use_list=False) + cache_key = next(unpacker) + if cache_key != self._compute_cache_dict(): + # Cache invalid + return False + + # Load from cache + self.reset() + print(f"Loading PersonIndex from cache file {self._cache_file}") + for data in unpacker: + data["names"] = ( + (Name.from_dict(x[0]), NameLink(x[1])) for x in data.pop("_names") + ) + self.add_person(Person(parent=self.parent, **data)) + + self.is_data_loaded = True + return True diff --git a/python/acl_anthology/people/name.py b/python/acl_anthology/people/name.py index b82acedc74..28fd88922f 100644 --- a/python/acl_anthology/people/name.py +++ b/python/acl_anthology/people/name.py @@ -132,7 +132,7 @@ def slugify(self) -> str: def from_dict(cls, name: dict[str, str]) -> Name: """ Parameters: - name: A dictionary with "first" and "last" keys. + name: A dictionary with "first", "last", and "script" keys. Only "last" is required. Returns: A corresponding Name object. @@ -140,6 +140,7 @@ def from_dict(cls, name: dict[str, str]) -> Name: return cls( name.get("first"), name["last"], + script=name.get("script"), ) @classmethod diff --git a/python/acl_anthology/people/person.py b/python/acl_anthology/people/person.py index 0b01bb3919..4d439cf1e0 100644 --- a/python/acl_anthology/people/person.py +++ b/python/acl_anthology/people/person.py @@ -96,7 +96,9 @@ class Person: factory=list, converter=_name_list_converter ) item_ids: list[AnthologyIDTuple] = field( - factory=list, repr=lambda x: f"" + factory=list, + converter=list, + repr=lambda x: f"", ) orcid: Optional[str] = field( default=None, @@ -104,7 +106,7 @@ class Person: ) # validator defined below comment: Optional[str] = field(default=None) degree: Optional[str] = field(default=None) - similar_ids: list[str] = field(factory=list) + similar_ids: list[str] = field(factory=list, converter=list) disable_name_matching: Optional[bool] = field(default=False, converter=bool) is_explicit: Optional[bool] = field(default=False, converter=bool) diff --git a/python/poetry.lock b/python/poetry.lock index 63a9848b18..64de70c9fc 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiofile" @@ -1285,6 +1285,91 @@ mkdocs-autorefs = ">=1.2" mkdocstrings = ">=0.28" typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} +[[package]] +name = "msgpack" +version = "1.1.1" +description = "MessagePack serializer" +optional = false +python-versions = ">=3.8" +groups = ["main", "dev"] +files = [ + {file = "msgpack-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:353b6fc0c36fde68b661a12949d7d49f8f51ff5fa019c1e47c87c4ff34b080ed"}, + {file = "msgpack-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:79c408fcf76a958491b4e3b103d1c417044544b68e96d06432a189b43d1215c8"}, + {file = "msgpack-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78426096939c2c7482bf31ef15ca219a9e24460289c00dd0b94411040bb73ad2"}, + {file = "msgpack-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b17ba27727a36cb73aabacaa44b13090feb88a01d012c0f4be70c00f75048b4"}, + {file = "msgpack-1.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a17ac1ea6ec3c7687d70201cfda3b1e8061466f28f686c24f627cae4ea8efd0"}, + {file = "msgpack-1.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:88d1e966c9235c1d4e2afac21ca83933ba59537e2e2727a999bf3f515ca2af26"}, + {file = "msgpack-1.1.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f6d58656842e1b2ddbe07f43f56b10a60f2ba5826164910968f5933e5178af75"}, + {file = "msgpack-1.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:96decdfc4adcbc087f5ea7ebdcfd3dee9a13358cae6e81d54be962efc38f6338"}, + {file = "msgpack-1.1.1-cp310-cp310-win32.whl", hash = "sha256:6640fd979ca9a212e4bcdf6eb74051ade2c690b862b679bfcb60ae46e6dc4bfd"}, + {file = "msgpack-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:8b65b53204fe1bd037c40c4148d00ef918eb2108d24c9aaa20bc31f9810ce0a8"}, + {file = "msgpack-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:71ef05c1726884e44f8b1d1773604ab5d4d17729d8491403a705e649116c9558"}, + {file = "msgpack-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:36043272c6aede309d29d56851f8841ba907a1a3d04435e43e8a19928e243c1d"}, + {file = "msgpack-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a32747b1b39c3ac27d0670122b57e6e57f28eefb725e0b625618d1b59bf9d1e0"}, + {file = "msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a8b10fdb84a43e50d38057b06901ec9da52baac6983d3f709d8507f3889d43f"}, + {file = "msgpack-1.1.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba0c325c3f485dc54ec298d8b024e134acf07c10d494ffa24373bea729acf704"}, + {file = "msgpack-1.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:88daaf7d146e48ec71212ce21109b66e06a98e5e44dca47d853cbfe171d6c8d2"}, + {file = "msgpack-1.1.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8b55ea20dc59b181d3f47103f113e6f28a5e1c89fd5b67b9140edb442ab67f2"}, + {file = "msgpack-1.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4a28e8072ae9779f20427af07f53bbb8b4aa81151054e882aee333b158da8752"}, + {file = "msgpack-1.1.1-cp311-cp311-win32.whl", hash = "sha256:7da8831f9a0fdb526621ba09a281fadc58ea12701bc709e7b8cbc362feabc295"}, + {file = "msgpack-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fd1b58e1431008a57247d6e7cc4faa41c3607e8e7d4aaf81f7c29ea013cb458"}, + {file = "msgpack-1.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ae497b11f4c21558d95de9f64fff7053544f4d1a17731c866143ed6bb4591238"}, + {file = "msgpack-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:33be9ab121df9b6b461ff91baac6f2731f83d9b27ed948c5b9d1978ae28bf157"}, + {file = "msgpack-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f64ae8fe7ffba251fecb8408540c34ee9df1c26674c50c4544d72dbf792e5ce"}, + {file = "msgpack-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a494554874691720ba5891c9b0b39474ba43ffb1aaf32a5dac874effb1619e1a"}, + {file = "msgpack-1.1.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb643284ab0ed26f6957d969fe0dd8bb17beb567beb8998140b5e38a90974f6c"}, + {file = "msgpack-1.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d275a9e3c81b1093c060c3837e580c37f47c51eca031f7b5fb76f7b8470f5f9b"}, + {file = "msgpack-1.1.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4fd6b577e4541676e0cc9ddc1709d25014d3ad9a66caa19962c4f5de30fc09ef"}, + {file = "msgpack-1.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb29aaa613c0a1c40d1af111abf025f1732cab333f96f285d6a93b934738a68a"}, + {file = "msgpack-1.1.1-cp312-cp312-win32.whl", hash = "sha256:870b9a626280c86cff9c576ec0d9cbcc54a1e5ebda9cd26dab12baf41fee218c"}, + {file = "msgpack-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:5692095123007180dca3e788bb4c399cc26626da51629a31d40207cb262e67f4"}, + {file = "msgpack-1.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3765afa6bd4832fc11c3749be4ba4b69a0e8d7b728f78e68120a157a4c5d41f0"}, + {file = "msgpack-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8ddb2bcfd1a8b9e431c8d6f4f7db0773084e107730ecf3472f1dfe9ad583f3d9"}, + {file = "msgpack-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:196a736f0526a03653d829d7d4c5500a97eea3648aebfd4b6743875f28aa2af8"}, + {file = "msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d592d06e3cc2f537ceeeb23d38799c6ad83255289bb84c2e5792e5a8dea268a"}, + {file = "msgpack-1.1.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4df2311b0ce24f06ba253fda361f938dfecd7b961576f9be3f3fbd60e87130ac"}, + {file = "msgpack-1.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e4141c5a32b5e37905b5940aacbc59739f036930367d7acce7a64e4dec1f5e0b"}, + {file = "msgpack-1.1.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b1ce7f41670c5a69e1389420436f41385b1aa2504c3b0c30620764b15dded2e7"}, + {file = "msgpack-1.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4147151acabb9caed4e474c3344181e91ff7a388b888f1e19ea04f7e73dc7ad5"}, + {file = "msgpack-1.1.1-cp313-cp313-win32.whl", hash = "sha256:500e85823a27d6d9bba1d057c871b4210c1dd6fb01fbb764e37e4e8847376323"}, + {file = "msgpack-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:6d489fba546295983abd142812bda76b57e33d0b9f5d5b71c09a583285506f69"}, + {file = "msgpack-1.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bba1be28247e68994355e028dcd668316db30c1f758d3241a7b903ac78dcd285"}, + {file = "msgpack-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8f93dcddb243159c9e4109c9750ba5b335ab8d48d9522c5308cd05d7e3ce600"}, + {file = "msgpack-1.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fbbc0b906a24038c9958a1ba7ae0918ad35b06cb449d398b76a7d08470b0ed9"}, + {file = "msgpack-1.1.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:61e35a55a546a1690d9d09effaa436c25ae6130573b6ee9829c37ef0f18d5e78"}, + {file = "msgpack-1.1.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:1abfc6e949b352dadf4bce0eb78023212ec5ac42f6abfd469ce91d783c149c2a"}, + {file = "msgpack-1.1.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:996f2609ddf0142daba4cefd767d6db26958aac8439ee41db9cc0db9f4c4c3a6"}, + {file = "msgpack-1.1.1-cp38-cp38-win32.whl", hash = "sha256:4d3237b224b930d58e9d83c81c0dba7aacc20fcc2f89c1e5423aa0529a4cd142"}, + {file = "msgpack-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:da8f41e602574ece93dbbda1fab24650d6bf2a24089f9e9dbb4f5730ec1e58ad"}, + {file = "msgpack-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5be6b6bc52fad84d010cb45433720327ce886009d862f46b26d4d154001994b"}, + {file = "msgpack-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3a89cd8c087ea67e64844287ea52888239cbd2940884eafd2dcd25754fb72232"}, + {file = "msgpack-1.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d75f3807a9900a7d575d8d6674a3a47e9f227e8716256f35bc6f03fc597ffbf"}, + {file = "msgpack-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d182dac0221eb8faef2e6f44701812b467c02674a322c739355c39e94730cdbf"}, + {file = "msgpack-1.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b13fe0fb4aac1aa5320cd693b297fe6fdef0e7bea5518cbc2dd5299f873ae90"}, + {file = "msgpack-1.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:435807eeb1bc791ceb3247d13c79868deb22184e1fc4224808750f0d7d1affc1"}, + {file = "msgpack-1.1.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4835d17af722609a45e16037bb1d4d78b7bdf19d6c0128116d178956618c4e88"}, + {file = "msgpack-1.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a8ef6e342c137888ebbfb233e02b8fbd689bb5b5fcc59b34711ac47ebd504478"}, + {file = "msgpack-1.1.1-cp39-cp39-win32.whl", hash = "sha256:61abccf9de335d9efd149e2fff97ed5974f2481b3353772e8e2dd3402ba2bd57"}, + {file = "msgpack-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:40eae974c873b2992fd36424a5d9407f93e97656d999f43fca9d29f820899084"}, + {file = "msgpack-1.1.1.tar.gz", hash = "sha256:77b79ce34a2bdab2594f490c8e80dd62a02d650b91a75159a63ec413b8d104cd"}, +] + +[[package]] +name = "msgpack-types" +version = "0.5.0" +description = "Type stubs for msgpack" +optional = false +python-versions = "<4.0,>=3.8" +groups = ["dev"] +files = [ + {file = "msgpack_types-0.5.0-py3-none-any.whl", hash = "sha256:8b633ed75e495a555fa0615843de559a74b1d176828d59bb393d266e51f6bda7"}, + {file = "msgpack_types-0.5.0.tar.gz", hash = "sha256:aebd1b8da23f8f9966d66ebb1a43bd261b95751c6a267bd21a124d2ccac84201"}, +] + +[package.dependencies] +msgpack = ">=1.1.0,<1.2.0" +typing-extensions = ">=4.6.0" + [[package]] name = "mypy" version = "1.15.0" @@ -2205,7 +2290,6 @@ optional = false python-versions = "*" groups = ["main"] files = [ - {file = "TexSoup-0.3.1-py3-none-any.whl", hash = "sha256:ae8f08d17f86a905b7c2ce01c9f2da613fbca0bcea78c71d727719e896045bed"}, {file = "TexSoup-0.3.1.tar.gz", hash = "sha256:3f6b2ad0abe3688a6656f544c1ba04d0eb25f423f8c377b7369f9ce061ddb70b"}, ] @@ -2228,7 +2312,7 @@ description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_full_version <= \"3.11.0a6\"" +markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -2456,4 +2540,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.1" python-versions = ">=3.10, !=3.11.0, <3.14" -content-hash = "fd532227f8093a084d9e122e006454d460046ae84dafd7919bc8cdf57542c3a2" +content-hash = "c93bb9c05154cb2144a0e7b9df21da3092ff83873d7eadcc1bea29971c61fa74" diff --git a/python/pyproject.toml b/python/pyproject.toml index 965d39c201..6322510aca 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -89,6 +89,7 @@ dependencies = [ "pylatexenc (~=2.10)", "stop-words (>=2018.7.23)", "typing-extensions (>=4.6.0) ; python_version < '3.11'", + "msgpack (>=1.1.1,<2.0.0)", ] [tool.poetry.group.dev.dependencies] @@ -107,6 +108,7 @@ types-lxml = "^2023.3.28" pymdown-extensions = "^10.13" pytest-datadir = "^1.6.1" richbench = {git = "https://github.com/tonybaloney/rich-bench.git"} +msgpack-types = "^0.5.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/python/tests/anthology_test.py b/python/tests/anthology_test.py index 094019a652..43e8b53e25 100644 --- a/python/tests/anthology_test.py +++ b/python/tests/anthology_test.py @@ -176,3 +176,11 @@ def test_load_all(anthology): assert anthology.people.is_data_loaded assert anthology.sigs.is_data_loaded assert anthology.venues.is_data_loaded + + +def test_compute_cache_dict(anthology): + datadir = str(anthology.datadir.resolve()) + assert anthology._compute_cache_dict([]) == {"datadir": datadir} + cache_dict = anthology._compute_cache_dict(["yaml/*"]) + assert len(list(cache_dict.keys())) == 2 + assert "yaml/people.yaml" in cache_dict diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 2b8dd7d261..733df09462 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -20,7 +20,10 @@ pytest.register_assert_rewrite("acl_anthology.utils.xml") -from acl_anthology import Anthology # noqa: E402 +from acl_anthology import Anthology, config # noqa: E402 + +# Disable caching by default when testing +config.disable_caching = True class AnthologyStub: diff --git a/python/tests/people/personindex_test.py b/python/tests/people/personindex_test.py index 87b9731f16..0970d1adb8 100644 --- a/python/tests/people/personindex_test.py +++ b/python/tests/people/personindex_test.py @@ -13,6 +13,10 @@ # limitations under the License. import pytest +from copy import deepcopy +from unittest.mock import patch + +from acl_anthology.config import config from acl_anthology.exceptions import ( AnthologyInvalidIDError, NameSpecResolutionError, @@ -568,7 +572,7 @@ def test_ingest_namespec_returns_namespec(index): ############################################################################## -### Tests for saving people.yaml +### Tests for saving people.yaml & caching ############################################################################## @@ -664,3 +668,30 @@ def test_add_person_to_people_yaml_via_create_person(index, tmp_path): orcid: 0000-0002-3600-1510""" in out ) + + +def test_peopleindex_caching(index, tmp_path, monkeypatch): + monkeypatch.setattr(config, "cache_path", tmp_path / "cache") + monkeypatch.setattr(config, "disable_caching", False) + assert not index._cache_file.is_file() + + with patch.object(PersonIndex, "build", wraps=index.build) as mock: + index.load() + # Should have called build() + mock.assert_called_once() + + # Should have created the cache file + assert index._cache_file.is_file() + + # Store _by_name mapping and reset index + old_index_by_name = deepcopy(index._by_name) + index.reset() + assert not index._by_name + + with patch.object(PersonIndex, "build", wraps=index.build) as mock: + index.load() + # Should NOT have called build(), but used the cache file + mock.assert_not_called() + + # Should have the same _by_name mapping as before + assert old_index_by_name == index._by_name