From 79ecb0d480fd8834a17ecf7926ba6e7162eb22ed Mon Sep 17 00:00:00 2001 From: Tobias Markus Date: Mon, 3 Jul 2023 23:03:48 +0200 Subject: [PATCH] feat: Cache build environments by content hash Write an archive consisting of the build environment provided by pip, and then extract it at a path containing the SHA256 hash of its contents. This prevents unnecessary rebuilds triggered by CMake thinking that packages provided by pip in the build environment have been changed, when it's only the temporary path that changed. Because the path contains the hash of its contents, this is perfectly safe since a changed build environment will change the hash. Signed-off-by: Tobias Markus --- src/scikit_build_core/builder/builder.py | 123 +++++++++++++++++- .../settings/skbuild_model.py | 3 + 2 files changed, 124 insertions(+), 2 deletions(-) diff --git a/src/scikit_build_core/builder/builder.py b/src/scikit_build_core/builder/builder.py index 73f4fc63..2c8a582c 100644 --- a/src/scikit_build_core/builder/builder.py +++ b/src/scikit_build_core/builder/builder.py @@ -1,11 +1,16 @@ from __future__ import annotations import dataclasses +import hashlib +import os import re import sys import sysconfig +import tarfile +import tempfile from collections.abc import Iterable, Mapping, Sequence from pathlib import Path +from typing import BinaryIO from packaging.version import Version @@ -22,6 +27,7 @@ get_python_library, get_soabi, ) +from .wheel_tag import WheelTag __all__: list[str] = ["Builder", "get_archs", "archs_to_tags"] @@ -64,6 +70,83 @@ def archs_to_tags(archs: list[str]) -> list[str]: return archs +@dataclasses.dataclass(init=False) +class BuildEnvArchive: + _archive_file: BinaryIO + hash: hashlib._Hash + + def __init__(self, env_dir: Path) -> None: + self._archive_file = tempfile.TemporaryFile(prefix="build-env-archive-", suffix=".tar") # type: ignore[assignment] + + # Rewrite environment path to be relative to root + # Example: + # /tmp/pip-build-env-pklovjqz/overlay/lib/python3.11/site-packages + # is rewritten into + # tmp/pip-build-env-pklovjqz/overlay/lib/python3.11/site-packages + prefix = Path(env_dir) + prefix = prefix.relative_to(prefix.root) + + def ext_filter(ti: tarfile.TarInfo) -> tarfile.TarInfo | None: + pname = Path(ti.name) + + if ti.type is tarfile.LNKTYPE: + logger.warning( + "Unexpected link inside build environment archive (path={})", pname + ) + elif ( + ti.type is not tarfile.REGTYPE + and ti.type is not tarfile.AREGTYPE + and ti.type is not tarfile.DIRTYPE + ): + logger.warning( + "Unexpected file type inside build environment archive (path={})", + pname, + ) + + # Rewrite name to be relative to site-packages inside the build environment + ti.name = str(pname.relative_to(prefix)) + + # FIXME: __pycache__ files don't have consistent hashes - why? + if "__pycache__" in ti.name: + return None + + # Reset mtime to zero + # This is safe (regarding build tool out-of-date detection) + # since the resulting archive is content-addressed through its hash + ti.mtime = 0 + + return ti + + with tarfile.open( + fileobj=self._archive_file, mode="x", dereference=True + ) as dir_tar: + dir_tar.add(env_dir, filter=ext_filter) + + self._archive_file.flush() + + archive_len = self._archive_file.tell() + self._archive_file.seek(0) + + self.hash = hashlib.file_digest(self._archive_file, hashlib.sha256) # type: ignore[attr-defined] + self._archive_file.seek(0) + + logger.debug( + "created build env archive len={} sha256={}", + archive_len, + self.hash.hexdigest(), + ) + + def extract(self, destination: Path) -> None: + self._archive_file.seek(0) + with tarfile.open(fileobj=self._archive_file, mode="r") as dir_tar: + dir_tar.extractall(path=destination) + + # Reset atime/mtime of the destination directory + # Otherwise CMake would consider the directory out of date + # FIXME: Apparently not necessary? + # os.utime(destination, times=(0,0)) + + @dataclasses.dataclass class Builder: settings: ScikitBuildSettings @@ -79,6 +162,31 @@ def get_cmake_args(self) -> list[str]: return [*self.settings.cmake.args, *env_cmake_args] + # FIXME: Proper setting for build env dir + def _build_dir(self) -> Path: + tags = WheelTag.compute_best( + archs_to_tags(get_archs(os.environ)), + self.settings.wheel.py_api, + expand_macos=self.settings.wheel.expand_macos_universal_tags, + ) + + assert self.settings.build_dir is not None + # A build dir can be specified, otherwise use a temporary directory + build_dir = Path( + self.settings.build_dir.format( + cache_tag=sys.implementation.cache_tag, + wheel_tag=str(tags), + ) + ) + logger.info("Build directory: {}", build_dir.resolve()) + + return build_dir.resolve() + + def _build_env_cache_dir(self, hash: hashlib._Hash) -> Path: + base_dir = self._build_dir() + base_dir = base_dir.with_name(base_dir.name + "-build-env-cache") + return base_dir / hash.hexdigest() + def configure( self, *, @@ -103,9 +211,20 @@ def configure( site_packages = Path(sysconfig.get_path("purelib")) self.config.prefix_dirs.append(site_packages) logger.debug("SITE_PACKAGES: {}", site_packages) - if site_packages != DIR.parent.parent: + + if self.settings.cache_build_env: + if not self.settings.experimental: + msg = "Experimental features must be enabled to use build environment caching" + raise AssertionError(msg) + + archive = BuildEnvArchive(DIR.parent.parent) + targettree = self._build_env_cache_dir(archive.hash) + archive.extract(targettree) + self.config.prefix_dirs.append(targettree) + + elif site_packages != DIR.parent.parent: self.config.prefix_dirs.append(DIR.parent.parent) - logger.debug("Extra SITE_PACKAGES: {}", site_packages) + logger.debug("Extra SITE_PACKAGES: {}", DIR.parent.parent) # Add the FindPython backport if needed fp_backport = self.settings.backport.find_python diff --git a/src/scikit_build_core/settings/skbuild_model.py b/src/scikit_build_core/settings/skbuild_model.py index ea87eb2a..232a28b3 100644 --- a/src/scikit_build_core/settings/skbuild_model.py +++ b/src/scikit_build_core/settings/skbuild_model.py @@ -148,3 +148,6 @@ class ScikitBuildSettings: #: The build directory. Defaults to a temporary directory, but can be set. build_dir: str = "" + + #: Whether to cache build environments. Experimental feature. + cache_build_env: bool = False