diff --git a/micropip/_compat_in_pyodide.py b/micropip/_compat_in_pyodide.py index 02a5068..976443e 100644 --- a/micropip/_compat_in_pyodide.py +++ b/micropip/_compat_in_pyodide.py @@ -1,5 +1,3 @@ -from io import BytesIO -from typing import IO from urllib.parse import urlparse from pyodide._package_loader import get_dynlibs @@ -20,7 +18,7 @@ # Otherwise, this is pytest test collection so let it go. -async def fetch_bytes(url: str, kwargs: dict[str, str]) -> IO[bytes]: +async def fetch_bytes(url: str, kwargs: dict[str, str]) -> bytes: parsed_url = urlparse(url) if parsed_url.scheme == "emfs": return open(parsed_url.path, "rb") @@ -28,7 +26,7 @@ async def fetch_bytes(url: str, kwargs: dict[str, str]) -> IO[bytes]: result_bytes = (await loadBinaryFile(parsed_url.path)).to_bytes() else: result_bytes = await (await pyfetch(url, **kwargs)).bytes() - return BytesIO(result_bytes) + return result_bytes async def fetch_string_and_headers( diff --git a/micropip/_compat_not_in_pyodide.py b/micropip/_compat_not_in_pyodide.py index c229aa9..18effe0 100644 --- a/micropip/_compat_not_in_pyodide.py +++ b/micropip/_compat_not_in_pyodide.py @@ -1,5 +1,4 @@ import re -from io import BytesIO from pathlib import Path from typing import IO, Any @@ -20,9 +19,9 @@ def _fetch(url: str, kwargs: dict[str, Any]) -> addinfourl: return urlopen(Request(url, **kwargs)) -async def fetch_bytes(url: str, kwargs: dict[str, Any]) -> IO[bytes]: +async def fetch_bytes(url: str, kwargs: dict[str, Any]) -> bytes: response = _fetch(url, kwargs=kwargs) - return BytesIO(response.read()) + return response.read() async def fetch_string_and_headers( diff --git a/micropip/package_index.py b/micropip/package_index.py index 2772b7b..19c6ab0 100644 --- a/micropip/package_index.py +++ b/micropip/package_index.py @@ -150,6 +150,9 @@ def _compatible_wheels( hashes = file["digests"] if "digests" in file else file["hashes"] sha256 = hashes.get("sha256") + # Check if the metadata file is available (PEP 658) + data_dist_info_metadata = file.get("data-dist-info-metadata") + # Size of the file in bytes, if available (PEP 700) # This key is not available in the Simple API HTML response, so this field may be None size = file.get("size") @@ -161,6 +164,7 @@ def _compatible_wheels( version=version, sha256=sha256, size=size, + data_dist_info_metadata=data_dist_info_metadata, ) releases_compatible = { diff --git a/micropip/transaction.py b/micropip/transaction.py index bbc91e2..641075c 100644 --- a/micropip/transaction.py +++ b/micropip/transaction.py @@ -231,9 +231,24 @@ async def add_wheel( logger.info(f"Collecting {wheel.name}{specifier}") logger.info(f" Downloading {wheel.url.split('/')[-1]}") - await wheel.download(self.fetch_kwargs) + wheel_download_task = asyncio.create_task(wheel.download(self.fetch_kwargs)) if self.deps: - await self.gather_requirements(wheel.requires(extras)) + # Case 1) If metadata file is available, + # we can gather requirements without waiting for the wheel to be downloaded. + if wheel.pep658_metadata_available(): + try: + await wheel.download_pep658_metadata(self.fetch_kwargs) + except OSError: + # If something goes wrong while downloading the metadata, + # we have to wait for the wheel to be downloaded. + await wheel_download_task + await self.gather_requirements(wheel.requires(extras)) + + # Case 2) If metadata file is not available, + # we have to wait for the wheel to be downloaded. + else: + await wheel_download_task + await self.gather_requirements(wheel.requires(extras)) self.wheels.append(wheel) diff --git a/micropip/wheelinfo.py b/micropip/wheelinfo.py index e831fdd..fadcd84 100644 --- a/micropip/wheelinfo.py +++ b/micropip/wheelinfo.py @@ -1,10 +1,11 @@ import asyncio import hashlib +import io import json import zipfile from dataclasses import dataclass from pathlib import Path -from typing import IO, Any +from typing import Any from urllib.parse import ParseResult, urlparse from packaging.requirements import Requirement @@ -36,10 +37,13 @@ class WheelInfo: parsed_url: ParseResult sha256: str | None = None size: int | None = None # Size in bytes, if available (PEP 700) + data_dist_info_metadata: bool | dict[ + str, str + ] | None = None # Whether the package index exposes the wheel's metadata (PEP 658) # Fields below are only available after downloading the wheel, i.e. after calling `download()`. - _data: IO[bytes] | None = None # Wheel file contents. + _data: bytes | None = None # Wheel file contents. _metadata: Metadata | None = None # Wheel metadata. _requires: list[Requirement] | None = None # List of requirements. @@ -77,6 +81,7 @@ def from_package_index( version: Version, sha256: str | None, size: int | None, + data_dist_info_metadata: bool = False, ) -> "WheelInfo": """Extract available metadata from response received from package index""" parsed_url = urlparse(url) @@ -92,6 +97,7 @@ def from_package_index( parsed_url=parsed_url, sha256=sha256, size=size, + data_dist_info_metadata=data_dist_info_metadata, ) async def install(self, target: Path) -> None: @@ -109,7 +115,8 @@ async def install(self, target: Path) -> None: raise RuntimeError( "Micropip internal error: attempted to install wheel before downloading it?" ) - self._validate() + _validate_sha256_checksum(self._data, self.sha256) + self._extract(target) await self._load_libraries(target) self._set_installer() @@ -118,10 +125,44 @@ async def download(self, fetch_kwargs: dict[str, Any]): if self._data is not None: return - self._data = await self._fetch_bytes(fetch_kwargs) - with zipfile.ZipFile(self._data) as zf: - metadata_path = wheel_dist_info_dir(zf, self.name) + "/" + Metadata.PKG_INFO - self._metadata = Metadata(zipfile.Path(zf, metadata_path)) + self._data = await self._fetch_bytes(self.url, fetch_kwargs) + + if self._metadata is None: + with zipfile.ZipFile(io.BytesIO(self._data)) as zf: + metadata_path = ( + wheel_dist_info_dir(zf, self.name) + "/" + Metadata.PKG_INFO + ) + self._metadata = Metadata(zipfile.Path(zf, metadata_path)) + + def pep658_metadata_available(self) -> bool: + """ + Check if the wheel's metadata is exposed via PEP 658. + """ + return self.data_dist_info_metadata is not None + + async def download_pep658_metadata( + self, fetch_kwargs: dict[str, Any] = None + ) -> dict[str, str]: + """ + Download the wheel's metadata exposed via PEP 658. + """ + if fetch_kwargs is None: + fetch_kwargs = {} + if self.data_dist_info_metadata is None: + raise RuntimeError( + "Micropip internal error: the package index does not expose the wheel's metadata via PEP 658." + ) + + metadata_url = self.url + ".metadata" + data = await self._fetch_bytes(metadata_url, fetch_kwargs) + + match self.data_dist_info_metadata: + case {"sha256": checksum}: # sha256 checksum available + _validate_sha256_checksum(data, checksum) + case _: # no checksum available + pass + + self._metadata = Metadata(data) def requires(self, extras: set[str]) -> list[Requirement]: """ @@ -136,9 +177,9 @@ def requires(self, extras: set[str]) -> list[Requirement]: self._requires = requires return requires - async def _fetch_bytes(self, fetch_kwargs: dict[str, Any]): + async def _fetch_bytes(self, url: str, fetch_kwargs: dict[str, Any]): try: - return await fetch_bytes(self.url, fetch_kwargs) + return await fetch_bytes(url, fetch_kwargs) except OSError as e: if self.parsed_url.hostname in [ "files.pythonhosted.org", @@ -153,20 +194,9 @@ async def _fetch_bytes(self, fetch_kwargs: dict[str, Any]): "Check if the server is sending the correct 'Access-Control-Allow-Origin' header." ) from e - def _validate(self): - if self.sha256 is None: - # No checksums available, e.g. because installing - # from a different location than PyPI. - return - - assert self._data - sha256_actual = _generate_package_hash(self._data) - if sha256_actual != self.sha256: - raise ValueError("Contents don't match hash") - def _extract(self, target: Path) -> None: assert self._data - with zipfile.ZipFile(self._data) as zf: + with zipfile.ZipFile(io.BytesIO(self._data)) as zf: zf.extractall(target) self._dist_info = target / wheel_dist_info_dir(zf, self.name) @@ -198,16 +228,22 @@ async def _load_libraries(self, target: Path) -> None: TODO: integrate with pyodide's dynamic library loading mechanism. """ assert self._data - dynlibs = get_dynlibs(self._data, ".whl", target) + dynlibs = get_dynlibs(io.BytesIO(self._data), ".whl", target) await asyncio.gather(*map(lambda dynlib: loadDynlib(dynlib, False), dynlibs)) -def _generate_package_hash(data: IO[bytes]) -> str: - """ - Generate a SHA256 hash of the package data. - """ - sha256_hash = hashlib.sha256() - data.seek(0) - while chunk := data.read(4096): - sha256_hash.update(chunk) - return sha256_hash.hexdigest() +def _validate_sha256_checksum(data: bytes, sha256_expected: str | None = None) -> None: + if sha256_expected is None: + # No checksums available, e.g. because installing + # from a different location than PyPI. + return + + actual = _generate_package_hash(data) + if actual != sha256_expected: + raise RuntimeError( + f"Invalid checksum: expected {sha256_expected}, got {actual}" + ) + + +def _generate_package_hash(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() diff --git a/tests/conftest.py b/tests/conftest.py index cca0c4d..0900dc4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -257,7 +257,7 @@ def write_file(filename, contents): tmp.seek(0) - return tmp + return tmp.read() @pytest.fixture diff --git a/tests/test_data/wheel/pytest-7.2.2-py3-none-any.whl.metadata.gz b/tests/test_data/wheel/pytest-7.2.2-py3-none-any.whl.metadata.gz new file mode 100644 index 0000000..2948d2e Binary files /dev/null and b/tests/test_data/wheel/pytest-7.2.2-py3-none-any.whl.metadata.gz differ diff --git a/tests/test_install.py b/tests/test_install.py index 12848d4..adcae86 100644 --- a/tests/test_install.py +++ b/tests/test_install.py @@ -370,8 +370,6 @@ async def run_test(selenium, url, name, version): @pytest.mark.asyncio async def test_custom_index_urls(mock_package_index_json_api, monkeypatch): - from io import BytesIO - mock_server_fake_package = mock_package_index_json_api( pkgs=["fake-pkg-micropip-test"] ) @@ -381,7 +379,7 @@ async def test_custom_index_urls(mock_package_index_json_api, monkeypatch): async def _mock_fetch_bytes(url, *args): nonlocal _wheel_url _wheel_url = url - return BytesIO(b"fake wheel") + return b"fake wheel" from micropip import wheelinfo diff --git a/tests/test_wheelinfo.py b/tests/test_wheelinfo.py index a5af701..bcec6c6 100644 --- a/tests/test_wheelinfo.py +++ b/tests/test_wheelinfo.py @@ -1,7 +1,6 @@ -from io import BytesIO - import pytest -from conftest import PYTEST_WHEEL, TEST_WHEEL_DIR +from conftest import PYTEST_WHEEL, TEST_WHEEL_DIR, _read_gzipped_testfile +from packaging.utils import parse_wheel_filename from micropip.wheelinfo import WheelInfo @@ -13,7 +12,7 @@ def dummy_wheel(): @pytest.fixture def dummy_wheel_content(): - yield BytesIO((TEST_WHEEL_DIR / PYTEST_WHEEL).read_bytes()) + yield (TEST_WHEEL_DIR / PYTEST_WHEEL).read_bytes() @pytest.fixture @@ -23,6 +22,11 @@ def dummy_wheel_url(httpserver): content_type="application/zip", headers={"Access-Control-Allow-Origin": "*"}, ) + httpserver.expect_request(f"/{PYTEST_WHEEL}.metadata").respond_with_data( + _read_gzipped_testfile(TEST_WHEEL_DIR / f"{PYTEST_WHEEL}.metadata.gz"), + content_type="application/zip", + headers={"Access-Control-Allow-Origin": "*"}, + ) return httpserver.url_for(f"/{PYTEST_WHEEL}") @@ -56,25 +60,6 @@ def test_from_package_index(): assert wheel.sha256 == sha256 -def test_validate(dummy_wheel): - import hashlib - - dummy_wheel.sha256 = None - dummy_wheel._data = BytesIO(b"dummy-data") - - # Should succeed when sha256 is None - dummy_wheel._validate() - - # Should fail when checksum is different - dummy_wheel.sha256 = "dummy-sha256" - with pytest.raises(ValueError, match="Contents don't match hash"): - dummy_wheel._validate() - - # Should succeed when checksum is the same - dummy_wheel.sha256 = hashlib.sha256(b"dummy-data").hexdigest() - dummy_wheel._validate() - - def test_extract(dummy_wheel, dummy_wheel_content, tmp_path): dummy_wheel._data = dummy_wheel_content dummy_wheel._extract(tmp_path) @@ -124,3 +109,87 @@ async def test_requires(dummy_wheel_url, tmp_path): requirements_extra_testing = [str(r.name) for r in wheel.requires({"testing"})] assert "pluggy" in requirements_extra_testing assert "hypothesis" in requirements_extra_testing + + +def test_pep658_metadata_available(): + name = "dummy-module" + filename = "dummy_module-0.0.1-py3-none-any.whl" + url = "https://test.com/dummy_module-0.0.1-py3-none-any.whl" + version = "0.0.1" + sha256 = "dummy-sha256" + size = 1234 + + wheel = WheelInfo.from_package_index( + name, filename, url, version, sha256, size, data_dist_info_metadata=True + ) + assert wheel.pep658_metadata_available() + + wheel = WheelInfo.from_package_index( + name, + filename, + url, + version, + sha256, + size, + data_dist_info_metadata={"sha256": "dummy-sha256"}, + ) + assert wheel.pep658_metadata_available() + + wheel = WheelInfo.from_url(url) + assert not wheel.pep658_metadata_available() + + +@pytest.mark.asyncio +async def test_download_pep658_metadata(dummy_wheel_url): + parsed = parse_wheel_filename(PYTEST_WHEEL) + name = str(parsed[0]) + version = str(parsed[1]) + filename = PYTEST_WHEEL + sha256 = "dummy-sha256" + size = 1234 + + wheel = WheelInfo.from_package_index( + name, + filename, + dummy_wheel_url, + version, + sha256, + size, + data_dist_info_metadata=True, + ) + assert wheel.pep658_metadata_available() + + assert wheel._metadata is None + await wheel.download_pep658_metadata() + assert wheel._metadata is not None + + wheel = WheelInfo.from_package_index( + name, + filename, + dummy_wheel_url, + version, + sha256, + size, + data_dist_info_metadata={"sha256": "dummy-sha256"}, + ) + assert wheel.pep658_metadata_available() + + assert wheel._metadata is None + with pytest.raises(RuntimeError, match="Invalid checksum: expected dummy-sha256"): + await wheel.download_pep658_metadata() + + checksum = "62eb95408ccec185e7a3b8f354a1df1721cd8f463922f5a900c7bf4b69c5a4e8" # TODO: calculate this from the file + wheel = WheelInfo.from_package_index( + name, + filename, + dummy_wheel_url, + version, + sha256, + size, + data_dist_info_metadata={"sha256": checksum}, + ) + assert wheel.pep658_metadata_available() + + assert wheel._metadata is None + await wheel.download_pep658_metadata() + assert wheel._metadata is not None