begin bulk fetch from cache

dholth · dholth · commit 4d20a8429b63 · 2025-09-23T19:04:02.000-04:00
diff --git a/conda_libmamba_solver/shards.py b/conda_libmamba_solver/shards.py
@@ -174,7 +174,7 @@ def build_repodata(self) -> RepodataDict:
         return repodata
 
 
-class Shards(ShardLike):
+class ShardsIndex(ShardLike):
     def __init__(self, shards_index: ShardsIndex, url: str, shards_cache: shards_cache.ShardCache):
         """
         Args:
@@ -185,13 +185,16 @@ def __init__(self, shards_index: ShardsIndex, url: str, shards_cache: shards_cac
         self.url = url
         self.shards_cache = shards_cache
 
-        self.session = get_session(self.base_url)
+        # can we share a session for multiple subdir's of the same channel, or
+        # any time self.shards_base_url is similar to another Shards() instance?
+        self.session = get_session(self.shards_base_url)
 
         self.repodata_no_packages = {
             k: v for k, v in self.shards_index.items() if k not in ("shards",)
         }
 
         # used to write out repodata subset
+        # not used in traversal algorithm
         self.visited: dict[str, Shard | None] = {}
 
     @property
@@ -203,7 +206,7 @@ def packages_index(self):
         return self.shards_index["shards"]
 
     @property
-    def base_url(self) -> str:
+    def shards_base_url(self) -> str:
         """
         Return self.url joined with shards_base_url.
         Note shards_base_url can be a relative or an absolute url.
@@ -218,7 +221,7 @@ def shard_url(self, package: str) -> str:
         """
         shard_name = f"{self.packages_index[package].hex()}.msgpack.zst"
         # "Individual shards are stored under the URL <shards_base_url><sha256>.msgpack.zst"
-        return urljoin(self.base_url, shard_name)
+        return urljoin(self.shards_base_url, shard_name)
 
     def fetch_shard(self, package: str) -> Shard:
         """
@@ -341,42 +344,43 @@ def repodata_shards(url, cache: RepodataCache) -> bytes:
     return response_bytes
 
 
-def fetch_shards(sd: SubdirData) -> Shards | None:
+def fetch_shards(
+    sd: SubdirData, cache: shards_cache.ShardCache | None = None
+) -> ShardsIndex | None:
     """
     Check a SubdirData's URL for shards.
     Return shards index bytes from cache or network.
     Return None if not found; caller should fetch normal repodata.
     """
 
     fetch = sd.repo_fetch
-    cache = fetch.repo_cache
+    repo_cache = fetch.repo_cache
     # cache.load_state() will clear the file on JSONDecodeError but cache.load()
     # will raise the exception
-    cache.load_state(binary=True)
-    cache_state = cache.state
+    repo_cache.load_state(binary=True)
+    cache_state = repo_cache.state
+
+    if cache is None:
+        cache = shards_cache.ShardCache(Path(conda.gateways.repodata.create_cache_dir()))
 
     if cache_state.should_check_format("shards"):
         try:
             # look for shards index
             shards_index_url = f"{sd.url_w_subdir}/repodata_shards.msgpack.zst"
-            found = repodata_shards(shards_index_url, cache)
+            found = repodata_shards(shards_index_url, repo_cache)
             cache_state.set_has_format("shards", True)
             # this will also set state["refresh_ns"] = time.time_ns(); we could
             # call cache.refresh() if we got a 304 instead:
-            cache.save(found)
+            repo_cache.save(found)
 
             # basic parse (move into caller?)
             shards_index: ShardsIndex = msgpack.loads(zstandard.decompress(found))  # type: ignore
-            shards = Shards(
-                shards_index,
-                shards_index_url,
-                shards_cache.ShardCache(Path(conda.gateways.repodata.create_cache_dir())),
-            )
+            shards = ShardsIndex(shards_index, shards_index_url, cache)
             return shards
 
         except (HTTPError, conda.gateways.repodata.RepodataIsEmpty):
             # fetch repodata.json / repodata.json.zst instead
             cache_state.set_has_format("shards", False)
-            cache.refresh(refresh_ns=1)  # expired but not falsy
+            repo_cache.refresh(refresh_ns=1)  # expired but not falsy
 
     return None
diff --git a/conda_libmamba_solver/shards_subset.py b/conda_libmamba_solver/shards_subset.py
@@ -36,11 +36,15 @@
 import json
 import sys
 from dataclasses import dataclass
+from pathlib import Path
 
+import conda.gateways.repodata
 from conda.base.context import context
 from conda.core.subdir_data import SubdirData
 from conda.models.channel import Channel
 
+from conda_libmamba_solver import shards_cache
+
 from .shards import RepodataDict, ShardLike, fetch_shards, shard_mentioned_packages
 
 
@@ -100,6 +104,7 @@ def shortest(self, start_packages):
         self.nodes = {package: Node(0, package) for package in start_packages}
         unvisited = [(n.distance, n) for n in self.nodes.values()]
         while unvisited:
+            # parallel fetch all unvisited shards but don't mark as visited
             original_priority, node = heapq.heappop(unvisited)
             if (
                 original_priority != node.distance
@@ -116,16 +121,7 @@ def shortest(self, start_packages):
 
 
 def build_repodata_subset(tmp_path, root_packages, channels):
-    channel_data: dict[str, ShardLike] = {}
-    for channel in channels:
-        for channel_url in Channel(channel).urls(True, context.subdirs):
-            subdir_data = SubdirData(Channel(channel_url))
-            found = fetch_shards(subdir_data)
-            if not found:
-                repodata_json, _ = subdir_data.repo_fetch.fetch_latest_parsed()
-                repodata_json = RepodataDict(repodata_json)  # type: ignore
-                found = ShardLike(repodata_json, channel_url)
-            channel_data[channel_url] = found
+    channel_data = fetch_channels(channels)
 
     subset = RepodataSubset((*channel_data.values(),))
     subset.shortest(root_packages)
@@ -147,3 +143,21 @@ def build_repodata_subset(tmp_path, root_packages, channels):
         subset_paths[channel] = repodata_path
 
     return subset_paths, repodata_size
+
+
+def fetch_channels(channels):
+    channel_data: dict[str, ShardLike] = {}
+
+    # share single disk cache for all Shards() instances
+    cache = shards_cache.ShardCache(Path(conda.gateways.repodata.create_cache_dir()))
+
+    for channel in channels:
+        for channel_url in Channel(channel).urls(True, context.subdirs):
+            subdir_data = SubdirData(Channel(channel_url))
+            found = fetch_shards(subdir_data, cache)
+            if not found:
+                repodata_json, _ = subdir_data.repo_fetch.fetch_latest_parsed()
+                repodata_json = RepodataDict(repodata_json)  # type: ignore
+                found = ShardLike(repodata_json, channel_url)
+            channel_data[channel_url] = found
+    return channel_data
diff --git a/tests/test_shards.py b/tests/test_shards.py
@@ -13,6 +13,7 @@
 import random
 import time
 import urllib.parse
+from contextlib import contextmanager
 from hashlib import sha256
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -27,13 +28,12 @@
 from conda_libmamba_solver import shards, shards_cache
 from conda_libmamba_solver.index import LibMambaIndexHelper
 from conda_libmamba_solver.shards import (
-    RepodataDict,
     ShardLike,
-    Shards,
+    ShardsIndex,
     fetch_shards,
     shard_mentioned_packages,
 )
-from conda_libmamba_solver.shards_subset import build_repodata_subset
+from conda_libmamba_solver.shards_subset import Node, build_repodata_subset, fetch_channels
 from tests.channel_testing.helpers import _dummy_http_server
 
 if TYPE_CHECKING:
@@ -157,19 +157,10 @@ def test_fetch_shards(conda_no_token: None):
 
     channels.append(Channel("conda-forge-sharded"))
 
-    channel_data: dict[str, ShardLike] = {}
-    for channel in channels:
-        for channel_url in Channel(channel).urls(True, context.subdirs):
-            subdir_data = SubdirData(Channel(channel_url))
-            found = fetch_shards(subdir_data)
-            if not found:
-                repodata_json, _ = subdir_data.repo_fetch.fetch_latest_parsed()
-                repodata_json = RepodataDict(repodata_json)  # type: ignore
-                found = ShardLike(repodata_json, channel_url)
-            channel_data[channel_url] = found
+    channel_data = fetch_channels(channels)
 
     # at least one should be real shards, not repodata.json presented as shards.
-    assert any(isinstance(channel, Shards) for channel in channel_data.values())
+    assert any(isinstance(channel, ShardsIndex) for channel in channel_data.values())
 
 
 def test_shard_cache(tmp_path: Path):
@@ -317,6 +308,9 @@ def test_shardlike():
 
 
 def test_shardlike_repr():
+    """
+    Code coverage for ShardLike.__repr__()
+    """
     shardlike = ShardLike(
         {
             "packages": {},
@@ -325,7 +319,7 @@ def test_shardlike_repr():
         },
         "https://conda.anaconda.org/",
     )
-    cls, url, *rest = repr(shardlike).split()
+    cls, url, *_ = repr(shardlike).split()
     assert "ShardLike" in cls
     assert shardlike.url == url
 
@@ -361,7 +355,8 @@ def test_shardlike_repr():
 
 def test_traverse_shards_3(conda_no_token: None, tmp_path):
     """
-    Another go at the dependency traversal algorithm.
+    Build repodata subset using the third attempt at a dependency traversal
+    algorithm.
     """
 
     logging.basicConfig(level=logging.INFO)
@@ -390,6 +385,9 @@ def test_traverse_shards_3(conda_no_token: None, tmp_path):
 
 
 def test_shards_indexhelper(conda_no_token):
+    """
+    Load LibMambaIndexHelper with parameters that will enable sharded repodata.
+    """
     channels = [*context.default_channels, Channel("conda-forge-sharded")]
 
     class fake_in_state:
@@ -407,3 +405,68 @@ class fake_in_state:
     )
 
     print(helper.repos)
+
+
+@contextmanager
+def _timer(name: str):
+    begin = time.monotonic_ns()
+    yield
+    end = time.monotonic_ns()
+    print(f"{name} took {(end - begin) / 1e9:0.6f}s")
+
+
+def test_parallel_fetcherator(conda_no_token: None):
+    channels = [*context.default_channels, Channel("conda-forge-sharded")]
+    roots = [
+        Node(distance=0, package="ca-certificates", visited=False),
+        Node(distance=0, package="icu", visited=False),
+        Node(distance=0, package="expat", visited=False),
+        Node(distance=0, package="libexpat", visited=False),
+        Node(distance=0, package="libffi", visited=False),
+        Node(distance=0, package="libmpdec", visited=False),
+        Node(distance=0, package="libzlib", visited=False),
+        Node(distance=0, package="openssl", visited=False),
+        Node(distance=0, package="python", visited=False),
+        Node(distance=0, package="readline", visited=False),
+        Node(distance=0, package="liblzma", visited=False),
+        Node(distance=0, package="xz", visited=False),
+        Node(distance=0, package="libsqlite", visited=False),
+        Node(distance=0, package="tk", visited=False),
+        Node(distance=0, package="ncurses", visited=False),
+        Node(distance=0, package="zlib", visited=False),
+        Node(distance=0, package="pip", visited=False),
+        Node(distance=0, package="twine", visited=False),
+        Node(distance=0, package="python_abi", visited=False),
+        Node(distance=0, package="tzdata", visited=False),
+    ]
+
+    with _timer("repodata.json/shards index fetch"):
+        channel_data = fetch_channels(channels)
+
+    with _timer("Shard fetch"):
+        sharded = [
+            channel for channel in channel_data.values() if isinstance(channel, ShardsIndex)
+        ]
+        assert sharded, "No sharded repodata found"
+
+        wanted = []
+        for shard in sharded:
+            for root in roots:
+                if root.package in shard:
+                    wanted.append((shard, root.package, shard.shard_url(root.package)))
+
+        print(len(wanted), "shards to fetch")
+
+        shared_shard_cache = sharded[0].shards_cache
+        from_cache = shared_shard_cache.retrieve_multiple([shard_url for *_, shard_url in wanted])
+
+        for url, shard_or_none in from_cache.items():
+            if shard_or_none is not None:
+                print(f"Cache hit for {url}")
+
+        # add fetched Shard objects to Shards objects visited dict
+        for shard, package, shard_url in wanted:
+            if from_cache_shard := from_cache.get(shard_url):
+                shard.visited[package] = from_cache_shard
+
+        # XXX don't call everything Shard/Shards