parallelize osv download for efficiency

crosleyzack · crosleyzack · commit 52af1d3eb0a8 · 2026-06-03T09:38:22.000-04:00
Signed-off-by: crosleyzack &lt;mail@crosleyzack.com&gt;
diff --git a/src/vunnel/providers/chainguard/__init__.py b/src/vunnel/providers/chainguard/__init__.py
@@ -27,6 +27,10 @@ class Config:
     osv_url: str = "https://packages.cgr.dev/chainguard/v2/osv/all.json"
     # Override with VUNNEL_PROVIDERS_CHAINGUARD_USE_OSV
     use_osv: bool = False
+    # Override with VUNNEL_PROVIDERS_CHAINGUARD_SKIP_REDOWNLOAD
+    skip_redownload: bool = False
+    # Override with VUNNEL_PROVIDERS_CHAINGUARD_OSV_MAX_WORKERS
+    osv_max_workers: int = 8
 
 
 class Provider(provider.Provider):
@@ -54,6 +58,8 @@ def __init__(self, root: str, config: Config | None = None):
                 namespace=self._namespace,
                 download_timeout=self.config.request_timeout,
                 logger=self.logger,
+                skip_redownload=self.config.skip_redownload,
+                max_workers=self.config.osv_max_workers,
             )
             self.schema = schema.OSVSchema(version="1.7.0")
         else:
@@ -63,6 +69,7 @@ def __init__(self, root: str, config: Config | None = None):
                 namespace=self._namespace,
                 download_timeout=self.config.request_timeout,
                 logger=self.logger,
+                skip_redownload=self.config.skip_redownload,
             )
             self.feed_url = self.config.secdb_url
             self.schema = schema.OSSchema()
diff --git a/src/vunnel/providers/wolfi/parser.py b/src/vunnel/providers/wolfi/parser.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import abc
+import concurrent.futures
 import copy
 import logging
 import os
@@ -34,6 +35,8 @@ def __init__(  # noqa: PLR0913
         download_timeout: int = 125,
         logger: logging.Logger | None = None,
         security_reference_url: str | None = None,
+        skip_redownload: bool = False,
+        max_workers: int = 8,
     ):
         if not fixdater:
             fixdater = fixdate.default_finder(workspace)
@@ -46,6 +49,8 @@ def __init__(  # noqa: PLR0913
         self.security_reference_url = (
             security_reference_url.strip("/") if security_reference_url else self._security_reference_url_
         )
+        self.skip_redownload = skip_redownload
+        self.max_workers = max_workers
 
         if not logger:
             logger = logging.getLogger(self.__class__.__name__)
@@ -105,9 +110,21 @@ def __init__(# noqa: PLR0913
         download_timeout: int = 125,
         logger: logging.Logger | None = None,
         security_reference_url: str | None = None,
+        skip_redownload: bool = False,
+        max_workers: int = 8,
     ):
         self._db_filename = self._extract_filename_from_url(url)
-        super().__init__(workspace, url, namespace, fixdater, download_timeout, logger, security_reference_url)
+        super().__init__(
+            workspace,
+            url,
+            namespace,
+            fixdater,
+            download_timeout,
+            logger,
+            security_reference_url,
+            skip_redownload=skip_redownload,
+            max_workers=max_workers,
+        )
 
     def _download(self) -> None:
         if not os.path.exists(self.input_dir_path):
@@ -119,6 +136,11 @@ def _download(self) -> None:
             self.logger.info(f"downloading {self.namespace} secdb {self.url}")
             r = http.get(self.url, self.logger, stream=True, timeout=self.download_timeout)
             file_path = os.path.join(self.input_dir_path, self._db_filename)
+            # if the file already exists and skip_redownload is True, skip writing the file again. This is to avoid
+            # unnecessary redownloading and rewriting of the same file, which can save time on subsequent runs.
+            if self.skip_redownload and os.path.exists(file_path):
+                self.logger.info(f"skipping download of {self.namespace} secdb since file already exists at {file_path}")
+                return
             with open(file_path, "wb") as fp:
                 for chunk in r.iter_content():
                     fp.write(chunk)
@@ -236,9 +258,14 @@ class OSVParser(Parser):
     _input_dir_ = "osv"
 
     def _download(self) -> None:
+        '''
+        Download all OSV entry files based on the index file at self.url, which should point to the
+        top level all.json file. For each entry in the index, we construct the URL for the individual
+        entry file and download it to the input directory.
+        '''
         if not os.path.exists(self.input_dir_path):
             os.makedirs(self.input_dir_path, exist_ok=True)
-        
+
         self.fixdater.download()
 
         try:
@@ -249,25 +276,48 @@ def _download(self) -> None:
             index = orjson.loads(r.content)
 
             base_url = self.url.rsplit("/", 1)[0]
-            for entry in index:
-                # for each entry pointed to by the index, pull down the full JSON file
-                filename = f"{entry['id']}.json"
-                entry_url = f"{base_url}/{filename}"
-                r = http.get(self.url, self.logger, stream=True, timeout=self.download_timeout)
-                file_path = os.path.join(self.input_dir_path, filename)
-                with open(file_path, "wb") as fp:
-                    for chunk in r.iter_content():
-                        fp.write(chunk)
+            # Download all entries in the index concurrently using a thread pool,
+            # which should speed up the download process significantly since there are thousands of entries.
+            # We construct the URL for each entry by appending the entry ID and .json to the base URL
+            # e.g. https://packages.cgr.dev/chainguard/v2/osv/CGA-2255-2h2p-73q2.json
+            with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                futures = [
+                    executor.submit(self._download_single_file, f"{base_url}/{entry['id']}.json", f"{entry['id']}.json")
+                    for entry in index
+                ]
+                # surface the first exception (if any) — matches prior behavior where a single
+                # failure aborted the batch via the outer try/except
+                done, _not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
+                for future in done:
+                    future.result()
         except Exception:
             self.logger.exception(f"ignoring error processing osv for {self.url}")
 
+    def _download_single_file(self, url: str, filename: str) -> None:
+        '''
+        Download a single OSV entry file given its URL and the desired filename.
+        '''
+        file_path = os.path.join(self.input_dir_path, filename)
+        # if the file already exists and skip_redownload is True, skip writing the file again. This is to avoid
+        # unnecessary redownloading and rewriting of the same file, which can save time on subsequent
+        # runs.
+        if self.skip_redownload and os.path.exists(file_path):
+            self.logger.info(f"skipping download of {self.namespace} osv entry {filename} since file already exists")
+            return
+        self.logger.info(f"downloading {self.namespace} osv entry {filename}")
+        r = http.get(url, self.logger, stream=True, timeout=self.download_timeout)
+        with open(file_path, "wb") as fp:
+            for chunk in r.iter_content():
+                fp.write(chunk)
+
     def _load(self) -> Generator[tuple[str, dict[str, Any]], None, None]:
         try:
             # for each file we have downloaded, which should be every json file in the index, load it
             # and yield the data for normalization
             for filename in os.listdir(self.input_dir_path):
                 if not filename.endswith(".json"):
                     continue
+                self.logger.info(f"loading {self.namespace} osv data from {filename}")
                 with open(os.path.join(self.input_dir_path, filename)) as fh:
                     data = orjson.loads(fh.read())
                     yield self._release_, data
@@ -289,4 +339,4 @@ def _normalize(self, release: str, data: dict[str, Any]) -> dict[str, Any]:  # n
         # we map the osv id to the osv data to keep consistency in the secdb parser, which
         # does this for ease of identifying the associated vulnerability when writing records.
         # IE: {"CGA-1234-5678-9abc": {<full osv record>}}
-        return {data["id"]: data}
+        return {data['id']: data}
diff --git a/tests/unit/providers/chainguard/test_chainguard.py b/tests/unit/providers/chainguard/test_chainguard.py
@@ -36,6 +36,32 @@ def test_parser_selection(
     assert p.schema.name == expected_schema_name
 
 
+@pytest.mark.parametrize(
+    ("use_osv", "expected_parser_cls"),
+    [
+        (False, SecDBParser),
+        (True, OSVParser),
+    ],
+)
+def test_config_propagates_to_parser(helpers, auto_fake_fixdate_finder, use_osv, expected_parser_cls):
+    workspace = helpers.provider_workspace_helper(name=Provider.name())
+
+    c = Config(use_osv=use_osv, skip_redownload=True, osv_max_workers=16)
+    c.runtime.result_store = result.StoreStrategy.FLAT_FILE
+    p = Provider(root=workspace.root, config=c)
+
+    assert isinstance(p.parser, expected_parser_cls)
+    assert p.parser.skip_redownload is True
+    if use_osv:
+        assert p.parser.max_workers == 16
+
+
+def test_config_defaults():
+    c = Config()
+    assert c.skip_redownload is False
+    assert c.osv_max_workers == 8
+
+
 def test_provider_schema(helpers, disable_get_requests, auto_fake_fixdate_finder):
     workspace = helpers.provider_workspace_helper(name=Provider.name())