From 08686f3f54fa7f2ceb37f83c950dd9b54902d394 Mon Sep 17 00:00:00 2001 From: hiroTamada Date: Fri, 6 Feb 2026 11:56:33 -0500 Subject: [PATCH 1/4] fix: add image-manifest=true to BuildKit cache export for ephemeral VMs Without image-manifest=true, BuildKit's registry cache stores layer references pointing to external registries (e.g., docker.io) rather than copying the actual layer blobs into the cache image. This causes cache misses in ephemeral BuildKit instances (like our builder VMs) because the layers aren't available locally. With image-manifest=true, BuildKit creates a proper OCI image manifest with all layer blobs stored in the registry, enabling cache hits even in fresh BuildKit instances. This fixes the issue where the global cache (populated by admin builds) wasn't providing cache hits for tenant builds - the first deployment for each tenant was re-downloading all base image layers from Docker Hub. Co-authored-by: Cursor --- lib/builds/builder_agent/main.go | 7 +++++-- lib/builds/cache.go | 4 +++- lib/builds/cache_test.go | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/builds/builder_agent/main.go b/lib/builds/builder_agent/main.go index beb5b18..045b300 100644 --- a/lib/builds/builder_agent/main.go +++ b/lib/builds/builder_agent/main.go @@ -728,11 +728,14 @@ func runBuild(ctx context.Context, config *BuildConfig, logWriter io.Writer) (st } // Export cache based on build type + // Note: image-manifest=true ensures layer blobs are stored in the registry cache image + // rather than as references to external registries (e.g., docker.io). This is critical + // for cache hits in ephemeral BuildKit instances that don't have local layer storage. if config.IsAdminBuild { // Admin build: export to global cache if config.GlobalCacheKey != "" { globalCacheRef := fmt.Sprintf("%s/cache/global/%s", registryHost, config.GlobalCacheKey) - cacheOpts := "type=registry,ref=" + globalCacheRef + ",mode=max" + cacheOpts := "type=registry,ref=" + globalCacheRef + ",mode=max,image-manifest=true,oci-mediatypes=true" if useInsecureFlag { cacheOpts += ",registry.insecure=true" } @@ -743,7 +746,7 @@ func runBuild(ctx context.Context, config *BuildConfig, logWriter io.Writer) (st // Regular build: export to tenant cache if config.CacheScope != "" { tenantCacheRef := fmt.Sprintf("%s/cache/%s", registryHost, config.CacheScope) - cacheOpts := "type=registry,ref=" + tenantCacheRef + ",mode=max" + cacheOpts := "type=registry,ref=" + tenantCacheRef + ",mode=max,image-manifest=true,oci-mediatypes=true" if useInsecureFlag { cacheOpts += ",registry.insecure=true" } diff --git a/lib/builds/cache.go b/lib/builds/cache.go index ff3e26a..f47e331 100644 --- a/lib/builds/cache.go +++ b/lib/builds/cache.go @@ -100,8 +100,10 @@ func (k *CacheKey) ImportCacheArg() string { } // ExportCacheArg returns the BuildKit --export-cache argument +// Uses image-manifest=true to ensure layer blobs are stored in the cache image +// rather than as external references, enabling cache hits in ephemeral BuildKit instances. func (k *CacheKey) ExportCacheArg() string { - return fmt.Sprintf("type=registry,ref=%s,mode=max", k.Reference) + return fmt.Sprintf("type=registry,ref=%s,mode=max,image-manifest=true,oci-mediatypes=true", k.Reference) } // normalizeCacheScope normalizes a cache scope to only contain safe characters diff --git a/lib/builds/cache_test.go b/lib/builds/cache_test.go index d51fb7c..7f3637b 100644 --- a/lib/builds/cache_test.go +++ b/lib/builds/cache_test.go @@ -103,7 +103,7 @@ func TestCacheKey_Args(t *testing.T) { assert.Equal(t, "type=registry,ref=localhost:8080/cache/tenant/nodejs/abc123", importArg) exportArg := key.ExportCacheArg() - assert.Equal(t, "type=registry,ref=localhost:8080/cache/tenant/nodejs/abc123,mode=max", exportArg) + assert.Equal(t, "type=registry,ref=localhost:8080/cache/tenant/nodejs/abc123,mode=max,image-manifest=true,oci-mediatypes=true", exportArg) } func TestValidateCacheScope(t *testing.T) { From 195e0020c751babc8be9a0630a4184a4967fcf1a Mon Sep 17 00:00:00 2001 From: hiroTamada Date: Fri, 6 Feb 2026 15:13:53 -0500 Subject: [PATCH 2/4] test: add unit test reproducing BuildKit cache mediatype issue Adds a unit test that reproduces the production issue where hypeman fails to pre-pull BuildKit cache images. The test creates a mock OCI layout with BuildKit's cache config mediatype (application/vnd.buildkit.cacheconfig.v0) and verifies that unpackLayers fails with the expected error. This test documents the root cause: umoci expects standard OCI config mediatype but BuildKit cache exports use a custom mediatype. Co-authored-by: Cursor --- lib/images/oci_test.go | 190 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 lib/images/oci_test.go diff --git a/lib/images/oci_test.go b/lib/images/oci_test.go new file mode 100644 index 0000000..592da9a --- /dev/null +++ b/lib/images/oci_test.go @@ -0,0 +1,190 @@ +package images + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// BuildKit cache config mediatype - this is what BuildKit uses when exporting +// cache with image-manifest=true +const buildKitCacheConfigMediaType = "application/vnd.buildkit.cacheconfig.v0" + +// TestUnpackLayersFailsOnBuildKitCacheMediatype verifies that hypeman's image +// unpacker fails when encountering BuildKit cache images. This reproduces the +// production issue where global cache images exported by BuildKit cannot be +// pre-pulled by hypeman because they use a non-standard config mediatype. +// +// The error occurs because: +// 1. BuildKit exports cache with --export-cache type=registry,image-manifest=true +// 2. The exported manifest uses "application/vnd.buildkit.cacheconfig.v0" as config mediatype +// 3. hypeman's unpackLayers expects "application/vnd.oci.image.config.v1+json" +// 4. umoci.UnpackRootfs fails with "config blob is not correct mediatype" +func TestUnpackLayersFailsOnBuildKitCacheMediatype(t *testing.T) { + // Create a temp directory for the OCI layout + cacheDir := t.TempDir() + + // Create OCI layout structure with BuildKit cache mediatype + err := createBuildKitCacheLayout(cacheDir, "test-cache") + require.NoError(t, err, "failed to create mock BuildKit cache layout") + + // Create OCI client and try to unpack + client, err := newOCIClient(cacheDir) + require.NoError(t, err) + + targetDir := t.TempDir() + err = client.unpackLayers(context.Background(), "test-cache", targetDir) + + // This should fail with a mediatype error + require.Error(t, err, "unpackLayers should fail on BuildKit cache mediatype") + assert.Contains(t, err.Error(), "config", "error should mention config") + + t.Logf("Got expected error: %v", err) +} + +// TestExtractMetadataSucceedsOnBuildKitCache verifies that extractOCIMetadata +// does NOT fail on BuildKit cache images - it's go-containerregistry which is +// lenient about mediatypes. The failure only happens during unpackLayers when +// umoci tries to unpack the rootfs. +func TestExtractMetadataSucceedsOnBuildKitCache(t *testing.T) { + cacheDir := t.TempDir() + + err := createBuildKitCacheLayout(cacheDir, "test-cache") + require.NoError(t, err) + + client, err := newOCIClient(cacheDir) + require.NoError(t, err) + + // This succeeds because go-containerregistry doesn't validate config mediatype + // The failure only happens in unpackLayers when umoci validates the config + meta, err := client.extractOCIMetadata("test-cache") + require.NoError(t, err, "extractOCIMetadata succeeds - go-containerregistry is lenient") + + // But the metadata will be empty/invalid since it's not a real OCI config + t.Logf("Got metadata (likely empty): %+v", meta) +} + +// createBuildKitCacheLayout creates an OCI layout that mimics what BuildKit +// exports when using --export-cache type=registry,image-manifest=true +// +// Layout structure: +// cacheDir/ +// ├── oci-layout (OCI layout version marker) +// ├── index.json (points to manifest) +// └── blobs/sha256/ +// ├── (image manifest with buildkit config mediatype) +// ├── (buildkit cache config blob) +// └── (dummy layer) +func createBuildKitCacheLayout(cacheDir, layoutTag string) error { + // Create directory structure + blobsDir := filepath.Join(cacheDir, "blobs", "sha256") + if err := os.MkdirAll(blobsDir, 0755); err != nil { + return err + } + + // 1. Create oci-layout file + ociLayout := map[string]string{"imageLayoutVersion": "1.0.0"} + ociLayoutBytes, _ := json.Marshal(ociLayout) + if err := os.WriteFile(filepath.Join(cacheDir, "oci-layout"), ociLayoutBytes, 0644); err != nil { + return err + } + + // 2. Create a dummy layer blob (gzipped tar with a single file) + // This is a minimal valid gzipped tar + layerContent := []byte{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // empty tar + } + layerDigest := sha256Hash(layerContent) + if err := os.WriteFile(filepath.Join(blobsDir, layerDigest), layerContent, 0644); err != nil { + return err + } + + // 3. Create BuildKit cache config blob + // This is what BuildKit puts in the config - NOT a standard OCI config + cacheConfig := map[string]interface{}{ + "layers": []map[string]interface{}{ + { + "blob": "sha256:" + layerDigest, + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + }, + }, + } + configBytes, _ := json.Marshal(cacheConfig) + configDigest := sha256Hash(configBytes) + if err := os.WriteFile(filepath.Join(blobsDir, configDigest), configBytes, 0644); err != nil { + return err + } + + // 4. Create image manifest with BuildKit's cache config mediatype + manifest := map[string]interface{}{ + "schemaVersion": 2, + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "config": map[string]interface{}{ + "mediaType": buildKitCacheConfigMediaType, // This is the problem! + "digest": "sha256:" + configDigest, + "size": len(configBytes), + }, + "layers": []map[string]interface{}{ + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:" + layerDigest, + "size": len(layerContent), + }, + }, + } + manifestBytes, _ := json.Marshal(manifest) + manifestDigest := sha256Hash(manifestBytes) + if err := os.WriteFile(filepath.Join(blobsDir, manifestDigest), manifestBytes, 0644); err != nil { + return err + } + + // 5. Create index.json pointing to the manifest with our layout tag + index := map[string]interface{}{ + "schemaVersion": 2, + "mediaType": "application/vnd.oci.image.index.v1+json", + "manifests": []map[string]interface{}{ + { + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "digest": "sha256:" + manifestDigest, + "size": len(manifestBytes), + "annotations": map[string]string{ + "org.opencontainers.image.ref.name": layoutTag, + }, + }, + }, + } + indexBytes, _ := json.Marshal(index) + if err := os.WriteFile(filepath.Join(cacheDir, "index.json"), indexBytes, 0644); err != nil { + return err + } + + return nil +} + +// sha256Hash computes the SHA256 hash of data and returns the hex string +func sha256Hash(data []byte) string { + h := sha256.Sum256(data) + return hex.EncodeToString(h[:]) +} + +// TestConvertToOCIMediaTypePassesThroughBuildKitType verifies that the +// mediatype conversion function doesn't handle BuildKit's cache config type, +// which is the root cause of the unpack failure. +func TestConvertToOCIMediaTypePassesThroughBuildKitType(t *testing.T) { + // Verify that BuildKit's mediatype passes through unchanged + result := convertToOCIMediaType(buildKitCacheConfigMediaType) + assert.Equal(t, buildKitCacheConfigMediaType, result, + "BuildKit cache config mediatype should pass through unchanged (this is the bug)") + + // Standard Docker types should be converted + assert.Equal(t, "application/vnd.oci.image.config.v1+json", + convertToOCIMediaType("application/vnd.docker.container.image.v1+json")) +} From 353aac75831c48830cbf7b66f1951784de37e9c6 Mon Sep 17 00:00:00 2001 From: hiroTamada Date: Fri, 6 Feb 2026 15:27:55 -0500 Subject: [PATCH 3/4] fix: skip conversion for BuildKit cache images BuildKit exports cache with a custom mediatype (application/vnd.buildkit.cacheconfig.v0) that can't be unpacked by standard OCI tools like umoci. This caused errors when pushing cache images to the registry: config blob is not correct mediatype application/vnd.oci.image.config.v1+json: application/vnd.buildkit.cacheconfig.v0 The fix skips the ext4 conversion step for cache/* repos since: 1. Cache images are not runnable containers 2. BuildKit imports them directly from the registry 3. There's no need to unpack or convert them locally Co-authored-by: Cursor --- lib/registry/registry.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/registry/registry.go b/lib/registry/registry.go index 44535f4..fed7274 100644 --- a/lib/registry/registry.go +++ b/lib/registry/registry.go @@ -138,7 +138,15 @@ func (w *responseWrapper) WriteHeader(code int) { } // triggerConversion queues the image for conversion to ext4 disk format. +// Skips BuildKit cache images (cache/*) since they're not runnable containers. func (r *Registry) triggerConversion(repo, reference, dockerDigest string) { + // Skip BuildKit cache images - they use a custom mediatype that can't be + // unpacked as a standard OCI image. BuildKit imports them directly from + // the registry without needing local conversion. + if strings.HasPrefix(repo, "cache/") { + return + } + imageRef := repo + ":" + reference if strings.HasPrefix(reference, "sha256:") { imageRef = repo + "@" + reference From 1460df94ce38190c69d343fbb5affe9a648371a1 Mon Sep 17 00:00:00 2001 From: hiroTamada Date: Fri, 6 Feb 2026 15:54:43 -0500 Subject: [PATCH 4/4] fix: handle host prefix in cache repo check The repo parameter passed to triggerConversion includes the Host header prefix (e.g., "10.102.0.1:8083/cache/global/node"). The previous check only used HasPrefix("cache/") which would never match. Now checks for both patterns: - HasPrefix("cache/") for edge case without host - Contains("/cache/") for normal case with host prefix Co-authored-by: Cursor --- lib/registry/registry.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/registry/registry.go b/lib/registry/registry.go index fed7274..651baf9 100644 --- a/lib/registry/registry.go +++ b/lib/registry/registry.go @@ -143,7 +143,8 @@ func (r *Registry) triggerConversion(repo, reference, dockerDigest string) { // Skip BuildKit cache images - they use a custom mediatype that can't be // unpacked as a standard OCI image. BuildKit imports them directly from // the registry without needing local conversion. - if strings.HasPrefix(repo, "cache/") { + // Note: repo may include host prefix (e.g., "10.102.0.1:8083/cache/global/node") + if strings.HasPrefix(repo, "cache/") || strings.Contains(repo, "/cache/") { return }