-
Notifications
You must be signed in to change notification settings - Fork 113
Blob storage artifact-data keys, binary sanitizer, and observability stripping #2744
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8eea5bf
c7043e0
a96c949
4e1416a
37a910e
513ed69
393705c
04ecfa7
12558e9
4c9a209
f5f09e0
7dd41d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,215 @@ | ||
| import { beforeEach, describe, expect, it, vi } from 'vitest'; | ||
| import { | ||
| sanitizeArtifactBinaryData, | ||
| stripBinaryDataForObservability, | ||
| } from '../blob-storage/artifact-binary-sanitizer'; | ||
|
|
||
| vi.mock('../blob-storage/index', () => ({ | ||
| getBlobStorageProvider: vi.fn(), | ||
| isBlobUri: (s: string) => s.startsWith('blob://'), | ||
| toBlobUri: (key: string) => `blob://${key}`, | ||
| fromBlobUri: (uri: string) => uri.slice('blob://'.length), | ||
| BLOB_URI_PREFIX: 'blob://', | ||
| })); | ||
|
|
||
| vi.mock('../blob-storage/storage-keys', () => ({ | ||
| buildStorageKey: vi.fn( | ||
| (input: any) => | ||
| `v1/t_${input.tenantId}/artifact-data/p_${input.projectId}/a_${input.artifactId}/sha256-${input.contentHash}.${input.ext}` | ||
| ), | ||
| })); | ||
|
|
||
| const SAMPLE_BASE64 = Buffer.from('sample image bytes').toString('base64'); | ||
|
|
||
| const CTX = { tenantId: 'tenant-1', projectId: 'proj-1', artifactId: 'art-1' }; | ||
|
|
||
| describe('stripBinaryDataForObservability', () => { | ||
| it('replaces image part data with placeholder', () => { | ||
| const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.type).toBe('image'); | ||
| expect(result.data).toMatch(/^\[binary data ~\d+ bytes, mimeType: image\/png\]$/); | ||
| expect(result.mimeType).toBe('image/png'); | ||
| }); | ||
|
|
||
| it('replaces file part data with placeholder', () => { | ||
| const input = { type: 'file', data: SAMPLE_BASE64, mimeType: 'application/pdf' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toMatch(/^\[binary data ~\d+ bytes/); | ||
| }); | ||
|
|
||
| it('leaves already-blob-uri data untouched', () => { | ||
| const input = { type: 'image', data: 'blob://some/key', mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe('blob://some/key'); | ||
| }); | ||
|
|
||
| it('leaves http URLs untouched', () => { | ||
| const input = { type: 'image', data: 'https://example.com/img.png', mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe('https://example.com/img.png'); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Missing test for Issue: Line 30 of the implementation checks Why: Data URIs are a common format for inline images. Without a test, a refactor could accidentally remove the check, causing data URIs to be re-uploaded unnecessarily. Fix: Add test cases: it('leaves data: URIs untouched', () => {
const dataUri = 'data:image/png;base64,' + LARGE_BASE64;
const input = { type: 'image', data: dataUri, mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toBe(dataUri);
}); |
||
| }); | ||
|
|
||
| it('strips data: URIs', () => { | ||
| const input = { | ||
| type: 'image', | ||
| data: 'data:image/png;base64,iVBORw0KGgo=', | ||
| mimeType: 'image/png', | ||
| }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('recursively strips nested binary parts', () => { | ||
| const input = { | ||
| toolResult: [ | ||
| { type: 'text', text: 'Ticket info' }, | ||
| { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/jpeg' }, | ||
| ], | ||
| }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket info' }); | ||
| expect(result.toolResult[1].data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('handles arrays at top level', () => { | ||
| const input = [ | ||
| { type: 'text', text: 'hi' }, | ||
| { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }, | ||
| ]; | ||
| const result = stripBinaryDataForObservability(input) as any[]; | ||
| expect(result[0]).toEqual({ type: 'text', text: 'hi' }); | ||
| expect(result[1].data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('passes through non-object primitives unchanged', () => { | ||
| expect(stripBinaryDataForObservability('hello')).toBe('hello'); | ||
| expect(stripBinaryDataForObservability(42)).toBe(42); | ||
| expect(stripBinaryDataForObservability(null)).toBeNull(); | ||
| }); | ||
|
|
||
| it('handles circular references safely', () => { | ||
| const input: Record<string, unknown> = { type: 'container' }; | ||
| input.self = input; | ||
|
|
||
| const result = stripBinaryDataForObservability(input) as Record<string, unknown>; | ||
| expect(result.type).toBe('container'); | ||
| expect(result.self).toBe('[Circular Reference]'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('sanitizeArtifactBinaryData', () => { | ||
| let mockUpload: ReturnType<typeof vi.fn>; | ||
|
|
||
| beforeEach(async () => { | ||
| mockUpload = vi.fn().mockResolvedValue(undefined); | ||
| const { getBlobStorageProvider } = await import('../blob-storage/index'); | ||
| vi.mocked(getBlobStorageProvider).mockReturnValue({ | ||
| upload: mockUpload, | ||
| download: vi.fn(), | ||
| delete: vi.fn(), | ||
| }); | ||
| }); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Missing test for upload failure handling Issue: The test suite mocks Why: Given that Fix: Add a test case: it('propagates upload errors when blob storage fails', async () => {
mockUpload.mockRejectedValueOnce(new Error('Storage quota exceeded'));
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
await expect(sanitizeArtifactBinaryData(input, CTX)).rejects.toThrow('Storage quota exceeded');
});Refs: |
||
|
|
||
| it('uploads an inline image part and replaces data with blob:// URI', async () => { | ||
| const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
|
|
||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| expect(result.type).toBe('image'); | ||
| expect(result.data).toMatch(/^blob:\/\//); | ||
| expect(result.mimeType).toBe('image/png'); | ||
| }); | ||
|
|
||
| it('preserves non-binary fields on the image part', async () => { | ||
| const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/jpeg', extra: 'keep' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(result.extra).toBe('keep'); | ||
| }); | ||
|
|
||
| it('does not re-upload data that is already a blob:// URI', async () => { | ||
| const input = { type: 'image', data: 'blob://v1/t_x/artifact-data/p_y/a_z/sha256-abc.png' }; | ||
| await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(mockUpload).not.toHaveBeenCalled(); | ||
| }); | ||
|
|
||
| it('recursively sanitizes nested structures', async () => { | ||
| const input = { | ||
| toolResult: [ | ||
| { type: 'text', text: 'Ticket data' }, | ||
| { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }, | ||
| ], | ||
| toolName: 'get-zendesk-ticket', | ||
| }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
|
|
||
| expect(result.toolName).toBe('get-zendesk-ticket'); | ||
| expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket data' }); | ||
| expect(result.toolResult[1].data).toMatch(/^blob:\/\//); | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it('uploads multiple image parts independently', async () => { | ||
| const input = { | ||
| images: [ | ||
| { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }, | ||
| { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/jpeg' }, | ||
| ], | ||
| }; | ||
| await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(mockUpload).toHaveBeenCalledTimes(2); | ||
| }); | ||
|
|
||
| it('leaves non-binary values unchanged', async () => { | ||
| const input = { | ||
| toolName: 'search', | ||
| toolInput: { query: 'test' }, | ||
| count: 5, | ||
| flag: true, | ||
| }; | ||
| const result = await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(result).toEqual(input); | ||
| expect(mockUpload).not.toHaveBeenCalled(); | ||
| }); | ||
|
|
||
| it('produces a deterministic blob:// URI via content hash', async () => { | ||
| const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }; | ||
| const r1 = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| const r2 = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(r1.data).toBe(r2.data); | ||
| }); | ||
|
|
||
| it('handles circular references safely', async () => { | ||
| const input: Record<string, unknown> = { | ||
| toolResult: [{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }], | ||
| }; | ||
| input.self = input; | ||
|
|
||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as Record<string, unknown>; | ||
| expect(result.self).toBe('[Circular Reference]'); | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it('uploads data: URI parts, extracting base64 payload and mimeType from the prefix', async () => { | ||
| const base64Payload = Buffer.from('png bytes').toString('base64'); | ||
| const input = { | ||
| type: 'image', | ||
| data: `data:image/png;base64,${base64Payload}`, | ||
| mimeType: 'image/jpeg', | ||
| }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| const { data: uploadedData, contentType } = mockUpload.mock.calls[0][0]; | ||
| expect(uploadedData).toEqual(Buffer.from(base64Payload, 'base64')); | ||
| expect(contentType).toBe('image/png'); | ||
| expect(result.data).toMatch(/^blob:\/\//); | ||
| }); | ||
|
|
||
| it('returns original inline data when upload fails', async () => { | ||
| mockUpload.mockRejectedValueOnce(new Error('storage unavailable')); | ||
| const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(result.data).toBe(SAMPLE_BASE64); | ||
| expect(result.type).toBe('image'); | ||
| }); | ||
| }); | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,161 @@ | ||||||
| import { createHash } from 'node:crypto'; | ||||||
| import { getExtensionFromMimeType } from '@inkeep/agents-core/constants/allowed-file-formats'; | ||||||
| import { getLogger } from '../../../../logger'; | ||||||
| import { parseDataUri } from '../../utils/message-parts'; | ||||||
| import { getBlobStorageProvider, isBlobUri, toBlobUri } from './index'; | ||||||
| import { buildStorageKey } from './storage-keys'; | ||||||
|
|
||||||
| const logger = getLogger('artifact-binary-sanitizer'); | ||||||
|
|
||||||
| export interface ArtifactBinaryContext { | ||||||
| tenantId: string; | ||||||
| projectId: string; | ||||||
| artifactId: string; | ||||||
| } | ||||||
|
|
||||||
| type InlineBinaryPart = { | ||||||
| type: 'image' | 'file'; | ||||||
| data: string; | ||||||
| mimeType?: string; | ||||||
| [key: string]: unknown; | ||||||
| }; | ||||||
|
|
||||||
| const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]'; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Magic number threshold lacks documentation Issue: The Why: Without documentation, it's unclear whether this threshold was deliberately chosen or if a larger value (e.g., 1KB) would better distinguish binary content from short encoded strings. Fix:
Suggested change
Then use the constant in the check below. |
||||||
|
|
||||||
| function isInlineBinaryPart(value: unknown): value is InlineBinaryPart { | ||||||
| if (typeof value !== 'object' || value === null) return false; | ||||||
| const v = value as Record<string, unknown>; | ||||||
| return ( | ||||||
| (v.type === 'image' || v.type === 'file') && | ||||||
| typeof v.data === 'string' && | ||||||
| !isBlobUri(v.data) && | ||||||
| !v.data.startsWith('http') | ||||||
| ); | ||||||
| } | ||||||
|
|
||||||
| async function uploadInlinePart( | ||||||
| part: InlineBinaryPart, | ||||||
| ctx: ArtifactBinaryContext | ||||||
| ): Promise<InlineBinaryPart> { | ||||||
| const storage = getBlobStorageProvider(); | ||||||
| const parsed = parseDataUri(part.data); | ||||||
| const base64Data = parsed ? parsed.base64Data : part.data; | ||||||
| const mimeType = parsed?.mimeType ?? part.mimeType ?? 'application/octet-stream'; | ||||||
| const buffer = Buffer.from(base64Data, 'base64'); | ||||||
| const contentHash = createHash('sha256').update(buffer).digest('hex'); | ||||||
| const ext = getExtensionFromMimeType(mimeType); | ||||||
|
|
||||||
| const key = buildStorageKey({ | ||||||
| category: 'artifact-data', | ||||||
| tenantId: ctx.tenantId, | ||||||
| projectId: ctx.projectId, | ||||||
| artifactId: ctx.artifactId, | ||||||
| contentHash, | ||||||
| ext, | ||||||
| }); | ||||||
|
|
||||||
| try { | ||||||
| await storage.upload({ key, data: buffer, contentType: mimeType }); | ||||||
| } catch (error) { | ||||||
| logger.error( | ||||||
| { | ||||||
| error: error instanceof Error ? error.message : String(error), | ||||||
| tenantId: ctx.tenantId, | ||||||
| projectId: ctx.projectId, | ||||||
| artifactId: ctx.artifactId, | ||||||
| mimeType, | ||||||
| size: buffer.length, | ||||||
| }, | ||||||
| 'Failed to upload artifact binary data to blob storage, returning original inline data' | ||||||
| ); | ||||||
| return part; | ||||||
| } | ||||||
|
|
||||||
| return { ...part, data: toBlobUri(key) }; | ||||||
| } | ||||||
|
|
||||||
| export async function sanitizeArtifactBinaryData( | ||||||
| value: unknown, | ||||||
| ctx: ArtifactBinaryContext | ||||||
| ): Promise<unknown> { | ||||||
| const inStack = new WeakSet<object>(); | ||||||
|
|
||||||
| const visit = async (current: unknown): Promise<unknown> => { | ||||||
| if (isInlineBinaryPart(current)) { | ||||||
| return uploadInlinePart(current, ctx); | ||||||
| } | ||||||
| if (Array.isArray(current)) { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| return Promise.all(current.map((item) => visit(item))); | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| if (current !== null && typeof current === 'object') { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| const result: Record<string, unknown> = {}; | ||||||
| for (const [k, v] of Object.entries(current as Record<string, unknown>)) { | ||||||
| result[k] = await visit(v); | ||||||
| } | ||||||
| return result; | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| return current; | ||||||
| }; | ||||||
|
|
||||||
| return visit(value); | ||||||
| } | ||||||
|
|
||||||
| export function stripBinaryDataForObservability(value: unknown): unknown { | ||||||
| const inStack = new WeakSet<object>(); | ||||||
|
|
||||||
| const visit = (current: unknown): unknown => { | ||||||
| if (isInlineBinaryPart(current)) { | ||||||
| const part = current; | ||||||
| const approxBytes = Math.round(part.data.length * 0.75); | ||||||
| return { | ||||||
| ...part, | ||||||
| data: `[binary data ~${approxBytes} bytes, mimeType: ${part.mimeType ?? 'unknown'}]`, | ||||||
| }; | ||||||
| } | ||||||
| if (Array.isArray(current)) { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| return current.map(visit); | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| if (current !== null && typeof current === 'object') { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| const result: Record<string, unknown> = {}; | ||||||
| for (const [k, v] of Object.entries(current as Record<string, unknown>)) { | ||||||
| result[k] = visit(v); | ||||||
| } | ||||||
| return result; | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| return current; | ||||||
| }; | ||||||
|
|
||||||
| return visit(value); | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💭 Consider: Missing boundary test for threshold change
Issue: The threshold changed from
> 100to> 1, but no test verifies this boundary.SAMPLE_BASE64(~24 chars) passes both thresholds.Why: Without a boundary test, a future refactor could accidentally change
> 1to>= 1or back to> 100without test failure.Fix: Consider adding boundary tests: