-
Notifications
You must be signed in to change notification settings - Fork 116
Blob storage artifact-data keys, binary sanitizer, and observability stripping #2744
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 10 commits
8eea5bf
c7043e0
a96c949
4e1416a
37a910e
513ed69
393705c
04ecfa7
12558e9
4c9a209
f5f09e0
7dd41d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| import { beforeEach, describe, expect, it, vi } from 'vitest'; | ||
| import { | ||
| sanitizeArtifactBinaryData, | ||
| stripBinaryDataForObservability, | ||
| } from '../blob-storage/artifact-binary-sanitizer'; | ||
|
|
||
| vi.mock('../blob-storage/index', () => ({ | ||
| getBlobStorageProvider: vi.fn(), | ||
| isBlobUri: (s: string) => s.startsWith('blob://'), | ||
| toBlobUri: (key: string) => `blob://${key}`, | ||
| fromBlobUri: (uri: string) => uri.slice('blob://'.length), | ||
| BLOB_URI_PREFIX: 'blob://', | ||
| })); | ||
|
|
||
| vi.mock('../blob-storage/storage-keys', () => ({ | ||
| buildStorageKey: vi.fn( | ||
| (input: any) => | ||
| `v1/t_${input.tenantId}/artifact-data/p_${input.projectId}/a_${input.artifactId}/sha256-${input.contentHash}.${input.ext}` | ||
| ), | ||
| })); | ||
|
|
||
| const SMALL_BASE64 = 'aGVsbG8='; | ||
| const LARGE_BASE64 = Buffer.from('x'.repeat(200)).toString('base64'); | ||
|
|
||
| const CTX = { tenantId: 'tenant-1', projectId: 'proj-1', artifactId: 'art-1' }; | ||
|
|
||
| describe('stripBinaryDataForObservability', () => { | ||
| it('replaces image part data with placeholder', () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.type).toBe('image'); | ||
| expect(result.data).toMatch(/^\[binary data ~\d+ bytes, mimeType: image\/png\]$/); | ||
| expect(result.mimeType).toBe('image/png'); | ||
| }); | ||
|
|
||
| it('replaces file part data with placeholder', () => { | ||
| const input = { type: 'file', data: LARGE_BASE64, mimeType: 'application/pdf' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toMatch(/^\[binary data ~\d+ bytes/); | ||
| }); | ||
|
|
||
| it('leaves already-blob-uri data untouched', () => { | ||
| const input = { type: 'image', data: 'blob://some/key', mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe('blob://some/key'); | ||
| }); | ||
|
|
||
| it('leaves small strings untouched (below 100 char threshold)', () => { | ||
| const input = { type: 'image', data: SMALL_BASE64, mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe(SMALL_BASE64); | ||
| }); | ||
|
|
||
| it('leaves http URLs untouched', () => { | ||
| const input = { type: 'image', data: 'https://example.com/img.png', mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe('https://example.com/img.png'); | ||
| }); | ||
|
|
||
| it('recursively strips nested binary parts', () => { | ||
| const input = { | ||
| toolResult: [ | ||
| { type: 'text', text: 'Ticket info' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg' }, | ||
| ], | ||
| }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket info' }); | ||
| expect(result.toolResult[1].data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('handles arrays at top level', () => { | ||
| const input = [ | ||
| { type: 'text', text: 'hi' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }, | ||
| ]; | ||
| const result = stripBinaryDataForObservability(input) as any[]; | ||
| expect(result[0]).toEqual({ type: 'text', text: 'hi' }); | ||
| expect(result[1].data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('passes through non-object primitives unchanged', () => { | ||
| expect(stripBinaryDataForObservability('hello')).toBe('hello'); | ||
| expect(stripBinaryDataForObservability(42)).toBe(42); | ||
| expect(stripBinaryDataForObservability(null)).toBeNull(); | ||
| }); | ||
|
|
||
| it('handles circular references safely', () => { | ||
| const input: Record<string, unknown> = { type: 'container' }; | ||
| input.self = input; | ||
|
|
||
| const result = stripBinaryDataForObservability(input) as Record<string, unknown>; | ||
| expect(result.type).toBe('container'); | ||
| expect(result.self).toBe('[Circular Reference]'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('sanitizeArtifactBinaryData', () => { | ||
| let mockUpload: ReturnType<typeof vi.fn>; | ||
|
|
||
| beforeEach(async () => { | ||
| mockUpload = vi.fn().mockResolvedValue(undefined); | ||
| const { getBlobStorageProvider } = await import('../blob-storage/index'); | ||
| vi.mocked(getBlobStorageProvider).mockReturnValue({ | ||
| upload: mockUpload, | ||
| download: vi.fn(), | ||
| delete: vi.fn(), | ||
| }); | ||
| }); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Missing test for upload failure handling Issue: The test suite mocks Why: Given that Fix: Add a test case: it('propagates upload errors when blob storage fails', async () => {
mockUpload.mockRejectedValueOnce(new Error('Storage quota exceeded'));
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
await expect(sanitizeArtifactBinaryData(input, CTX)).rejects.toThrow('Storage quota exceeded');
});Refs: |
||
|
|
||
| it('uploads an inline image part and replaces data with blob:// URI', async () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
|
|
||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| expect(result.type).toBe('image'); | ||
| expect(result.data).toMatch(/^blob:\/\//); | ||
| expect(result.mimeType).toBe('image/png'); | ||
| }); | ||
|
|
||
| it('preserves non-binary fields on the image part', async () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg', extra: 'keep' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(result.extra).toBe('keep'); | ||
| }); | ||
|
|
||
| it('does not re-upload data that is already a blob:// URI', async () => { | ||
| const input = { type: 'image', data: 'blob://v1/t_x/artifact-data/p_y/a_z/sha256-abc.png' }; | ||
| await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(mockUpload).not.toHaveBeenCalled(); | ||
| }); | ||
|
|
||
| it('recursively sanitizes nested structures', async () => { | ||
| const input = { | ||
| toolResult: [ | ||
| { type: 'text', text: 'Ticket data' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }, | ||
| ], | ||
| toolName: 'get-zendesk-ticket', | ||
| }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
|
|
||
| expect(result.toolName).toBe('get-zendesk-ticket'); | ||
| expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket data' }); | ||
| expect(result.toolResult[1].data).toMatch(/^blob:\/\//); | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it('uploads multiple image parts independently', async () => { | ||
| const input = { | ||
| images: [ | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg' }, | ||
| ], | ||
| }; | ||
| await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(mockUpload).toHaveBeenCalledTimes(2); | ||
| }); | ||
|
|
||
| it('leaves non-binary values unchanged', async () => { | ||
| const input = { | ||
| toolName: 'search', | ||
| toolInput: { query: 'test' }, | ||
| count: 5, | ||
| flag: true, | ||
| }; | ||
| const result = await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(result).toEqual(input); | ||
| expect(mockUpload).not.toHaveBeenCalled(); | ||
| }); | ||
|
|
||
| it('produces a deterministic blob:// URI via content hash', async () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }; | ||
| const r1 = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| const r2 = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(r1.data).toBe(r2.data); | ||
| }); | ||
|
|
||
| it('handles circular references safely', async () => { | ||
| const input: Record<string, unknown> = { | ||
| toolResult: [{ type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }], | ||
| }; | ||
| input.self = input; | ||
|
|
||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as Record<string, unknown>; | ||
| expect(result.self).toBe('[Circular Reference]'); | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| }); | ||
| }); | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,142 @@ | ||||||
| import { createHash } from 'node:crypto'; | ||||||
| import { getExtensionFromMimeType } from '@inkeep/agents-core/constants/allowed-file-formats'; | ||||||
| import { getBlobStorageProvider, isBlobUri, toBlobUri } from './index'; | ||||||
| import { buildStorageKey } from './storage-keys'; | ||||||
|
|
||||||
| export interface ArtifactBinaryContext { | ||||||
| tenantId: string; | ||||||
| projectId: string; | ||||||
| artifactId: string; | ||||||
| } | ||||||
|
|
||||||
| type InlineBinaryPart = { | ||||||
| type: 'image' | 'file'; | ||||||
| data: string; | ||||||
| mimeType?: string; | ||||||
| [key: string]: unknown; | ||||||
| }; | ||||||
|
|
||||||
| const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]'; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Magic number threshold lacks documentation Issue: The Why: Without documentation, it's unclear whether this threshold was deliberately chosen or if a larger value (e.g., 1KB) would better distinguish binary content from short encoded strings. Fix:
Suggested change
Then use the constant in the check below. |
||||||
|
|
||||||
| function isInlineBinaryPart(value: unknown): value is InlineBinaryPart { | ||||||
| if (typeof value !== 'object' || value === null) return false; | ||||||
| const v = value as Record<string, unknown>; | ||||||
| return ( | ||||||
| (v.type === 'image' || v.type === 'file') && | ||||||
| typeof v.data === 'string' && | ||||||
| v.data.length > 100 && | ||||||
| !isBlobUri(v.data) && | ||||||
| !v.data.startsWith('http') && | ||||||
| !v.data.startsWith('data:') | ||||||
| ); | ||||||
| } | ||||||
|
|
||||||
| async function uploadInlinePart( | ||||||
| part: InlineBinaryPart, | ||||||
| ctx: ArtifactBinaryContext | ||||||
| ): Promise<InlineBinaryPart> { | ||||||
| const storage = getBlobStorageProvider(); | ||||||
| const buffer = Buffer.from(part.data, 'base64'); | ||||||
| const mimeType = part.mimeType ?? 'application/octet-stream'; | ||||||
| const contentHash = createHash('sha256').update(buffer).digest('hex'); | ||||||
| const ext = getExtensionFromMimeType(mimeType); | ||||||
|
|
||||||
| const key = buildStorageKey({ | ||||||
| category: 'artifact-data', | ||||||
| tenantId: ctx.tenantId, | ||||||
| projectId: ctx.projectId, | ||||||
| artifactId: ctx.artifactId, | ||||||
| contentHash, | ||||||
| ext, | ||||||
| }); | ||||||
|
|
||||||
| await storage.upload({ key, data: buffer, contentType: mimeType }); | ||||||
|
||||||
|
|
||||||
| return { ...part, data: toBlobUri(key) }; | ||||||
| } | ||||||
|
|
||||||
| export async function sanitizeArtifactBinaryData( | ||||||
| value: unknown, | ||||||
| ctx: ArtifactBinaryContext | ||||||
| ): Promise<unknown> { | ||||||
| const inStack = new WeakSet<object>(); | ||||||
|
|
||||||
| const visit = async (current: unknown): Promise<unknown> => { | ||||||
| if (isInlineBinaryPart(current)) { | ||||||
| return uploadInlinePart(current, ctx); | ||||||
| } | ||||||
| if (Array.isArray(current)) { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| return Promise.all(current.map((item) => visit(item))); | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| if (current !== null && typeof current === 'object') { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| const result: Record<string, unknown> = {}; | ||||||
| for (const [k, v] of Object.entries(current as Record<string, unknown>)) { | ||||||
| result[k] = await visit(v); | ||||||
| } | ||||||
| return result; | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| return current; | ||||||
| }; | ||||||
|
|
||||||
| return visit(value); | ||||||
| } | ||||||
|
|
||||||
| export function stripBinaryDataForObservability(value: unknown): unknown { | ||||||
| const inStack = new WeakSet<object>(); | ||||||
|
|
||||||
| const visit = (current: unknown): unknown => { | ||||||
| if (isInlineBinaryPart(current)) { | ||||||
| const part = current as InlineBinaryPart; | ||||||
| const approxBytes = Math.round(part.data.length * 0.75); | ||||||
| return { | ||||||
| ...part, | ||||||
| data: `[binary data ~${approxBytes} bytes, mimeType: ${part.mimeType ?? 'unknown'}]`, | ||||||
| }; | ||||||
| } | ||||||
| if (Array.isArray(current)) { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| return current.map(visit); | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| if (current !== null && typeof current === 'object') { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| const result: Record<string, unknown> = {}; | ||||||
| for (const [k, v] of Object.entries(current as Record<string, unknown>)) { | ||||||
| result[k] = visit(v); | ||||||
| } | ||||||
| return result; | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| return current; | ||||||
| }; | ||||||
|
|
||||||
| return visit(value); | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Minor: Missing test for
data:URI handlingIssue: Line 30 of the implementation checks
!v.data.startsWith('data:')but there's no explicit test verifying data URIs are left untouched.Why: Data URIs are a common format for inline images. Without a test, a refactor could accidentally remove the check, causing data URIs to be re-uploaded unnecessarily.
Fix: Add test cases: