-
Notifications
You must be signed in to change notification settings - Fork 111
Blob storage artifact-data keys, binary sanitizer, and observability stripping #2744
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: stack/history_shape
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| import { beforeEach, describe, expect, it, vi } from 'vitest'; | ||
| import { | ||
| sanitizeArtifactBinaryData, | ||
| stripBinaryDataForObservability, | ||
| } from '../blob-storage/artifact-binary-sanitizer'; | ||
|
|
||
| vi.mock('../blob-storage/index', () => ({ | ||
| getBlobStorageProvider: vi.fn(), | ||
| isBlobUri: (s: string) => s.startsWith('blob://'), | ||
| toBlobUri: (key: string) => `blob://${key}`, | ||
| fromBlobUri: (uri: string) => uri.slice('blob://'.length), | ||
| BLOB_URI_PREFIX: 'blob://', | ||
| })); | ||
|
|
||
| vi.mock('../blob-storage/storage-keys', () => ({ | ||
| buildStorageKey: vi.fn( | ||
| (input: any) => | ||
| `v1/t_${input.tenantId}/artifact-data/p_${input.projectId}/a_${input.artifactId}/sha256-${input.contentHash}.${input.ext}` | ||
| ), | ||
| })); | ||
|
|
||
| const SMALL_BASE64 = 'aGVsbG8='; | ||
| const LARGE_BASE64 = Buffer.from('x'.repeat(200)).toString('base64'); | ||
|
|
||
| const CTX = { tenantId: 'tenant-1', projectId: 'proj-1', artifactId: 'art-1' }; | ||
|
|
||
| describe('stripBinaryDataForObservability', () => { | ||
| it('replaces image part data with placeholder', () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.type).toBe('image'); | ||
| expect(result.data).toMatch(/^\[binary data ~\d+ bytes, mimeType: image\/png\]$/); | ||
| expect(result.mimeType).toBe('image/png'); | ||
| }); | ||
|
|
||
| it('replaces file part data with placeholder', () => { | ||
| const input = { type: 'file', data: LARGE_BASE64, mimeType: 'application/pdf' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toMatch(/^\[binary data ~\d+ bytes/); | ||
| }); | ||
|
|
||
| it('leaves already-blob-uri data untouched', () => { | ||
| const input = { type: 'image', data: 'blob://some/key', mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe('blob://some/key'); | ||
| }); | ||
|
|
||
| it('leaves small strings untouched (below 100 char threshold)', () => { | ||
| const input = { type: 'image', data: SMALL_BASE64, mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe(SMALL_BASE64); | ||
| }); | ||
|
|
||
| it('leaves http URLs untouched', () => { | ||
| const input = { type: 'image', data: 'https://example.com/img.png', mimeType: 'image/png' }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.data).toBe('https://example.com/img.png'); | ||
| }); | ||
|
|
||
| it('recursively strips nested binary parts', () => { | ||
| const input = { | ||
| toolResult: [ | ||
| { type: 'text', text: 'Ticket info' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg' }, | ||
| ], | ||
| }; | ||
| const result = stripBinaryDataForObservability(input) as any; | ||
| expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket info' }); | ||
| expect(result.toolResult[1].data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('handles arrays at top level', () => { | ||
| const input = [ | ||
| { type: 'text', text: 'hi' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }, | ||
| ]; | ||
| const result = stripBinaryDataForObservability(input) as any[]; | ||
| expect(result[0]).toEqual({ type: 'text', text: 'hi' }); | ||
| expect(result[1].data).toMatch(/^\[binary data/); | ||
| }); | ||
|
|
||
| it('passes through non-object primitives unchanged', () => { | ||
| expect(stripBinaryDataForObservability('hello')).toBe('hello'); | ||
| expect(stripBinaryDataForObservability(42)).toBe(42); | ||
| expect(stripBinaryDataForObservability(null)).toBeNull(); | ||
| }); | ||
|
|
||
| it('handles circular references safely', () => { | ||
| const input: Record<string, unknown> = { type: 'container' }; | ||
| input.self = input; | ||
|
|
||
| const result = stripBinaryDataForObservability(input) as Record<string, unknown>; | ||
| expect(result.type).toBe('container'); | ||
| expect(result.self).toBe('[Circular Reference]'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('sanitizeArtifactBinaryData', () => { | ||
| let mockUpload: ReturnType<typeof vi.fn>; | ||
|
|
||
| beforeEach(async () => { | ||
| mockUpload = vi.fn().mockResolvedValue(undefined); | ||
| const { getBlobStorageProvider } = await import('../blob-storage/index'); | ||
| vi.mocked(getBlobStorageProvider).mockReturnValue({ | ||
| upload: mockUpload, | ||
| download: vi.fn(), | ||
| delete: vi.fn(), | ||
| }); | ||
| }); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Missing test for upload failure handling Issue: The test suite mocks Why: Given that Fix: Add a test case: it('propagates upload errors when blob storage fails', async () => {
mockUpload.mockRejectedValueOnce(new Error('Storage quota exceeded'));
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
await expect(sanitizeArtifactBinaryData(input, CTX)).rejects.toThrow('Storage quota exceeded');
});Refs: |
||
|
|
||
| it('uploads an inline image part and replaces data with blob:// URI', async () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
|
|
||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| expect(result.type).toBe('image'); | ||
| expect(result.data).toMatch(/^blob:\/\//); | ||
| expect(result.mimeType).toBe('image/png'); | ||
| }); | ||
|
|
||
| it('preserves non-binary fields on the image part', async () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg', extra: 'keep' }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(result.extra).toBe('keep'); | ||
| }); | ||
|
|
||
| it('does not re-upload data that is already a blob:// URI', async () => { | ||
| const input = { type: 'image', data: 'blob://v1/t_x/artifact-data/p_y/a_z/sha256-abc.png' }; | ||
| await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(mockUpload).not.toHaveBeenCalled(); | ||
| }); | ||
|
|
||
| it('recursively sanitizes nested structures', async () => { | ||
| const input = { | ||
| toolResult: [ | ||
| { type: 'text', text: 'Ticket data' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }, | ||
| ], | ||
| toolName: 'get-zendesk-ticket', | ||
| }; | ||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
|
|
||
| expect(result.toolName).toBe('get-zendesk-ticket'); | ||
| expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket data' }); | ||
| expect(result.toolResult[1].data).toMatch(/^blob:\/\//); | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it('uploads multiple image parts independently', async () => { | ||
| const input = { | ||
| images: [ | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }, | ||
| { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg' }, | ||
| ], | ||
| }; | ||
| await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(mockUpload).toHaveBeenCalledTimes(2); | ||
| }); | ||
|
|
||
| it('leaves non-binary values unchanged', async () => { | ||
| const input = { | ||
| toolName: 'search', | ||
| toolInput: { query: 'test' }, | ||
| count: 5, | ||
| flag: true, | ||
| }; | ||
| const result = await sanitizeArtifactBinaryData(input, CTX); | ||
| expect(result).toEqual(input); | ||
| expect(mockUpload).not.toHaveBeenCalled(); | ||
| }); | ||
|
|
||
| it('produces a deterministic blob:// URI via content hash', async () => { | ||
| const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }; | ||
| const r1 = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| const r2 = (await sanitizeArtifactBinaryData(input, CTX)) as any; | ||
| expect(r1.data).toBe(r2.data); | ||
| }); | ||
|
|
||
| it('handles circular references safely', async () => { | ||
| const input: Record<string, unknown> = { | ||
| toolResult: [{ type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }], | ||
| }; | ||
| input.self = input; | ||
|
|
||
| const result = (await sanitizeArtifactBinaryData(input, CTX)) as Record<string, unknown>; | ||
| expect(result.self).toBe('[Circular Reference]'); | ||
| expect(mockUpload).toHaveBeenCalledOnce(); | ||
| }); | ||
| }); | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,142 @@ | ||||||
| import { createHash } from 'node:crypto'; | ||||||
| import { getExtensionFromMimeType } from '@inkeep/agents-core/constants/allowed-image-formats'; | ||||||
| import { getBlobStorageProvider, isBlobUri, toBlobUri } from './index'; | ||||||
| import { buildStorageKey } from './storage-keys'; | ||||||
|
|
||||||
| export interface ArtifactBinaryContext { | ||||||
| tenantId: string; | ||||||
| projectId: string; | ||||||
| artifactId: string; | ||||||
| } | ||||||
|
|
||||||
| type InlineBinaryPart = { | ||||||
| type: 'image' | 'file'; | ||||||
| data: string; | ||||||
| mimeType?: string; | ||||||
| [key: string]: unknown; | ||||||
| }; | ||||||
|
|
||||||
| const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]'; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Minor: Magic number threshold lacks documentation Issue: The Why: Without documentation, it's unclear whether this threshold was deliberately chosen or if a larger value (e.g., 1KB) would better distinguish binary content from short encoded strings. Fix:
Suggested change
Then use the constant in the check below. |
||||||
|
|
||||||
| function isInlineBinaryPart(value: unknown): value is InlineBinaryPart { | ||||||
| if (typeof value !== 'object' || value === null) return false; | ||||||
| const v = value as Record<string, unknown>; | ||||||
| return ( | ||||||
| (v.type === 'image' || v.type === 'file') && | ||||||
| typeof v.data === 'string' && | ||||||
| v.data.length > 100 && | ||||||
| !isBlobUri(v.data) && | ||||||
| !v.data.startsWith('http') && | ||||||
| !v.data.startsWith('data:') | ||||||
| ); | ||||||
| } | ||||||
|
|
||||||
| async function uploadInlinePart( | ||||||
| part: InlineBinaryPart, | ||||||
| ctx: ArtifactBinaryContext | ||||||
| ): Promise<InlineBinaryPart> { | ||||||
| const storage = getBlobStorageProvider(); | ||||||
| const buffer = Buffer.from(part.data, 'base64'); | ||||||
| const mimeType = part.mimeType ?? 'application/octet-stream'; | ||||||
| const contentHash = createHash('sha256').update(buffer).digest('hex'); | ||||||
| const ext = getExtensionFromMimeType(mimeType); | ||||||
|
|
||||||
| const key = buildStorageKey({ | ||||||
| category: 'artifact-data', | ||||||
| tenantId: ctx.tenantId, | ||||||
| projectId: ctx.projectId, | ||||||
| artifactId: ctx.artifactId, | ||||||
| contentHash, | ||||||
| ext, | ||||||
| }); | ||||||
|
|
||||||
| await storage.upload({ key, data: buffer, contentType: mimeType }); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟠 MAJOR: Missing error handling for storage upload failures Issue: The Why: The existing pattern in Fix: Consider wrapping the upload in a try-catch: try {
await storage.upload({ key, data: buffer, contentType: mimeType });
return { ...part, data: toBlobUri(key) };
} catch (error) {
// Option 1: Return original part (fallback to base64)
return part;
// Option 2: Return placeholder
// return { ...part, data: `[upload failed, binary ~${buffer.length} bytes]` };
}Refs: |
||||||
|
|
||||||
| return { ...part, data: toBlobUri(key) }; | ||||||
| } | ||||||
|
|
||||||
| export async function sanitizeArtifactBinaryData( | ||||||
| value: unknown, | ||||||
| ctx: ArtifactBinaryContext | ||||||
| ): Promise<unknown> { | ||||||
| const inStack = new WeakSet<object>(); | ||||||
|
|
||||||
| const visit = async (current: unknown): Promise<unknown> => { | ||||||
| if (isInlineBinaryPart(current)) { | ||||||
| return uploadInlinePart(current, ctx); | ||||||
| } | ||||||
| if (Array.isArray(current)) { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| return Promise.all(current.map((item) => visit(item))); | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| if (current !== null && typeof current === 'object') { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| const result: Record<string, unknown> = {}; | ||||||
| for (const [k, v] of Object.entries(current as Record<string, unknown>)) { | ||||||
| result[k] = await visit(v); | ||||||
| } | ||||||
| return result; | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| return current; | ||||||
| }; | ||||||
|
|
||||||
| return visit(value); | ||||||
| } | ||||||
|
|
||||||
| export function stripBinaryDataForObservability(value: unknown): unknown { | ||||||
| const inStack = new WeakSet<object>(); | ||||||
|
|
||||||
| const visit = (current: unknown): unknown => { | ||||||
| if (isInlineBinaryPart(current)) { | ||||||
| const part = current as InlineBinaryPart; | ||||||
| const approxBytes = Math.round(part.data.length * 0.75); | ||||||
| return { | ||||||
| ...part, | ||||||
| data: `[binary data ~${approxBytes} bytes, mimeType: ${part.mimeType ?? 'unknown'}]`, | ||||||
| }; | ||||||
| } | ||||||
| if (Array.isArray(current)) { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| return current.map(visit); | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| if (current !== null && typeof current === 'object') { | ||||||
| if (inStack.has(current)) { | ||||||
| return CIRCULAR_REFERENCE_PLACEHOLDER; | ||||||
| } | ||||||
| inStack.add(current); | ||||||
| try { | ||||||
| const result: Record<string, unknown> = {}; | ||||||
| for (const [k, v] of Object.entries(current as Record<string, unknown>)) { | ||||||
| result[k] = visit(v); | ||||||
| } | ||||||
| return result; | ||||||
| } finally { | ||||||
| inStack.delete(current); | ||||||
| } | ||||||
| } | ||||||
| return current; | ||||||
| }; | ||||||
|
|
||||||
| return visit(value); | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Minor: Missing test for
data:URI handlingIssue: Line 30 of the implementation checks
!v.data.startsWith('data:')but there's no explicit test verifying data URIs are left untouched.Why: Data URIs are a common format for inline images. Without a test, a refactor could accidentally remove the check, causing data URIs to be re-uploaded unnecessarily.
Fix: Add test cases: