Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import {
sanitizeArtifactBinaryData,
stripBinaryDataForObservability,
} from '../blob-storage/artifact-binary-sanitizer';

vi.mock('../blob-storage/index', () => ({
getBlobStorageProvider: vi.fn(),
isBlobUri: (s: string) => s.startsWith('blob://'),
toBlobUri: (key: string) => `blob://${key}`,
fromBlobUri: (uri: string) => uri.slice('blob://'.length),
BLOB_URI_PREFIX: 'blob://',
}));

vi.mock('../blob-storage/storage-keys', () => ({
buildStorageKey: vi.fn(
(input: any) =>
`v1/t_${input.tenantId}/artifact-data/p_${input.projectId}/a_${input.artifactId}/sha256-${input.contentHash}.${input.ext}`
),
}));

const SAMPLE_BASE64 = Buffer.from('sample image bytes').toString('base64');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💭 Consider: Missing boundary test for threshold change

Issue: The threshold changed from > 100 to > 1, but no test verifies this boundary. SAMPLE_BASE64 (~24 chars) passes both thresholds.

Why: Without a boundary test, a future refactor could accidentally change > 1 to >= 1 or back to > 100 without test failure.

Fix: Consider adding boundary tests:

it('treats 2-char data string as binary part', () => {
  const input = { type: 'image', data: 'AA', mimeType: 'image/png' };
  const result = stripBinaryDataForObservability(input) as any;
  expect(result.data).toMatch(/^\[binary data/);
});

it('does not treat 1-char data string as binary part', () => {
  const input = { type: 'image', data: 'A', mimeType: 'image/png' };
  const result = stripBinaryDataForObservability(input) as any;
  expect(result.data).toBe('A');
});


const CTX = { tenantId: 'tenant-1', projectId: 'proj-1', artifactId: 'art-1' };

describe('stripBinaryDataForObservability', () => {
it('replaces image part data with placeholder', () => {
const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.type).toBe('image');
expect(result.data).toMatch(/^\[binary data ~\d+ bytes, mimeType: image\/png\]$/);
expect(result.mimeType).toBe('image/png');
});

it('replaces file part data with placeholder', () => {
const input = { type: 'file', data: SAMPLE_BASE64, mimeType: 'application/pdf' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toMatch(/^\[binary data ~\d+ bytes/);
});

it('leaves already-blob-uri data untouched', () => {
const input = { type: 'image', data: 'blob://some/key', mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toBe('blob://some/key');
});

it('leaves http URLs untouched', () => {
const input = { type: 'image', data: 'https://example.com/img.png', mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toBe('https://example.com/img.png');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: Missing test for data: URI handling

Issue: Line 30 of the implementation checks !v.data.startsWith('data:') but there's no explicit test verifying data URIs are left untouched.

Why: Data URIs are a common format for inline images. Without a test, a refactor could accidentally remove the check, causing data URIs to be re-uploaded unnecessarily.

Fix: Add test cases:

it('leaves data: URIs untouched', () => {
  const dataUri = 'data:image/png;base64,' + LARGE_BASE64;
  const input = { type: 'image', data: dataUri, mimeType: 'image/png' };
  const result = stripBinaryDataForObservability(input) as any;
  expect(result.data).toBe(dataUri);
});

});

it('strips data: URIs', () => {
const input = {
type: 'image',
data: 'data:image/png;base64,iVBORw0KGgo=',
mimeType: 'image/png',
};
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toMatch(/^\[binary data/);
});

it('recursively strips nested binary parts', () => {
const input = {
toolResult: [
{ type: 'text', text: 'Ticket info' },
{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/jpeg' },
],
};
const result = stripBinaryDataForObservability(input) as any;
expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket info' });
expect(result.toolResult[1].data).toMatch(/^\[binary data/);
});

it('handles arrays at top level', () => {
const input = [
{ type: 'text', text: 'hi' },
{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' },
];
const result = stripBinaryDataForObservability(input) as any[];
expect(result[0]).toEqual({ type: 'text', text: 'hi' });
expect(result[1].data).toMatch(/^\[binary data/);
});

it('passes through non-object primitives unchanged', () => {
expect(stripBinaryDataForObservability('hello')).toBe('hello');
expect(stripBinaryDataForObservability(42)).toBe(42);
expect(stripBinaryDataForObservability(null)).toBeNull();
});

it('handles circular references safely', () => {
const input: Record<string, unknown> = { type: 'container' };
input.self = input;

const result = stripBinaryDataForObservability(input) as Record<string, unknown>;
expect(result.type).toBe('container');
expect(result.self).toBe('[Circular Reference]');
});
});

describe('sanitizeArtifactBinaryData', () => {
let mockUpload: ReturnType<typeof vi.fn>;

beforeEach(async () => {
mockUpload = vi.fn().mockResolvedValue(undefined);
const { getBlobStorageProvider } = await import('../blob-storage/index');
vi.mocked(getBlobStorageProvider).mockReturnValue({
upload: mockUpload,
download: vi.fn(),
delete: vi.fn(),
});
});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: Missing test for upload failure handling

Issue: The test suite mocks upload to always succeed but never tests the failure path.

Why: Given that image-upload.test.ts has a test for 'drops file part when storage.upload throws' (line 148), this gap masks the missing error handling in the implementation.

Fix: Add a test case:

it('propagates upload errors when blob storage fails', async () => {
  mockUpload.mockRejectedValueOnce(new Error('Storage quota exceeded'));
  const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
  await expect(sanitizeArtifactBinaryData(input, CTX)).rejects.toThrow('Storage quota exceeded');
});

Refs:


it('uploads an inline image part and replaces data with blob:// URI', async () => {
const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' };
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;

expect(mockUpload).toHaveBeenCalledOnce();
expect(result.type).toBe('image');
expect(result.data).toMatch(/^blob:\/\//);
expect(result.mimeType).toBe('image/png');
});

it('preserves non-binary fields on the image part', async () => {
const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/jpeg', extra: 'keep' };
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;
expect(result.extra).toBe('keep');
});

it('does not re-upload data that is already a blob:// URI', async () => {
const input = { type: 'image', data: 'blob://v1/t_x/artifact-data/p_y/a_z/sha256-abc.png' };
await sanitizeArtifactBinaryData(input, CTX);
expect(mockUpload).not.toHaveBeenCalled();
});

it('recursively sanitizes nested structures', async () => {
const input = {
toolResult: [
{ type: 'text', text: 'Ticket data' },
{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' },
],
toolName: 'get-zendesk-ticket',
};
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;

expect(result.toolName).toBe('get-zendesk-ticket');
expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket data' });
expect(result.toolResult[1].data).toMatch(/^blob:\/\//);
expect(mockUpload).toHaveBeenCalledOnce();
});

it('uploads multiple image parts independently', async () => {
const input = {
images: [
{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' },
{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/jpeg' },
],
};
await sanitizeArtifactBinaryData(input, CTX);
expect(mockUpload).toHaveBeenCalledTimes(2);
});

it('leaves non-binary values unchanged', async () => {
const input = {
toolName: 'search',
toolInput: { query: 'test' },
count: 5,
flag: true,
};
const result = await sanitizeArtifactBinaryData(input, CTX);
expect(result).toEqual(input);
expect(mockUpload).not.toHaveBeenCalled();
});

it('produces a deterministic blob:// URI via content hash', async () => {
const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' };
const r1 = (await sanitizeArtifactBinaryData(input, CTX)) as any;
const r2 = (await sanitizeArtifactBinaryData(input, CTX)) as any;
expect(r1.data).toBe(r2.data);
});

it('handles circular references safely', async () => {
const input: Record<string, unknown> = {
toolResult: [{ type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' }],
};
input.self = input;

const result = (await sanitizeArtifactBinaryData(input, CTX)) as Record<string, unknown>;
expect(result.self).toBe('[Circular Reference]');
expect(mockUpload).toHaveBeenCalledOnce();
});

it('uploads data: URI parts, extracting base64 payload and mimeType from the prefix', async () => {
const base64Payload = Buffer.from('png bytes').toString('base64');
const input = {
type: 'image',
data: `data:image/png;base64,${base64Payload}`,
mimeType: 'image/jpeg',
};
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;
expect(mockUpload).toHaveBeenCalledOnce();
const { data: uploadedData, contentType } = mockUpload.mock.calls[0][0];
expect(uploadedData).toEqual(Buffer.from(base64Payload, 'base64'));
expect(contentType).toBe('image/png');
expect(result.data).toMatch(/^blob:\/\//);
});

it('returns original inline data when upload fails', async () => {
mockUpload.mockRejectedValueOnce(new Error('storage unavailable'));
const input = { type: 'image', data: SAMPLE_BASE64, mimeType: 'image/png' };
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;
expect(result.data).toBe(SAMPLE_BASE64);
expect(result.type).toBe('image');
});
});
13 changes: 13 additions & 0 deletions agents-api/src/domains/run/services/__tests__/storage-keys.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ describe('storage-keys', () => {
).toBeNull();
});

it('builds versioned artifact-data storage key', () => {
const key = buildStorageKey({
category: 'artifact-data',
tenantId: 'tenant-1',
projectId: 'project-1',
artifactId: 'artifact-1',
contentHash: 'abc123',
ext: 'png',
});

expect(key).toBe('v1/t_tenant-1/artifact-data/p_project-1/a_artifact-1/sha256-abc123.png');
});

it('builds media storage key prefix', () => {
expect(
buildMediaStorageKeyPrefix({
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import { createHash } from 'node:crypto';
import { getExtensionFromMimeType } from '@inkeep/agents-core/constants/allowed-file-formats';
import { getLogger } from '../../../../logger';
import { parseDataUri } from '../../utils/message-parts';
import { getBlobStorageProvider, isBlobUri, toBlobUri } from './index';
import { buildStorageKey } from './storage-keys';

const logger = getLogger('artifact-binary-sanitizer');

export interface ArtifactBinaryContext {
tenantId: string;
projectId: string;
artifactId: string;
}

type InlineBinaryPart = {
type: 'image' | 'file';
data: string;
mimeType?: string;
[key: string]: unknown;
};

const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: Magic number threshold lacks documentation

Issue: The 100 character threshold for detecting binary data is hardcoded without explanation. This represents only ~75 decoded bytes, which may be smaller than intended.

Why: Without documentation, it's unclear whether this threshold was deliberately chosen or if a larger value (e.g., 1KB) would better distinguish binary content from short encoded strings.

Fix:

Suggested change
const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]';
const BINARY_DATA_MIN_LENGTH = 100; // ~75 decoded bytes; data below this is likely metadata, not binary payload

Then use the constant in the check below.


function isInlineBinaryPart(value: unknown): value is InlineBinaryPart {
if (typeof value !== 'object' || value === null) return false;
const v = value as Record<string, unknown>;
return (
(v.type === 'image' || v.type === 'file') &&
typeof v.data === 'string' &&
!isBlobUri(v.data) &&
!v.data.startsWith('http')
);
}

async function uploadInlinePart(
part: InlineBinaryPart,
ctx: ArtifactBinaryContext
): Promise<InlineBinaryPart> {
const storage = getBlobStorageProvider();
const parsed = parseDataUri(part.data);
const base64Data = parsed ? parsed.base64Data : part.data;
const mimeType = parsed?.mimeType ?? part.mimeType ?? 'application/octet-stream';
const buffer = Buffer.from(base64Data, 'base64');
const contentHash = createHash('sha256').update(buffer).digest('hex');
const ext = getExtensionFromMimeType(mimeType);

const key = buildStorageKey({
category: 'artifact-data',
tenantId: ctx.tenantId,
projectId: ctx.projectId,
artifactId: ctx.artifactId,
contentHash,
ext,
});

try {
await storage.upload({ key, data: buffer, contentType: mimeType });
} catch (error) {
logger.error(
{
error: error instanceof Error ? error.message : String(error),
tenantId: ctx.tenantId,
projectId: ctx.projectId,
artifactId: ctx.artifactId,
mimeType,
size: buffer.length,
},
'Failed to upload artifact binary data to blob storage, returning original inline data'
);
return part;
}

return { ...part, data: toBlobUri(key) };
}

export async function sanitizeArtifactBinaryData(
value: unknown,
ctx: ArtifactBinaryContext
): Promise<unknown> {
const inStack = new WeakSet<object>();

const visit = async (current: unknown): Promise<unknown> => {
if (isInlineBinaryPart(current)) {
return uploadInlinePart(current, ctx);
}
if (Array.isArray(current)) {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
return Promise.all(current.map((item) => visit(item)));
} finally {
inStack.delete(current);
}
}
if (current !== null && typeof current === 'object') {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
const result: Record<string, unknown> = {};
for (const [k, v] of Object.entries(current as Record<string, unknown>)) {
result[k] = await visit(v);
}
return result;
} finally {
inStack.delete(current);
}
}
return current;
};

return visit(value);
}

export function stripBinaryDataForObservability(value: unknown): unknown {
const inStack = new WeakSet<object>();

const visit = (current: unknown): unknown => {
if (isInlineBinaryPart(current)) {
const part = current;
const approxBytes = Math.round(part.data.length * 0.75);
return {
...part,
data: `[binary data ~${approxBytes} bytes, mimeType: ${part.mimeType ?? 'unknown'}]`,
};
}
if (Array.isArray(current)) {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
return current.map(visit);
} finally {
inStack.delete(current);
}
}
if (current !== null && typeof current === 'object') {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
const result: Record<string, unknown> = {};
for (const [k, v] of Object.entries(current as Record<string, unknown>)) {
result[k] = visit(v);
}
return result;
} finally {
inStack.delete(current);
}
}
return current;
};

return visit(value);
}
Loading
Loading