Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import {
sanitizeArtifactBinaryData,
stripBinaryDataForObservability,
} from '../blob-storage/artifact-binary-sanitizer';

vi.mock('../blob-storage/index', () => ({
getBlobStorageProvider: vi.fn(),
isBlobUri: (s: string) => s.startsWith('blob://'),
toBlobUri: (key: string) => `blob://${key}`,
fromBlobUri: (uri: string) => uri.slice('blob://'.length),
BLOB_URI_PREFIX: 'blob://',
}));

vi.mock('../blob-storage/storage-keys', () => ({
buildStorageKey: vi.fn(
(input: any) =>
`v1/t_${input.tenantId}/artifact-data/p_${input.projectId}/a_${input.artifactId}/sha256-${input.contentHash}.${input.ext}`
),
}));

const SMALL_BASE64 = 'aGVsbG8=';
const LARGE_BASE64 = Buffer.from('x'.repeat(200)).toString('base64');

const CTX = { tenantId: 'tenant-1', projectId: 'proj-1', artifactId: 'art-1' };

describe('stripBinaryDataForObservability', () => {
it('replaces image part data with placeholder', () => {
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.type).toBe('image');
expect(result.data).toMatch(/^\[binary data ~\d+ bytes, mimeType: image\/png\]$/);
expect(result.mimeType).toBe('image/png');
});

it('replaces file part data with placeholder', () => {
const input = { type: 'file', data: LARGE_BASE64, mimeType: 'application/pdf' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toMatch(/^\[binary data ~\d+ bytes/);
});

it('leaves already-blob-uri data untouched', () => {
const input = { type: 'image', data: 'blob://some/key', mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toBe('blob://some/key');
});

it('leaves small strings untouched (below 100 char threshold)', () => {
const input = { type: 'image', data: SMALL_BASE64, mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toBe(SMALL_BASE64);
});

it('leaves http URLs untouched', () => {
const input = { type: 'image', data: 'https://example.com/img.png', mimeType: 'image/png' };
const result = stripBinaryDataForObservability(input) as any;
expect(result.data).toBe('https://example.com/img.png');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: Missing test for data: URI handling

Issue: Line 30 of the implementation checks !v.data.startsWith('data:') but there's no explicit test verifying data URIs are left untouched.

Why: Data URIs are a common format for inline images. Without a test, a refactor could accidentally remove the check, causing data URIs to be re-uploaded unnecessarily.

Fix: Add test cases:

it('leaves data: URIs untouched', () => {
  const dataUri = 'data:image/png;base64,' + LARGE_BASE64;
  const input = { type: 'image', data: dataUri, mimeType: 'image/png' };
  const result = stripBinaryDataForObservability(input) as any;
  expect(result.data).toBe(dataUri);
});

});

it('recursively strips nested binary parts', () => {
const input = {
toolResult: [
{ type: 'text', text: 'Ticket info' },
{ type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg' },
],
};
const result = stripBinaryDataForObservability(input) as any;
expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket info' });
expect(result.toolResult[1].data).toMatch(/^\[binary data/);
});

it('handles arrays at top level', () => {
const input = [
{ type: 'text', text: 'hi' },
{ type: 'image', data: LARGE_BASE64, mimeType: 'image/png' },
];
const result = stripBinaryDataForObservability(input) as any[];
expect(result[0]).toEqual({ type: 'text', text: 'hi' });
expect(result[1].data).toMatch(/^\[binary data/);
});

it('passes through non-object primitives unchanged', () => {
expect(stripBinaryDataForObservability('hello')).toBe('hello');
expect(stripBinaryDataForObservability(42)).toBe(42);
expect(stripBinaryDataForObservability(null)).toBeNull();
});

it('handles circular references safely', () => {
const input: Record<string, unknown> = { type: 'container' };
input.self = input;

const result = stripBinaryDataForObservability(input) as Record<string, unknown>;
expect(result.type).toBe('container');
expect(result.self).toBe('[Circular Reference]');
});
});

describe('sanitizeArtifactBinaryData', () => {
let mockUpload: ReturnType<typeof vi.fn>;

beforeEach(async () => {
mockUpload = vi.fn().mockResolvedValue(undefined);
const { getBlobStorageProvider } = await import('../blob-storage/index');
vi.mocked(getBlobStorageProvider).mockReturnValue({
upload: mockUpload,
download: vi.fn(),
delete: vi.fn(),
});
});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: Missing test for upload failure handling

Issue: The test suite mocks upload to always succeed but never tests the failure path.

Why: Given that image-upload.test.ts has a test for 'drops file part when storage.upload throws' (line 148), this gap masks the missing error handling in the implementation.

Fix: Add a test case:

it('propagates upload errors when blob storage fails', async () => {
  mockUpload.mockRejectedValueOnce(new Error('Storage quota exceeded'));
  const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
  await expect(sanitizeArtifactBinaryData(input, CTX)).rejects.toThrow('Storage quota exceeded');
});

Refs:


it('uploads an inline image part and replaces data with blob:// URI', async () => {
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;

expect(mockUpload).toHaveBeenCalledOnce();
expect(result.type).toBe('image');
expect(result.data).toMatch(/^blob:\/\//);
expect(result.mimeType).toBe('image/png');
});

it('preserves non-binary fields on the image part', async () => {
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg', extra: 'keep' };
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;
expect(result.extra).toBe('keep');
});

it('does not re-upload data that is already a blob:// URI', async () => {
const input = { type: 'image', data: 'blob://v1/t_x/artifact-data/p_y/a_z/sha256-abc.png' };
await sanitizeArtifactBinaryData(input, CTX);
expect(mockUpload).not.toHaveBeenCalled();
});

it('recursively sanitizes nested structures', async () => {
const input = {
toolResult: [
{ type: 'text', text: 'Ticket data' },
{ type: 'image', data: LARGE_BASE64, mimeType: 'image/png' },
],
toolName: 'get-zendesk-ticket',
};
const result = (await sanitizeArtifactBinaryData(input, CTX)) as any;

expect(result.toolName).toBe('get-zendesk-ticket');
expect(result.toolResult[0]).toEqual({ type: 'text', text: 'Ticket data' });
expect(result.toolResult[1].data).toMatch(/^blob:\/\//);
expect(mockUpload).toHaveBeenCalledOnce();
});

it('uploads multiple image parts independently', async () => {
const input = {
images: [
{ type: 'image', data: LARGE_BASE64, mimeType: 'image/png' },
{ type: 'image', data: LARGE_BASE64, mimeType: 'image/jpeg' },
],
};
await sanitizeArtifactBinaryData(input, CTX);
expect(mockUpload).toHaveBeenCalledTimes(2);
});

it('leaves non-binary values unchanged', async () => {
const input = {
toolName: 'search',
toolInput: { query: 'test' },
count: 5,
flag: true,
};
const result = await sanitizeArtifactBinaryData(input, CTX);
expect(result).toEqual(input);
expect(mockUpload).not.toHaveBeenCalled();
});

it('produces a deterministic blob:// URI via content hash', async () => {
const input = { type: 'image', data: LARGE_BASE64, mimeType: 'image/png' };
const r1 = (await sanitizeArtifactBinaryData(input, CTX)) as any;
const r2 = (await sanitizeArtifactBinaryData(input, CTX)) as any;
expect(r1.data).toBe(r2.data);
});

it('handles circular references safely', async () => {
const input: Record<string, unknown> = {
toolResult: [{ type: 'image', data: LARGE_BASE64, mimeType: 'image/png' }],
};
input.self = input;

const result = (await sanitizeArtifactBinaryData(input, CTX)) as Record<string, unknown>;
expect(result.self).toBe('[Circular Reference]');
expect(mockUpload).toHaveBeenCalledOnce();
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { createHash } from 'node:crypto';
import { getExtensionFromMimeType } from '@inkeep/agents-core/constants/allowed-image-formats';
import { getBlobStorageProvider, isBlobUri, toBlobUri } from './index';
import { buildStorageKey } from './storage-keys';

export interface ArtifactBinaryContext {
tenantId: string;
projectId: string;
artifactId: string;
}

type InlineBinaryPart = {
type: 'image' | 'file';
data: string;
mimeType?: string;
[key: string]: unknown;
};

const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: Magic number threshold lacks documentation

Issue: The 100 character threshold for detecting binary data is hardcoded without explanation. This represents only ~75 decoded bytes, which may be smaller than intended.

Why: Without documentation, it's unclear whether this threshold was deliberately chosen or if a larger value (e.g., 1KB) would better distinguish binary content from short encoded strings.

Fix:

Suggested change
const CIRCULAR_REFERENCE_PLACEHOLDER = '[Circular Reference]';
const BINARY_DATA_MIN_LENGTH = 100; // ~75 decoded bytes; data below this is likely metadata, not binary payload

Then use the constant in the check below.


function isInlineBinaryPart(value: unknown): value is InlineBinaryPart {
if (typeof value !== 'object' || value === null) return false;
const v = value as Record<string, unknown>;
return (
(v.type === 'image' || v.type === 'file') &&
typeof v.data === 'string' &&
v.data.length > 100 &&
!isBlobUri(v.data) &&
!v.data.startsWith('http') &&
!v.data.startsWith('data:')
);
}

async function uploadInlinePart(
part: InlineBinaryPart,
ctx: ArtifactBinaryContext
): Promise<InlineBinaryPart> {
const storage = getBlobStorageProvider();
const buffer = Buffer.from(part.data, 'base64');
const mimeType = part.mimeType ?? 'application/octet-stream';
const contentHash = createHash('sha256').update(buffer).digest('hex');
const ext = getExtensionFromMimeType(mimeType);

const key = buildStorageKey({
category: 'artifact-data',
tenantId: ctx.tenantId,
projectId: ctx.projectId,
artifactId: ctx.artifactId,
contentHash,
ext,
});

await storage.upload({ key, data: buffer, contentType: mimeType });
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 MAJOR: Missing error handling for storage upload failures

Issue: The uploadInlinePart function calls storage.upload() without a try-catch. If the upload fails (network error, quota exceeded, permission denied), the error propagates and could fail the entire artifact processing pipeline.

Why: The existing pattern in image-upload.ts (lines 90-98) catches upload failures gracefully. This inconsistency could cause unhandled errors and partial artifact state when storage is temporarily unavailable.

Fix: Consider wrapping the upload in a try-catch:

try {
  await storage.upload({ key, data: buffer, contentType: mimeType });
  return { ...part, data: toBlobUri(key) };
} catch (error) {
  // Option 1: Return original part (fallback to base64)
  return part;
  // Option 2: Return placeholder
  // return { ...part, data: `[upload failed, binary ~${buffer.length} bytes]` };
}

Refs:


return { ...part, data: toBlobUri(key) };
}

export async function sanitizeArtifactBinaryData(
value: unknown,
ctx: ArtifactBinaryContext
): Promise<unknown> {
const inStack = new WeakSet<object>();

const visit = async (current: unknown): Promise<unknown> => {
if (isInlineBinaryPart(current)) {
return uploadInlinePart(current, ctx);
}
if (Array.isArray(current)) {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
return Promise.all(current.map((item) => visit(item)));
} finally {
inStack.delete(current);
}
}
if (current !== null && typeof current === 'object') {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
const result: Record<string, unknown> = {};
for (const [k, v] of Object.entries(current as Record<string, unknown>)) {
result[k] = await visit(v);
}
return result;
} finally {
inStack.delete(current);
}
}
return current;
};

return visit(value);
}

export function stripBinaryDataForObservability(value: unknown): unknown {
const inStack = new WeakSet<object>();

const visit = (current: unknown): unknown => {
if (isInlineBinaryPart(current)) {
const part = current as InlineBinaryPart;
const approxBytes = Math.round(part.data.length * 0.75);
return {
...part,
data: `[binary data ~${approxBytes} bytes, mimeType: ${part.mimeType ?? 'unknown'}]`,
};
}
if (Array.isArray(current)) {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
return current.map(visit);
} finally {
inStack.delete(current);
}
}
if (current !== null && typeof current === 'object') {
if (inStack.has(current)) {
return CIRCULAR_REFERENCE_PLACEHOLDER;
}
inStack.add(current);
try {
const result: Record<string, unknown> = {};
for (const [k, v] of Object.entries(current as Record<string, unknown>)) {
result[k] = visit(v);
}
return result;
} finally {
inStack.delete(current);
}
}
return current;
};

return visit(value);
}
20 changes: 19 additions & 1 deletion agents-api/src/domains/run/services/blob-storage/storage-keys.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,16 @@ export type ConversationMediaKeyInput = {
ext: string;
};

export type StorageKeyInput = ConversationMediaKeyInput;
export type ArtifactDataKeyInput = {
category: 'artifact-data';
tenantId: string;
projectId: string;
artifactId: string;
contentHash: string;
ext: string;
};

export type StorageKeyInput = ConversationMediaKeyInput | ArtifactDataKeyInput;

export function buildStorageKey(input: StorageKeyInput): string {
switch (input.category) {
Expand All @@ -25,6 +34,15 @@ export function buildStorageKey(input: StorageKeyInput): string {
`m_${input.messageId}`,
`sha256-${input.contentHash}.${input.ext}`,
].join('/');
case 'artifact-data':
return [
STORAGE_KEY_VERSION,
`t_${input.tenantId}`,
input.category,
`p_${input.projectId}`,
`a_${input.artifactId}`,
`sha256-${input.contentHash}.${input.ext}`,
].join('/');
}
}

Expand Down
Loading
Loading