Skip to content

Commit e31465a

Browse files
Export image support for JSON and JSONL (#4359)
* export image support for json and jsonl * add tests and cleanup functionality * add test for convertTo prepare function * comment --------- Co-authored-by: timothycarambat <[email protected]>
1 parent bb7d65f commit e31465a

File tree

2 files changed

+294
-8
lines changed

2 files changed

+294
-8
lines changed
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
/* eslint-env jest */
2+
const { prepareChatsForExport } = require("../../../utils/helpers/chat/convertTo");
3+
4+
// Mock the database models
5+
jest.mock("../../../models/workspaceChats");
6+
jest.mock("../../../models/embedChats");
7+
8+
const { WorkspaceChats } = require("../../../models/workspaceChats");
9+
const { EmbedChats } = require("../../../models/embedChats");
10+
11+
const mockChat = (withImages = false) => {
12+
return {
13+
id: 1,
14+
prompt: "Test prompt",
15+
response: JSON.stringify({
16+
text: "Test response",
17+
attachments: withImages ? [
18+
{ mime: "image/png", name: "image.png", contentString: "....=" },
19+
{ mime: "image/jpeg", name: "image2.jpeg", contentString: "....=" }
20+
] : [],
21+
sources: [],
22+
metrics: {},
23+
}),
24+
createdAt: new Date(),
25+
workspace: { name: "Test Workspace", openAiPrompt: "Test OpenAI Prompt" },
26+
user: { username: "testuser" },
27+
feedbackScore: 1,
28+
}
29+
};
30+
31+
describe("prepareChatsForExport", () => {
32+
beforeEach(() => {
33+
jest.clearAllMocks();
34+
WorkspaceChats.whereWithData = jest.fn().mockResolvedValue([]);
35+
EmbedChats.whereWithEmbedAndWorkspace = jest.fn().mockResolvedValue([]);
36+
});
37+
38+
test("should throw error for invalid chat type", async () => {
39+
await expect(prepareChatsForExport("json", "invalid"))
40+
.rejects
41+
.toThrow("Invalid chat type: invalid");
42+
});
43+
44+
test("should throw error for invalid export type", async () => {
45+
await expect(prepareChatsForExport("invalid", "workspace"))
46+
.rejects
47+
.toThrow("Invalid export type: invalid");
48+
});
49+
50+
// CSV and JSON are the same format, so we can test them together
51+
test("should return prepared data in csv and json format for workspace chat type", async () => {
52+
const chatExample = mockChat();
53+
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
54+
const result = await prepareChatsForExport("json", "workspace");
55+
56+
const responseJson = JSON.parse(chatExample.response);
57+
expect(result).toBeDefined();
58+
expect(result).toEqual([{
59+
id: chatExample.id,
60+
prompt: chatExample.prompt,
61+
response: responseJson.text,
62+
sent_at: chatExample.createdAt,
63+
rating: chatExample.feedbackScore ? "GOOD" : "BAD",
64+
username: chatExample.user.username,
65+
workspace: chatExample.workspace.name,
66+
attachments: [],
67+
}]);
68+
});
69+
70+
test("Should handle attachments for workspace chat type when json format is selected", async () => {
71+
const chatExample = mockChat(true);
72+
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
73+
const result = await prepareChatsForExport("json", "workspace");
74+
75+
const responseJson = JSON.parse(chatExample.response);
76+
expect(result).toBeDefined();
77+
expect(result).toEqual([{
78+
id: chatExample.id,
79+
prompt: chatExample.prompt,
80+
response: responseJson.text,
81+
sent_at: chatExample.createdAt,
82+
rating: chatExample.feedbackScore ? "GOOD" : "BAD",
83+
username: chatExample.user.username,
84+
workspace: chatExample.workspace.name,
85+
attachments: [
86+
{
87+
type: "image",
88+
image: responseJson.attachments[0].contentString,
89+
},
90+
{
91+
type: "image",
92+
image: responseJson.attachments[1].contentString,
93+
},
94+
]
95+
}]);
96+
});
97+
98+
test("Should ignore attachments for workspace chat type when csv format is selected", async () => {
99+
const chatExample = mockChat(true);
100+
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
101+
const result = await prepareChatsForExport("csv", "workspace");
102+
103+
const responseJson = JSON.parse(chatExample.response);
104+
expect(result).toBeDefined();
105+
expect(result.attachments).not.toBeDefined();
106+
expect(result).toEqual([{
107+
id: chatExample.id,
108+
prompt: chatExample.prompt,
109+
response: responseJson.text,
110+
sent_at: chatExample.createdAt,
111+
rating: chatExample.feedbackScore ? "GOOD" : "BAD",
112+
username: chatExample.user.username,
113+
workspace: chatExample.workspace.name,
114+
}]);
115+
});
116+
117+
test("should return prepared data in jsonAlpaca format for workspace chat type", async () => {
118+
const chatExample = mockChat();
119+
const imageChatExample = mockChat(true);
120+
WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]);
121+
const result = await prepareChatsForExport("jsonAlpaca", "workspace");
122+
123+
const responseJson1 = JSON.parse(chatExample.response);
124+
const responseJson2 = JSON.parse(imageChatExample.response);
125+
expect(result).toBeDefined();
126+
127+
// Alpaca format does not support attachments - so they are not included
128+
expect(result[0].attachments).not.toBeDefined();
129+
expect(result[1].attachments).not.toBeDefined();
130+
expect(result).toEqual([{
131+
instruction: chatExample.workspace.openAiPrompt,
132+
input: chatExample.prompt,
133+
output: responseJson1.text,
134+
},
135+
{
136+
instruction: chatExample.workspace.openAiPrompt,
137+
input: imageChatExample.prompt,
138+
output: responseJson2.text,
139+
}]);
140+
});
141+
142+
test("should return prepared data in jsonl format for workspace chat type", async () => {
143+
const chatExample = mockChat();
144+
const responseJson = JSON.parse(chatExample.response);
145+
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
146+
const result = await prepareChatsForExport("jsonl", "workspace");
147+
expect(result).toBeDefined();
148+
expect(result).toEqual(
149+
{
150+
[chatExample.workspace.id]: {
151+
messages: [
152+
{
153+
role: "system",
154+
content: [{
155+
type: "text",
156+
text: chatExample.workspace.openAiPrompt,
157+
}],
158+
},
159+
{
160+
role: "user",
161+
content: [{
162+
type: "text",
163+
text: chatExample.prompt,
164+
}],
165+
},
166+
{
167+
role: "assistant",
168+
content: [{
169+
type: "text",
170+
text: responseJson.text,
171+
}],
172+
},
173+
],
174+
},
175+
},
176+
);
177+
});
178+
179+
test("should return prepared data in jsonl format for workspace chat type with attachments", async () => {
180+
const chatExample = mockChat();
181+
const imageChatExample = mockChat(true);
182+
const responseJson = JSON.parse(chatExample.response);
183+
const imageResponseJson = JSON.parse(imageChatExample.response);
184+
185+
WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]);
186+
const result = await prepareChatsForExport("jsonl", "workspace");
187+
expect(result).toBeDefined();
188+
expect(result).toEqual(
189+
{
190+
[chatExample.workspace.id]: {
191+
messages: [
192+
{
193+
role: "system",
194+
content: [{
195+
type: "text",
196+
text: chatExample.workspace.openAiPrompt,
197+
}],
198+
},
199+
{
200+
role: "user",
201+
content: [{
202+
type: "text",
203+
text: chatExample.prompt,
204+
}],
205+
},
206+
{
207+
role: "assistant",
208+
content: [{
209+
type: "text",
210+
text: responseJson.text,
211+
}],
212+
},
213+
{
214+
role: "user",
215+
content: [{
216+
type: "text",
217+
text: imageChatExample.prompt,
218+
}, {
219+
type: "image",
220+
image: imageResponseJson.attachments[0].contentString,
221+
}, {
222+
type: "image",
223+
image: imageResponseJson.attachments[1].contentString,
224+
}],
225+
},
226+
{
227+
role: "assistant",
228+
content: [{
229+
type: "text",
230+
text: imageResponseJson.text,
231+
}],
232+
},
233+
],
234+
},
235+
},
236+
);
237+
});
238+
});

server/utils/helpers/chat/convertTo.js

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ async function convertToJSONAlpaca(preparedData) {
3434
return JSON.stringify(preparedData, null, 4);
3535
}
3636

37+
// You can validate JSONL outputs on https://jsonlines.org/validator/
3738
async function convertToJSONL(workspaceChatsMap) {
3839
return Object.values(workspaceChatsMap)
3940
.map((workspaceChats) => JSON.stringify(workspaceChats))
@@ -64,12 +65,24 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
6465

6566
if (format === "csv" || format === "json") {
6667
const preparedData = chats.map((chat) => {
67-
const responseJson = JSON.parse(chat.response);
68+
const responseJson = safeJsonParse(chat.response, {});
6869
const baseData = {
6970
id: chat.id,
7071
prompt: chat.prompt,
7172
response: responseJson.text,
7273
sent_at: chat.createdAt,
74+
// Only add attachments to the json format since we cannot arrange attachments in csv format
75+
...(format === "json"
76+
? {
77+
attachments:
78+
responseJson.attachments?.length > 0
79+
? responseJson.attachments.map((attachment) => ({
80+
type: "image",
81+
image: attachmentToDataUrl(attachment),
82+
}))
83+
: [],
84+
}
85+
: {}),
7386
};
7487

7588
if (chatType === "embed") {
@@ -101,9 +114,10 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
101114
return preparedData;
102115
}
103116

117+
// jsonAlpaca format does not support array outputs
104118
if (format === "jsonAlpaca") {
105119
const preparedData = chats.map((chat) => {
106-
const responseJson = JSON.parse(chat.response);
120+
const responseJson = safeJsonParse(chat.response, {});
107121
return {
108122
instruction: buildSystemPrompt(
109123
chat,
@@ -117,18 +131,25 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
117131
return preparedData;
118132
}
119133

134+
// Export to JSONL format (recommended for fine-tuning)
120135
const workspaceChatsMap = chats.reduce((acc, chat) => {
121136
const { prompt, response, workspaceId } = chat;
122-
const responseJson = JSON.parse(response);
137+
const responseJson = safeJsonParse(response, { attachments: [] });
138+
const attachments = responseJson.attachments;
123139

124140
if (!acc[workspaceId]) {
125141
acc[workspaceId] = {
126142
messages: [
127143
{
128144
role: "system",
129-
content:
130-
chat.workspace?.openAiPrompt ||
131-
"Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
145+
content: [
146+
{
147+
type: "text",
148+
text:
149+
chat.workspace?.openAiPrompt ||
150+
"Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
151+
},
152+
],
132153
},
133154
],
134155
};
@@ -137,11 +158,27 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
137158
acc[workspaceId].messages.push(
138159
{
139160
role: "user",
140-
content: prompt,
161+
content: [
162+
{
163+
type: "text",
164+
text: prompt,
165+
},
166+
...(attachments?.length > 0
167+
? attachments.map((attachment) => ({
168+
type: "image",
169+
image: attachmentToDataUrl(attachment),
170+
}))
171+
: []),
172+
],
141173
},
142174
{
143175
role: "assistant",
144-
content: responseJson.text,
176+
content: [
177+
{
178+
type: "text",
179+
text: responseJson.text,
180+
},
181+
],
145182
}
146183
);
147184

@@ -203,6 +240,17 @@ function buildSystemPrompt(chat, prompt = null) {
203240
return `${prompt ?? STANDARD_PROMPT}${context}`;
204241
}
205242

243+
/**
244+
* Converts an attachment's content string to a proper data URL format if needed
245+
* @param {Object} attachment - The attachment object containing contentString and mime type
246+
* @returns {string} The properly formatted data URL
247+
*/
248+
function attachmentToDataUrl(attachment) {
249+
return attachment.contentString.startsWith("data:")
250+
? attachment.contentString
251+
: `data:${attachment.mime};base64,${attachment.contentString}`;
252+
}
253+
206254
module.exports = {
207255
prepareChatsForExport,
208256
exportChatsAsType,

0 commit comments

Comments
 (0)