Skip to content

Commit de0c943

Browse files
Josue Gomezfuthgar
authored andcommitted
fix(onboard): clean up build context temp dir on sandbox creation failure
Register a process 'exit' handler to guarantee the build context temp directory is removed even when run() calls process.exit() on command failure (which bypasses try/finally). Wrap the sandbox creation block in try/finally for immediate cleanup on sync exceptions and on the success path. Signed-off-by: Josue Gomez <josue@guatulab.com>
1 parent eb4ba8c commit de0c943

2 files changed

Lines changed: 186 additions & 103 deletions

File tree

bin/lib/onboard.js

Lines changed: 114 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,117 +1760,128 @@ async function createSandbox(gpu, model, provider, preferredInferenceApi = null,
17601760
registry.removeSandbox(sandboxName);
17611761
}
17621762

1763-
// Stage build context
1763+
// Stage build context.
1764+
// The build context contains source code, scripts, and potentially API keys
1765+
// in env args, so it must not persist in /tmp after a failed sandbox create.
1766+
// run() calls process.exit() on failure (bypassing try/finally), so we
1767+
// register a process 'exit' handler to guarantee cleanup in all cases.
17641768
const buildCtx = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-build-"));
1765-
const stagedDockerfile = path.join(buildCtx, "Dockerfile");
1766-
fs.copyFileSync(path.join(ROOT, "Dockerfile"), stagedDockerfile);
1767-
copyBuildContextDir(path.join(ROOT, "nemoclaw"), path.join(buildCtx, "nemoclaw"));
1768-
copyBuildContextDir(path.join(ROOT, "nemoclaw-blueprint"), path.join(buildCtx, "nemoclaw-blueprint"));
1769-
copyBuildContextDir(path.join(ROOT, "scripts"), path.join(buildCtx, "scripts"));
1770-
1771-
// Create sandbox (use -- echo to avoid dropping into interactive shell)
1772-
// Pass the base policy so sandbox starts in proxy mode (required for policy updates later)
1773-
const basePolicyPath = path.join(ROOT, "nemoclaw-blueprint", "policies", "openclaw-sandbox.yaml");
1774-
const createArgs = [
1775-
"--from", `${buildCtx}/Dockerfile`,
1776-
"--name", sandboxName,
1777-
"--policy", basePolicyPath,
1778-
];
1779-
// --gpu is intentionally omitted. See comment in startGateway().
1780-
1781-
console.log(` Creating sandbox '${sandboxName}' (this takes a few minutes on first run)...`);
1782-
const chatUiUrl = process.env.CHAT_UI_URL || "http://127.0.0.1:18789";
1783-
patchStagedDockerfile(stagedDockerfile, model, chatUiUrl, String(Date.now()), provider, preferredInferenceApi);
1784-
// Only pass non-sensitive env vars to the sandbox. NVIDIA_API_KEY is NOT
1785-
// needed inside the sandbox — inference is proxied through the OpenShell
1786-
// gateway which injects the stored credential server-side. The gateway
1787-
// also strips any Authorization headers sent by the sandbox client.
1788-
// See: crates/openshell-sandbox/src/proxy.rs (header stripping),
1789-
// crates/openshell-router/src/backend.rs (server-side auth injection).
1790-
const envArgs = [formatEnvAssignment("CHAT_UI_URL", chatUiUrl)];
1791-
const sandboxEnv = { ...process.env };
1792-
delete sandboxEnv.NVIDIA_API_KEY;
1793-
const discordToken = getCredential("DISCORD_BOT_TOKEN") || process.env.DISCORD_BOT_TOKEN;
1794-
if (discordToken) {
1795-
sandboxEnv.DISCORD_BOT_TOKEN = discordToken;
1796-
}
1797-
const slackToken = getCredential("SLACK_BOT_TOKEN") || process.env.SLACK_BOT_TOKEN;
1798-
if (slackToken) {
1799-
sandboxEnv.SLACK_BOT_TOKEN = slackToken;
1800-
}
1801-
1802-
// Run without piping through awk — the pipe masked non-zero exit codes
1803-
// from openshell because bash returns the status of the last pipeline
1804-
// command (awk, always 0) unless pipefail is set. Removing the pipe
1805-
// lets the real exit code flow through to run().
1806-
const createCommand = `${openshellShellCommand([
1807-
"sandbox",
1808-
"create",
1809-
...createArgs,
1810-
"--",
1811-
"env",
1812-
...envArgs,
1813-
"nemoclaw-start",
1814-
])} 2>&1`;
1815-
const createResult = await streamSandboxCreate(createCommand, sandboxEnv, {
1816-
readyCheck: () => {
1817-
const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true });
1818-
return isSandboxReady(list, sandboxName);
1819-
},
1820-
});
1769+
const cleanupBuildCtx = () => {
1770+
try { fs.rmSync(buildCtx, { recursive: true, force: true }); } catch {}
1771+
};
1772+
process.on("exit", cleanupBuildCtx);
18211773

1822-
// Clean up build context regardless of outcome
1823-
run(`rm -rf "${buildCtx}"`, { ignoreError: true });
1774+
try {
1775+
const stagedDockerfile = path.join(buildCtx, "Dockerfile");
1776+
fs.copyFileSync(path.join(ROOT, "Dockerfile"), stagedDockerfile);
1777+
copyBuildContextDir(path.join(ROOT, "nemoclaw"), path.join(buildCtx, "nemoclaw"));
1778+
copyBuildContextDir(path.join(ROOT, "nemoclaw-blueprint"), path.join(buildCtx, "nemoclaw-blueprint"));
1779+
copyBuildContextDir(path.join(ROOT, "scripts"), path.join(buildCtx, "scripts"));
1780+
1781+
// Create sandbox (use -- echo to avoid dropping into interactive shell)
1782+
// Pass the base policy so sandbox starts in proxy mode (required for policy updates later)
1783+
const basePolicyPath = path.join(ROOT, "nemoclaw-blueprint", "policies", "openclaw-sandbox.yaml");
1784+
const createArgs = [
1785+
"--from", `${buildCtx}/Dockerfile`,
1786+
"--name", sandboxName,
1787+
"--policy", basePolicyPath,
1788+
];
1789+
// --gpu is intentionally omitted. See comment in startGateway().
1790+
1791+
console.log(` Creating sandbox '${sandboxName}' (this takes a few minutes on first run)...`);
1792+
const chatUiUrl = process.env.CHAT_UI_URL || "http://127.0.0.1:18789";
1793+
patchStagedDockerfile(stagedDockerfile, model, chatUiUrl, String(Date.now()), provider, preferredInferenceApi);
1794+
// Only pass non-sensitive env vars to the sandbox. NVIDIA_API_KEY is NOT
1795+
// needed inside the sandbox — inference is proxied through the OpenShell
1796+
// gateway which injects the stored credential server-side. The gateway
1797+
// also strips any Authorization headers sent by the sandbox client.
1798+
// See: crates/openshell-sandbox/src/proxy.rs (header stripping),
1799+
// crates/openshell-router/src/backend.rs (server-side auth injection).
1800+
const envArgs = [formatEnvAssignment("CHAT_UI_URL", chatUiUrl)];
1801+
const sandboxEnv = { ...process.env };
1802+
delete sandboxEnv.NVIDIA_API_KEY;
1803+
const discordToken = getCredential("DISCORD_BOT_TOKEN") || process.env.DISCORD_BOT_TOKEN;
1804+
if (discordToken) {
1805+
sandboxEnv.DISCORD_BOT_TOKEN = discordToken;
1806+
}
1807+
const slackToken = getCredential("SLACK_BOT_TOKEN") || process.env.SLACK_BOT_TOKEN;
1808+
if (slackToken) {
1809+
sandboxEnv.SLACK_BOT_TOKEN = slackToken;
1810+
}
1811+
1812+
// Run without piping through awk — the pipe masked non-zero exit codes
1813+
// from openshell because bash returns the status of the last pipeline
1814+
// command (awk, always 0) unless pipefail is set. Removing the pipe
1815+
// lets the real exit code flow through to run().
1816+
const createCommand = `${openshellShellCommand([
1817+
"sandbox",
1818+
"create",
1819+
...createArgs,
1820+
"--",
1821+
"env",
1822+
...envArgs,
1823+
"nemoclaw-start",
1824+
])} 2>&1`;
1825+
const createResult = await streamSandboxCreate(createCommand, sandboxEnv, {
1826+
readyCheck: () => {
1827+
const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true });
1828+
return isSandboxReady(list, sandboxName);
1829+
},
1830+
});
18241831

1825-
if (createResult.status !== 0) {
1826-
console.error("");
1827-
console.error(` Sandbox creation failed (exit ${createResult.status}).`);
1828-
if (createResult.output) {
1832+
if (createResult.status !== 0) {
18291833
console.error("");
1830-
console.error(createResult.output);
1831-
}
1832-
console.error(" Try: openshell sandbox list # check gateway state");
1833-
printSandboxCreateRecoveryHints(createResult.output);
1834-
process.exit(createResult.status || 1);
1835-
}
1836-
1837-
// Wait for sandbox to reach Ready state in k3s before registering.
1838-
// On WSL2 + Docker Desktop the pod can take longer to initialize;
1839-
// without this gate, NemoClaw registers a phantom sandbox that
1840-
// causes "sandbox not found" on every subsequent connect/status call.
1841-
console.log(" Waiting for sandbox to become ready...");
1842-
let ready = false;
1843-
for (let i = 0; i < 30; i++) {
1844-
const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true });
1845-
if (isSandboxReady(list, sandboxName)) {
1846-
ready = true;
1847-
break;
1834+
console.error(` Sandbox creation failed (exit ${createResult.status}).`);
1835+
if (createResult.output) {
1836+
console.error("");
1837+
console.error(createResult.output);
1838+
}
1839+
console.error(" Try: openshell sandbox list # check gateway state");
1840+
printSandboxCreateRecoveryHints(createResult.output);
1841+
process.exit(createResult.status || 1);
1842+
}
1843+
1844+
// Wait for sandbox to reach Ready state in k3s before registering.
1845+
// On WSL2 + Docker Desktop the pod can take longer to initialize;
1846+
// without this gate, NemoClaw registers a phantom sandbox that
1847+
// causes "sandbox not found" on every subsequent connect/status call.
1848+
console.log(" Waiting for sandbox to become ready...");
1849+
let ready = false;
1850+
for (let i = 0; i < 30; i++) {
1851+
const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true });
1852+
if (isSandboxReady(list, sandboxName)) {
1853+
ready = true;
1854+
break;
1855+
}
1856+
require("child_process").spawnSync("sleep", ["2"]);
18481857
}
1849-
require("child_process").spawnSync("sleep", ["2"]);
1850-
}
18511858

1852-
if (!ready) {
1853-
// Clean up the orphaned sandbox so the next onboard retry with the same
1854-
// name doesn't fail on "sandbox already exists".
1855-
const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
1856-
console.error("");
1857-
console.error(` Sandbox '${sandboxName}' was created but did not become ready within 60s.`);
1858-
if (delResult.status === 0) {
1859-
console.error(" The orphaned sandbox has been removed — you can safely retry.");
1860-
} else {
1861-
console.error(` Could not remove the orphaned sandbox. Manual cleanup:`);
1862-
console.error(` openshell sandbox delete "${sandboxName}"`);
1859+
if (!ready) {
1860+
// Clean up the orphaned sandbox so the next onboard retry with the same
1861+
// name doesn't fail on "sandbox already exists".
1862+
const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
1863+
console.error("");
1864+
console.error(` Sandbox '${sandboxName}' was created but did not become ready within 60s.`);
1865+
if (delResult.status === 0) {
1866+
console.error(" The orphaned sandbox has been removed — you can safely retry.");
1867+
} else {
1868+
console.error(` Could not remove the orphaned sandbox. Manual cleanup:`);
1869+
console.error(` openshell sandbox delete "${sandboxName}"`);
1870+
}
1871+
console.error(" Retry: nemoclaw onboard");
1872+
process.exit(1);
18631873
}
1864-
console.error(" Retry: nemoclaw onboard");
1865-
process.exit(1);
1866-
}
18671874

1868-
// Release any stale forward on port 18789 before claiming it for the new sandbox.
1869-
// A previous onboard run may have left the port forwarded to a different sandbox,
1870-
// which would silently prevent the new sandbox's dashboard from being reachable.
1871-
runOpenshell(["forward", "stop", "18789"], { ignoreError: true });
1872-
// Forward dashboard port to the new sandbox
1873-
runOpenshell(["forward", "start", "--background", "18789", sandboxName], { ignoreError: true });
1875+
// Release any stale forward on port 18789 before claiming it for the new sandbox.
1876+
// A previous onboard run may have left the port forwarded to a different sandbox,
1877+
// which would silently prevent the new sandbox's dashboard from being reachable.
1878+
runOpenshell(["forward", "stop", "18789"], { ignoreError: true });
1879+
// Forward dashboard port to the new sandbox
1880+
runOpenshell(["forward", "start", "--background", "18789", sandboxName], { ignoreError: true });
1881+
} finally {
1882+
cleanupBuildCtx();
1883+
process.removeListener("exit", cleanupBuildCtx);
1884+
}
18741885

18751886
// Register only after confirmed ready — prevents phantom entries
18761887
registry.registerSandbox({

test/onboard-build-cleanup.test.js

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
import { describe, it, expect } from "vitest";
5+
import fs from "node:fs";
6+
import os from "node:os";
7+
import path from "node:path";
8+
import { spawnSync } from "node:child_process";
9+
10+
describe("onboard build context cleanup", () => {
11+
it("removes the build context temp dir when a command fails mid-build", () => {
12+
// Simulate the pattern used in createSandbox: register a process 'exit'
13+
// handler to clean up the temp dir, then exit non-zero (as run() does
14+
// via process.exit on command failure). The handler must still fire.
15+
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cleanup-test-"));
16+
const marker = path.join(tmp, "sentinel.txt");
17+
fs.writeFileSync(marker, "build context contents");
18+
19+
const result = spawnSync(
20+
"node",
21+
[
22+
"-e",
23+
`
24+
const fs = require("fs");
25+
const buildCtx = ${JSON.stringify(tmp)};
26+
const cleanup = () => {
27+
try { fs.rmSync(buildCtx, { recursive: true, force: true }); } catch {}
28+
};
29+
process.on("exit", cleanup);
30+
// Simulate run() calling process.exit() on command failure
31+
process.exit(1);
32+
`,
33+
],
34+
{ encoding: "utf-8", timeout: 5000 },
35+
);
36+
37+
expect(result.status).toBe(1);
38+
expect(fs.existsSync(tmp)).toBe(false);
39+
});
40+
41+
it("removes the build context on success and deregisters the handler", () => {
42+
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cleanup-test-"));
43+
fs.writeFileSync(path.join(tmp, "sentinel.txt"), "build context contents");
44+
45+
const result = spawnSync(
46+
"node",
47+
[
48+
"-e",
49+
`
50+
const fs = require("fs");
51+
const buildCtx = ${JSON.stringify(tmp)};
52+
const cleanup = () => {
53+
try { fs.rmSync(buildCtx, { recursive: true, force: true }); } catch {}
54+
};
55+
process.on("exit", cleanup);
56+
// Simulate successful path: explicit cleanup + deregister
57+
cleanup();
58+
process.removeListener("exit", cleanup);
59+
// Verify the specific handler was deregistered
60+
if (process.listeners("exit").includes(cleanup)) {
61+
console.error("exit handler was not deregistered");
62+
process.exit(2);
63+
}
64+
`,
65+
],
66+
{ encoding: "utf-8", timeout: 5000 },
67+
);
68+
69+
expect(result.status).toBe(0);
70+
expect(fs.existsSync(tmp)).toBe(false);
71+
});
72+
});

0 commit comments

Comments
 (0)