diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 37e78ecd7..e74d49921 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -185,6 +185,28 @@ function hasStaleGateway(gwInfoOutput) { return typeof gwInfoOutput === "string" && gwInfoOutput.length > 0 && gwInfoOutput.includes(GATEWAY_NAME); } +const ANSI_ESCAPE = String.fromCharCode(27); +const ANSI_REGEX = new RegExp(`${ANSI_ESCAPE}\\[[0-9;]*[A-Za-z]`, "g"); + +function stripAnsi(value = "") { + return value.replace(ANSI_REGEX, ""); +} + +function getActiveGatewayName(statusOutput = "") { + if (typeof statusOutput !== "string" || statusOutput.length === 0) { + return ""; + } + const match = stripAnsi(statusOutput) + .match(/^\s*Gateway:\s+(.+?)\s*$/m); + return match ? match[1].trim() : ""; +} + +function isGatewayHealthy(statusOutput = "", gwInfoOutput = "") { + const connected = typeof statusOutput === "string" && statusOutput.includes("Connected"); + const activeGateway = getActiveGatewayName(statusOutput); + return connected && activeGateway === GATEWAY_NAME && hasStaleGateway(gwInfoOutput); +} + function streamSandboxCreate(command, env = process.env, options = {}) { const child = spawn("bash", ["-lc", command], { cwd: ROOT, @@ -1237,8 +1259,16 @@ async function preflight() { // A previous onboard run may have left the gateway container and port // forward running. If a NemoClaw-owned gateway is still present, tear // it down so the port check below doesn't fail on our own leftovers. + const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { ignoreError: true }); - if (hasStaleGateway(gwInfo)) { + const healthyGateway = isGatewayHealthy(gatewayStatus, gwInfo); + if (healthyGateway) { + console.log(" Reusing existing NemoClaw gateway..."); + runOpenshell(["forward", "stop", "18789"], { ignoreError: true }); + runOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); + process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; + console.log(" ✓ Existing gateway selected"); + } else if (hasStaleGateway(gwInfo)) { console.log(" Cleaning up previous NemoClaw session..."); runOpenshell(["forward", "stop", "18789"], { ignoreError: true }); runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME], { ignoreError: true }); @@ -1251,6 +1281,10 @@ async function preflight() { { port: 18789, label: "NemoClaw dashboard" }, ]; for (const { port, label } of requiredPorts) { + if (port === 8080 && healthyGateway) { + console.log(` ✓ Port ${port} already in use by active NemoClaw gateway (${label})`); + continue; + } const portCheck = await checkPortAvailable(port); if (!portCheck.ok) { console.error(""); @@ -1305,11 +1339,21 @@ function destroyGateway() { // ── Step 2: Gateway ────────────────────────────────────────────── -async function startGateway(_gpu) { +async function startGatewayWithOptions(_gpu, { exitOnFailure = true } = {}) { step(3, 7, "Starting OpenShell gateway"); - // Clean up any previous gateway and its Docker volumes - destroyGateway(); + const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); + const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { ignoreError: true }); + if (isGatewayHealthy(gatewayStatus, gwInfo)) { + console.log(" ✓ Reusing existing gateway"); + runOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); + process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; + return; + } + + if (hasStaleGateway(gwInfo)) { + runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME], { ignoreError: true }); + } const gwArgs = ["--name", GATEWAY_NAME]; // Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is @@ -1332,22 +1376,29 @@ async function startGateway(_gpu) { if (startResult.status !== 0) { console.error(" Gateway failed to start. Cleaning up stale state..."); destroyGateway(); - console.error(" Stale state removed. Please rerun: nemoclaw onboard"); - process.exit(1); + if (exitOnFailure) { + console.error(" Stale state removed. Please rerun: nemoclaw onboard"); + process.exit(1); + } + throw new Error("Gateway failed to start"); } // Verify health for (let i = 0; i < 5; i++) { const status = runCaptureOpenshell(["status"], { ignoreError: true }); - if (status.includes("Connected")) { + const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { ignoreError: true }); + if (isGatewayHealthy(status, gwInfo)) { console.log(" ✓ Gateway is healthy"); break; } if (i === 4) { console.error(" Gateway health check failed. Cleaning up stale state..."); destroyGateway(); - console.error(" Stale state removed. Please rerun: nemoclaw onboard"); - process.exit(1); + if (exitOnFailure) { + console.error(" Stale state removed. Please rerun: nemoclaw onboard"); + process.exit(1); + } + throw new Error("Gateway failed to start"); } sleep(2); } @@ -1364,6 +1415,14 @@ async function startGateway(_gpu) { process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; } +async function startGateway(_gpu) { + return startGatewayWithOptions(_gpu, { exitOnFailure: true }); +} + +async function startGatewayForRecovery(_gpu) { + return startGatewayWithOptions(_gpu, { exitOnFailure: false }); +} + // ── Step 3: Sandbox ────────────────────────────────────────────── async function createSandbox(gpu, model, provider, preferredInferenceApi = null) { @@ -1415,6 +1474,7 @@ async function createSandbox(gpu, model, provider, preferredInferenceApi = null) run(`cp -r "${path.join(ROOT, "nemoclaw-blueprint")}" "${buildCtx}/nemoclaw-blueprint"`); run(`cp -r "${path.join(ROOT, "scripts")}" "${buildCtx}/scripts"`); run(`rm -rf "${buildCtx}/nemoclaw/node_modules"`, { ignoreError: true }); + run(`bash "${buildCtx}/scripts/clean-staged-tree.sh" "${buildCtx}/nemoclaw-blueprint"`, { ignoreError: true }); // Create sandbox (use -- echo to avoid dropping into interactive shell) // Pass the base policy so sandbox starts in proxy mode (required for policy updates later) @@ -1551,9 +1611,7 @@ async function setupNim(gpu) { const options = []; options.push({ key: "build", - label: - "NVIDIA Endpoints" + - (!ollamaRunning && !(EXPERIMENTAL && vllmRunning) ? " (recommended)" : ""), + label: "NVIDIA Endpoints", }); options.push({ key: "openai", label: "OpenAI" }); options.push({ key: "custom", label: "Other OpenAI-compatible endpoint" }); @@ -2155,7 +2213,7 @@ async function setupPolicies(sandboxName) { // ── Dashboard ──────────────────────────────────────────────────── const CONTROL_UI_PORT = 18789; -const CONTROL_UI_CHAT_PATH = "/chat?session=main"; +const CONTROL_UI_PATH = "/"; function findOpenclawJsonPath(dir) { if (!fs.existsSync(dir)) return null; @@ -2201,17 +2259,13 @@ function fetchGatewayAuthTokenFromSandbox(sandboxName) { } } -function buildControlUiChatUrls(token) { +function buildControlUiUrls(token) { const hash = token ? `#token=${token}` : ""; - const pathChat = `${CONTROL_UI_CHAT_PATH}${hash}`; - const bases = [ - `http://127.0.0.1:${CONTROL_UI_PORT}`, - `http://localhost:${CONTROL_UI_PORT}`, - ]; + const baseUrl = `http://127.0.0.1:${CONTROL_UI_PORT}`; + const urls = [`${baseUrl}${CONTROL_UI_PATH}${hash}`]; const chatUi = (process.env.CHAT_UI_URL || "").trim().replace(/\/$/, ""); - const urls = bases.map((b) => `${b}${pathChat}`); - if (chatUi && /^https?:\/\//i.test(chatUi) && !bases.includes(chatUi)) { - urls.push(`${chatUi}${pathChat}`); + if (chatUi && /^https?:\/\//i.test(chatUi) && chatUi !== baseUrl) { + urls.push(`${chatUi}${CONTROL_UI_PATH}${hash}`); } return [...new Set(urls)]; } @@ -2239,22 +2293,26 @@ function printDashboard(sandboxName, model, provider, nimContainer = null) { console.log(` Model ${model} (${providerLabel})`); console.log(` NIM ${nimLabel}`); console.log(` ${"─".repeat(50)}`); - console.log(` Next:`); + console.log(` Run: nemoclaw ${sandboxName} connect`); + console.log(` Status: nemoclaw ${sandboxName} status`); + console.log(` Logs: nemoclaw ${sandboxName} logs --follow`); + console.log(""); if (token) { - note(" URLs below embed the gateway token — treat them like a password."); - console.log(` Control UI: copy one line into your browser (port ${CONTROL_UI_PORT} must be forwarded):`); - for (const u of buildControlUiChatUrls(token)) { - console.log(` ${u}`); + console.log(" OpenClaw UI (tokenized URL; treat it like a password)"); + console.log(` Port ${CONTROL_UI_PORT} must be forwarded before opening this URL.`); + for (const url of buildControlUiUrls(token)) { + console.log(` ${url}`); } } else { note(" Could not read gateway token from the sandbox (download failed)."); - console.log(` Control UI: http://127.0.0.1:${CONTROL_UI_PORT}${CONTROL_UI_CHAT_PATH}`); + console.log(" OpenClaw UI"); + console.log(` Port ${CONTROL_UI_PORT} must be forwarded before opening this URL.`); + for (const url of buildControlUiUrls()) { + console.log(` ${url}`); + } console.log(` Token: nemoclaw ${sandboxName} connect → jq -r '.gateway.auth.token' /sandbox/.openclaw/openclaw.json`); console.log(` append #token= to the URL, or see /tmp/gateway.log inside the sandbox.`); } - console.log(` Run: nemoclaw ${sandboxName} connect`); - console.log(` Status: nemoclaw ${sandboxName} status`); - console.log(` Logs: nemoclaw ${sandboxName} logs --follow`); console.log(` ${"─".repeat(50)}`); console.log(""); } @@ -2297,12 +2355,16 @@ module.exports = { getInstalledOpenshellVersion, getStableGatewayImageRef, hasStaleGateway, + isGatewayHealthy, isSandboxReady, onboard, + preflight, pruneStaleSandboxEntry, runCaptureOpenshell, setupInference, setupNim, + startGateway, + startGatewayForRecovery, writeSandboxConfigSyncFile, patchStagedDockerfile, }; diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 868b00b83..3d82ffa96 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -20,7 +20,9 @@ const R = _useColor ? "\x1b[0m" : ""; const _RD = _useColor ? "\x1b[1;31m" : ""; const YW = _useColor ? "\x1b[1;33m" : ""; -const { ROOT, SCRIPTS, run, runCapture, runInteractive, shellQuote, validateName } = require("./lib/runner"); +const { ROOT, SCRIPTS, run, runCapture: _runCapture, runInteractive, shellQuote, validateName } = require("./lib/runner"); +const { resolveOpenshell } = require("./lib/resolve-openshell"); +const { startGatewayForRecovery } = require("./lib/onboard"); const { ensureApiKey, ensureGithubToken, @@ -41,6 +43,263 @@ const GLOBAL_COMMANDS = new Set([ ]); const REMOTE_UNINSTALL_URL = "https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uninstall.sh"; +let OPENSHELL_BIN = null; + +function getOpenshellBinary() { + if (!OPENSHELL_BIN) { + OPENSHELL_BIN = resolveOpenshell(); + } + if (!OPENSHELL_BIN) { + console.error("openshell CLI not found. Install OpenShell before using sandbox commands."); + process.exit(1); + } + return OPENSHELL_BIN; +} + +function runOpenshell(args, opts = {}) { + const result = spawnSync(getOpenshellBinary(), args, { + cwd: ROOT, + env: { ...process.env, ...opts.env }, + encoding: "utf-8", + stdio: opts.stdio ?? "inherit", + }); + if (result.status !== 0 && !opts.ignoreError) { + console.error(` Command failed (exit ${result.status}): openshell ${args.join(" ")}`); + process.exit(result.status || 1); + } + return result; +} + +function captureOpenshell(args, opts = {}) { + const result = spawnSync(getOpenshellBinary(), args, { + cwd: ROOT, + env: { ...process.env, ...opts.env }, + encoding: "utf-8", + stdio: ["ignore", "pipe", "pipe"], + }); + return { + status: result.status ?? 1, + output: `${result.stdout || ""}${opts.ignoreError ? "" : result.stderr || ""}`.trim(), + }; +} + +function stripAnsi(value = "") { + // eslint-disable-next-line no-control-regex + return String(value).replace(/\x1b\[[0-9;]*m/g, ""); +} + +function hasNamedGateway(output = "") { + return stripAnsi(output).includes("Gateway: nemoclaw"); +} + +function getActiveGatewayName(output = "") { + const match = stripAnsi(output).match(/^\s*Gateway:\s+(.+?)\s*$/m); + return match ? match[1].trim() : ""; +} + +function getNamedGatewayLifecycleState() { + const status = captureOpenshell(["status"]); + const gatewayInfo = captureOpenshell(["gateway", "info", "-g", "nemoclaw"]); + const cleanStatus = stripAnsi(status.output); + const activeGateway = getActiveGatewayName(status.output); + const connected = /^\s*Status:\s*Connected\b/im.test(cleanStatus); + const named = hasNamedGateway(gatewayInfo.output); + const refusing = /Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanStatus); + if (connected && activeGateway === "nemoclaw" && named) { + return { state: "healthy_named", status: status.output, gatewayInfo: gatewayInfo.output }; + } + if (activeGateway === "nemoclaw" && named && refusing) { + return { state: "named_unreachable", status: status.output, gatewayInfo: gatewayInfo.output }; + } + if (activeGateway === "nemoclaw" && named) { + return { state: "named_unhealthy", status: status.output, gatewayInfo: gatewayInfo.output }; + } + if (connected) { + return { state: "connected_other", status: status.output, gatewayInfo: gatewayInfo.output }; + } + return { state: "missing_named", status: status.output, gatewayInfo: gatewayInfo.output }; +} + +async function recoverNamedGatewayRuntime() { + const before = getNamedGatewayLifecycleState(); + if (before.state === "healthy_named") { + return { recovered: true, before, after: before, attempted: false }; + } + + runOpenshell(["gateway", "select", "nemoclaw"], { ignoreError: true }); + let after = getNamedGatewayLifecycleState(); + if (after.state === "healthy_named") { + process.env.OPENSHELL_GATEWAY = "nemoclaw"; + return { recovered: true, before, after, attempted: true, via: "select" }; + } + + const shouldStartGateway = [before.state, after.state].some((state) => + ["named_unhealthy", "named_unreachable", "connected_other"].includes(state) + ); + + if (shouldStartGateway) { + try { + await startGatewayForRecovery(); + } catch { + // Fall through to the lifecycle re-check below so we preserve the + // existing recovery result shape and emit the correct classification. + } + runOpenshell(["gateway", "select", "nemoclaw"], { ignoreError: true }); + after = getNamedGatewayLifecycleState(); + if (after.state === "healthy_named") { + process.env.OPENSHELL_GATEWAY = "nemoclaw"; + return { recovered: true, before, after, attempted: true, via: "start" }; + } + } + + return { recovered: false, before, after, attempted: true }; +} + +function getSandboxGatewayState(sandboxName) { + const result = captureOpenshell(["sandbox", "get", sandboxName]); + const output = result.output; + if (result.status === 0) { + return { state: "present", output }; + } + if (/NotFound|sandbox not found/i.test(output)) { + return { state: "missing", output }; + } + if (/transport error|Connection refused|handshake verification failed|Missing gateway auth token|device identity required/i.test(output)) { + return { state: "gateway_error", output }; + } + return { state: "unknown_error", output }; +} + +function printGatewayLifecycleHint(output = "", sandboxName = "", writer = console.error) { + const cleanOutput = stripAnsi(output); + if (/No gateway configured/i.test(cleanOutput)) { + writer(" The selected NemoClaw gateway is no longer configured or its metadata/runtime has been lost."); + writer(" Start the gateway again with `openshell gateway start --name nemoclaw` before expecting existing sandboxes to reconnect."); + writer(" If the gateway has to be rebuilt from scratch, recreate the affected sandbox afterward."); + return; + } + if (/Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanOutput) && /Gateway:\s+nemoclaw/i.test(cleanOutput)) { + writer(" The selected NemoClaw gateway exists in metadata, but its API is refusing connections after restart."); + writer(" This usually means the gateway runtime did not come back cleanly after the restart."); + writer(" Retry `openshell gateway start --name nemoclaw`; if it stays in this state, rebuild the gateway before expecting existing sandboxes to reconnect."); + return; + } + if (/handshake verification failed/i.test(cleanOutput)) { + writer(" This looks like gateway identity drift after restart."); + writer(" Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state."); + writer(" Try re-establishing the NemoClaw gateway/runtime first. If the sandbox is still unreachable, recreate just that sandbox with `nemoclaw onboard`."); + return; + } + if (/Connection refused|transport error/i.test(cleanOutput)) { + writer(` The sandbox '${sandboxName}' may still exist, but the current gateway/runtime is not reachable.`); + writer(" Check `openshell status`, verify the active gateway, and retry."); + return; + } + if (/Missing gateway auth token|device identity required/i.test(cleanOutput)) { + writer(" The gateway is reachable, but the current auth or device identity state is not usable."); + writer(" Verify the active gateway and retry after re-establishing the runtime."); + } +} + +async function getReconciledSandboxGatewayState(sandboxName) { + let lookup = getSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + return lookup; + } + if (lookup.state === "missing") { + return lookup; + } + + if (lookup.state === "gateway_error") { + const recovery = await recoverNamedGatewayRuntime(); + if (recovery.recovered) { + const retried = getSandboxGatewayState(sandboxName); + if (retried.state === "present" || retried.state === "missing") { + return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; + } + if (/handshake verification failed/i.test(retried.output)) { + return { + state: "identity_drift", + output: retried.output, + recoveredGateway: true, + recoveryVia: recovery.via || null, + }; + } + return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; + } + const latestLifecycle = getNamedGatewayLifecycleState(); + const latestStatus = stripAnsi(latestLifecycle.status || ""); + if (/No gateway configured/i.test(latestStatus)) { + return { + state: "gateway_missing_after_restart", + output: latestLifecycle.status || lookup.output, + }; + } + if (/Connection refused|client error \(Connect\)|tcp connect error/i.test(latestStatus) && /Gateway:\s+nemoclaw/i.test(latestStatus)) { + return { + state: "gateway_unreachable_after_restart", + output: latestLifecycle.status || lookup.output, + }; + } + if (recovery.after?.state === "named_unreachable" || recovery.before?.state === "named_unreachable") { + return { + state: "gateway_unreachable_after_restart", + output: recovery.after?.status || recovery.before?.status || lookup.output, + }; + } + return { ...lookup, gatewayRecoveryFailed: true }; + } + + return lookup; +} + +async function ensureLiveSandboxOrExit(sandboxName) { + const lookup = await getReconciledSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + return lookup; + } + if (lookup.state === "missing") { + registry.removeSandbox(sandboxName); + console.error(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); + console.error(" Removed stale local registry entry."); + console.error(" Run `nemoclaw list` to confirm the remaining sandboxes, or `nemoclaw onboard` to create a new one."); + process.exit(1); + } + if (lookup.state === "identity_drift") { + console.error(` Sandbox '${sandboxName}' is recorded locally, but the gateway trust material rotated after restart.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Existing sandbox connections cannot be reattached safely after this gateway identity change."); + console.error(" Recreate this sandbox with `nemoclaw onboard` once the gateway runtime is stable."); + process.exit(1); + } + if (lookup.state === "gateway_unreachable_after_restart") { + console.error(` Sandbox '${sandboxName}' may still exist, but the selected NemoClaw gateway is still refusing connections after restart.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting."); + console.error(" If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox."); + process.exit(1); + } + if (lookup.state === "gateway_missing_after_restart") { + console.error(` Sandbox '${sandboxName}' may still exist locally, but the NemoClaw gateway is no longer configured after restart/rebuild.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Start the gateway again with `openshell gateway start --name nemoclaw` before retrying."); + console.error(" If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward."); + process.exit(1); + } + console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); + if (lookup.output) { + console.error(lookup.output); + } + printGatewayLifecycleHint(lookup.output, sandboxName); + console.error(" Check `openshell status` and the active gateway, then retry."); + process.exit(1); +} function resolveUninstallScript() { const candidates = [ @@ -298,17 +557,22 @@ function listSandboxes() { // ── Sandbox-scoped actions ─────────────────────────────────────── -function sandboxConnect(sandboxName) { - const qn = shellQuote(sandboxName); +async function sandboxConnect(sandboxName) { + await ensureLiveSandboxOrExit(sandboxName); // Ensure port forward is alive before connecting - run(`openshell forward start --background 18789 ${qn} 2>/dev/null || true`, { ignoreError: true }); - runInteractive(`openshell sandbox connect ${qn}`); + runOpenshell(["forward", "start", "--background", "18789", sandboxName], { ignoreError: true }); + const result = spawnSync(getOpenshellBinary(), ["sandbox", "connect", sandboxName], { + stdio: "inherit", + cwd: ROOT, + env: process.env, + }); + exitWithSpawnResult(result); } -function sandboxStatus(sandboxName) { +async function sandboxStatus(sandboxName) { const sb = registry.getSandbox(sandboxName); const live = parseGatewayInference( - runCapture("openshell inference get 2>/dev/null", { ignoreError: true }) + captureOpenshell(["inference", "get"], { ignoreError: true }).output ); if (sb) { console.log(""); @@ -319,8 +583,51 @@ function sandboxStatus(sandboxName) { console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); } - // openshell info - run(`openshell sandbox get ${shellQuote(sandboxName)} 2>/dev/null || true`, { ignoreError: true }); + const lookup = await getReconciledSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + console.log(""); + if (lookup.recoveredGateway) { + console.log(` Recovered NemoClaw gateway runtime via ${lookup.recoveryVia || "gateway reattach"}.`); + console.log(""); + } + console.log(lookup.output); + } else if (lookup.state === "missing") { + registry.removeSandbox(sandboxName); + console.log(""); + console.log(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); + console.log(" Removed stale local registry entry."); + } else if (lookup.state === "identity_drift") { + console.log(""); + console.log(` Sandbox '${sandboxName}' is recorded locally, but the gateway trust material rotated after restart.`); + if (lookup.output) { + console.log(lookup.output); + } + console.log(" Existing sandbox connections cannot be reattached safely after this gateway identity change."); + console.log(" Recreate this sandbox with `nemoclaw onboard` once the gateway runtime is stable."); + } else if (lookup.state === "gateway_unreachable_after_restart") { + console.log(""); + console.log(` Sandbox '${sandboxName}' may still exist, but the selected NemoClaw gateway is still refusing connections after restart.`); + if (lookup.output) { + console.log(lookup.output); + } + console.log(" Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting."); + console.log(" If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox."); + } else if (lookup.state === "gateway_missing_after_restart") { + console.log(""); + console.log(` Sandbox '${sandboxName}' may still exist locally, but the NemoClaw gateway is no longer configured after restart/rebuild.`); + if (lookup.output) { + console.log(lookup.output); + } + console.log(" Start the gateway again with `openshell gateway start --name nemoclaw` before retrying."); + console.log(" If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward."); + } else { + console.log(""); + console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); + if (lookup.output) { + console.log(lookup.output); + } + printGatewayLifecycleHint(lookup.output, sandboxName, console.log); + } // NIM health const nimStat = sb && sb.nimContainer ? nim.nimStatusByName(sb.nimContainer) : nim.nimStatus(sandboxName); @@ -332,8 +639,9 @@ function sandboxStatus(sandboxName) { } function sandboxLogs(sandboxName, follow) { - const followFlag = follow ? " --tail" : ""; - run(`openshell logs ${shellQuote(sandboxName)}${followFlag}`); + const args = ["logs", sandboxName]; + if (follow) args.push("--follow"); + runOpenshell(args); } async function sandboxPolicyAdd(sandboxName) { @@ -390,7 +698,7 @@ async function sandboxDestroy(sandboxName, args = []) { else nim.stopNimContainer(sandboxName); console.log(` Deleting sandbox '${sandboxName}'...`); - run(`openshell sandbox delete ${shellQuote(sandboxName)} 2>/dev/null || true`, { ignoreError: true }); + runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); registry.removeSandbox(sandboxName); console.log(` ${G}✓${R} Sandbox '${sandboxName}' destroyed`); @@ -488,8 +796,8 @@ const [cmd, ...args] = process.argv.slice(2); const actionArgs = args.slice(1); switch (action) { - case "connect": sandboxConnect(cmd); break; - case "status": sandboxStatus(cmd); break; + case "connect": await sandboxConnect(cmd); break; + case "status": await sandboxStatus(cmd); break; case "logs": sandboxLogs(cmd, actionArgs.includes("--follow")); break; case "policy-add": await sandboxPolicyAdd(cmd); break; case "policy-list": sandboxPolicyList(cmd); break; diff --git a/scripts/clean-staged-tree.sh b/scripts/clean-staged-tree.sh new file mode 100755 index 000000000..93a550e21 --- /dev/null +++ b/scripts/clean-staged-tree.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +target_dir="${1:-}" + +if [ -z "$target_dir" ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +rm -rf "$target_dir/.venv" "$target_dir/.pytest_cache" +find "$target_dir" -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh index 518ef8555..203794ed7 100755 --- a/scripts/nemoclaw-start.sh +++ b/scripts/nemoclaw-start.sh @@ -18,12 +18,12 @@ set -euo pipefail # Harden: limit process count to prevent fork bombs (ref: #809) # Best-effort: some container runtimes (e.g., brev) restrict ulimit # modification, returning "Invalid argument". Warn but don't block startup. -if ! ulimit -Hu 512 2>/dev/null; then - echo "[SECURITY] Could not set hard nproc limit (container runtime may restrict ulimit)" >&2 -fi if ! ulimit -Su 512 2>/dev/null; then echo "[SECURITY] Could not set soft nproc limit (container runtime may restrict ulimit)" >&2 fi +if ! ulimit -Hu 512 2>/dev/null; then + echo "[SECURITY] Could not set hard nproc limit (container runtime may restrict ulimit)" >&2 +fi # SECURITY: Lock down PATH so the agent cannot inject malicious binaries # into commands executed by the entrypoint or auto-pair watcher. diff --git a/scripts/setup.sh b/scripts/setup.sh index 99cd40f2f..81bd7a2c2 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -201,6 +201,7 @@ cp -r "$REPO_DIR/nemoclaw" "$BUILD_CTX/nemoclaw" cp -r "$REPO_DIR/nemoclaw-blueprint" "$BUILD_CTX/nemoclaw-blueprint" cp -r "$REPO_DIR/scripts" "$BUILD_CTX/scripts" rm -rf "$BUILD_CTX/nemoclaw/node_modules" +bash "$BUILD_CTX/scripts/clean-staged-tree.sh" "$BUILD_CTX/nemoclaw-blueprint" 2>/dev/null || true # Capture full output to a temp file so we can filter for display but still # detect failures. The raw log is kept on failure for debugging. diff --git a/test/cli.test.js b/test/cli.test.js index 82dd5ee64..7cfb06e0d 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -3,16 +3,22 @@ import { describe, it, expect } from "vitest"; import { execSync } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; const CLI = path.join(import.meta.dirname, "..", "bin", "nemoclaw.js"); function run(args) { + return runWithEnv(args); +} + +function runWithEnv(args, env = {}, timeout = 10000) { try { const out = execSync(`node "${CLI}" ${args}`, { encoding: "utf-8", - timeout: 10000, - env: { ...process.env, HOME: "/tmp/nemoclaw-cli-test-" + Date.now() }, + timeout, + env: { ...process.env, HOME: "/tmp/nemoclaw-cli-test-" + Date.now(), ...env }, }); return { code: 0, out }; } catch (err) { @@ -90,4 +96,588 @@ describe("CLI dispatch", () => { expect(r.out.includes("Troubleshooting")).toBeTruthy(); expect(r.out.includes("nemoclaw debug")).toBeTruthy(); }); + + it("passes --follow through to openshell logs", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-logs-follow-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + const markerFile = path.join(home, "logs-args"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + "printf '%s ' \"$@\" > \"$marker_file\"", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha logs --follow", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(fs.readFileSync(markerFile, "utf8")).toContain("logs alpha --follow"); + }); + + it("removes stale registry entries when connect targets a missing live sandbox", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-stale-connect-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: status: NotFound, message: \"sandbox not found\"' >&2", + " exit 1", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha connect", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect(r.out.includes("Removed stale local registry entry")).toBeTruthy(); + const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); + expect(saved.sandboxes.alpha).toBeUndefined(); + }); + + it("keeps registry entries when status hits a gateway-level transport error", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-error-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: handshake verification failed' >&2", + " exit 1", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("Could not verify sandbox 'alpha'")).toBeTruthy(); + expect(r.out.includes("gateway identity drift after restart")).toBeTruthy(); + const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); + expect(saved.sandboxes.alpha).toBeTruthy(); + }, 25000); + + it("recovers status after gateway runtime is reattached", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-recover-status-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + const stateFile = path.join(home, "sandbox-get-count"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `state_file=${JSON.stringify(stateFile)}`, + "count=$(cat \"$state_file\" 2>/dev/null || echo 0)", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " count=$((count + 1))", + " echo \"$count\" > \"$state_file\"", + " if [ \"$count\" -eq 1 ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + " fi", + " echo 'Sandbox: alpha'", + " exit 0", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: nemoclaw'", + " echo ' Status: Connected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out.includes("Recovered NemoClaw gateway runtime")).toBeTruthy(); + expect(r.out.includes("Sandbox: alpha")).toBeTruthy(); + }); + + it("does not treat a different connected gateway as a healthy nemoclaw gateway", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-mixed-gateway-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: openshell'", + " echo ' Status: Connected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"start\" ] && [ \"$3\" = \"--name\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"inference\" ] && [ \"$2\" = \"get\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("Recovered NemoClaw gateway runtime")).toBeFalsy(); + expect(r.out.includes("Could not verify sandbox 'alpha'")).toBeTruthy(); + expect(r.out.includes("verify the active gateway")).toBeTruthy(); + }, 25000); + + it("matches ANSI-decorated gateway transport errors when printing lifecycle hints", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-ansi-transport-hint-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " printf '\\033[31mError: trans\\033[0mport error: Connec\\033[33mtion refused\\033[0m\\n' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: openshell'", + " echo ' Status: Disconnected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " printf 'Gateway Info\\n\\n Gateway: openshell\\n'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("current gateway/runtime is not reachable")).toBeTruthy(); + }, 25000); + + it("matches ANSI-decorated gateway auth errors when printing lifecycle hints", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-ansi-auth-hint-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " printf '\\033[31mMissing gateway auth\\033[0m token\\n' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: openshell'", + " echo ' Status: Disconnected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " printf 'Gateway Info\\n\\n Gateway: openshell\\n'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("Verify the active gateway and retry after re-establishing the runtime.")).toBeTruthy(); + }, 25000); + + it("explains unrecoverable gateway trust rotation after restart", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-identity-drift-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: handshake verification failed' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: nemoclaw'", + " echo ' Status: Connected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const statusResult = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + expect(statusResult.code).toBe(0); + expect(statusResult.out.includes("gateway trust material rotated after restart")).toBeTruthy(); + expect(statusResult.out.includes("cannot be reattached safely")).toBeTruthy(); + + const connectResult = runWithEnv("alpha connect", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(connectResult.code).toBe(1); + expect(connectResult.out.includes("gateway trust material rotated after restart")).toBeTruthy(); + expect(connectResult.out.includes("Recreate this sandbox")).toBeTruthy(); + }); + + it("explains when gateway metadata exists but the restarted API is still refusing connections", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-unreachable-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: nemoclaw'", + " echo ' Server: https://127.0.0.1:8080'", + " echo 'Error: client error (Connect)' >&2", + " echo 'Connection refused (os error 111)' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"start\" ] && [ \"$3\" = \"--name\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const statusResult = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + expect(statusResult.code).toBe(0); + expect(statusResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); + expect(statusResult.out.includes("Retry `openshell gateway start --name nemoclaw`")).toBeTruthy(); + + const connectResult = runWithEnv("alpha connect", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(connectResult.code).toBe(1); + expect(connectResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); + expect(connectResult.out.includes("If the gateway never becomes healthy")).toBeTruthy(); + }, 25000); + + it("explains when the named gateway is no longer configured after restart or rebuild", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-missing-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Gateway Status'", + " echo", + " echo ' Status: No gateway configured.'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 1", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"start\" ] && [ \"$3\" = \"--name\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 1", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const statusResult = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(statusResult.code).toBe(0); + expect(statusResult.out.includes("gateway is no longer configured after restart/rebuild")).toBeTruthy(); + expect(statusResult.out.includes("Start the gateway again")).toBeTruthy(); + }, 25000); }); diff --git a/test/e2e/test-double-onboard.sh b/test/e2e/test-double-onboard.sh index f70d6533e..da2f4a065 100755 --- a/test/e2e/test-double-onboard.sh +++ b/test/e2e/test-double-onboard.sh @@ -2,28 +2,25 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Double onboard: verify that consecutive `nemoclaw onboard` runs recover -# automatically from stale state (gateway, port forward, registry entries) -# left behind by a previous run. +# Double onboard / lifecycle recovery: +# - prove repeat onboard reuses the healthy shared NemoClaw gateway +# - prove onboarding a second sandbox does not destroy the first sandbox +# - prove stale registry entries are reconciled against live OpenShell state +# - prove gateway rebuilds surface the expected lifecycle guidance # -# Regression test for issues #21, #22, #140, #152, #397. -# -# Key insight: running onboard without NVIDIA_API_KEY in non-interactive -# mode causes process.exit(1) at step 4, but steps 1-3 (preflight, -# gateway, sandbox) complete first — naturally simulating an unclean exit. -# -# Prerequisites: -# - Docker running -# - openshell CLI installed -# - nemoclaw CLI installed -# - NVIDIA_API_KEY must NOT be set -# -# Usage: -# unset NVIDIA_API_KEY -# bash test/e2e/test-double-onboard.sh +# This script intentionally uses a local fake OpenAI-compatible endpoint so it +# matches the current onboarding flow. Older versions of this test relied on a +# missing/invalid NVIDIA_API_KEY causing a late failure after sandbox creation; +# that no longer reflects current non-interactive onboarding behavior. set -uo pipefail +if [ -z "${NEMOCLAW_E2E_NO_TIMEOUT:-}" ]; then + export NEMOCLAW_E2E_NO_TIMEOUT=1 + TIMEOUT_SECONDS="${NEMOCLAW_E2E_TIMEOUT_SECONDS:-900}" + exec timeout -s TERM "$TIMEOUT_SECONDS" "$0" "$@" +fi + PASS=0 FAIL=0 TOTAL=0 @@ -44,22 +41,144 @@ section() { } info() { printf '\033[1;34m [info]\033[0m %s\n' "$1"; } +registry_has() { + local sandbox_name="$1" + [ -f "$REGISTRY" ] && grep -q "$sandbox_name" "$REGISTRY" +} + SANDBOX_A="e2e-double-a" SANDBOX_B="e2e-double-b" REGISTRY="$HOME/.nemoclaw/sandboxes.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +FAKE_HOST="127.0.0.1" +FAKE_PORT="${NEMOCLAW_FAKE_PORT:-18080}" +FAKE_BASE_URL="http://${FAKE_HOST}:${FAKE_PORT}/v1" +FAKE_LOG="$(mktemp)" +FAKE_PID="" + +if command -v node >/dev/null 2>&1 && [ -f "$REPO_ROOT/bin/nemoclaw.js" ]; then + NEMOCLAW_CMD=(node "$REPO_ROOT/bin/nemoclaw.js") +else + NEMOCLAW_CMD=(nemoclaw) +fi + +# shellcheck disable=SC2329 +cleanup() { + if [ -n "$FAKE_PID" ] && kill -0 "$FAKE_PID" 2>/dev/null; then + kill "$FAKE_PID" 2>/dev/null || true + wait "$FAKE_PID" 2>/dev/null || true + fi + rm -f "$FAKE_LOG" +} +trap cleanup EXIT + +start_fake_openai() { + python3 - "$FAKE_HOST" "$FAKE_PORT" >"$FAKE_LOG" 2>&1 <<'PY' & +import json +import sys +from http.server import BaseHTTPRequestHandler, HTTPServer + +HOST = sys.argv[1] +PORT = int(sys.argv[2]) + + +class Handler(BaseHTTPRequestHandler): + def _send(self, status, payload): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + def do_GET(self): + if self.path in ("/v1/models", "/models"): + self._send(200, {"data": [{"id": "test-model", "object": "model"}]}) + return + self._send(404, {"error": {"message": "not found"}}) + + def do_POST(self): + length = int(self.headers.get("Content-Length", "0")) + if length: + self.rfile.read(length) + if self.path in ("/v1/chat/completions", "/chat/completions"): + self._send( + 200, + { + "id": "chatcmpl-test", + "object": "chat.completion", + "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}], + }, + ) + return + if self.path in ("/v1/responses", "/responses"): + self._send( + 200, + { + "id": "resp-test", + "object": "response", + "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "ok"}]}], + }, + ) + return + self._send(404, {"error": {"message": "not found"}}) + + +HTTPServer((HOST, PORT), Handler).serve_forever() +PY + FAKE_PID=$! + + for _ in $(seq 1 20); do + if curl -sf "${FAKE_BASE_URL}/models" >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + + return 1 +} + +run_onboard() { + local sandbox_name="$1" + local recreate="${2:-0}" + local log_file + log_file="$(mktemp)" + + local -a env_args=( + "COMPATIBLE_API_KEY=dummy" + "NEMOCLAW_NON_INTERACTIVE=1" + "NEMOCLAW_PROVIDER=custom" + "NEMOCLAW_ENDPOINT_URL=${FAKE_BASE_URL}" + "NEMOCLAW_MODEL=test-model" + "NEMOCLAW_SANDBOX_NAME=${sandbox_name}" + "NEMOCLAW_POLICY_MODE=skip" + ) + if [ "$recreate" = "1" ]; then + env_args+=("NEMOCLAW_RECREATE_SANDBOX=1") + fi + + env "${env_args[@]}" "${NEMOCLAW_CMD[@]}" onboard --non-interactive >"$log_file" 2>&1 + RUN_ONBOARD_EXIT=$? + RUN_ONBOARD_OUTPUT="$(cat "$log_file")" + rm -f "$log_file" +} + +run_nemoclaw() { + "${NEMOCLAW_CMD[@]}" "$@" +} # ══════════════════════════════════════════════════════════════════ # Phase 0: Pre-cleanup # ══════════════════════════════════════════════════════════════════ section "Phase 0: Pre-cleanup" info "Destroying any leftover test sandboxes/gateway from previous runs..." -# Use nemoclaw destroy (not just openshell sandbox delete) to also clean -# the nemoclaw registry at ~/.nemoclaw/sandboxes.json. Stale registry -# entries from a previous run would cause Phase 2 to exit with -# "Sandbox already exists" before the test even starts. -if command -v nemoclaw >/dev/null 2>&1; then - nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true - nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true +if [ -x "$REPO_ROOT/bin/nemoclaw.js" ] || command -v nemoclaw >/dev/null 2>&1; then + run_nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true + run_nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true fi openshell sandbox delete "$SANDBOX_A" 2>/dev/null || true openshell sandbox delete "$SANDBOX_B" 2>/dev/null || true @@ -68,7 +187,7 @@ openshell gateway destroy -g nemoclaw 2>/dev/null || true pass "Pre-cleanup complete" # ══════════════════════════════════════════════════════════════════ -# Phase 1: Prerequisites +# Phase 1: Prerequisites + fake endpoint # ══════════════════════════════════════════════════════════════════ section "Phase 1: Prerequisites" @@ -86,51 +205,53 @@ else exit 1 fi -if command -v nemoclaw >/dev/null 2>&1; then - pass "nemoclaw CLI installed" +if [ -x "$REPO_ROOT/bin/nemoclaw.js" ] || command -v nemoclaw >/dev/null 2>&1; then + pass "nemoclaw CLI available" else fail "nemoclaw CLI not found — cannot continue" exit 1 fi -if [ -n "${NVIDIA_API_KEY:-}" ]; then - fail "NVIDIA_API_KEY is set — this test requires it UNSET (unset NVIDIA_API_KEY)" +if command -v python3 >/dev/null 2>&1; then + pass "python3 installed" +else + fail "python3 not found — cannot continue" exit 1 +fi + +if start_fake_openai; then + pass "Fake OpenAI-compatible endpoint started at ${FAKE_BASE_URL}" else - pass "NVIDIA_API_KEY is not set (required for controlled step-4 exit)" + fail "Failed to start fake OpenAI-compatible endpoint" + info "Fake server log:" + sed 's/^/ /' "$FAKE_LOG" + exit 1 fi # ══════════════════════════════════════════════════════════════════ -# Phase 2: First onboard (e2e-double-a) — leaves stale state +# Phase 2: First onboard (e2e-double-a) # ══════════════════════════════════════════════════════════════════ section "Phase 2: First onboard ($SANDBOX_A)" -info "Running nemoclaw onboard — expect exit 1 (no API key)..." +info "Running successful non-interactive onboard against local compatible endpoint..." -# Write to temp file to avoid openshell FD inheritance blocking $() -ONBOARD_LOG="$(mktemp)" -NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_A" \ - NEMOCLAW_POLICY_MODE=skip \ - nemoclaw onboard --non-interactive >"$ONBOARD_LOG" 2>&1 -exit1=$? -output1="$(cat "$ONBOARD_LOG")" -rm -f "$ONBOARD_LOG" +run_onboard "$SANDBOX_A" +output1="$RUN_ONBOARD_OUTPUT" +exit1="$RUN_ONBOARD_EXIT" -if [ $exit1 -eq 1 ]; then - pass "First onboard exited 1 (step 4 failed as expected)" +if [ "$exit1" -eq 0 ]; then + pass "First onboard completed successfully" else - fail "First onboard exited $exit1 (expected 1)" + fail "First onboard exited $exit1 (expected 0)" fi if grep -q "Sandbox '${SANDBOX_A}' created" <<<"$output1"; then - pass "Sandbox '$SANDBOX_A' created (step 3 completed)" + pass "Sandbox '$SANDBOX_A' created" else - fail "Sandbox creation not confirmed in output" + fail "Sandbox '$SANDBOX_A' creation not confirmed in output" fi -# Verify stale state was left behind if openshell gateway info -g nemoclaw 2>/dev/null | grep -q "nemoclaw"; then - pass "Gateway is still running (stale state)" + pass "Gateway is running after first onboard" else fail "Gateway is not running after first onboard" fi @@ -141,96 +262,76 @@ else fail "Sandbox '$SANDBOX_A' not found in openshell" fi -if [ -f "$REGISTRY" ] && grep -q "$SANDBOX_A" "$REGISTRY"; then +if registry_has "$SANDBOX_A"; then pass "Registry contains '$SANDBOX_A'" else fail "Registry does not contain '$SANDBOX_A'" fi -info "Stale state confirmed — NOT cleaning up before next onboard" - # ══════════════════════════════════════════════════════════════════ -# Phase 3: Second onboard — SAME name (e2e-double-a) +# Phase 3: Second onboard — SAME name (recreate) # ══════════════════════════════════════════════════════════════════ -section "Phase 3: Second onboard ($SANDBOX_A — same name, stale state)" +section "Phase 3: Second onboard ($SANDBOX_A — same name, recreate)" info "Running nemoclaw onboard with NEMOCLAW_RECREATE_SANDBOX=1..." -ONBOARD_LOG="$(mktemp)" -NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_A" \ - NEMOCLAW_RECREATE_SANDBOX=1 \ - NEMOCLAW_POLICY_MODE=skip \ - nemoclaw onboard --non-interactive >"$ONBOARD_LOG" 2>&1 -exit2=$? -output2="$(cat "$ONBOARD_LOG")" -rm -f "$ONBOARD_LOG" +run_onboard "$SANDBOX_A" "1" +output2="$RUN_ONBOARD_OUTPUT" +exit2="$RUN_ONBOARD_EXIT" -# Step 4 still fails (no API key), but steps 1-3 should succeed -if [ $exit2 -eq 1 ]; then - pass "Second onboard exited 1 (step 4 failed as expected)" +if [ "$exit2" -eq 0 ]; then + pass "Second onboard completed successfully" else - fail "Second onboard exited $exit2 (expected 1)" + fail "Second onboard exited $exit2 (expected 0)" fi -if grep -q "Cleaning up previous NemoClaw session" <<<"$output2"; then - pass "Stale session cleanup fired on second onboard" +if grep -q "Reusing existing NemoClaw gateway" <<<"$output2"; then + pass "Healthy gateway reused on second onboard" else - fail "Stale session cleanup did NOT fire (regression: #397)" + fail "Healthy gateway was not reused on second onboard" fi if grep -q "Port 8080 is not available" <<<"$output2"; then - fail "Port 8080 conflict detected (regression: #21)" + fail "Port 8080 conflict detected (regression)" else - pass "No port 8080 conflict" + pass "No port 8080 conflict on second onboard" fi if grep -q "Port 18789 is not available" <<<"$output2"; then - fail "Port 18789 conflict detected" + fail "Port 18789 conflict detected on second onboard" else - pass "No port 18789 conflict" + pass "No port 18789 conflict on second onboard" fi -if grep -q "Sandbox '${SANDBOX_A}' created" <<<"$output2"; then - pass "Sandbox '$SANDBOX_A' recreated" -else - fail "Sandbox '$SANDBOX_A' was not recreated" -fi - -if openshell gateway info -g nemoclaw 2>/dev/null | grep -q "nemoclaw"; then - pass "Gateway running after second onboard" +if openshell sandbox get "$SANDBOX_A" >/dev/null 2>&1; then + pass "Sandbox '$SANDBOX_A' still exists after recreate" else - fail "Gateway not running after second onboard" + fail "Sandbox '$SANDBOX_A' missing after recreate" fi # ══════════════════════════════════════════════════════════════════ -# Phase 4: Third onboard — DIFFERENT name (e2e-double-b) +# Phase 4: Third onboard — DIFFERENT name # ══════════════════════════════════════════════════════════════════ -section "Phase 4: Third onboard ($SANDBOX_B — different name, stale state)" +section "Phase 4: Third onboard ($SANDBOX_B — different name)" info "Running nemoclaw onboard with new sandbox name..." -ONBOARD_LOG="$(mktemp)" -NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_B" \ - NEMOCLAW_POLICY_MODE=skip \ - nemoclaw onboard --non-interactive >"$ONBOARD_LOG" 2>&1 -exit3=$? -output3="$(cat "$ONBOARD_LOG")" -rm -f "$ONBOARD_LOG" +run_onboard "$SANDBOX_B" +output3="$RUN_ONBOARD_OUTPUT" +exit3="$RUN_ONBOARD_EXIT" -if [ $exit3 -eq 1 ]; then - pass "Third onboard exited 1 (step 4 failed as expected)" +if [ "$exit3" -eq 0 ]; then + pass "Third onboard completed successfully" else - fail "Third onboard exited $exit3 (expected 1)" + fail "Third onboard exited $exit3 (expected 0)" fi -if grep -q "Cleaning up previous NemoClaw session" <<<"$output3"; then - pass "Stale session cleanup fired on third onboard" +if grep -q "Reusing existing NemoClaw gateway" <<<"$output3"; then + pass "Healthy gateway reused on third onboard" else - fail "Stale session cleanup did NOT fire on third onboard" + fail "Healthy gateway was not reused on third onboard" fi if grep -q "Port 8080 is not available" <<<"$output3"; then - fail "Port 8080 conflict on third onboard (regression)" + fail "Port 8080 conflict on third onboard" else pass "No port 8080 conflict on third onboard" fi @@ -241,19 +342,100 @@ else pass "No port 18789 conflict on third onboard" fi -if grep -q "Sandbox '${SANDBOX_B}' created" <<<"$output3"; then +if openshell sandbox get "$SANDBOX_B" >/dev/null 2>&1; then pass "Sandbox '$SANDBOX_B' created" else fail "Sandbox '$SANDBOX_B' was not created" fi +if openshell sandbox get "$SANDBOX_A" >/dev/null 2>&1; then + pass "First sandbox '$SANDBOX_A' still exists after creating '$SANDBOX_B'" +else + fail "First sandbox '$SANDBOX_A' disappeared after creating '$SANDBOX_B' (regression: #849)" +fi + +# ══════════════════════════════════════════════════════════════════ +# Phase 5: Stale registry reconciliation +# ══════════════════════════════════════════════════════════════════ +section "Phase 5: Stale registry reconciliation" +info "Deleting '$SANDBOX_A' directly in OpenShell to leave a stale NemoClaw registry entry..." + +openshell sandbox delete "$SANDBOX_A" 2>/dev/null || true + +if registry_has "$SANDBOX_A"; then + pass "Registry still contains stale '$SANDBOX_A' entry" +else + fail "Registry was unexpectedly cleaned before status reconciliation" +fi + +STATUS_LOG="$(mktemp)" +run_nemoclaw "$SANDBOX_A" status >"$STATUS_LOG" 2>&1 +status_exit=$? +status_output="$(cat "$STATUS_LOG")" +rm -f "$STATUS_LOG" + +if [ "$status_exit" -eq 0 ]; then + pass "Stale sandbox status exited 0" +else + fail "Stale sandbox status exited $status_exit (expected 0)" +fi + +if grep -q "Removed stale local registry entry" <<<"$status_output"; then + pass "Stale registry entry was reconciled during status" +else + fail "Stale registry reconciliation message missing" +fi + +if registry_has "$SANDBOX_A"; then + fail "Registry still contains '$SANDBOX_A' after status reconciliation" +else + pass "Registry entry for '$SANDBOX_A' removed after status reconciliation" +fi + # ══════════════════════════════════════════════════════════════════ -# Phase 5: Final cleanup +# Phase 6: Gateway lifecycle response # ══════════════════════════════════════════════════════════════════ -section "Phase 5: Final cleanup" +section "Phase 6: Gateway lifecycle response" +info "Stopping the NemoClaw gateway runtime to verify current lifecycle behavior..." -nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true -nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true +openshell forward stop 18789 2>/dev/null || true +openshell gateway stop -g nemoclaw 2>/dev/null || true + +GATEWAY_LOG="$(mktemp)" +run_nemoclaw "$SANDBOX_B" status >"$GATEWAY_LOG" 2>&1 +gateway_status_exit=$? +gateway_status_output="$(cat "$GATEWAY_LOG")" +rm -f "$GATEWAY_LOG" + +if [ "$gateway_status_exit" -eq 0 ]; then + pass "Post-stop status exited 0" +else + fail "Post-stop status exited $gateway_status_exit (expected 0)" +fi + +if grep -qE \ + "Recovered NemoClaw gateway runtime|gateway is no longer configured after restart/rebuild|gateway is still refusing connections after restart|gateway trust material rotated after restart" \ + <<<"$gateway_status_output"; then + pass "Gateway lifecycle response was explicit after gateway stop" +else + fail "Gateway lifecycle response was not explicit after gateway stop" + info "Observed status output:" + printf '%s\n' "$gateway_status_output" | sed 's/^/ /' +fi + +if registry_has "$SANDBOX_B"; then + pass "Registry still contains '$SANDBOX_B' after gateway stop" +else + fail "Registry is missing '$SANDBOX_B' after gateway stop" +fi + +# ══════════════════════════════════════════════════════════════════ +# Phase 7: Final cleanup +# ══════════════════════════════════════════════════════════════════ +section "Phase 7: Final cleanup" + +run_nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true +run_nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true openshell sandbox delete "$SANDBOX_A" 2>/dev/null || true openshell sandbox delete "$SANDBOX_B" 2>/dev/null || true openshell forward stop 18789 2>/dev/null || true @@ -279,9 +461,6 @@ fi pass "Final cleanup complete" -# ══════════════════════════════════════════════════════════════════ -# Summary -# ══════════════════════════════════════════════════════════════════ echo "" echo "========================================" echo " Double Onboard E2E Results:" @@ -291,7 +470,7 @@ echo " Total: $TOTAL" echo "========================================" if [ "$FAIL" -eq 0 ]; then - printf '\n\033[1;32m Double onboard PASSED — stale state recovery verified.\033[0m\n' + printf '\n\033[1;32m Double onboard and lifecycle recovery PASSED.\033[0m\n' exit 0 else printf '\n\033[1;31m %d test(s) failed.\033[0m\n' "$FAIL" diff --git a/test/gateway-cleanup.test.js b/test/gateway-cleanup.test.js index 5043a2373..799680048 100644 --- a/test/gateway-cleanup.test.js +++ b/test/gateway-cleanup.test.js @@ -23,16 +23,17 @@ describe("gateway cleanup: Docker volumes removed on failure (#17)", () => { it("onboard.js: volume cleanup runs on gateway start failure", () => { const content = fs.readFileSync(path.join(ROOT, "bin/lib/onboard.js"), "utf-8"); - // The startGateway function should call destroyGateway after a failed start - const startGwBlock = content.match(/async function startGateway[\s\S]*?^}/m); + const startGwBlock = content.match(/async function startGatewayWithOptions[\s\S]*?^}/m); expect(startGwBlock).toBeTruthy(); - // Count calls to destroyGateway — should be at least 3: - // 1. pre-cleanup before start - // 2. after start failure - // 3. after health check failure - const calls = (startGwBlock[0].match(/destroyGateway\(\)/g) || []).length; - expect(calls).toBeGreaterThanOrEqual(3); + // Current behavior: + // 1. stale gateway metadata is destroyed directly before start, if present + // 2. destroyGateway() runs after start failure + // 3. destroyGateway() runs after health check failure + expect(startGwBlock[0].includes('if (hasStaleGateway(gwInfo))')).toBe(true); + expect(startGwBlock[0].includes('runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME]')).toBe(true); + const destroyCalls = (startGwBlock[0].match(/destroyGateway\(\)/g) || []).length; + expect(destroyCalls).toBeGreaterThanOrEqual(2); }); it("uninstall.sh: includes Docker volume cleanup", () => { diff --git a/test/onboard.test.js b/test/onboard.test.js index f1240a9ed..8a8046b52 100644 --- a/test/onboard.test.js +++ b/test/onboard.test.js @@ -12,6 +12,7 @@ import { buildSandboxConfigSyncScript, getFutureShellPathHint, getInstalledOpenshellVersion, + isGatewayHealthy, getSandboxInferenceConfig, getStableGatewayImageRef, patchStagedDockerfile, @@ -152,6 +153,33 @@ describe("onboard helpers", () => { expect(getStableGatewayImageRef("bogus")).toBe(null); }); + it("recognizes only a connected named NemoClaw gateway as healthy", () => { + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: nemoclaw\n Status: Connected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" + ) + ).toBe(true); + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: openshell\n Status: Connected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" + ) + ).toBe(false); + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: openshell\n Status: Connected", + "Error: no gateway metadata found" + ) + ).toBe(false); + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: nemoclaw\n Status: Disconnected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" + ) + ).toBe(false); + }); + it("returns a future-shell PATH hint for user-local openshell installs", () => { expect(getFutureShellPathHint("/home/test/.local/bin", "/usr/local/bin:/usr/bin")).toBe( 'export PATH="/home/test/.local/bin:$PATH"' @@ -436,6 +464,68 @@ console.log(JSON.stringify({ liveExists, sandbox: registry.getSandbox("my-assist assert.equal(payload.sandbox, null); }); + it("reuses an existing healthy gateway instead of destroying it", () => { + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-gateway-reuse-")); + const fakeBin = path.join(tmpDir, "bin"); + const scriptPath = path.join(tmpDir, "gateway-reuse-check.js"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\nexit 0\n", { mode: 0o755 }); + + const script = String.raw` +const runner = require(${runnerPath}); +const commands = []; + +runner.run = (command, opts = {}) => { + commands.push(command); + return { status: 0 }; +}; +runner.runCapture = (command) => { + if (command.includes("'status'")) { + return "Server Status\n\n Gateway: nemoclaw\n Status: Connected"; + } + if (command.includes("'gateway' 'info' '-g' 'nemoclaw'")) { + return "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080"; + } + if (command.includes("'--version'")) { + return "openshell 0.0.12"; + } + return ""; +}; + +const { startGateway } = require(${onboardPath}); + +(async () => { + await startGateway(null); + console.log(JSON.stringify(commands)); +})().catch((error) => { + console.error(error); + process.exit(1); +}); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { + ...process.env, + HOME: tmpDir, + PATH: `${fakeBin}:${process.env.PATH || ""}`, + }, + }); + + assert.equal(result.status, 0, result.stderr); + const commands = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.equal(commands.length, 1); + assert.match(commands[0], /gateway' 'select' 'nemoclaw'/); + assert.doesNotMatch(commands[0], /gateway' 'destroy'/); + assert.doesNotMatch(commands[0], /gateway' 'start'/); + }); + it("builds the sandbox without uploading an external OpenClaw config file", async () => { const repoRoot = path.join(import.meta.dirname, ".."); const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-create-sandbox-"));