Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
36574ff
fix: harden installer and onboard resiliency
kjw3 Mar 24, 2026
c53bb20
Merge origin/main into fix/446-installer-resiliency
kjw3 Mar 25, 2026
7fe4389
fix: improve onboard recovery after stale sandbox state
kjw3 Mar 25, 2026
2acf5a3
Merge remote-tracking branch 'origin/main' into fix/446-installer-res…
kjw3 Mar 25, 2026
8721b13
fix: recover gateway runtime after restart drift
kjw3 Mar 25, 2026
6ebb41d
fix: reuse healthy sandboxes on repeat onboard
kjw3 Mar 25, 2026
beffc78
fix: classify sandbox image transfer failures
kjw3 Mar 25, 2026
cb0428b
test: cover sandbox image transfer recovery hints
kjw3 Mar 25, 2026
0f4c0d3
fix: redact onboard session failure details
kjw3 Mar 25, 2026
a4e2821
Merge branch 'main' into fix/446-installer-resiliency
kjw3 Mar 25, 2026
9c73c7f
test: satisfy shellcheck for resiliency scripts
kjw3 Mar 25, 2026
8552e01
fix: address onboard review follow-ups
kjw3 Mar 25, 2026
22b2797
fix: handle ansi gateway metadata in onboarding
kjw3 Mar 25, 2026
b5d4f85
Merge branch 'main' into fix/446-installer-resiliency
kjw3 Mar 25, 2026
f8f7b96
fix: use openshell wrappers in onboard recovery
kjw3 Mar 25, 2026
09e2121
Merge branch 'main' into fix/446-installer-resiliency
kjw3 Mar 25, 2026
a8c64c7
fix(onboard): restore resiliency recovery flows
kjw3 Mar 25, 2026
11b311d
test(e2e): align resiliency flows with onboard order
kjw3 Mar 25, 2026
f90c693
fix(onboard): reuse healthy gateways during reruns
kjw3 Mar 25, 2026
359f7c9
fix(onboard): allow healthy gateway port reuse
kjw3 Mar 25, 2026
44acebd
fix(onboard): allow healthy dashboard forward reuse
kjw3 Mar 25, 2026
98e9b6d
test(e2e): refresh resiliency expectations
kjw3 Mar 25, 2026
c9ba0bb
fix: serialize onboard runs and correct resume step order
kjw3 Mar 25, 2026
0152363
Merge origin/main into fix/446-installer-resiliency
kjw3 Mar 25, 2026
0b3a598
fix: classify foreign gateway reuse conservatively
kjw3 Mar 25, 2026
aa05263
fix: restore legacy installer wrapper behavior
kjw3 Mar 25, 2026
b1dbe9d
Merge origin/main into fix/446-installer-resiliency
kjw3 Mar 26, 2026
0125332
fix: simplify printed OpenClaw UI URLs in onboard output
kjw3 Mar 26, 2026
e289249
fix: restore gateway recovery coverage paths
kjw3 Mar 26, 2026
fee98f2
fix: align onboard control ui output with main
kjw3 Mar 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
384 changes: 384 additions & 0 deletions bin/lib/onboard-session.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,384 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

const fs = require("fs");
const path = require("path");

const SESSION_VERSION = 1;
const SESSION_DIR = path.join(process.env.HOME || "/tmp", ".nemoclaw");
const SESSION_FILE = path.join(SESSION_DIR, "onboard-session.json");
const LOCK_FILE = path.join(SESSION_DIR, "onboard.lock");
const VALID_STEP_STATES = new Set(["pending", "in_progress", "complete", "failed", "skipped"]);

function ensureSessionDir() {
fs.mkdirSync(SESSION_DIR, { recursive: true, mode: 0o700 });
}

function sessionPath() {
return SESSION_FILE;
}

function lockPath() {
return LOCK_FILE;
}

function defaultSteps() {
return {
preflight: { status: "pending", startedAt: null, completedAt: null, error: null },
gateway: { status: "pending", startedAt: null, completedAt: null, error: null },
sandbox: { status: "pending", startedAt: null, completedAt: null, error: null },
provider_selection: { status: "pending", startedAt: null, completedAt: null, error: null },
inference: { status: "pending", startedAt: null, completedAt: null, error: null },
openclaw: { status: "pending", startedAt: null, completedAt: null, error: null },
policies: { status: "pending", startedAt: null, completedAt: null, error: null },
};
}

function createSession(overrides = {}) {
const now = new Date().toISOString();
return {
version: SESSION_VERSION,
sessionId: overrides.sessionId || `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`,
resumable: true,
status: "in_progress",
mode: overrides.mode || "interactive",
startedAt: overrides.startedAt || now,
updatedAt: overrides.updatedAt || now,
lastStepStarted: overrides.lastStepStarted || null,
lastCompletedStep: overrides.lastCompletedStep || null,
failure: overrides.failure || null,
sandboxName: overrides.sandboxName || null,
provider: overrides.provider || null,
model: overrides.model || null,
endpointUrl: overrides.endpointUrl || null,
credentialEnv: overrides.credentialEnv || null,
preferredInferenceApi: overrides.preferredInferenceApi || null,
nimContainer: overrides.nimContainer || null,
metadata: {
gatewayName: overrides.metadata?.gatewayName || "nemoclaw",
},
steps: {
...defaultSteps(),
...(overrides.steps || {}),
},
};
}

function isObject(value) {
return typeof value === "object" && value !== null && !Array.isArray(value);
}

function redactSensitiveText(value) {
if (typeof value !== "string") return null;
return value
.replace(/(NVIDIA_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GEMINI_API_KEY|COMPATIBLE_API_KEY|COMPATIBLE_ANTHROPIC_API_KEY)=\S+/gi, "$1=<REDACTED>")
.replace(/Bearer\s+\S+/gi, "Bearer <REDACTED>")
.replace(/nvapi-[A-Za-z0-9_-]{10,}/g, "<REDACTED>")
.replace(/ghp_[A-Za-z0-9]{20,}/g, "<REDACTED>")
.replace(/sk-[A-Za-z0-9_-]{10,}/g, "<REDACTED>")
.slice(0, 240);
}

function sanitizeFailure(input) {
if (!input) return null;
const step = typeof input.step === "string" ? input.step : null;
const message = redactSensitiveText(input.message);
const recordedAt = typeof input.recordedAt === "string" ? input.recordedAt : new Date().toISOString();
return step || message ? { step, message, recordedAt } : null;
}

function validateStep(step) {
if (!isObject(step)) return false;
if (!VALID_STEP_STATES.has(step.status)) return false;
return true;
}

function normalizeSession(data) {
if (!isObject(data) || data.version !== SESSION_VERSION) return null;
const normalized = createSession({
sessionId: typeof data.sessionId === "string" ? data.sessionId : undefined,
mode: typeof data.mode === "string" ? data.mode : undefined,
startedAt: typeof data.startedAt === "string" ? data.startedAt : undefined,
updatedAt: typeof data.updatedAt === "string" ? data.updatedAt : undefined,
sandboxName: typeof data.sandboxName === "string" ? data.sandboxName : null,
provider: typeof data.provider === "string" ? data.provider : null,
model: typeof data.model === "string" ? data.model : null,
endpointUrl: typeof data.endpointUrl === "string" ? data.endpointUrl : null,
credentialEnv: typeof data.credentialEnv === "string" ? data.credentialEnv : null,
preferredInferenceApi: typeof data.preferredInferenceApi === "string" ? data.preferredInferenceApi : null,
nimContainer: typeof data.nimContainer === "string" ? data.nimContainer : null,
lastStepStarted: typeof data.lastStepStarted === "string" ? data.lastStepStarted : null,
lastCompletedStep: typeof data.lastCompletedStep === "string" ? data.lastCompletedStep : null,
failure: sanitizeFailure(data.failure),
metadata: isObject(data.metadata) ? data.metadata : undefined,
});
normalized.resumable = data.resumable !== false;
normalized.status = typeof data.status === "string" ? data.status : normalized.status;

if (isObject(data.steps)) {
for (const [name, step] of Object.entries(data.steps)) {
if (Object.prototype.hasOwnProperty.call(normalized.steps, name) && validateStep(step)) {
normalized.steps[name] = {
status: step.status,
startedAt: typeof step.startedAt === "string" ? step.startedAt : null,
completedAt: typeof step.completedAt === "string" ? step.completedAt : null,
error: redactSensitiveText(step.error),
};
}
}
}

return normalized;
}

function loadSession() {
try {
if (!fs.existsSync(SESSION_FILE)) {
return null;
}
const parsed = JSON.parse(fs.readFileSync(SESSION_FILE, "utf-8"));
return normalizeSession(parsed);
} catch {
return null;
}
}

function saveSession(session) {
const normalized = normalizeSession(session) || createSession();
normalized.updatedAt = new Date().toISOString();
ensureSessionDir();
const tmpFile = path.join(
SESSION_DIR,
`.onboard-session.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`
);
fs.writeFileSync(tmpFile, JSON.stringify(normalized, null, 2), { mode: 0o600 });
fs.renameSync(tmpFile, SESSION_FILE);
return normalized;
}

function clearSession() {
try {
if (fs.existsSync(SESSION_FILE)) {
fs.unlinkSync(SESSION_FILE);
}
} catch {
return;
}
}

function parseLockFile(contents) {
try {
const parsed = JSON.parse(contents);
if (typeof parsed?.pid !== "number") return null;
return {
pid: parsed.pid,
startedAt: typeof parsed.startedAt === "string" ? parsed.startedAt : null,
command: typeof parsed.command === "string" ? parsed.command : null,
};
} catch {
return null;
}
}

function isProcessAlive(pid) {
if (!Number.isInteger(pid) || pid <= 0) return false;
try {
process.kill(pid, 0);
return true;
} catch (error) {
return error?.code === "EPERM";
}
}

function acquireOnboardLock(command = null) {
ensureSessionDir();
const payload = JSON.stringify(
{
pid: process.pid,
startedAt: new Date().toISOString(),
command: typeof command === "string" ? command : null,
},
null,
2
);

for (let attempt = 0; attempt < 2; attempt++) {
try {
const fd = fs.openSync(LOCK_FILE, "wx", 0o600);
fs.writeFileSync(fd, payload);
fs.closeSync(fd);
return { acquired: true, lockFile: LOCK_FILE, stale: false };
} catch (error) {
if (error?.code !== "EEXIST") {
throw error;
}

const existing = parseLockFile(fs.readFileSync(LOCK_FILE, "utf8"));
if (existing && isProcessAlive(existing.pid)) {
return {
acquired: false,
lockFile: LOCK_FILE,
stale: false,
holderPid: existing.pid,
holderStartedAt: existing.startedAt,
holderCommand: existing.command,
};
}

try {
fs.unlinkSync(LOCK_FILE);
} catch (unlinkError) {
if (unlinkError?.code !== "ENOENT") {
throw unlinkError;
}
}
}
}

return { acquired: false, lockFile: LOCK_FILE, stale: true };
}

function releaseOnboardLock() {
try {
if (!fs.existsSync(LOCK_FILE)) return;
const existing = parseLockFile(fs.readFileSync(LOCK_FILE, "utf8"));
if (existing && existing.pid !== process.pid) return;
fs.unlinkSync(LOCK_FILE);
} catch {
return;
}
}

function updateSession(mutator) {
const current = loadSession() || createSession();
const next = typeof mutator === "function" ? mutator(current) || current : current;
return saveSession(next);
}

function markStepStarted(stepName) {
return updateSession((session) => {
const step = session.steps[stepName];
if (!step) return session;
step.status = "in_progress";
step.startedAt = new Date().toISOString();
step.error = null;
session.lastStepStarted = stepName;
session.failure = null;
session.status = "in_progress";
return session;
});
}

function markStepComplete(stepName, updates = {}) {
return updateSession((session) => {
const step = session.steps[stepName];
if (!step) return session;
step.status = "complete";
step.completedAt = new Date().toISOString();
step.error = null;
session.lastCompletedStep = stepName;
session.failure = null;
Object.assign(session, filterSafeUpdates(updates));
return session;
});
}

function markStepFailed(stepName, message = null) {
return updateSession((session) => {
const step = session.steps[stepName];
if (!step) return session;
step.status = "failed";
step.error = redactSensitiveText(message);
session.failure = sanitizeFailure({
step: stepName,
message,
recordedAt: new Date().toISOString(),
});
session.status = "failed";
return session;
});
}

function completeSession(updates = {}) {
return updateSession((session) => {
Object.assign(session, filterSafeUpdates(updates));
session.status = "complete";
session.failure = null;
return session;
});
}

function filterSafeUpdates(updates) {
const safe = {};
if (!isObject(updates)) return safe;
if (typeof updates.sandboxName === "string") safe.sandboxName = updates.sandboxName;
if (typeof updates.provider === "string") safe.provider = updates.provider;
if (typeof updates.model === "string") safe.model = updates.model;
if (typeof updates.endpointUrl === "string") safe.endpointUrl = updates.endpointUrl;
if (typeof updates.credentialEnv === "string") safe.credentialEnv = updates.credentialEnv;
if (typeof updates.preferredInferenceApi === "string") safe.preferredInferenceApi = updates.preferredInferenceApi;
if (typeof updates.nimContainer === "string") safe.nimContainer = updates.nimContainer;
if (isObject(updates.metadata)) {
safe.metadata = {};
if (typeof updates.metadata.gatewayName === "string") {
safe.metadata.gatewayName = updates.metadata.gatewayName;
}
}
return safe;
}

function summarizeForDebug(session = loadSession()) {
if (!session) return null;
return {
version: session.version,
sessionId: session.sessionId,
status: session.status,
resumable: session.resumable,
mode: session.mode,
startedAt: session.startedAt,
updatedAt: session.updatedAt,
sandboxName: session.sandboxName,
provider: session.provider,
model: session.model,
endpointUrl: session.endpointUrl,
credentialEnv: session.credentialEnv,
preferredInferenceApi: session.preferredInferenceApi,
nimContainer: session.nimContainer,
lastStepStarted: session.lastStepStarted,
lastCompletedStep: session.lastCompletedStep,
failure: session.failure,
steps: Object.fromEntries(
Object.entries(session.steps).map(([name, step]) => [
name,
{
status: step.status,
startedAt: step.startedAt,
completedAt: step.completedAt,
error: step.error,
},
])
),
};
}

module.exports = {
LOCK_FILE,
SESSION_DIR,
SESSION_FILE,
SESSION_VERSION,
acquireOnboardLock,
clearSession,
completeSession,
createSession,
loadSession,
markStepComplete,
markStepFailed,
markStepStarted,
lockPath,
saveSession,
releaseOnboardLock,
sessionPath,
redactSensitiveText,
summarizeForDebug,
updateSession,
};
Loading
Loading