Skip to content
35 changes: 25 additions & 10 deletions bin/lib/nim.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

const { run, runCapture, shellQuote } = require("./runner");
const nimImages = require("./nim-images.json");
const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"];

function containerName(sandboxName) {
return `nemoclaw-nim-${sandboxName}`;
Expand All @@ -23,6 +24,10 @@ function listModels() {
}));
}

function canRunNimWithMemory(totalMemoryMB) {
return nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
}

function detectGpu() {
// Try NVIDIA first — query VRAM
try {
Expand All @@ -34,42 +39,51 @@ function detectGpu() {
const perGpuMB = lines.map((l) => parseInt(l.trim(), 10)).filter((n) => !isNaN(n));
if (perGpuMB.length > 0) {
const totalMemoryMB = perGpuMB.reduce((a, b) => a + b, 0);
// Only mark nimCapable if at least one NIM model fits in GPU VRAM
const canRunNim = nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
return {
type: "nvidia",
count: perGpuMB.length,
totalMemoryMB,
perGpuMB: perGpuMB[0],
nimCapable: canRunNim,
nimCapable: canRunNimWithMemory(totalMemoryMB),
};
}
}
} catch {
/* ignored */
}

// Fallback: DGX Spark (GB10) — VRAM not queryable due to unified memory architecture
// Fallback: unified-memory NVIDIA devices where discrete VRAM is not queryable.
try {
const nameOutput = runCapture("nvidia-smi --query-gpu=name --format=csv,noheader,nounits", {
ignoreError: true,
});
if (nameOutput && nameOutput.includes("GB10")) {
// GB10 has 128GB unified memory shared with Grace CPU — use system RAM
const gpuNames = nameOutput
.split("\n")
.map((line) => line.trim())
.filter(Boolean);
const unifiedGpuNames = gpuNames.filter((name) =>
UNIFIED_MEMORY_GPU_TAGS.some((tag) => new RegExp(tag, "i").test(name)),
);
if (unifiedGpuNames.length > 0) {
let totalMemoryMB = 0;
try {
const memLine = runCapture("free -m | awk '/Mem:/ {print $2}'", { ignoreError: true });
if (memLine) totalMemoryMB = parseInt(memLine.trim(), 10) || 0;
} catch {
/* ignored */
}
const count = unifiedGpuNames.length;
const perGpuMB = count > 0 ? Math.floor(totalMemoryMB / count) : totalMemoryMB;
const isSpark = unifiedGpuNames.some((name) => /GB10/i.test(name));
return {
type: "nvidia",
count: 1,
name: unifiedGpuNames[0],
count,
totalMemoryMB,
perGpuMB: totalMemoryMB,
nimCapable: true,
spark: true,
perGpuMB: perGpuMB || totalMemoryMB,
nimCapable: canRunNimWithMemory(totalMemoryMB),
unifiedMemory: true,
spark: isSpark,
};
}
} catch {
Expand Down Expand Up @@ -232,6 +246,7 @@ module.exports = {
containerName,
getImageForModel,
listModels,
canRunNimWithMemory,
detectGpu,
pullNimImage,
startNimContainer,
Expand Down
25 changes: 16 additions & 9 deletions bin/lib/policies.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ const { ROOT, run, runCapture, shellQuote } = require("./runner");
const registry = require("./registry");

const PRESETS_DIR = path.join(ROOT, "nemoclaw-blueprint", "policies", "presets");

function getOpenshellCommand() {
const binary = process.env.NEMOCLAW_OPENSHELL_BIN;
if (!binary) return "openshell";
Expand Down Expand Up @@ -76,8 +75,15 @@ function extractPresetEntries(presetContent) {
function parseCurrentPolicy(raw) {
if (!raw) return "";
const sep = raw.indexOf("---");
if (sep === -1) return raw;
return raw.slice(sep + 3).trim();
const candidate = (sep === -1 ? raw : raw.slice(sep + 3)).trim();
if (!candidate) return "";
if (/^(error|failed|invalid|warning|status)\b/i.test(candidate)) {
return "";
}
if (!/^[a-z_][a-z0-9_]*\s*:/m.test(candidate)) {
return "";
}
return candidate;
}

/**
Expand All @@ -104,16 +110,17 @@ function buildPolicyGetCommand(sandboxName) {
* @returns {string} Merged YAML with version header when missing
*/
function mergePresetIntoPolicy(currentPolicy, presetEntries) {
const normalizedCurrentPolicy = parseCurrentPolicy(currentPolicy);
if (!presetEntries) {
return currentPolicy || "version: 1\n\nnetwork_policies:\n";
return normalizedCurrentPolicy || "version: 1\n\nnetwork_policies:\n";
}
if (!currentPolicy) {
if (!normalizedCurrentPolicy) {
return "version: 1\n\nnetwork_policies:\n" + presetEntries;
}

let merged;
if (/^network_policies\s*:/m.test(currentPolicy)) {
const lines = currentPolicy.split("\n");
if (/^network_policies\s*:/m.test(normalizedCurrentPolicy)) {
const lines = normalizedCurrentPolicy.split("\n");
const result = [];
let inNetworkPolicies = false;
let inserted = false;
Expand Down Expand Up @@ -142,11 +149,11 @@ function mergePresetIntoPolicy(currentPolicy, presetEntries) {

merged = result.join("\n");
} else {
merged = currentPolicy.trimEnd() + "\n\nnetwork_policies:\n" + presetEntries;
merged = normalizedCurrentPolicy.trimEnd() + "\n\nnetwork_policies:\n" + presetEntries;
}

if (!merged.trimStart().startsWith("version:")) {
merged = "version: 1\n" + merged;
merged = "version: 1\n\n" + merged;
}
return merged;
}
Expand Down
16 changes: 14 additions & 2 deletions bin/nemoclaw.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ const REMOTE_UNINSTALL_URL =
"https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uninstall.sh";
let OPENSHELL_BIN = null;
const MIN_LOGS_OPENSHELL_VERSION = "0.0.7";
const NEMOCLAW_GATEWAY_NAME = "nemoclaw";
const DASHBOARD_FORWARD_PORT = "18789";

function getOpenshellBinary() {
if (!OPENSHELL_BIN) {
Expand Down Expand Up @@ -108,6 +110,15 @@ function captureOpenshell(args, opts = {}) {
};
}

function cleanupGatewayAfterLastSandbox() {
runOpenshell(["forward", "stop", DASHBOARD_FORWARD_PORT], { ignoreError: true });
runOpenshell(["gateway", "destroy", "-g", NEMOCLAW_GATEWAY_NAME], { ignoreError: true });
run(
`docker volume ls -q --filter "name=openshell-cluster-${NEMOCLAW_GATEWAY_NAME}" | grep . && docker volume ls -q --filter "name=openshell-cluster-${NEMOCLAW_GATEWAY_NAME}" | xargs docker volume rm || true`,
{ ignoreError: true },
);
}

function parseVersionFromText(value = "") {
const match = String(value || "").match(/([0-9]+\.[0-9]+\.[0-9]+)/);
return match ? match[1] : null;
Expand Down Expand Up @@ -748,7 +759,6 @@ async function deploy(instanceName) {
}

async function start() {
await ensureApiKey();
const { defaultSandbox } = registry.listSandboxes();
const safeName =
defaultSandbox && /^[a-zA-Z0-9._-]+$/.test(defaultSandbox) ? defaultSandbox : null;
Expand Down Expand Up @@ -1090,7 +1100,9 @@ async function sandboxDestroy(sandboxName, args = []) {
console.log(` Deleting sandbox '${sandboxName}'...`);
runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });

registry.removeSandbox(sandboxName);
if (registry.removeSandbox(sandboxName) && registry.listSandboxes().sandboxes.length === 0) {
cleanupGatewayAfterLastSandbox();
}
console.log(` ${G}✓${R} Sandbox '${sandboxName}' destroyed`);
}

Expand Down
7 changes: 4 additions & 3 deletions scripts/start-services.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,12 @@ do_stop() {
}

do_start() {
[ -n "${NVIDIA_API_KEY:-}" ] || fail "NVIDIA_API_KEY required"

if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
warn "TELEGRAM_BOT_TOKEN not set — Telegram bridge will not start."
warn "Create a bot via @BotFather on Telegram and set the token."
elif [ -z "${NVIDIA_API_KEY:-}" ]; then
warn "NVIDIA_API_KEY not set — Telegram bridge will not start."
warn "Set NVIDIA_API_KEY if you want Telegram requests to reach inference."
fi

command -v node >/dev/null || fail "node not found. Install Node.js first."
Expand All @@ -151,7 +152,7 @@ do_start() {
mkdir -p "$PIDDIR"

# Telegram bridge (only if token provided)
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ]; then
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${NVIDIA_API_KEY:-}" ]; then
SANDBOX_NAME="$SANDBOX_NAME" start_service telegram-bridge \
node "$REPO_DIR/scripts/telegram-bridge.js"
fi
Expand Down
167 changes: 167 additions & 0 deletions test/cli.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,52 @@ describe("CLI dispatch", () => {
expect(r.out.includes("No sandboxes")).toBeTruthy();
});

it("start does not prompt for NVIDIA_API_KEY before launching local services", () => {
const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-start-no-key-"));
const localBin = path.join(home, "bin");
const registryDir = path.join(home, ".nemoclaw");
const markerFile = path.join(home, "start-args");
fs.mkdirSync(localBin, { recursive: true });
fs.mkdirSync(registryDir, { recursive: true });
fs.writeFileSync(
path.join(registryDir, "sandboxes.json"),
JSON.stringify({
sandboxes: {
alpha: {
name: "alpha",
model: "test-model",
provider: "nvidia-prod",
gpuEnabled: false,
policies: [],
},
},
defaultSandbox: "alpha",
}),
{ mode: 0o600 },
);
fs.writeFileSync(
path.join(localBin, "bash"),
[
"#!/bin/sh",
`marker_file=${JSON.stringify(markerFile)}`,
'printf \'%s\\n\' "$@" > "$marker_file"',
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

const r = runWithEnv("start", {
HOME: home,
PATH: `${localBin}:${process.env.PATH || ""}`,
NVIDIA_API_KEY: "",
TELEGRAM_BOT_TOKEN: "",
});

expect(r.code).toBe(0);
expect(r.out).not.toContain("NVIDIA API Key required");
expect(fs.readFileSync(markerFile, "utf8")).toContain("start-services.sh");
});

it("unknown onboard option exits 1", () => {
const r = run("onboard --non-interactiv");
expect(r.code).toBe(1);
Expand Down Expand Up @@ -148,6 +194,127 @@ describe("CLI dispatch", () => {
expect(fs.readFileSync(markerFile, "utf8")).not.toContain("--follow");
});

it("destroys the gateway runtime when the last sandbox is removed", () => {
const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-destroy-last-"));
const localBin = path.join(home, "bin");
const registryDir = path.join(home, ".nemoclaw");
const openshellLog = path.join(home, "openshell.log");
const bashLog = path.join(home, "bash.log");
fs.mkdirSync(localBin, { recursive: true });
fs.mkdirSync(registryDir, { recursive: true });
fs.writeFileSync(
path.join(registryDir, "sandboxes.json"),
JSON.stringify({
sandboxes: {
alpha: {
name: "alpha",
model: "test-model",
provider: "nvidia-prod",
gpuEnabled: false,
policies: [],
},
},
defaultSandbox: "alpha",
}),
{ mode: 0o600 },
);
fs.writeFileSync(
path.join(localBin, "openshell"),
[
"#!/bin/sh",
`log_file=${JSON.stringify(openshellLog)}`,
'printf \'%s\\n\' "$*" >> "$log_file"',
"exit 0",
].join("\n"),
{ mode: 0o755 },
);
fs.writeFileSync(
path.join(localBin, "bash"),
[
"#!/bin/sh",
`log_file=${JSON.stringify(bashLog)}`,
'printf \'%s\\n\' "$*" >> "$log_file"',
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

const r = runWithEnv("alpha destroy --yes", {
HOME: home,
PATH: `${localBin}:${process.env.PATH || ""}`,
});

expect(r.code).toBe(0);
expect(fs.readFileSync(openshellLog, "utf8")).toContain("sandbox delete alpha");
expect(fs.readFileSync(openshellLog, "utf8")).toContain("forward stop 18789");
expect(fs.readFileSync(openshellLog, "utf8")).toContain("gateway destroy -g nemoclaw");
expect(fs.readFileSync(bashLog, "utf8")).toContain("docker volume ls -q --filter");
});

it("keeps the gateway runtime when other sandboxes still exist", () => {
const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-destroy-shared-"));
const localBin = path.join(home, "bin");
const registryDir = path.join(home, ".nemoclaw");
const openshellLog = path.join(home, "openshell.log");
const bashLog = path.join(home, "bash.log");
fs.mkdirSync(localBin, { recursive: true });
fs.mkdirSync(registryDir, { recursive: true });
fs.writeFileSync(
path.join(registryDir, "sandboxes.json"),
JSON.stringify({
sandboxes: {
alpha: {
name: "alpha",
model: "test-model",
provider: "nvidia-prod",
gpuEnabled: false,
policies: [],
},
beta: {
name: "beta",
model: "test-model",
provider: "nvidia-prod",
gpuEnabled: false,
policies: [],
},
},
defaultSandbox: "alpha",
}),
{ mode: 0o600 },
);
fs.writeFileSync(
path.join(localBin, "openshell"),
[
"#!/bin/sh",
`log_file=${JSON.stringify(openshellLog)}`,
'printf \'%s\\n\' "$*" >> "$log_file"',
"exit 0",
].join("\n"),
{ mode: 0o755 },
);
fs.writeFileSync(
path.join(localBin, "bash"),
[
"#!/bin/sh",
`log_file=${JSON.stringify(bashLog)}`,
'printf \'%s\\n\' "$*" >> "$log_file"',
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

const r = runWithEnv("alpha destroy --yes", {
HOME: home,
PATH: `${localBin}:${process.env.PATH || ""}`,
});

expect(r.code).toBe(0);
expect(fs.readFileSync(openshellLog, "utf8")).toContain("sandbox delete alpha");
expect(fs.readFileSync(openshellLog, "utf8")).not.toContain("forward stop 18789");
expect(fs.readFileSync(openshellLog, "utf8")).not.toContain("gateway destroy -g nemoclaw");
expect(fs.readFileSync(bashLog, "utf8")).not.toContain("docker volume ls -q --filter");
});

it("passes plain logs through without the tail flag", () => {
const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-logs-plain-"));
const localBin = path.join(home, "bin");
Expand Down
Loading
Loading