Skip to content
35 changes: 25 additions & 10 deletions bin/lib/nim.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

const { run, runCapture, shellQuote } = require("./runner");
const nimImages = require("./nim-images.json");
const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"];

function containerName(sandboxName) {
return `nemoclaw-nim-${sandboxName}`;
Expand All @@ -23,6 +24,10 @@ function listModels() {
}));
}

function canRunNimWithMemory(totalMemoryMB) {
return nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
}

function detectGpu() {
// Try NVIDIA first — query VRAM
try {
Expand All @@ -34,42 +39,51 @@ function detectGpu() {
const perGpuMB = lines.map((l) => parseInt(l.trim(), 10)).filter((n) => !isNaN(n));
if (perGpuMB.length > 0) {
const totalMemoryMB = perGpuMB.reduce((a, b) => a + b, 0);
// Only mark nimCapable if at least one NIM model fits in GPU VRAM
const canRunNim = nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
return {
type: "nvidia",
count: perGpuMB.length,
totalMemoryMB,
perGpuMB: perGpuMB[0],
nimCapable: canRunNim,
nimCapable: canRunNimWithMemory(totalMemoryMB),
};
}
}
} catch {
/* ignored */
}

// Fallback: DGX Spark (GB10) — VRAM not queryable due to unified memory architecture
// Fallback: unified-memory NVIDIA devices where discrete VRAM is not queryable.
try {
const nameOutput = runCapture("nvidia-smi --query-gpu=name --format=csv,noheader,nounits", {
ignoreError: true,
});
if (nameOutput && nameOutput.includes("GB10")) {
// GB10 has 128GB unified memory shared with Grace CPU — use system RAM
const gpuNames = nameOutput
.split("\n")
.map((line) => line.trim())
.filter(Boolean);
const unifiedGpuNames = gpuNames.filter((name) =>
UNIFIED_MEMORY_GPU_TAGS.some((tag) => new RegExp(tag, "i").test(name)),
);
if (unifiedGpuNames.length > 0) {
let totalMemoryMB = 0;
try {
const memLine = runCapture("free -m | awk '/Mem:/ {print $2}'", { ignoreError: true });
if (memLine) totalMemoryMB = parseInt(memLine.trim(), 10) || 0;
} catch {
/* ignored */
}
const count = unifiedGpuNames.length;
const perGpuMB = count > 0 ? Math.floor(totalMemoryMB / count) : totalMemoryMB;
const isSpark = unifiedGpuNames.some((name) => /GB10/i.test(name));
return {
type: "nvidia",
count: 1,
name: unifiedGpuNames[0],
count,
totalMemoryMB,
perGpuMB: totalMemoryMB,
nimCapable: true,
spark: true,
perGpuMB: perGpuMB || totalMemoryMB,
nimCapable: canRunNimWithMemory(totalMemoryMB),
unifiedMemory: true,
spark: isSpark,
};
}
} catch {
Expand Down Expand Up @@ -232,6 +246,7 @@ module.exports = {
containerName,
getImageForModel,
listModels,
canRunNimWithMemory,
detectGpu,
pullNimImage,
startNimContainer,
Expand Down
34 changes: 25 additions & 9 deletions bin/lib/policies.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
const fs = require("fs");
const path = require("path");
const os = require("os");
const YAML = require("yaml");
const { ROOT, run, runCapture, shellQuote } = require("./runner");
const registry = require("./registry");

const PRESETS_DIR = path.join(ROOT, "nemoclaw-blueprint", "policies", "presets");

function getOpenshellCommand() {
const binary = process.env.NEMOCLAW_OPENSHELL_BIN;
if (!binary) return "openshell";
Expand Down Expand Up @@ -76,8 +76,23 @@ function extractPresetEntries(presetContent) {
function parseCurrentPolicy(raw) {
if (!raw) return "";
const sep = raw.indexOf("---");
if (sep === -1) return raw;
return raw.slice(sep + 3).trim();
const candidate = (sep === -1 ? raw : raw.slice(sep + 3)).trim();
if (!candidate) return "";
if (/^(error|failed|invalid|warning|status)\b/i.test(candidate)) {
return "";
}
if (!/^[a-z_][a-z0-9_]*\s*:/m.test(candidate)) {
return "";
}
try {
const parsed = YAML.parse(candidate);
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
return "";
}
} catch {
return "";
}
return candidate;
}

/**
Expand All @@ -104,16 +119,17 @@ function buildPolicyGetCommand(sandboxName) {
* @returns {string} Merged YAML with version header when missing
*/
function mergePresetIntoPolicy(currentPolicy, presetEntries) {
const normalizedCurrentPolicy = parseCurrentPolicy(currentPolicy);
if (!presetEntries) {
return currentPolicy || "version: 1\n\nnetwork_policies:\n";
return normalizedCurrentPolicy || "version: 1\n\nnetwork_policies:\n";
}
if (!currentPolicy) {
if (!normalizedCurrentPolicy) {
return "version: 1\n\nnetwork_policies:\n" + presetEntries;
}

let merged;
if (/^network_policies\s*:/m.test(currentPolicy)) {
const lines = currentPolicy.split("\n");
if (/^network_policies\s*:/m.test(normalizedCurrentPolicy)) {
const lines = normalizedCurrentPolicy.split("\n");
const result = [];
let inNetworkPolicies = false;
let inserted = false;
Expand Down Expand Up @@ -142,11 +158,11 @@ function mergePresetIntoPolicy(currentPolicy, presetEntries) {

merged = result.join("\n");
} else {
merged = currentPolicy.trimEnd() + "\n\nnetwork_policies:\n" + presetEntries;
merged = normalizedCurrentPolicy.trimEnd() + "\n\nnetwork_policies:\n" + presetEntries;
}

if (!merged.trimStart().startsWith("version:")) {
merged = "version: 1\n" + merged;
merged = "version: 1\n\n" + merged;
}
return merged;
}
Expand Down
32 changes: 29 additions & 3 deletions bin/nemoclaw.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ const REMOTE_UNINSTALL_URL =
"https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uninstall.sh";
let OPENSHELL_BIN = null;
const MIN_LOGS_OPENSHELL_VERSION = "0.0.7";
const NEMOCLAW_GATEWAY_NAME = "nemoclaw";
const DASHBOARD_FORWARD_PORT = "18789";

function getOpenshellBinary() {
if (!OPENSHELL_BIN) {
Expand Down Expand Up @@ -108,6 +110,23 @@ function captureOpenshell(args, opts = {}) {
};
}

function cleanupGatewayAfterLastSandbox() {
runOpenshell(["forward", "stop", DASHBOARD_FORWARD_PORT], { ignoreError: true });
runOpenshell(["gateway", "destroy", "-g", NEMOCLAW_GATEWAY_NAME], { ignoreError: true });
run(
`docker volume ls -q --filter "name=openshell-cluster-${NEMOCLAW_GATEWAY_NAME}" | grep . && docker volume ls -q --filter "name=openshell-cluster-${NEMOCLAW_GATEWAY_NAME}" | xargs docker volume rm || true`,
{ ignoreError: true },
);
}

function hasNoLiveSandboxes() {
const liveList = captureOpenshell(["sandbox", "list"], { ignoreError: true });
if (liveList.status !== 0) {
return false;
}
return parseLiveSandboxNames(liveList.output).size === 0;
}

function parseVersionFromText(value = "") {
const match = String(value || "").match(/([0-9]+\.[0-9]+\.[0-9]+)/);
return match ? match[1] : null;
Expand Down Expand Up @@ -748,7 +767,6 @@ async function deploy(instanceName) {
}

async function start() {
await ensureApiKey();
const { defaultSandbox } = registry.listSandboxes();
const safeName =
defaultSandbox && /^[a-zA-Z0-9._-]+$/.test(defaultSandbox) ? defaultSandbox : null;
Expand Down Expand Up @@ -1088,9 +1106,17 @@ async function sandboxDestroy(sandboxName, args = []) {
else nim.stopNimContainer(sandboxName);

console.log(` Deleting sandbox '${sandboxName}'...`);
runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
const deleteResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });

registry.removeSandbox(sandboxName);
const removed = registry.removeSandbox(sandboxName);
if (
deleteResult.status === 0 &&
removed &&
registry.listSandboxes().sandboxes.length === 0 &&
hasNoLiveSandboxes()
) {
cleanupGatewayAfterLastSandbox();
}
console.log(` ${G}✓${R} Sandbox '${sandboxName}' destroyed`);
}

Expand Down
7 changes: 4 additions & 3 deletions scripts/start-services.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,12 @@ do_stop() {
}

do_start() {
[ -n "${NVIDIA_API_KEY:-}" ] || fail "NVIDIA_API_KEY required"

if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
warn "TELEGRAM_BOT_TOKEN not set — Telegram bridge will not start."
warn "Create a bot via @BotFather on Telegram and set the token."
elif [ -z "${NVIDIA_API_KEY:-}" ]; then
warn "NVIDIA_API_KEY not set — Telegram bridge will not start."
warn "Set NVIDIA_API_KEY if you want Telegram requests to reach inference."
fi

command -v node >/dev/null || fail "node not found. Install Node.js first."
Expand All @@ -151,7 +152,7 @@ do_start() {
mkdir -p "$PIDDIR"

# Telegram bridge (only if token provided)
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ]; then
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${NVIDIA_API_KEY:-}" ]; then
SANDBOX_NAME="$SANDBOX_NAME" start_service telegram-bridge \
node "$REPO_DIR/scripts/telegram-bridge.js"
fi
Expand Down
Loading
Loading