Skip to content

Commit 6acf889

Browse files
kjw3laitingsheng
authored andcommitted
fix: address core blocker lifecycle regressions (#1208)
## Summary - stop requiring `NVIDIA_API_KEY` for local-only `nemoclaw start` and only gate the Telegram bridge when that bridge actually needs the key - clean up the dashboard forward, `nemoclaw` gateway, and `openshell-cluster-nemoclaw` Docker volumes when the last sandbox is destroyed - broaden unified-memory NVIDIA GPU detection beyond `GB10` while keeping `spark: true` specific to GB10 - harden policy merge/retry behavior so truncated or error-like current-policy reads rebuild from a clean `version: 1` scaffold instead of producing malformed YAML ## Issue Mapping Fixes #1191 Fixes #1160 Fixes #1182 Fixes #1162 Related #991 ## Notes - `#1188` was investigated but is not included in this PR. - The current evidence still points to a deeper runtime / proxy reachability problem on macOS + Colima rather than a bounded NemoClaw-only fix. - Keeping it out of this branch avoids speculative networking changes without strong reproduction and cross-platform coverage. ## Validation ```bash npx vitest run npx eslint bin/nemoclaw.js bin/lib/nim.js bin/lib/policies.js test/cli.test.js test/nim.test.js test/policies.test.js test/service-env.test.js npx tsc -p jsconfig.json --noEmit ``` ## References Reviewed - #1106 - #308 - #95 - #770 Signed-off-by: Kevin Jones <kejones@nvidia.com> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** - Core services can start without an NVIDIA API key. - Enhanced unified‑memory GPU detection with more accurate capability reporting. * **Bug Fixes** - Gateway and forwarded‑port cleanup only runs when the last sandbox is removed and no live sandboxes remain. - Telegram bridge now starts only when both required tokens are present; clearer startup warnings. - Policy parsing/merge more robust for metadata‑only or malformed inputs; consistent version header formatting. * **Tests** - Added tests covering GPU detection, policy parsing/merge, CLI sandbox/gateway flows, and service startup. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 35a708f commit 6acf889

File tree

8 files changed

+468
-26
lines changed

8 files changed

+468
-26
lines changed

bin/lib/nim.js

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
const { run, runCapture, shellQuote } = require("./runner");
77
const nimImages = require("./nim-images.json");
8+
const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"];
89

910
function containerName(sandboxName) {
1011
return `nemoclaw-nim-${sandboxName}`;
@@ -23,6 +24,10 @@ function listModels() {
2324
}));
2425
}
2526

27+
function canRunNimWithMemory(totalMemoryMB) {
28+
return nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
29+
}
30+
2631
function detectGpu() {
2732
// Try NVIDIA first — query VRAM
2833
try {
@@ -34,42 +39,51 @@ function detectGpu() {
3439
const perGpuMB = lines.map((l) => parseInt(l.trim(), 10)).filter((n) => !isNaN(n));
3540
if (perGpuMB.length > 0) {
3641
const totalMemoryMB = perGpuMB.reduce((a, b) => a + b, 0);
37-
// Only mark nimCapable if at least one NIM model fits in GPU VRAM
38-
const canRunNim = nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
3942
return {
4043
type: "nvidia",
4144
count: perGpuMB.length,
4245
totalMemoryMB,
4346
perGpuMB: perGpuMB[0],
44-
nimCapable: canRunNim,
47+
nimCapable: canRunNimWithMemory(totalMemoryMB),
4548
};
4649
}
4750
}
4851
} catch {
4952
/* ignored */
5053
}
5154

52-
// Fallback: DGX Spark (GB10) — VRAM not queryable due to unified memory architecture
55+
// Fallback: unified-memory NVIDIA devices where discrete VRAM is not queryable.
5356
try {
5457
const nameOutput = runCapture("nvidia-smi --query-gpu=name --format=csv,noheader,nounits", {
5558
ignoreError: true,
5659
});
57-
if (nameOutput && nameOutput.includes("GB10")) {
58-
// GB10 has 128GB unified memory shared with Grace CPU — use system RAM
60+
const gpuNames = nameOutput
61+
.split("\n")
62+
.map((line) => line.trim())
63+
.filter(Boolean);
64+
const unifiedGpuNames = gpuNames.filter((name) =>
65+
UNIFIED_MEMORY_GPU_TAGS.some((tag) => new RegExp(tag, "i").test(name)),
66+
);
67+
if (unifiedGpuNames.length > 0) {
5968
let totalMemoryMB = 0;
6069
try {
6170
const memLine = runCapture("free -m | awk '/Mem:/ {print $2}'", { ignoreError: true });
6271
if (memLine) totalMemoryMB = parseInt(memLine.trim(), 10) || 0;
6372
} catch {
6473
/* ignored */
6574
}
75+
const count = unifiedGpuNames.length;
76+
const perGpuMB = count > 0 ? Math.floor(totalMemoryMB / count) : totalMemoryMB;
77+
const isSpark = unifiedGpuNames.some((name) => /GB10/i.test(name));
6678
return {
6779
type: "nvidia",
68-
count: 1,
80+
name: unifiedGpuNames[0],
81+
count,
6982
totalMemoryMB,
70-
perGpuMB: totalMemoryMB,
71-
nimCapable: true,
72-
spark: true,
83+
perGpuMB: perGpuMB || totalMemoryMB,
84+
nimCapable: canRunNimWithMemory(totalMemoryMB),
85+
unifiedMemory: true,
86+
spark: isSpark,
7387
};
7488
}
7589
} catch {
@@ -232,6 +246,7 @@ module.exports = {
232246
containerName,
233247
getImageForModel,
234248
listModels,
249+
canRunNimWithMemory,
235250
detectGpu,
236251
pullNimImage,
237252
startNimContainer,

bin/lib/policies.js

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
const fs = require("fs");
77
const path = require("path");
88
const os = require("os");
9+
const YAML = require("yaml");
910
const { ROOT, run, runCapture, shellQuote } = require("./runner");
1011
const registry = require("./registry");
1112

1213
const PRESETS_DIR = path.join(ROOT, "nemoclaw-blueprint", "policies", "presets");
13-
1414
function getOpenshellCommand() {
1515
const binary = process.env.NEMOCLAW_OPENSHELL_BIN;
1616
if (!binary) return "openshell";
@@ -76,8 +76,23 @@ function extractPresetEntries(presetContent) {
7676
function parseCurrentPolicy(raw) {
7777
if (!raw) return "";
7878
const sep = raw.indexOf("---");
79-
if (sep === -1) return raw;
80-
return raw.slice(sep + 3).trim();
79+
const candidate = (sep === -1 ? raw : raw.slice(sep + 3)).trim();
80+
if (!candidate) return "";
81+
if (/^(error|failed|invalid|warning|status)\b/i.test(candidate)) {
82+
return "";
83+
}
84+
if (!/^[a-z_][a-z0-9_]*\s*:/m.test(candidate)) {
85+
return "";
86+
}
87+
try {
88+
const parsed = YAML.parse(candidate);
89+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
90+
return "";
91+
}
92+
} catch {
93+
return "";
94+
}
95+
return candidate;
8196
}
8297

8398
/**
@@ -104,16 +119,17 @@ function buildPolicyGetCommand(sandboxName) {
104119
* @returns {string} Merged YAML with version header when missing
105120
*/
106121
function mergePresetIntoPolicy(currentPolicy, presetEntries) {
122+
const normalizedCurrentPolicy = parseCurrentPolicy(currentPolicy);
107123
if (!presetEntries) {
108-
return currentPolicy || "version: 1\n\nnetwork_policies:\n";
124+
return normalizedCurrentPolicy || "version: 1\n\nnetwork_policies:\n";
109125
}
110-
if (!currentPolicy) {
126+
if (!normalizedCurrentPolicy) {
111127
return "version: 1\n\nnetwork_policies:\n" + presetEntries;
112128
}
113129

114130
let merged;
115-
if (/^network_policies\s*:/m.test(currentPolicy)) {
116-
const lines = currentPolicy.split("\n");
131+
if (/^network_policies\s*:/m.test(normalizedCurrentPolicy)) {
132+
const lines = normalizedCurrentPolicy.split("\n");
117133
const result = [];
118134
let inNetworkPolicies = false;
119135
let inserted = false;
@@ -142,11 +158,11 @@ function mergePresetIntoPolicy(currentPolicy, presetEntries) {
142158

143159
merged = result.join("\n");
144160
} else {
145-
merged = currentPolicy.trimEnd() + "\n\nnetwork_policies:\n" + presetEntries;
161+
merged = normalizedCurrentPolicy.trimEnd() + "\n\nnetwork_policies:\n" + presetEntries;
146162
}
147163

148164
if (!merged.trimStart().startsWith("version:")) {
149-
merged = "version: 1\n" + merged;
165+
merged = "version: 1\n\n" + merged;
150166
}
151167
return merged;
152168
}

bin/nemoclaw.js

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ const REMOTE_UNINSTALL_URL =
6969
"https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uninstall.sh";
7070
let OPENSHELL_BIN = null;
7171
const MIN_LOGS_OPENSHELL_VERSION = "0.0.7";
72+
const NEMOCLAW_GATEWAY_NAME = "nemoclaw";
73+
const DASHBOARD_FORWARD_PORT = "18789";
7274

7375
function getOpenshellBinary() {
7476
if (!OPENSHELL_BIN) {
@@ -108,6 +110,23 @@ function captureOpenshell(args, opts = {}) {
108110
};
109111
}
110112

113+
function cleanupGatewayAfterLastSandbox() {
114+
runOpenshell(["forward", "stop", DASHBOARD_FORWARD_PORT], { ignoreError: true });
115+
runOpenshell(["gateway", "destroy", "-g", NEMOCLAW_GATEWAY_NAME], { ignoreError: true });
116+
run(
117+
`docker volume ls -q --filter "name=openshell-cluster-${NEMOCLAW_GATEWAY_NAME}" | grep . && docker volume ls -q --filter "name=openshell-cluster-${NEMOCLAW_GATEWAY_NAME}" | xargs docker volume rm || true`,
118+
{ ignoreError: true },
119+
);
120+
}
121+
122+
function hasNoLiveSandboxes() {
123+
const liveList = captureOpenshell(["sandbox", "list"], { ignoreError: true });
124+
if (liveList.status !== 0) {
125+
return false;
126+
}
127+
return parseLiveSandboxNames(liveList.output).size === 0;
128+
}
129+
111130
function parseVersionFromText(value = "") {
112131
const match = String(value || "").match(/([0-9]+\.[0-9]+\.[0-9]+)/);
113132
return match ? match[1] : null;
@@ -748,7 +767,6 @@ async function deploy(instanceName) {
748767
}
749768

750769
async function start() {
751-
await ensureApiKey();
752770
const { defaultSandbox } = registry.listSandboxes();
753771
const safeName =
754772
defaultSandbox && /^[a-zA-Z0-9._-]+$/.test(defaultSandbox) ? defaultSandbox : null;
@@ -1088,9 +1106,17 @@ async function sandboxDestroy(sandboxName, args = []) {
10881106
else nim.stopNimContainer(sandboxName);
10891107

10901108
console.log(` Deleting sandbox '${sandboxName}'...`);
1091-
runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
1109+
const deleteResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
10921110

1093-
registry.removeSandbox(sandboxName);
1111+
const removed = registry.removeSandbox(sandboxName);
1112+
if (
1113+
deleteResult.status === 0 &&
1114+
removed &&
1115+
registry.listSandboxes().sandboxes.length === 0 &&
1116+
hasNoLiveSandboxes()
1117+
) {
1118+
cleanupGatewayAfterLastSandbox();
1119+
}
10941120
console.log(` ${G}${R} Sandbox '${sandboxName}' destroyed`);
10951121
}
10961122

scripts/start-services.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,12 @@ do_stop() {
123123
}
124124

125125
do_start() {
126-
[ -n "${NVIDIA_API_KEY:-}" ] || fail "NVIDIA_API_KEY required"
127-
128126
if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
129127
warn "TELEGRAM_BOT_TOKEN not set — Telegram bridge will not start."
130128
warn "Create a bot via @BotFather on Telegram and set the token."
129+
elif [ -z "${NVIDIA_API_KEY:-}" ]; then
130+
warn "NVIDIA_API_KEY not set — Telegram bridge will not start."
131+
warn "Set NVIDIA_API_KEY if you want Telegram requests to reach inference."
131132
fi
132133

133134
command -v node >/dev/null || fail "node not found. Install Node.js first."
@@ -151,7 +152,7 @@ do_start() {
151152
mkdir -p "$PIDDIR"
152153

153154
# Telegram bridge (only if token provided)
154-
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ]; then
155+
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${NVIDIA_API_KEY:-}" ]; then
155156
SANDBOX_NAME="$SANDBOX_NAME" start_service telegram-bridge \
156157
node "$REPO_DIR/scripts/telegram-bridge.js"
157158
fi

0 commit comments

Comments
 (0)