From d92dc54ee69e576de8bc7e2cf1dca728fa1d71a0 Mon Sep 17 00:00:00 2001 From: Julie Yaunches Date: Wed, 1 Apr 2026 15:33:45 -0400 Subject: [PATCH 1/2] fix(e2e): harden Brev E2E infrastructure for CPU-only instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shared E2E infrastructure improvements that make Brev-based CI reliable on CPU-only instances: brev-setup.sh: - Extract HAS_GPU flag for consistent GPU detection — nvidia-smi must run successfully, not just exist on PATH (Brev GPU images ship the binary even on CPU instances) - All GPU gates (container toolkit, Docker runtime reset, vLLM) use the single HAS_GPU flag - Replace cloud-init || true with proper error check + warning - Add timeout (300s) and quiet window (5s) to apt wait loop to prevent indefinite hangs and races - Add retry loops (5 attempts, 30s backoff) for Node.js and Docker apt-get installs - Unsuppress NodeSource installer output for debugging - Reset Docker default runtime to runc on CPU-only instances where Brev pre-configures nvidia as default setup.sh: - Check nvidia-smi exit code instead of just PATH presence for GPU gateway flag brev-e2e.test.js: - Use known GCP instance type (n2-standard-4) instead of cpu search - Add shellQuote() for safe secret interpolation in SSH commands - Delete leftover instances before creating new ones - Wait for cloud-init before running bootstrap --- scripts/brev-setup.sh | 99 ++++++++++++++++++++++++++++++++++++--- scripts/setup.sh | 3 +- test/e2e/brev-e2e.test.js | 47 +++++++++++-------- 3 files changed, 122 insertions(+), 27 deletions(-) diff --git a/scripts/brev-setup.sh b/scripts/brev-setup.sh index 1bf5d9de4..c2a6a7c12 100755 --- a/scripts/brev-setup.sh +++ b/scripts/brev-setup.sh @@ -37,6 +37,64 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" export NEEDRESTART_MODE=a export DEBIAN_FRONTEND=noninteractive +# ── GPU detection ───────────────────────────────────────────────── +# Brev GPU images ship the nvidia-smi binary even on CPU-only instances. +# We need nvidia-smi to actually *run* (driver loaded) to treat this as +# a GPU host. Export a flag so every later section uses the same check. +if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then + HAS_GPU=true +else + HAS_GPU=false +fi +info "GPU detected: $HAS_GPU" + +# ── Acquire exclusive apt access ────────────────────────────────── +# Cloud VMs run multiple apt processes on boot: cloud-init, apt-daily, +# and unattended-upgrades. We must wait for cloud-init AND disable the +# background services before we can safely use apt ourselves. +APT_WAIT_MAX=300 # seconds — fail if apt is still busy after 5 min +APT_QUIET_WINDOW=5 # seconds — apt must be idle this long before we proceed + +if command -v cloud-init >/dev/null 2>&1; then + info "Waiting for cloud-init to finish..." + if ! cloud-init status --wait >/dev/null 2>&1; then + warn "cloud-init exited with an error — proceeding, but the VM may be in an inconsistent state" + fi + info "cloud-init done" +fi +# Stop background apt services that run independently of cloud-init +info "Disabling background apt services..." +sudo systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades 2>/dev/null || true +sudo systemctl disable apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true +# Kill any straggler apt/dpkg processes +sudo pkill -9 -x apt-get 2>/dev/null || true +sudo pkill -9 -x apt 2>/dev/null || true +sudo pkill -9 -x dpkg 2>/dev/null || true +# Release any stale locks left by killed processes +sudo rm -f /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null || true +sudo dpkg --configure -a 2>/dev/null || true + +# Wait until apt/dpkg are truly idle for $APT_QUIET_WINDOW consecutive seconds. +apt_waited=0 +apt_quiet=0 +while [ "$apt_quiet" -lt "$APT_QUIET_WINDOW" ]; do + if [ "$apt_waited" -ge "$APT_WAIT_MAX" ]; then + fail "apt/dpkg still busy after ${APT_WAIT_MAX}s — cannot proceed" + fi + if pgrep -Ex "apt-get|apt|dpkg" >/dev/null 2>&1 \ + || fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null; then + info "Waiting for apt/dpkg processes to finish... (${apt_waited}s elapsed)" + apt_quiet=0 + sleep 2 + apt_waited=$((apt_waited + 2)) + else + apt_quiet=$((apt_quiet + 1)) + sleep 1 + apt_waited=$((apt_waited + 1)) + fi +done +info "apt is now exclusively ours" + # --- 0. Node.js (needed for services) --- if ! command -v node >/dev/null 2>&1; then info "Installing Node.js..." @@ -55,9 +113,16 @@ if ! command -v node >/dev/null 2>&1; then else fail "No SHA-256 verification tool found (need sha256sum or shasum)" fi - sudo -E bash "$tmpdir/setup_node.sh" >/dev/null 2>&1 + sudo -E bash "$tmpdir/setup_node.sh" ) - sudo apt-get install -y -qq nodejs >/dev/null 2>&1 + for attempt in 1 2 3 4 5; do + if sudo apt-get install -y -qq nodejs; then + break + fi + [ "$attempt" -eq 5 ] && fail "Node.js install failed after 5 attempts" + info "Node.js install failed (attempt $attempt/5), retrying in 30s..." + sleep 30 + done info "Node.js $(node --version) installed" else info "Node.js already installed: $(node --version)" @@ -66,8 +131,14 @@ fi # --- 1. Docker --- if ! command -v docker >/dev/null 2>&1; then info "Installing Docker..." - sudo apt-get update -qq >/dev/null 2>&1 - sudo apt-get install -y -qq docker.io >/dev/null 2>&1 + for attempt in 1 2 3 4 5; do + if sudo apt-get update -qq && sudo apt-get install -y -qq docker.io; then + break + fi + [ "$attempt" -eq 5 ] && fail "Docker install failed after 5 attempts" + info "Docker install failed (attempt $attempt/5), retrying in 30s..." + sleep 30 + done sudo usermod -aG docker "$(whoami)" info "Docker installed" else @@ -75,7 +146,7 @@ else fi # --- 2. NVIDIA Container Toolkit (if GPU present) --- -if command -v nvidia-smi >/dev/null 2>&1; then +if [ "$HAS_GPU" = true ]; then if ! dpkg -s nvidia-container-toolkit >/dev/null 2>&1; then info "Installing NVIDIA Container Toolkit..." curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ @@ -91,6 +162,22 @@ if command -v nvidia-smi >/dev/null 2>&1; then else info "NVIDIA Container Toolkit already installed" fi +else + # CPU-only instance: ensure Docker uses runc (not nvidia) as default runtime. + # Brev GPU images pre-configure nvidia as the default Docker runtime even on + # CPU instances, causing "nvidia-container-cli: nvml error: driver not loaded" + # when starting containers. + if grep -q '"default-runtime".*nvidia' /etc/docker/daemon.json 2>/dev/null; then + info "Resetting Docker default runtime to runc (no GPU detected)..." + sudo python3 -c " +import json +with open('/etc/docker/daemon.json') as f: cfg = json.load(f) +cfg.pop('default-runtime', None) +with open('/etc/docker/daemon.json', 'w') as f: json.dump(cfg, f, indent=2) +" + sudo systemctl restart docker + info "Docker runtime reset to runc" + fi fi # --- 3. openshell CLI (binary release, not pip) --- @@ -139,7 +226,7 @@ fi VLLM_MODEL="nvidia/nemotron-3-nano-30b-a3b" if [ "${SKIP_VLLM:-}" = "1" ]; then info "Skipping vLLM install (SKIP_VLLM=1)" -elif command -v nvidia-smi >/dev/null 2>&1; then +elif [ "$HAS_GPU" = true ]; then if ! python3 -c "import vllm" 2>/dev/null; then info "Installing vLLM..." if ! command -v pip3 >/dev/null 2>&1; then diff --git a/scripts/setup.sh b/scripts/setup.sh index 4ba7f13dd..b503fb09c 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -112,7 +112,8 @@ info "Starting OpenShell gateway..." openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | grep . && docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs docker volume rm || true GATEWAY_ARGS=(--name nemoclaw) -command -v nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu) +# Only enable GPU if nvidia-smi actually works (driver loaded), not just present on PATH +nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu) if ! openshell gateway start "${GATEWAY_ARGS[@]}" 2>&1 | grep -E "Gateway|✓|Error|error"; then warn "Gateway start failed. Cleaning up stale state..." openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true diff --git a/test/e2e/brev-e2e.test.js b/test/e2e/brev-e2e.test.js index cd298ddd7..8f251f67e 100644 --- a/test/e2e/brev-e2e.test.js +++ b/test/e2e/brev-e2e.test.js @@ -17,8 +17,7 @@ * * Optional env vars: * TEST_SUITE — which test to run: full (default), credential-sanitization, telegram-injection, all - * BREV_MIN_VCPU — Minimum vCPUs for CPU instance (default: 4) - * BREV_MIN_RAM — Minimum RAM in GB for CPU instance (default: 16) + * BREV_INSTANCE_TYPE — Brev/GCP instance type (default: n2-standard-4) */ import { describe, it, expect, beforeAll, afterAll } from "vitest"; @@ -28,8 +27,8 @@ import { homedir } from "node:os"; import path from "node:path"; // CPU instance specs: min vCPUs and RAM for the instance search -const BREV_MIN_VCPU = parseInt(process.env.BREV_MIN_VCPU || "4", 10); -const BREV_MIN_RAM = parseInt(process.env.BREV_MIN_RAM || "16", 10); +// Use a known CPU-only GCP instance type to avoid GPU images with broken nvidia runtime +const BREV_INSTANCE_TYPE = process.env.BREV_INSTANCE_TYPE || "n2-standard-4"; const INSTANCE_NAME = process.env.INSTANCE_NAME; const TEST_SUITE = process.env.TEST_SUITE || "full"; const REPO_DIR = path.resolve(import.meta.dirname, "../.."); @@ -58,19 +57,15 @@ function ssh(cmd, { timeout = 120_000, stream = false } = {}) { return stream ? "" : result.trim(); } -/** - * Escape a value for safe inclusion in a single-quoted shell string. - * Replaces single quotes with the shell-safe sequence: '\'' - */ -function shellEscape(value) { - return String(value).replace(/'/g, "'\\''"); +function shellQuote(value) { + return `'${String(value).replace(/'/g, "'\\''")}'`; } /** Run a command on the remote VM with env vars set for NemoClaw. */ function sshEnv(cmd, { timeout = 600_000, stream = false } = {}) { const envPrefix = [ - `export NVIDIA_API_KEY='${shellEscape(process.env.NVIDIA_API_KEY)}'`, - `export GITHUB_TOKEN='${shellEscape(process.env.GITHUB_TOKEN)}'`, + `export NVIDIA_API_KEY=${shellQuote(process.env.NVIDIA_API_KEY)}`, + `export GITHUB_TOKEN=${shellQuote(process.env.GITHUB_TOKEN)}`, `export NEMOCLAW_NON_INTERACTIVE=1`, `export NEMOCLAW_SANDBOX_NAME=e2e-test`, ].join(" && "); @@ -131,14 +126,21 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => { ); brev("login", "--token", process.env.BREV_API_TOKEN); - // Create bare CPU instance via brev search cpu | brev create - console.log(`[${elapsed()}] Creating CPU instance via brev search cpu | brev create...`); - console.log(`[${elapsed()}] min-vcpu: ${BREV_MIN_VCPU}, min-ram: ${BREV_MIN_RAM}GB`); - execSync( - `brev search cpu --min-vcpu ${BREV_MIN_VCPU} --min-ram ${BREV_MIN_RAM} --sort price | ` + - `brev create ${INSTANCE_NAME} --detached`, - { encoding: "utf-8", timeout: 180_000, stdio: ["pipe", "inherit", "inherit"] }, - ); + // Delete any leftover instance from a previous failed run + try { + brev("delete", INSTANCE_NAME); + console.log(`[${elapsed()}] Deleted leftover instance "${INSTANCE_NAME}"`); + } catch { + // Expected — no leftover instance + } + + // Create CPU instance with a known GCP instance type + console.log(`[${elapsed()}] Creating CPU instance (type: ${BREV_INSTANCE_TYPE})...`); + execSync(`brev create --type ${BREV_INSTANCE_TYPE} ${INSTANCE_NAME} --detached`, { + encoding: "utf-8", + timeout: 180_000, + stdio: ["pipe", "inherit", "inherit"], + }); instanceCreated = true; console.log(`[${elapsed()}] brev create returned (instance provisioning in background)`); @@ -161,6 +163,11 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => { ); console.log(`[${elapsed()}] Code synced`); + // Wait for cloud-init to finish — Brev instances run apt provisioning on boot + console.log(`[${elapsed()}] Waiting for cloud-init to finish...`); + ssh(`cloud-init status --wait 2>/dev/null || true`, { timeout: 600_000, stream: true }); + console.log(`[${elapsed()}] cloud-init done`); + // Bootstrap VM — stream output to CI log so we can see progress console.log(`[${elapsed()}] Running brev-setup.sh (bootstrap)...`); sshEnv(`cd ${remoteDir} && SKIP_VLLM=1 bash scripts/brev-setup.sh`, { From dafcaba7bd0015720e7a044578c3a9b2bdd59fa2 Mon Sep 17 00:00:00 2001 From: Julie Yaunches Date: Wed, 1 Apr 2026 15:55:18 -0400 Subject: [PATCH 2/2] fix(e2e): use execFileSync to prevent shell injection in brev create Replaces execSync with execFileSync for the brev create call, passing args as an array instead of interpolating into a shell string. Avoids shell injection risk if BREV_INSTANCE_TYPE or INSTANCE_NAME ever contain spaces or metacharacters. Addresses CodeRabbit review feedback on PR #1266. --- test/e2e/brev-e2e.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/brev-e2e.test.js b/test/e2e/brev-e2e.test.js index 8f251f67e..45688ff3c 100644 --- a/test/e2e/brev-e2e.test.js +++ b/test/e2e/brev-e2e.test.js @@ -136,7 +136,7 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => { // Create CPU instance with a known GCP instance type console.log(`[${elapsed()}] Creating CPU instance (type: ${BREV_INSTANCE_TYPE})...`); - execSync(`brev create --type ${BREV_INSTANCE_TYPE} ${INSTANCE_NAME} --detached`, { + execFileSync("brev", ["create", "--type", BREV_INSTANCE_TYPE, INSTANCE_NAME, "--detached"], { encoding: "utf-8", timeout: 180_000, stdio: ["pipe", "inherit", "inherit"],