From d92dc54ee69e576de8bc7e2cf1dca728fa1d71a0 Mon Sep 17 00:00:00 2001
From: Julie Yaunches <jyaunches@nvidia.com>
Date: Wed, 1 Apr 2026 15:33:45 -0400
Subject: [PATCH 1/2] fix(e2e): harden Brev E2E infrastructure for CPU-only
 instances
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Shared E2E infrastructure improvements that make Brev-based CI reliable
on CPU-only instances:

brev-setup.sh:
- Extract HAS_GPU flag for consistent GPU detection — nvidia-smi must
  run successfully, not just exist on PATH (Brev GPU images ship the
  binary even on CPU instances)
- All GPU gates (container toolkit, Docker runtime reset, vLLM) use
  the single HAS_GPU flag
- Replace cloud-init || true with proper error check + warning
- Add timeout (300s) and quiet window (5s) to apt wait loop to prevent
  indefinite hangs and races
- Add retry loops (5 attempts, 30s backoff) for Node.js and Docker
  apt-get installs
- Unsuppress NodeSource installer output for debugging
- Reset Docker default runtime to runc on CPU-only instances where
  Brev pre-configures nvidia as default

setup.sh:
- Check nvidia-smi exit code instead of just PATH presence for GPU
  gateway flag

brev-e2e.test.js:
- Use known GCP instance type (n2-standard-4) instead of cpu search
- Add shellQuote() for safe secret interpolation in SSH commands
- Delete leftover instances before creating new ones
- Wait for cloud-init before running bootstrap
---
 scripts/brev-setup.sh     | 99 ++++++++++++++++++++++++++++++++++++---
 scripts/setup.sh          |  3 +-
 test/e2e/brev-e2e.test.js | 47 +++++++++++--------
 3 files changed, 122 insertions(+), 27 deletions(-)

diff --git a/scripts/brev-setup.sh b/scripts/brev-setup.sh
index 1bf5d9de4..c2a6a7c12 100755
--- a/scripts/brev-setup.sh
+++ b/scripts/brev-setup.sh
@@ -37,6 +37,64 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 export NEEDRESTART_MODE=a
 export DEBIAN_FRONTEND=noninteractive
 
+# ── GPU detection ─────────────────────────────────────────────────
+# Brev GPU images ship the nvidia-smi binary even on CPU-only instances.
+# We need nvidia-smi to actually *run* (driver loaded) to treat this as
+# a GPU host.  Export a flag so every later section uses the same check.
+if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
+  HAS_GPU=true
+else
+  HAS_GPU=false
+fi
+info "GPU detected: $HAS_GPU"
+
+# ── Acquire exclusive apt access ──────────────────────────────────
+# Cloud VMs run multiple apt processes on boot: cloud-init, apt-daily,
+# and unattended-upgrades. We must wait for cloud-init AND disable the
+# background services before we can safely use apt ourselves.
+APT_WAIT_MAX=300   # seconds — fail if apt is still busy after 5 min
+APT_QUIET_WINDOW=5 # seconds — apt must be idle this long before we proceed
+
+if command -v cloud-init >/dev/null 2>&1; then
+  info "Waiting for cloud-init to finish..."
+  if ! cloud-init status --wait >/dev/null 2>&1; then
+    warn "cloud-init exited with an error — proceeding, but the VM may be in an inconsistent state"
+  fi
+  info "cloud-init done"
+fi
+# Stop background apt services that run independently of cloud-init
+info "Disabling background apt services..."
+sudo systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades 2>/dev/null || true
+sudo systemctl disable apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true
+# Kill any straggler apt/dpkg processes
+sudo pkill -9 -x apt-get 2>/dev/null || true
+sudo pkill -9 -x apt 2>/dev/null || true
+sudo pkill -9 -x dpkg 2>/dev/null || true
+# Release any stale locks left by killed processes
+sudo rm -f /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null || true
+sudo dpkg --configure -a 2>/dev/null || true
+
+# Wait until apt/dpkg are truly idle for $APT_QUIET_WINDOW consecutive seconds.
+apt_waited=0
+apt_quiet=0
+while [ "$apt_quiet" -lt "$APT_QUIET_WINDOW" ]; do
+  if [ "$apt_waited" -ge "$APT_WAIT_MAX" ]; then
+    fail "apt/dpkg still busy after ${APT_WAIT_MAX}s — cannot proceed"
+  fi
+  if pgrep -Ex "apt-get|apt|dpkg" >/dev/null 2>&1 \
+    || fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null; then
+    info "Waiting for apt/dpkg processes to finish... (${apt_waited}s elapsed)"
+    apt_quiet=0
+    sleep 2
+    apt_waited=$((apt_waited + 2))
+  else
+    apt_quiet=$((apt_quiet + 1))
+    sleep 1
+    apt_waited=$((apt_waited + 1))
+  fi
+done
+info "apt is now exclusively ours"
+
 # --- 0. Node.js (needed for services) ---
 if ! command -v node >/dev/null 2>&1; then
   info "Installing Node.js..."
@@ -55,9 +113,16 @@ if ! command -v node >/dev/null 2>&1; then
     else
       fail "No SHA-256 verification tool found (need sha256sum or shasum)"
     fi
-    sudo -E bash "$tmpdir/setup_node.sh" >/dev/null 2>&1
+    sudo -E bash "$tmpdir/setup_node.sh"
   )
-  sudo apt-get install -y -qq nodejs >/dev/null 2>&1
+  for attempt in 1 2 3 4 5; do
+    if sudo apt-get install -y -qq nodejs; then
+      break
+    fi
+    [ "$attempt" -eq 5 ] && fail "Node.js install failed after 5 attempts"
+    info "Node.js install failed (attempt $attempt/5), retrying in 30s..."
+    sleep 30
+  done
   info "Node.js $(node --version) installed"
 else
   info "Node.js already installed: $(node --version)"
@@ -66,8 +131,14 @@ fi
 # --- 1. Docker ---
 if ! command -v docker >/dev/null 2>&1; then
   info "Installing Docker..."
-  sudo apt-get update -qq >/dev/null 2>&1
-  sudo apt-get install -y -qq docker.io >/dev/null 2>&1
+  for attempt in 1 2 3 4 5; do
+    if sudo apt-get update -qq && sudo apt-get install -y -qq docker.io; then
+      break
+    fi
+    [ "$attempt" -eq 5 ] && fail "Docker install failed after 5 attempts"
+    info "Docker install failed (attempt $attempt/5), retrying in 30s..."
+    sleep 30
+  done
   sudo usermod -aG docker "$(whoami)"
   info "Docker installed"
 else
@@ -75,7 +146,7 @@ else
 fi
 
 # --- 2. NVIDIA Container Toolkit (if GPU present) ---
-if command -v nvidia-smi >/dev/null 2>&1; then
+if [ "$HAS_GPU" = true ]; then
   if ! dpkg -s nvidia-container-toolkit >/dev/null 2>&1; then
     info "Installing NVIDIA Container Toolkit..."
     curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
@@ -91,6 +162,22 @@ if command -v nvidia-smi >/dev/null 2>&1; then
   else
     info "NVIDIA Container Toolkit already installed"
   fi
+else
+  # CPU-only instance: ensure Docker uses runc (not nvidia) as default runtime.
+  # Brev GPU images pre-configure nvidia as the default Docker runtime even on
+  # CPU instances, causing "nvidia-container-cli: nvml error: driver not loaded"
+  # when starting containers.
+  if grep -q '"default-runtime".*nvidia' /etc/docker/daemon.json 2>/dev/null; then
+    info "Resetting Docker default runtime to runc (no GPU detected)..."
+    sudo python3 -c "
+import json
+with open('/etc/docker/daemon.json') as f: cfg = json.load(f)
+cfg.pop('default-runtime', None)
+with open('/etc/docker/daemon.json', 'w') as f: json.dump(cfg, f, indent=2)
+"
+    sudo systemctl restart docker
+    info "Docker runtime reset to runc"
+  fi
 fi
 
 # --- 3. openshell CLI (binary release, not pip) ---
@@ -139,7 +226,7 @@ fi
 VLLM_MODEL="nvidia/nemotron-3-nano-30b-a3b"
 if [ "${SKIP_VLLM:-}" = "1" ]; then
   info "Skipping vLLM install (SKIP_VLLM=1)"
-elif command -v nvidia-smi >/dev/null 2>&1; then
+elif [ "$HAS_GPU" = true ]; then
   if ! python3 -c "import vllm" 2>/dev/null; then
     info "Installing vLLM..."
     if ! command -v pip3 >/dev/null 2>&1; then
diff --git a/scripts/setup.sh b/scripts/setup.sh
index 4ba7f13dd..b503fb09c 100755
--- a/scripts/setup.sh
+++ b/scripts/setup.sh
@@ -112,7 +112,8 @@ info "Starting OpenShell gateway..."
 openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
 docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | grep . && docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs docker volume rm || true
 GATEWAY_ARGS=(--name nemoclaw)
-command -v nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
+# Only enable GPU if nvidia-smi actually works (driver loaded), not just present on PATH
+nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
 if ! openshell gateway start "${GATEWAY_ARGS[@]}" 2>&1 | grep -E "Gateway|✓|Error|error"; then
   warn "Gateway start failed. Cleaning up stale state..."
   openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
diff --git a/test/e2e/brev-e2e.test.js b/test/e2e/brev-e2e.test.js
index cd298ddd7..8f251f67e 100644
--- a/test/e2e/brev-e2e.test.js
+++ b/test/e2e/brev-e2e.test.js
@@ -17,8 +17,7 @@
  *
  * Optional env vars:
  *   TEST_SUITE       — which test to run: full (default), credential-sanitization, telegram-injection, all
- *   BREV_MIN_VCPU    — Minimum vCPUs for CPU instance (default: 4)
- *   BREV_MIN_RAM     — Minimum RAM in GB for CPU instance (default: 16)
+ *   BREV_INSTANCE_TYPE — Brev/GCP instance type (default: n2-standard-4)
  */
 
 import { describe, it, expect, beforeAll, afterAll } from "vitest";
@@ -28,8 +27,8 @@ import { homedir } from "node:os";
 import path from "node:path";
 
 // CPU instance specs: min vCPUs and RAM for the instance search
-const BREV_MIN_VCPU = parseInt(process.env.BREV_MIN_VCPU || "4", 10);
-const BREV_MIN_RAM = parseInt(process.env.BREV_MIN_RAM || "16", 10);
+// Use a known CPU-only GCP instance type to avoid GPU images with broken nvidia runtime
+const BREV_INSTANCE_TYPE = process.env.BREV_INSTANCE_TYPE || "n2-standard-4";
 const INSTANCE_NAME = process.env.INSTANCE_NAME;
 const TEST_SUITE = process.env.TEST_SUITE || "full";
 const REPO_DIR = path.resolve(import.meta.dirname, "../..");
@@ -58,19 +57,15 @@ function ssh(cmd, { timeout = 120_000, stream = false } = {}) {
   return stream ? "" : result.trim();
 }
 
-/**
- * Escape a value for safe inclusion in a single-quoted shell string.
- * Replaces single quotes with the shell-safe sequence: '\''
- */
-function shellEscape(value) {
-  return String(value).replace(/'/g, "'\\''");
+function shellQuote(value) {
+  return `'${String(value).replace(/'/g, "'\\''")}'`;
 }
 
 /** Run a command on the remote VM with env vars set for NemoClaw. */
 function sshEnv(cmd, { timeout = 600_000, stream = false } = {}) {
   const envPrefix = [
-    `export NVIDIA_API_KEY='${shellEscape(process.env.NVIDIA_API_KEY)}'`,
-    `export GITHUB_TOKEN='${shellEscape(process.env.GITHUB_TOKEN)}'`,
+    `export NVIDIA_API_KEY=${shellQuote(process.env.NVIDIA_API_KEY)}`,
+    `export GITHUB_TOKEN=${shellQuote(process.env.GITHUB_TOKEN)}`,
     `export NEMOCLAW_NON_INTERACTIVE=1`,
     `export NEMOCLAW_SANDBOX_NAME=e2e-test`,
   ].join(" && ");
@@ -131,14 +126,21 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
     );
     brev("login", "--token", process.env.BREV_API_TOKEN);
 
-    // Create bare CPU instance via brev search cpu | brev create
-    console.log(`[${elapsed()}] Creating CPU instance via brev search cpu | brev create...`);
-    console.log(`[${elapsed()}]   min-vcpu: ${BREV_MIN_VCPU}, min-ram: ${BREV_MIN_RAM}GB`);
-    execSync(
-      `brev search cpu --min-vcpu ${BREV_MIN_VCPU} --min-ram ${BREV_MIN_RAM} --sort price | ` +
-        `brev create ${INSTANCE_NAME} --detached`,
-      { encoding: "utf-8", timeout: 180_000, stdio: ["pipe", "inherit", "inherit"] },
-    );
+    // Delete any leftover instance from a previous failed run
+    try {
+      brev("delete", INSTANCE_NAME);
+      console.log(`[${elapsed()}] Deleted leftover instance "${INSTANCE_NAME}"`);
+    } catch {
+      // Expected — no leftover instance
+    }
+
+    // Create CPU instance with a known GCP instance type
+    console.log(`[${elapsed()}] Creating CPU instance (type: ${BREV_INSTANCE_TYPE})...`);
+    execSync(`brev create --type ${BREV_INSTANCE_TYPE} ${INSTANCE_NAME} --detached`, {
+      encoding: "utf-8",
+      timeout: 180_000,
+      stdio: ["pipe", "inherit", "inherit"],
+    });
     instanceCreated = true;
     console.log(`[${elapsed()}] brev create returned (instance provisioning in background)`);
 
@@ -161,6 +163,11 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
     );
     console.log(`[${elapsed()}] Code synced`);
 
+    // Wait for cloud-init to finish — Brev instances run apt provisioning on boot
+    console.log(`[${elapsed()}] Waiting for cloud-init to finish...`);
+    ssh(`cloud-init status --wait 2>/dev/null || true`, { timeout: 600_000, stream: true });
+    console.log(`[${elapsed()}] cloud-init done`);
+
     // Bootstrap VM — stream output to CI log so we can see progress
     console.log(`[${elapsed()}] Running brev-setup.sh (bootstrap)...`);
     sshEnv(`cd ${remoteDir} && SKIP_VLLM=1 bash scripts/brev-setup.sh`, {

From 77e0862516ee1fcd5069f28b4e93b2bead75c376 Mon Sep 17 00:00:00 2001
From: Julie Yaunches <jyaunches@nvidia.com>
Date: Wed, 1 Apr 2026 15:34:35 -0400
Subject: [PATCH 2/2] test: add real runAgentInSandbox() E2E tests for telegram
 injection

Add Phases 7-11 to test-telegram-injection.sh addressing PR #1092
feedback from @cv: exercise real production code paths instead of
ad-hoc SSH commands.

New test phases:
- Phase 7: Real runAgentInSandbox() with $(command) and backtick
  injection payloads (T9-T11)
- Phase 8: Multi-line message handling with embedded injection (T12-T13)
- Phase 9: Session ID option injection prevention via sanitizeSessionId,
  including negative Telegram chat IDs (T14-T15)
- Phase 10: Empty and special-character-only session IDs (T16-T17)
- Phase 11: Cleanup of marker files and temp scripts

These tests are expected to FAIL on unfixed code (runAgentInSandbox is
not exported, signature doesn't accept options), proving they detect the
vulnerability. They will pass once PR #119 is merged.

Refs: #118, PR #1092
---
 test/e2e/test-telegram-injection.sh | 385 ++++++++++++++++++++++++++++
 1 file changed, 385 insertions(+)

diff --git a/test/e2e/test-telegram-injection.sh b/test/e2e/test-telegram-injection.sh
index 64ae41efb..0c671b22d 100755
--- a/test/e2e/test-telegram-injection.sh
+++ b/test/e2e/test-telegram-injection.sh
@@ -450,6 +450,391 @@ else
   fail "T8b: Message with special characters caused empty/error response"
 fi
 
+# ══════════════════════════════════════════════════════════════════
+# Phase 7: Real runAgentInSandbox() Invocation
+#
+# PR #1092 feedback from @cv: The tests above use ad-hoc SSH commands
+# that bypass the actual code path in telegram-bridge.js. This phase
+# exercises the real runAgentInSandbox() function via Node.js to ensure
+# the production code is validated.
+# ══════════════════════════════════════════════════════════════════
+section "Phase 7: Real runAgentInSandbox() Code Path"
+
+info "T9: Testing real runAgentInSandbox() with injection payload..."
+
+# Create a test script that invokes the actual runAgentInSandbox function
+cat >/tmp/test-real-bridge.js <<'TESTSCRIPT'
+// Test script to invoke the real runAgentInSandbox() function.
+//
+// NOTE: On unfixed code, this will fail because:
+//   1. runAgentInSandbox is not exported (no require.main guard)
+//   2. The function signature doesn't accept an options parameter
+//   3. initConfig() runs at require-time and fails without TELEGRAM_BOT_TOKEN
+// These failures are expected and prove the tests detect the missing fix.
+// After the fix PR is merged, this script will work correctly.
+const path = require('path');
+
+// Mock resolveOpenshell to use the system openshell
+const resolveOpenshellPath = require.resolve(path.join(process.cwd(), 'bin/lib/resolve-openshell'));
+require.cache[resolveOpenshellPath] = {
+  exports: { resolveOpenshell: () => process.env.OPENSHELL_PATH || 'openshell' }
+};
+
+const bridge = require(path.join(process.cwd(), 'scripts/telegram-bridge.js'));
+
+async function test() {
+  const message = process.argv[2] || 'Hello';
+  const sessionId = process.argv[3] || 'e2e-test';
+  const apiKey = process.env.NVIDIA_API_KEY;
+  const sandbox = process.env.NEMOCLAW_SANDBOX_NAME || 'e2e-test';
+  const openshell = process.env.OPENSHELL_PATH || 'openshell';
+
+  if (!apiKey) {
+    console.error('NVIDIA_API_KEY required');
+    process.exit(1);
+  }
+
+  try {
+    // Call the real runAgentInSandbox with options override for testing
+    const result = await bridge.runAgentInSandbox(message, sessionId, {
+      apiKey,
+      sandbox,
+      openshell,
+    });
+    console.log('RESULT_START');
+    console.log(result);
+    console.log('RESULT_END');
+  } catch (err) {
+    console.error('ERROR:', err.message);
+    process.exit(1);
+  }
+}
+
+test();
+TESTSCRIPT
+
+# Find openshell path
+OPENSHELL_PATH=$(command -v openshell) || OPENSHELL_PATH=""
+if [ -z "$OPENSHELL_PATH" ]; then
+  skip "T9: openshell not found, cannot test real runAgentInSandbox()"
+else
+  # Clean up any previous injection markers
+  sandbox_exec "rm -f /tmp/injection-proof-t9" >/dev/null 2>&1
+
+  # Test with injection payload through real runAgentInSandbox
+  t9_result=$(cd "$REPO" && OPENSHELL_PATH="$OPENSHELL_PATH" \
+    NEMOCLAW_SANDBOX_NAME="$SANDBOX_NAME" \
+    timeout 120 node /tmp/test-real-bridge.js \
+    '$(touch /tmp/injection-proof-t9 && echo INJECTED)' \
+    'e2e-injection-t9' 2>&1) || true
+
+  # Check if injection file was created
+  injection_check_t9=$(sandbox_exec "test -f /tmp/injection-proof-t9 && echo EXPLOITED || echo SAFE")
+  if echo "$injection_check_t9" | grep -q "SAFE"; then
+    pass "T9: Real runAgentInSandbox() prevented \$(command) injection"
+  else
+    fail "T9: Real runAgentInSandbox() EXECUTED injection — vulnerability in production code!"
+  fi
+fi
+
+# T10: Test backticks through real code path
+info "T10: Testing real runAgentInSandbox() with backtick payload..."
+
+if [ -z "$OPENSHELL_PATH" ]; then
+  skip "T10: openshell not found, cannot test real runAgentInSandbox()"
+else
+  sandbox_exec "rm -f /tmp/injection-proof-t10" >/dev/null 2>&1
+
+  t10_result=$(cd "$REPO" && OPENSHELL_PATH="$OPENSHELL_PATH" \
+    NEMOCLAW_SANDBOX_NAME="$SANDBOX_NAME" \
+    timeout 120 node /tmp/test-real-bridge.js \
+    '`touch /tmp/injection-proof-t10`' \
+    'e2e-injection-t10' 2>&1) || true
+
+  injection_check_t10=$(sandbox_exec "test -f /tmp/injection-proof-t10 && echo EXPLOITED || echo SAFE")
+  if echo "$injection_check_t10" | grep -q "SAFE"; then
+    pass "T10: Real runAgentInSandbox() prevented backtick injection"
+  else
+    fail "T10: Real runAgentInSandbox() EXECUTED backtick injection — vulnerability in production code!"
+  fi
+fi
+
+# T11: Test API key not exposed through real code path
+info "T11: Verifying API key not in process arguments (real code path)..."
+
+if [ -z "$OPENSHELL_PATH" ]; then
+  skip "T11: openshell not found, cannot test real runAgentInSandbox()"
+else
+  # Start the bridge test in background and check ps
+  cd "$REPO" && OPENSHELL_PATH="$OPENSHELL_PATH" \
+    NEMOCLAW_SANDBOX_NAME="$SANDBOX_NAME" \
+    timeout 60 node /tmp/test-real-bridge.js 'Hello world' 'e2e-ps-check' &
+  BRIDGE_PID=$!
+
+  # Poll the process list multiple times to catch short-lived SSH invocations
+  API_KEY_PREFIX="${NVIDIA_API_KEY:0:15}"
+  KEY_LEAKED=false
+  for _poll in $(seq 1 10); do
+    sleep 1
+    # shellcheck disable=SC2009 # pgrep doesn't support the output format we need
+    ps_output=$(ps aux 2>/dev/null | grep -v grep | grep -v "test-telegram-injection" || true)
+    if echo "$ps_output" | grep -qF "$API_KEY_PREFIX"; then
+      KEY_LEAKED=true
+      break
+    fi
+  done
+
+  if [ "$KEY_LEAKED" = true ]; then
+    fail "T11: API key found in process arguments via real runAgentInSandbox()"
+  else
+    pass "T11: API key NOT visible in process arguments (real code path)"
+  fi
+
+  # Clean up background process
+  kill $BRIDGE_PID 2>/dev/null || true
+  wait $BRIDGE_PID 2>/dev/null || true
+fi
+
+# ══════════════════════════════════════════════════════════════════
+# Phase 8: Multi-line Message Handling
+# ══════════════════════════════════════════════════════════════════
+section "Phase 8: Multi-line Message Handling"
+
+# T12: Multi-line message with injection on second line
+info "T12: Testing multi-line message with injection payload..."
+
+if [ -z "$OPENSHELL_PATH" ]; then
+  skip "T12: openshell not found, cannot test multi-line messages"
+else
+  sandbox_exec "rm -f /tmp/injection-proof-t12" >/dev/null 2>&1
+
+  # Multi-line message with injection attempt on line 2
+  MULTILINE_PAYLOAD=$'Hello\n$(touch /tmp/injection-proof-t12)\nWorld'
+
+  t12_result=$(cd "$REPO" && OPENSHELL_PATH="$OPENSHELL_PATH" \
+    NEMOCLAW_SANDBOX_NAME="$SANDBOX_NAME" \
+    timeout 120 node /tmp/test-real-bridge.js \
+    "$MULTILINE_PAYLOAD" \
+    'e2e-multiline-t12' 2>&1) || true
+
+  injection_check_t12=$(sandbox_exec "test -f /tmp/injection-proof-t12 && echo EXPLOITED || echo SAFE")
+  if echo "$injection_check_t12" | grep -q "SAFE"; then
+    pass "T12: Multi-line message injection was NOT executed"
+  else
+    fail "T12: Multi-line message injection was EXECUTED — vulnerability!"
+  fi
+fi
+
+# T13: Message with embedded newlines and backticks
+info "T13: Testing message with newlines and backticks..."
+
+ssh_config_t13="$(mktemp)"
+openshell sandbox ssh-config "$SANDBOX_NAME" >"$ssh_config_t13" 2>/dev/null
+sandbox_exec "rm -f /tmp/injection-proof-t13" >/dev/null 2>&1
+
+PAYLOAD_MULTILINE_BT=$'First line\n`touch /tmp/injection-proof-t13`\nThird line'
+
+timeout 30 ssh -F "$ssh_config_t13" \
+  -o StrictHostKeyChecking=no \
+  -o UserKnownHostsFile=/dev/null \
+  -o LogLevel=ERROR \
+  "openshell-${SANDBOX_NAME}" \
+  'MSG=$(cat) && echo "$MSG"' \
+  <<<"$PAYLOAD_MULTILINE_BT" >/dev/null 2>&1 || true
+rm -f "$ssh_config_t13"
+
+injection_check_t13=$(sandbox_exec "test -f /tmp/injection-proof-t13 && echo EXPLOITED || echo SAFE")
+if echo "$injection_check_t13" | grep -q "SAFE"; then
+  pass "T13: Multi-line backtick injection was NOT executed"
+else
+  fail "T13: Multi-line backtick injection was EXECUTED"
+fi
+
+# ══════════════════════════════════════════════════════════════════
+# Phase 9: Session ID Option Injection Prevention
+# ══════════════════════════════════════════════════════════════════
+section "Phase 9: Session ID Option Injection"
+
+# T14: Leading hyphen session IDs should be sanitized
+info "T14: Testing sanitizeSessionId strips leading hyphens..."
+
+t14_result=$(cd "$REPO" && node -e "
+  const path = require('path');
+  // Mock resolveOpenshell to avoid PATH dependency
+  const resolveOpenshellPath = require.resolve('./bin/lib/resolve-openshell');
+  require.cache[resolveOpenshellPath] = {
+    exports: { resolveOpenshell: () => '/mock/openshell' }
+  };
+  const bridge = require(path.join(process.cwd(), 'scripts/telegram-bridge.js'));
+
+  // Test cases for option injection prevention
+  const tests = [
+    { input: '--help', expected: 'help' },
+    { input: '-v', expected: 'v' },
+    { input: '---test', expected: 'test' },
+    { input: '-123456789', expected: '123456789' },
+    { input: '--', expected: null },
+    { input: '-', expected: null },
+    { input: 'valid-id', expected: 'valid-id' },
+    { input: '-abc-123', expected: 'abc-123' },
+  ];
+
+  let passed = 0;
+  let failed = 0;
+  for (const t of tests) {
+    const result = bridge.sanitizeSessionId(t.input);
+    if (result === t.expected) {
+      passed++;
+    } else {
+      console.log('FAIL: sanitizeSessionId(' + JSON.stringify(t.input) + ') = ' + JSON.stringify(result) + ', expected ' + JSON.stringify(t.expected));
+      failed++;
+    }
+  }
+  console.log('SANITIZE_RESULT: ' + passed + ' passed, ' + failed + ' failed');
+  process.exit(failed > 0 ? 1 : 0);
+" 2>&1)
+
+if echo "$t14_result" | grep -q "SANITIZE_RESULT:.*0 failed"; then
+  pass "T14: sanitizeSessionId correctly strips leading hyphens"
+else
+  fail "T14: sanitizeSessionId failed to strip leading hyphens: $t14_result"
+fi
+
+# T15: Verify leading hyphen session ID doesn't cause option injection in real call
+info "T15: Testing real runAgentInSandbox with leading-hyphen session ID..."
+
+if [ -z "$OPENSHELL_PATH" ]; then
+  skip "T15: openshell not found, cannot test session ID sanitization"
+else
+  # Create a fresh test script that checks the actual session ID used
+  cat >/tmp/test-session-id.js <<'TESTSCRIPT'
+const path = require('path');
+const resolveOpenshellPath = require.resolve(path.join(process.cwd(), 'bin/lib/resolve-openshell'));
+require.cache[resolveOpenshellPath] = {
+  exports: { resolveOpenshell: () => process.env.OPENSHELL_PATH || 'openshell' }
+};
+const bridge = require(path.join(process.cwd(), 'scripts/telegram-bridge.js'));
+
+// Just test the sanitization with a negative chat ID (like Telegram groups)
+const sessionId = process.argv[2] || '--help';
+const sanitized = bridge.sanitizeSessionId(sessionId);
+console.log('INPUT:', sessionId);
+console.log('SANITIZED:', sanitized);
+if (sanitized && sanitized.startsWith('-')) {
+  console.log('ERROR: Sanitized ID starts with hyphen — option injection possible!');
+  process.exit(1);
+} else {
+  console.log('OK: No leading hyphen in sanitized ID');
+  process.exit(0);
+}
+TESTSCRIPT
+
+  t15_result=$(cd "$REPO" && OPENSHELL_PATH="$OPENSHELL_PATH" \
+    node /tmp/test-session-id.js '--help' 2>&1) || true
+
+  if echo "$t15_result" | grep -q "OK: No leading hyphen"; then
+    pass "T15: Session ID '--help' sanitized to prevent option injection"
+  else
+    fail "T15: Session ID sanitization failed: $t15_result"
+  fi
+
+  # Also test with negative number (Telegram group chat ID)
+  t15b_result=$(cd "$REPO" && OPENSHELL_PATH="$OPENSHELL_PATH" \
+    node /tmp/test-session-id.js '-123456789' 2>&1) || true
+
+  if echo "$t15b_result" | grep -q "OK: No leading hyphen"; then
+    pass "T15b: Negative chat ID '-123456789' sanitized correctly"
+  else
+    fail "T15b: Negative chat ID sanitization failed: $t15b_result"
+  fi
+
+  rm -f /tmp/test-session-id.js
+fi
+
+# ══════════════════════════════════════════════════════════════════
+# Phase 10: Empty Message Handling
+# ══════════════════════════════════════════════════════════════════
+section "Phase 10: Empty and Edge Case Messages"
+
+# T16: Empty session ID should return error
+info "T16: Testing empty session ID handling..."
+
+t16_result=$(cd "$REPO" && node -e "
+  const path = require('path');
+  const resolveOpenshellPath = require.resolve('./bin/lib/resolve-openshell');
+  require.cache[resolveOpenshellPath] = {
+    exports: { resolveOpenshell: () => '/mock/openshell' }
+  };
+  const bridge = require(path.join(process.cwd(), 'scripts/telegram-bridge.js'));
+
+  const result = bridge.sanitizeSessionId('');
+  if (result === null) {
+    console.log('OK: Empty session ID returns null');
+    process.exit(0);
+  } else {
+    console.log('FAIL: Empty session ID returned:', result);
+    process.exit(1);
+  }
+" 2>&1)
+
+if echo "$t16_result" | grep -q "OK:"; then
+  pass "T16: Empty session ID correctly returns null"
+else
+  fail "T16: Empty session ID handling failed: $t16_result"
+fi
+
+# T17: Session ID with only special characters
+info "T17: Testing session ID with only special characters..."
+
+t17_result=$(cd "$REPO" && node -e "
+  const path = require('path');
+  const resolveOpenshellPath = require.resolve('./bin/lib/resolve-openshell');
+  require.cache[resolveOpenshellPath] = {
+    exports: { resolveOpenshell: () => '/mock/openshell' }
+  };
+  const bridge = require(path.join(process.cwd(), 'scripts/telegram-bridge.js'));
+
+  const testCases = [
+    { input: ';;;', expected: null },
+    { input: '\$()', expected: null },
+    { input: '---', expected: null },
+    { input: '!@#\$%', expected: null },
+  ];
+
+  let ok = true;
+  for (const tc of testCases) {
+    const result = bridge.sanitizeSessionId(tc.input);
+    if (result !== tc.expected) {
+      console.log('FAIL:', JSON.stringify(tc.input), '→', result, 'expected', tc.expected);
+      ok = false;
+    }
+  }
+  if (ok) {
+    console.log('OK: All special-character session IDs return null');
+  }
+  process.exit(ok ? 0 : 1);
+" 2>&1)
+
+if echo "$t17_result" | grep -q "OK:"; then
+  pass "T17: Special-character-only session IDs correctly return null"
+else
+  fail "T17: Special character handling failed: $t17_result"
+fi
+
+# ══════════════════════════════════════════════════════════════════
+# Phase 11: Cleanup
+# ══════════════════════════════════════════════════════════════════
+section "Phase 11: Cleanup"
+
+info "Cleaning up injection marker files in sandbox..."
+sandbox_exec "rm -f /tmp/injection-proof-t1 /tmp/injection-proof-t2 /tmp/injection-proof-t3 \
+  /tmp/injection-proof-t9 /tmp/injection-proof-t10 /tmp/injection-proof-t12 /tmp/injection-proof-t13" >/dev/null 2>&1
+
+info "Cleaning up local temp files..."
+rm -f /tmp/test-real-bridge.js /tmp/test-session-id.js 2>/dev/null || true
+
+pass "Cleanup completed"
+
 # ══════════════════════════════════════════════════════════════════
 # Summary
 # ══════════════════════════════════════════════════════════════════