Skip to content

Commit ab1ceff

Browse files
committed
update build-image at nvidia
Signed-off-by: Ceng23333 <441651826@qq.com>
1 parent 495da3e commit ab1ceff

3 files changed

Lines changed: 45 additions & 3 deletions

File tree

deployment/cases/9g_8b/validate.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,47 @@ REGISTRY_URL="http://${REGISTRY_IP}:${REGISTRY_PORT}"
3131
ROUTER_URL="http://${REGISTRY_IP}:${ROUTER_PORT}"
3232
BABYSITTER_URL="http://${REGISTRY_IP}:${BABYSITTER_HEALTH_PORT}"
3333

34+
# Wait for server to be ready: (1) registry, then (2) model loaded (/models has 9g_8b_thinking)
35+
VALIDATE_WAIT_TIMEOUT_SEC="${VALIDATE_WAIT_TIMEOUT_SEC:-300}"
36+
VALIDATE_WAIT_INTERVAL_SEC="${VALIDATE_WAIT_INTERVAL_SEC:-5}"
37+
WAIT_START=$(date +%s)
38+
echo "Waiting for server to be ready (timeout: ${VALIDATE_WAIT_TIMEOUT_SEC}s, interval: ${VALIDATE_WAIT_INTERVAL_SEC}s)..."
39+
40+
# Phase 1: wait for registry
41+
while true; do
42+
if curl -s --connect-timeout 3 "${REGISTRY_URL}/health" >/dev/null 2>&1; then
43+
ELAPSED=$(($(date +%s) - WAIT_START))
44+
echo " -> Registry ready after ${ELAPSED}s"
45+
break
46+
fi
47+
ELAPSED=$(($(date +%s) - WAIT_START))
48+
if [ "${ELAPSED}" -ge "${VALIDATE_WAIT_TIMEOUT_SEC}" ]; then
49+
echo " -> Timeout after ${ELAPSED}s (registry not responding)"
50+
break
51+
fi
52+
echo " -> Waiting for registry... (${ELAPSED}s elapsed)"
53+
sleep "${VALIDATE_WAIT_INTERVAL_SEC}"
54+
done
55+
56+
# Phase 2: wait for model loaded (router /models returns 9g_8b_thinking)
57+
echo " -> Waiting for model (9g_8b_thinking) to load..."
58+
while true; do
59+
MODELS=$(curl -s --connect-timeout 5 "${ROUTER_URL}/models" 2>/dev/null || echo "{}")
60+
if echo "${MODELS}" | grep -q "9g_8b_thinking"; then
61+
ELAPSED=$(($(date +%s) - WAIT_START))
62+
echo " -> Model ready after ${ELAPSED}s"
63+
break
64+
fi
65+
ELAPSED=$(($(date +%s) - WAIT_START))
66+
if [ "${ELAPSED}" -ge "${VALIDATE_WAIT_TIMEOUT_SEC}" ]; then
67+
echo " -> Timeout after ${ELAPSED}s (model not loaded)"
68+
break
69+
fi
70+
echo " -> Waiting for model... (${ELAPSED}s elapsed)"
71+
sleep "${VALIDATE_WAIT_INTERVAL_SEC}"
72+
done
73+
echo ""
74+
3475
FAILED=0
3576

3677
echo "=========================================="

docker/nvidia/build-image.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
2020

2121
DEFAULT_BASE_IMAGE="nvcr.io/nvidia/pytorch:25.12-py3"
2222
BASE_IMAGE="${BASE_IMAGE:-${DEFAULT_BASE_IMAGE}}"
23-
IMAGE_TAG="${IMAGE_TAG:-infinilm-svc:nvidia}"
23+
# Default: unique tag with timestamp; also tag as infinilm-svc:nvidia for deps reuse
24+
IMAGE_TAG="${IMAGE_TAG:-infinilm-svc:nvidia-$(date +%Y%m%d-%H%M%S)}"
2425
NO_CACHE=""
2526
# Accept a --phase flag (for compatibility with metax build script),
2627
# but this simple NVIDIA builder is effectively single-phase.

script/launch_babysitter_9g8b.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ RESTART_DELAY=5
2424
HEARTBEAT_INTERVAL=30
2525

2626
# InfiniLM Server Configuration (for SERVICE_TYPE="InfiniLM")
27-
MODEL_PATH="/models/9g_8b_thinking" # Required for InfiniLM
27+
MODEL_PATH="${MODEL_PATH:-/models/9g_8b_thinking}" # Required for InfiniLM; override via env
2828
MODEL_NAME="" # Model name for /models endpoint (leave empty to use directory name from MODEL_PATH, like vLLM/llama.cpp)
2929
LAUNCH_SCRIPT="" # Path to launch_server.py (leave empty for auto-detect)
30-
DEV="metax" # Device type: nvidia, metax, etc.
30+
DEV="${DEV:-metax}" # Device type: nvidia, metax, etc.; override via env
3131
NDEV=1 # Number of devices
3232
MAX_BATCH=16 # Max batch size
3333
MAX_TOKENS="" # Optional, leave empty for default

0 commit comments

Comments
 (0)