flagos-ai · Darryl233 · Feb 22, 2026 · Feb 22, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
@@ -7,6 +7,8 @@ display_name: "CUDA Tests"
 
 # Docker image for this hardware
 ci_image: localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.1-time2510131515
+ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12-20260224162355
+ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12-20260225234600
 
 # Runner labels for this hardware
 runner_labels:

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
@@ -15,7 +15,9 @@ jobs:
         shell: bash
     runs-on: ubuntu-latest
     outputs:
-      ci_image: ${{ steps.config.outputs.ci_image }}
+      # ci_image: ${{ steps.config.outputs.ci_image }}
+      ci_train_image: ${{ steps.config.outputs.ci_train_image }}
+      ci_inference_image: ${{ steps.config.outputs.ci_inference_image }}
       runs_on: ${{ steps.config.outputs.runs_on }}
       container_volumes: ${{ steps.config.outputs.container_volumes }}
       container_options: ${{ steps.config.outputs.container_options }}
@@ -108,7 +110,7 @@ jobs:
     with:
       platform: ${{ inputs.platform }}
       device: ${{ matrix.device }}
-      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
@@ -126,7 +128,7 @@ jobs:
     with:
       platform: ${{ inputs.platform }}
       test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
-      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
@@ -144,7 +146,7 @@ jobs:
     with:
       platform: ${{ inputs.platform }}
       test_matrix: ${{ needs.checkout_and_config.outputs.hetero_train_test_matrix }}
-      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
@@ -154,41 +156,41 @@ jobs:
       env_path: ${{ needs.checkout_and_config.outputs.env_path }}
 
   # NOTE: Inference, serve, and rl functional tests are temporarily disabled
-  # functional_tests_inference:
-  #   needs:
-  #     - checkout_and_config
-  #     - unit_tests
-  #   if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null
-  #   uses: ./.github/workflows/functional_tests_inference.yml
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     test_matrix: ${{ needs.checkout_and_config.outputs.inference_test_matrix }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-  #     source_artifact: flagscale-source-${{ github.sha }}
-  #     pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
-  #     env_name: ${{ needs.checkout_and_config.outputs.env_name_inference }}
-  #     env_path: ${{ needs.checkout_and_config.outputs.env_path }}
+  functional_tests_inference:
+    needs:
+      - checkout_and_config
+      - unit_tests
+    if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null
+    uses: ./.github/workflows/functional_tests_inference.yml
+    with:
+      platform: ${{ inputs.platform }}
+      test_matrix: ${{ needs.checkout_and_config.outputs.inference_test_matrix }}
+      image: ${{ needs.checkout_and_config.outputs.ci_inference_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      source_artifact: flagscale-source-${{ github.sha }}
+      pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
+      env_name: ${{ needs.checkout_and_config.outputs.env_name_inference }}
+      env_path: ${{ needs.checkout_and_config.outputs.env_path }}
 
-  # functional_tests_serve:
-  #   needs:
-  #     - checkout_and_config
-  #     - unit_tests
-  #   if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null
-  #   uses: ./.github/workflows/functional_tests_serve.yml
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     test_matrix: ${{ needs.checkout_and_config.outputs.serve_test_matrix }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-  #     source_artifact: flagscale-source-${{ github.sha }}
-  #     pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
-  #     env_name: ${{ needs.checkout_and_config.outputs.env_name_serve }}
-  #     env_path: ${{ needs.checkout_and_config.outputs.env_path }}
+  functional_tests_serve:
+    needs:
+      - checkout_and_config
+      - unit_tests
+    if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null
+    uses: ./.github/workflows/functional_tests_serve.yml
+    with:
+      platform: ${{ inputs.platform }}
+      test_matrix: ${{ needs.checkout_and_config.outputs.serve_test_matrix }}
+      image: ${{ needs.checkout_and_config.outputs.ci_inference_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      source_artifact: flagscale-source-${{ github.sha }}
+      pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
+      env_name: ${{ needs.checkout_and_config.outputs.env_name_serve }}
+      env_path: ${{ needs.checkout_and_config.outputs.env_path }}
 
   # functional_tests_rl:
   #   needs:

diff --git a/.github/workflows/functional_tests_hetero_train.yml b/.github/workflows/functional_tests_hetero_train.yml
@@ -237,4 +237,13 @@ jobs:
 
           echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
           exit $exit_code
-        timeout-minutes: 15
+        timeout-minutes: 30
+
+      - name: Upload Functional Test Logs
+        if: always() && steps.functional_test.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
+          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
+          retention-days: 7
+          if-no-files-found: warn
diff --git a/.github/workflows/functional_tests_inference.yml b/.github/workflows/functional_tests_inference.yml
@@ -52,7 +52,7 @@ jobs:
       run:
         shell: bash
     env:
-      PROJECT_ROOT: /workspace/FlagScale
+      PROJECT_ROOT: /tmp/FlagScale
     runs-on: ${{ fromJson(inputs.runs_on) }}
     strategy:
       fail-fast: false
@@ -231,6 +231,6 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
-          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/results_test
+          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
           retention-days: 7
           if-no-files-found: warn
diff --git a/.github/workflows/functional_tests_rl.yml b/.github/workflows/functional_tests_rl.yml
@@ -225,3 +225,12 @@ jobs:
           echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
           exit $exit_code
         timeout-minutes: 15
+
+      - name: Upload Functional Test Logs
+        if: always() && steps.functional_test.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
+          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
+          retention-days: 7
+          if-no-files-found: warn
diff --git a/.github/workflows/functional_tests_serve.yml b/.github/workflows/functional_tests_serve.yml
@@ -52,7 +52,7 @@ jobs:
       run:
         shell: bash
     env:
-      PROJECT_ROOT: /workspace/FlagScale
+      PROJECT_ROOT: ${{ github.workspace }}
     runs-on: ${{ fromJson(inputs.runs_on) }}
     strategy:
       fail-fast: false
@@ -231,6 +231,6 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
-          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/results_test
+          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
           retention-days: 7
           if-no-files-found: warn
diff --git a/.github/workflows/functional_tests_train.yml b/.github/workflows/functional_tests_train.yml
@@ -147,26 +147,25 @@ jobs:
 
           echo "Python location: $(which python)"
           echo "Python version: $(python --version)"
-
-          # Install task source dependencies (pip deps are pre-installed in the env)
-          echo "Installing task source dependencies..."
-
-          # Derive install-dir from env_path (e.g., /root/miniconda3 -> /root)
-          INSTALL_DIR=""
-          if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then
-            INSTALL_DIR=$(dirname "$ENV_PATH")
-          fi
-
-          # Only install Megatron-LM source dep (pip deps are pre-installed in Docker image)
-          ./tools/install/install.sh \
-            --platform ${{ inputs.platform }} \
-            --task train \
-            --pkg-mgr "$PKG_MGR" \
-            ${ENV_NAME:+--env-name "$ENV_NAME"} \
-            ${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \
-            --no-system --no-dev --no-base --no-task \
-            --src-deps megatron-lm \
-            --retry-count 3
+          pip install sentencepiece==0.2.1 transformers==4.57.6 tiktoken==0.12.0
+          # # Install task source dependencies (pip deps are pre-installed in the env)
+          # echo "Installing task source dependencies..."
+
+          # # Derive install-dir from env_path (e.g., /root/miniconda3 -> /root)
+          # INSTALL_DIR=""
+          # if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then
+          #   INSTALL_DIR=$(dirname "$ENV_PATH")
+          # fi
+
+          # # Only install Megatron-LM source dep (pip deps are pre-installed in Docker image)
+          # ./tools/install/install.sh \
+          #   --platform ${{ inputs.platform }} \
+          #   --task train \
+          #   --pkg-mgr "$PKG_MGR" \
+          #   ${ENV_NAME:+--env-name "$ENV_NAME"} \
+          #   ${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \
+          #   --no-system --no-dev --no-base --no-task \
+          #   --retry-count 3
         timeout-minutes: 30
 
       - name: Run functional tests
@@ -237,4 +236,13 @@ jobs:
 
           echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
           exit $exit_code
-        timeout-minutes: 15
+        timeout-minutes: 30
+
+      - name: Upload Functional Test Logs
+        if: always() && steps.functional_test.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
+          path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
+          retention-days: 7
+          if-no-files-found: warn
diff --git a/docker/build.sh b/docker/build.sh
@@ -10,6 +10,8 @@
 #   ./docker/build.sh --platform cuda
 #   ./docker/build.sh --platform cuda --task train
 #   ./docker/build.sh --platform cuda --task train --target dev
+#   ./docker/build.sh --platform cuda --task train --target dev --build-arg PKG_MGR=conda
+
 
 set -euo pipefail
 
@@ -44,6 +46,7 @@ TASK=""
 TARGET="dev"
 TAG_PREFIX="flagscale"
 NO_CACHE=false
+BUILD_ARGS=()
 
 # PyPI index URLs (optional, for custom mirrors)
 PIP_INDEX_URL="${PIP_INDEX_URL:-}"
@@ -105,6 +108,7 @@ OPTIONS:
     --tag-prefix PREFIX  Image tag prefix (default: flagscale)
     --index-url URL      PyPI index URL (for custom mirrors)
     --extra-index-url URL  Extra PyPI index URL
+    --build-arg K=V      Pass build-arg to docker (can be repeated)
     --no-cache           Build without cache
     --help               Show this help message
 
@@ -119,6 +123,7 @@ EXAMPLES:
     $0 --platform cuda
     $0 --platform cuda --task train
     $0 --platform cuda --task train --target dev
+    $0 --platform cuda --task train --target dev --build-arg PKG_MGR=conda
     CUDA_VERSION=12.4.0 $0 --platform cuda --task train
 
 EOF
@@ -136,6 +141,7 @@ parse_args() {
             --tag-prefix)       TAG_PREFIX="$2"; shift 2 ;;
             --index-url)        PIP_INDEX_URL="$2"; shift 2 ;;
             --extra-index-url)  PIP_EXTRA_INDEX_URL="$2"; shift 2 ;;
+            --build-arg)        BUILD_ARGS+=("$2"); shift 2 ;;
             --no-cache)         NO_CACHE=true; shift ;;
             --help|-h)          usage; exit 0 ;;
             *)
@@ -165,6 +171,9 @@ get_image_tag() {
     # Add python version
     tag="${tag}-py${PYTHON_VERSION}"
 
+    # Add timestamp
+    tag="${tag}-$(date +%Y%m%d%H%M%S)"
+
     echo "$tag"
 }
 
@@ -222,6 +231,10 @@ build_image() {
     fi
 
     [ "$NO_CACHE" = true ] && build_cmd="$build_cmd --no-cache"
+    for arg in "${BUILD_ARGS[@]}"; do
+        log_info "Build-arg: $arg"
+        build_cmd="$build_cmd --build-arg \"$arg\""
+    done
     build_cmd="$build_cmd $PROJECT_ROOT"
 
     log_info "Running: $build_cmd"

diff --git a/docker/cuda/Dockerfile.all b/docker/cuda/Dockerfile.all
@@ -90,10 +90,39 @@ ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/l
 
 WORKDIR /workspace
 
+# =============================================================================
+# ENV STAGE - Ensure uv/conda/pip environments
+# =============================================================================
+FROM base AS env
+
+ARG PKG_MGR
+ARG FLAGSCALE_HOME=/opt/flagscale
+ARG PYTHON_VERSION=3.12
+
+RUN . /etc/profile.d/flagscale-env.sh && \
+    if [ "$PKG_MGR" = "uv" ]; then \
+        if [ ! -f "${FLAGSCALE_HOME}/venv/bin/python3" ]; then \
+            echo "[ENV] Creating uv venv at ${FLAGSCALE_HOME}/venv (python=${PYTHON_VERSION})"; \
+            "$HOME/.local/bin/uv" venv "${FLAGSCALE_HOME}/venv" --python "${PYTHON_VERSION}"; \
+        else \
+            echo "[ENV] Found uv venv at ${FLAGSCALE_HOME}/venv"; \
+        fi; \
+    elif [ "$PKG_MGR" = "conda" ]; then \
+        CONDA_ENV_NAME=flagscale-all; \
+        if [ -d "${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}" ]; then \
+            echo "[ENV] Found conda env ${CONDA_ENV_NAME} at ${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}"; \
+        else \
+            echo "[ENV] Creating conda env ${CONDA_ENV_NAME} (python=${PYTHON_VERSION})"; \
+            env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" create -y -n "${CONDA_ENV_NAME}" "python=${PYTHON_VERSION}"; \
+        fi; \
+    else \
+        echo "[ENV] Using system pip (no dedicated env to create)"; \
+    fi
+
 # =============================================================================
 # DEPS STAGE - Install all dependencies using install folder
 # =============================================================================
-FROM base AS deps
+FROM env AS deps
 
 ARG PYTORCH_INDEX
 ARG PKG_MGR
@@ -153,9 +182,9 @@ WORKDIR /workspace/FlagScale
 CMD ["/bin/bash"]
 
 # =============================================================================
-# RELEASE STAGE - Production image (uses same base for consistency)
+# RELEASE STAGE - Production image (uses same env for consistency)
 # =============================================================================
-FROM base AS release
+FROM env AS release
 
 ARG PYTORCH_INDEX
 ARG PKG_MGR