diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml index b47552106..98cdd0f6e 100644 --- a/.github/configs/cuda.yml +++ b/.github/configs/cuda.yml @@ -26,3 +26,32 @@ container_volumes: # Container options (hardware-specific settings) container_options: "--gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535" + +# ============================================================================= +# Package Manager Configuration +# ============================================================================= +# Supported package managers: pip, uv, conda +# - pip: Use pip directly (standard Python) +# - uv: Use uv pip (fast, modern package manager) +# - conda: Use conda environment with pip for PyPI packages +# +# Unified environment parameters: +# - env_name: Conda environment name (for conda only) +# - env_path: Environment path (venv path for uv, conda installation path for conda) +# +# To transition to uv in the future: +# 1. Change pkg_mgr to "uv" +# 2. Ensure uv is installed in the Docker image +# 3. Set env_path to the virtual environment path (e.g., "/opt/venv") +# +pkg_mgr: "conda" # Current: conda for CI/CD compatibility + +# Environment path (venv path for uv, conda installation path for conda) +env_path: "/root/miniconda3" + +# Conda environment name (for conda only) +env_names: + train: "flagscale-train" + hetero_train: "flagscale-train" + inference: "flagscale-inference" + rl: "flagscale-rl" diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index 5c17a0413..cc8e0eb69 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -25,6 +25,12 @@ jobs: inference_test_matrix: ${{ steps.config.outputs.inference_test_matrix }} serve_test_matrix: ${{ steps.config.outputs.serve_test_matrix }} rl_test_matrix: ${{ steps.config.outputs.rl_test_matrix }} + pkg_mgr: ${{ steps.config.outputs.pkg_mgr }} + env_path: ${{ steps.config.outputs.env_path }} + env_name_train: ${{ steps.config.outputs.env_name_train }} + env_name_inference: ${{ steps.config.outputs.env_name_inference }} + env_name_serve: ${{ steps.config.outputs.env_name_serve }} + env_name_rl: ${{ steps.config.outputs.env_name_rl }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -107,8 +113,9 @@ jobs: container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} container_options: ${{ needs.checkout_and_config.outputs.container_options }} source_artifact: flagscale-source-${{ github.sha }} - conda_env: flagscale-train # Optional: can be empty for non-conda environments - conda_path: "/root/miniconda3" # Optional: specify custom conda path, empty for auto-detection + pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} + env_path: ${{ needs.checkout_and_config.outputs.env_path }} functional_tests_train: needs: @@ -124,8 +131,9 @@ jobs: container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} container_options: ${{ needs.checkout_and_config.outputs.container_options }} source_artifact: flagscale-source-${{ github.sha }} - conda_env: flagscale-train # Optional: can be empty for non-conda environments - conda_path: "/root/miniconda3" # Optional: specify custom conda path, empty for auto-detection + pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} + env_path: ${{ needs.checkout_and_config.outputs.env_path }} functional_tests_hetero_train: needs: @@ -141,59 +149,64 @@ jobs: container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} container_options: ${{ needs.checkout_and_config.outputs.container_options }} source_artifact: flagscale-source-${{ github.sha }} - conda_env: flagscale-train # Optional: can be empty for non-conda environments - conda_path: "/root/miniconda3" # Optional: specify custom conda path, empty for auto-detection + pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} + env_path: ${{ needs.checkout_and_config.outputs.env_path }} - functional_tests_inference: - needs: - - checkout_and_config - - unit_tests - if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null - uses: ./.github/workflows/functional_tests_inference.yml - with: - platform: ${{ inputs.platform }} - test_matrix: ${{ needs.checkout_and_config.outputs.inference_test_matrix }} - image: ${{ needs.checkout_and_config.outputs.ci_image }} - runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} - container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} - container_options: ${{ needs.checkout_and_config.outputs.container_options }} - source_artifact: flagscale-source-${{ github.sha }} - conda_env: flagscale-inference # Optional: can be empty for non-conda environments - conda_path: "/root/miniconda3" # Optional: specify custom conda path, empty for auto-detection + # NOTE: Inference, serve, and rl functional tests are temporarily disabled + # functional_tests_inference: + # needs: + # - checkout_and_config + # - unit_tests + # if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null + # uses: ./.github/workflows/functional_tests_inference.yml + # with: + # platform: ${{ inputs.platform }} + # test_matrix: ${{ needs.checkout_and_config.outputs.inference_test_matrix }} + # image: ${{ needs.checkout_and_config.outputs.ci_image }} + # runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} + # container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} + # container_options: ${{ needs.checkout_and_config.outputs.container_options }} + # source_artifact: flagscale-source-${{ github.sha }} + # pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + # env_name: ${{ needs.checkout_and_config.outputs.env_name_inference }} + # env_path: ${{ needs.checkout_and_config.outputs.env_path }} - functional_tests_serve: - needs: - - checkout_and_config - - unit_tests - if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null - uses: ./.github/workflows/functional_tests_serve.yml - with: - platform: ${{ inputs.platform }} - test_matrix: ${{ needs.checkout_and_config.outputs.serve_test_matrix }} - image: ${{ needs.checkout_and_config.outputs.ci_image }} - runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} - container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} - container_options: ${{ needs.checkout_and_config.outputs.container_options }} - source_artifact: flagscale-source-${{ github.sha }} - conda_env: flagscale-inference # Optional: can be empty for non-conda environments - conda_path: "/root/miniconda3" # Optional: specify custom conda path, empty for auto-detection + # functional_tests_serve: + # needs: + # - checkout_and_config + # - unit_tests + # if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null + # uses: ./.github/workflows/functional_tests_serve.yml + # with: + # platform: ${{ inputs.platform }} + # test_matrix: ${{ needs.checkout_and_config.outputs.serve_test_matrix }} + # image: ${{ needs.checkout_and_config.outputs.ci_image }} + # runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} + # container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} + # container_options: ${{ needs.checkout_and_config.outputs.container_options }} + # source_artifact: flagscale-source-${{ github.sha }} + # pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + # env_name: ${{ needs.checkout_and_config.outputs.env_name_serve }} + # env_path: ${{ needs.checkout_and_config.outputs.env_path }} - functional_tests_rl: - needs: - - checkout_and_config - - unit_tests - if: fromJson(needs.checkout_and_config.outputs.rl_test_matrix)[0] != null - uses: ./.github/workflows/functional_tests_rl.yml - with: - platform: ${{ inputs.platform }} - test_matrix: ${{ needs.checkout_and_config.outputs.rl_test_matrix }} - image: ${{ needs.checkout_and_config.outputs.ci_image }} - runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} - container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} - container_options: ${{ needs.checkout_and_config.outputs.container_options }} - source_artifact: flagscale-source-${{ github.sha }} - conda_env: flagscale-RL # Optional: can be empty for non-conda environments - conda_path: "/root/miniconda3" # Optional: specify custom conda path, empty for auto-detection + # functional_tests_rl: + # needs: + # - checkout_and_config + # - unit_tests + # if: fromJson(needs.checkout_and_config.outputs.rl_test_matrix)[0] != null + # uses: ./.github/workflows/functional_tests_rl.yml + # with: + # platform: ${{ inputs.platform }} + # test_matrix: ${{ needs.checkout_and_config.outputs.rl_test_matrix }} + # image: ${{ needs.checkout_and_config.outputs.ci_image }} + # runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} + # container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} + # container_options: ${{ needs.checkout_and_config.outputs.container_options }} + # source_artifact: flagscale-source-${{ github.sha }} + # pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + # env_name: ${{ needs.checkout_and_config.outputs.env_name_rl }} + # env_path: ${{ needs.checkout_and_config.outputs.env_path }} all_tests_complete: defaults: @@ -204,9 +217,10 @@ jobs: - unit_tests - functional_tests_train - functional_tests_hetero_train - - functional_tests_inference - - functional_tests_serve - - functional_tests_rl + # NOTE: Disabled tests removed from needs + # - functional_tests_inference + # - functional_tests_serve + # - functional_tests_rl runs-on: ubuntu-latest if: always() steps: @@ -233,23 +247,24 @@ jobs: failed=true fi - if [ "${{ needs.functional_tests_inference.result }}" != "success" ] && \ - [ "${{ needs.functional_tests_inference.result }}" != "skipped" ]; then - echo "❌ Inference functional tests failed" - failed=true - fi + # NOTE: Inference, serve, and rl checks disabled + # if [ "${{ needs.functional_tests_inference.result }}" != "success" ] && \ + # [ "${{ needs.functional_tests_inference.result }}" != "skipped" ]; then + # echo "❌ Inference functional tests failed" + # failed=true + # fi - if [ "${{ needs.functional_tests_serve.result }}" != "success" ] && \ - [ "${{ needs.functional_tests_serve.result }}" != "skipped" ]; then - echo "❌ RL functional tests failed" - failed=true - fi + # if [ "${{ needs.functional_tests_serve.result }}" != "success" ] && \ + # [ "${{ needs.functional_tests_serve.result }}" != "skipped" ]; then + # echo "❌ Serve functional tests failed" + # failed=true + # fi - if [ "${{ needs.functional_tests_rl.result }}" != "success" ] && \ - [ "${{ needs.functional_tests_rl.result }}" != "skipped" ]; then - echo "❌ RL functional tests failed" - failed=true - fi + # if [ "${{ needs.functional_tests_rl.result }}" != "success" ] && \ + # [ "${{ needs.functional_tests_rl.result }}" != "skipped" ]; then + # echo "❌ RL functional tests failed" + # failed=true + # fi if [ "$failed" = "true" ]; then exit 1 diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index de2fa6018..0b3487220 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -1,4 +1,4 @@ -name: CUDA Tests Validation +name: cuda_tests on: push: @@ -12,6 +12,7 @@ concurrency: jobs: run_tests: + # Package manager and environment settings are read from .github/configs/cuda.yml uses: ./.github/workflows/all_tests_common.yml with: platform: cuda diff --git a/.github/workflows/functional_tests_common.yml b/.github/workflows/functional_tests_common.yml deleted file mode 100644 index 984cda9c6..000000000 --- a/.github/workflows/functional_tests_common.yml +++ /dev/null @@ -1,206 +0,0 @@ -name: Common Functional Tests - -# NOTE: This workflow is a TEMPLATE/REFERENCE for functional test workflows -# Task-specific workflows (functional_tests_train.yml, etc.) are preferred for production use -# This template demonstrates the standard pattern for running functional tests with install script - -on: - workflow_call: - inputs: - platform: - required: true - type: string - description: Platform name (e.g., cuda, default) - device: - required: true - type: string - description: Device type (e.g., a100, a800, h100, generic) - task: - required: true - type: string - description: Task name (e.g., train, hetero_train, inference, rl) - model: - required: true - type: string - description: Model name (e.g., aquila, mixtral, deepseek) - case: - required: false - type: string - description: Specific test case (e.g., tp2_pp2, tp4_pp2) - conda_env: - required: false - type: string - description: Conda environment name (optional, skips conda if not provided) - default: "" - conda_path: - required: false - type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" - image: - required: true - type: string - runs_on: - required: true - type: string - container_volumes: - required: true - type: string - container_options: - required: true - type: string - source_artifact: - required: true - type: string - description: Name of the artifact containing source code - -jobs: - functional_test: - defaults: - run: - shell: bash - env: - PROJECT_ROOT: /workspace/FlagScale - runs-on: ${{ fromJson(inputs.runs_on) }} - container: - image: ${{ inputs.image }} - ports: - - 80 - volumes: ${{ fromJson(inputs.container_volumes) }} - options: ${{ inputs.container_options }} - - steps: - - name: Download source code artifact (attempt 1) - uses: actions/download-artifact@v4 - continue-on-error: true - id: download_attempt_1 - with: - name: ${{ inputs.source_artifact }} - path: /tmp - - - name: Download source code artifact (attempt 2) - if: steps.download_attempt_1.outcome == 'failure' - uses: actions/download-artifact@v4 - continue-on-error: true - id: download_attempt_2 - with: - name: ${{ inputs.source_artifact }} - path: /tmp - - - name: Download source code artifact (attempt 3) - if: steps.download_attempt_2.outcome == 'failure' - uses: actions/download-artifact@v4 - id: download_attempt_3 - with: - name: ${{ inputs.source_artifact }} - path: /tmp - - - name: Verify artifact download - run: | - if [ "${{ steps.download_attempt_1.outcome }}" == "success" ]; then - echo "✅ Artifact downloaded successfully on attempt 1" - elif [ "${{ steps.download_attempt_2.outcome }}" == "success" ]; then - echo "✅ Artifact downloaded successfully on attempt 2 (retried once)" - elif [ "${{ steps.download_attempt_3.outcome }}" == "success" ]; then - echo "✅ Artifact downloaded successfully on attempt 3 (retried twice)" - else - echo "❌ Error: All 3 download attempts failed" - echo "Artifact name: ${{ inputs.source_artifact }}" - exit 1 - fi - - - name: Extract source code - run: | - mkdir -p $PROJECT_ROOT - tar -xzf /tmp/flagscale-source.tar.gz -C $PROJECT_ROOT - - - name: Set safe directory - run: | - git config --global --add safe.directory $PROJECT_ROOT - - - name: Install dependencies for task - run: | - set -euo pipefail - cd $PROJECT_ROOT - - PLATFORM='${{ inputs.platform }}' - TASK='${{ inputs.task }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' - - echo "Installing dependencies for task: $TASK" - echo "Platform: $PLATFORM" - - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform $PLATFORM --task $TASK --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi - fi - - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD - timeout-minutes: 30 - - - name: Run functional tests - id: functional_test - run: | - set -euo pipefail - cd $PROJECT_ROOT - - PLATFORM='${{ inputs.platform }}' - DEVICE='${{ inputs.device }}' - TASK='${{ inputs.task }}' - MODEL='${{ inputs.model }}' - CASE='${{ inputs.case }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' - - echo "Running functional tests for $TASK" - echo "Platform: $PLATFORM" - echo "Device: $DEVICE" - echo "Task: $TASK" - echo "Model: $MODEL" - echo "Case: ${CASE:-all}" - echo "Project root: $PROJECT_ROOT" - - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi - - # Display Python environment info - echo "Python location: $(which python)" - echo "Python version: $(python --version)" - - # Run functional tests using run_tests.sh - bash "$PROJECT_ROOT/tests/test_utils/runners/run_tests.sh" \ - --platform "$PLATFORM" \ - --device "$DEVICE" \ - --type functional \ - --task "$TASK" \ - --model "$MODEL" \ - --list "$CASE" - exit_code=$? - - if [ $exit_code -eq 0 ]; then - echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE" - else - echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)" - fi - - echo "exit_code=$exit_code" >> $GITHUB_OUTPUT - exit $exit_code - timeout-minutes: 15 diff --git a/.github/workflows/functional_tests_hetero_train.yml b/.github/workflows/functional_tests_hetero_train.yml index 45f5fdea1..56dc08918 100644 --- a/.github/workflows/functional_tests_hetero_train.yml +++ b/.github/workflows/functional_tests_hetero_train.yml @@ -27,16 +27,21 @@ on: required: true type: string description: Name of the artifact containing source code - conda_env: + pkg_mgr: required: false type: string - description: Conda environment name (optional, skips conda if not provided) + description: Package manager (pip, uv, conda). Default uv. + default: "uv" + env_name: + required: false + type: string + description: Conda environment name (for conda only) default: "" - conda_path: + env_path: required: false type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" + description: Environment path (venv path for uv, conda installation path for conda) + default: "/opt/venv" jobs: functional_test_hetero_train: @@ -111,20 +116,57 @@ jobs: set -euo pipefail cd $PROJECT_ROOT - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' + + echo "Installing dependencies for heterogeneous training" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" + + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; } + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; } + fi + ;; + pip) + echo "Using system Python with pip" + ;; + esac + + echo "Python location: $(which python)" + echo "Python version: $(python --version)" + + # Install task source dependencies (pip deps are pre-installed in the env) + echo "Installing task source dependencies..." - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform ${{ inputs.platform }} --task hetero_train --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi + # Derive install-dir from env_path (e.g., /root/miniconda3 -> /root) + INSTALL_DIR="" + if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then + INSTALL_DIR=$(dirname "$ENV_PATH") fi - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD + # Only install Megatron-LM source dep (pip deps are pre-installed in Docker image) + ./tools/install/install.sh \ + --platform ${{ inputs.platform }} \ + --task train \ + --pkg-mgr "$PKG_MGR" \ + ${ENV_NAME:+--env-name "$ENV_NAME"} \ + ${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \ + --no-system --no-dev --no-base --no-task \ + --src-deps megatron-lm \ + --retry-count 3 timeout-minutes: 30 - name: Run functional tests @@ -138,8 +180,9 @@ jobs: TASK='${{ matrix.test_config.task }}' MODEL='${{ matrix.test_config.model }}' CASE='${{ matrix.test_config.case }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' echo "Running functional tests for heterogeneous training" echo "Platform: $PLATFORM" @@ -147,22 +190,30 @@ jobs: echo "Task: $TASK" echo "Model: $MODEL" echo "Case: ${CASE:-all}" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" echo "Project root: $PROJECT_ROOT" - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️ Conda activation failed" + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || echo "⚠️ UV activation failed" + fi + ;; + pip) + echo "ℹ️ Running tests with pip/system Python" + ;; + esac # Display Python environment info echo "Python location: $(which python)" diff --git a/.github/workflows/functional_tests_inference.yml b/.github/workflows/functional_tests_inference.yml index 6b56027c4..7319735ec 100644 --- a/.github/workflows/functional_tests_inference.yml +++ b/.github/workflows/functional_tests_inference.yml @@ -30,16 +30,21 @@ on: required: true type: string description: Name of the artifact containing source code - conda_env: + pkg_mgr: required: false type: string - description: Conda environment name (optional, skips conda if not provided) + description: Package manager (pip, uv, conda). Default uv. + default: "uv" + env_name: + required: false + type: string + description: Conda environment name (for conda only) default: "" - conda_path: + env_path: required: false type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" + description: Environment path (venv path for uv, conda installation path for conda) + default: "/opt/venv" jobs: functional_test_inference: @@ -109,81 +114,47 @@ jobs: run: | git config --global --add safe.directory $PROJECT_ROOT - - name: Check sccache installation and get path - id: get_sccache_path - timeout-minutes: 15 - run: | - set -euo pipefail - cd $PROJECT_ROOT - - if ! command -v sccache &> /dev/null; then - echo "❌ sccache not found in PATH, running auto-installer..." - source ./tools/install/utils/install-sccache.sh - fi - - SCCACHE_PATH=$(command -v sccache) - echo "sccache_path=${SCCACHE_PATH}" >> $GITHUB_OUTPUT - - echo "Found sccache at: ${SCCACHE_PATH}" - - - name: Install dependencies for inference - env: - SCCACHE_CPP2: 1 - CUDA_CCACHE: 1 - SCCACHE_VERBOSE: 0 - SCCACHE_CACHE_SIZE: 20G - SCCACHE_DIR: /github/home/.cache/sccache - CMAKE_C_COMPILER_LAUNCHER: ${{ steps.get_sccache_path.outputs.sccache_path }} - CMAKE_CXX_COMPILER_LAUNCHER: ${{ steps.get_sccache_path.outputs.sccache_path }} - CMAKE_CUDA_COMPILER_LAUNCHER: ${{ steps.get_sccache_path.outputs.sccache_path }} - run: | - set -euo pipefail - cd $PROJECT_ROOT - - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' - - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform ${{ inputs.platform }} --task inference --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi - fi - - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD - timeout-minutes: 90 - - - name: Set up flash-attn for inference(robobrain2) - if: ${{ matrix.test_config.model == 'robobrain2' }} - timeout-minutes: 15 + - name: Setup environment for inference run: | set -euo pipefail cd $PROJECT_ROOT - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' - - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' + + echo "Setting up environment for inference" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" + + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; } + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; } + fi + ;; + pip) + echo "Using system Python with pip" + ;; + esac - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + echo "Python location: $(which python)" + echo "Python version: $(python --version)" - pip uninstall -y flash_attn - pip install flash-attn==2.6.3 --no-build-isolation - python -c "import flash_attn; import flash_attn.layers; print('FlashAttention loaded successfully, version:', flash_attn.__version__)" + # For inference task: all dependencies are pre-installed in the env + # No additional installation needed + echo "✅ Environment ready for inference tests" + timeout-minutes: 5 - name: Run functional tests id: functional_test @@ -196,8 +167,9 @@ jobs: TASK='${{ matrix.test_config.task }}' MODEL='${{ matrix.test_config.model }}' CASE='${{ matrix.test_config.case }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' echo "Running functional tests for inference" echo "Platform: $PLATFORM" @@ -205,22 +177,30 @@ jobs: echo "Task: $TASK" echo "Model: $MODEL" echo "Case: ${CASE:-all}" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" echo "Project root: $PROJECT_ROOT" - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️ Conda activation failed" + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || echo "⚠️ UV activation failed" + fi + ;; + pip) + echo "ℹ️ Running tests with pip/system Python" + ;; + esac # Display Python environment info echo "Python location: $(which python)" diff --git a/.github/workflows/functional_tests_rl.yml b/.github/workflows/functional_tests_rl.yml index 427c483ba..f66fd1f8e 100644 --- a/.github/workflows/functional_tests_rl.yml +++ b/.github/workflows/functional_tests_rl.yml @@ -30,16 +30,21 @@ on: required: true type: string description: Name of the artifact containing source code - conda_env: + pkg_mgr: required: false type: string - description: Conda environment name (optional, skips conda if not provided) + description: Package manager (pip, uv, conda). Default uv. + default: "uv" + env_name: + required: false + type: string + description: Conda environment name (for conda only) default: "" - conda_path: + env_path: required: false type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" + description: Environment path (venv path for uv, conda installation path for conda) + default: "/opt/venv" jobs: functional_test_rl: @@ -109,26 +114,47 @@ jobs: run: | git config --global --add safe.directory $PROJECT_ROOT - - name: Install dependencies for rl + - name: Setup environment for rl run: | set -euo pipefail cd $PROJECT_ROOT - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' + + echo "Setting up environment for RL" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" + + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; } + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; } + fi + ;; + pip) + echo "Using system Python with pip" + ;; + esac - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform ${{ inputs.platform }} --task rl --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi - fi + echo "Python location: $(which python)" + echo "Python version: $(python --version)" - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD - timeout-minutes: 30 + # For RL task: all dependencies are pre-installed in the env + # No additional installation needed + echo "✅ Environment ready for RL tests" + timeout-minutes: 5 - name: Run functional tests id: functional_test @@ -141,8 +167,9 @@ jobs: TASK='${{ matrix.test_config.task }}' MODEL='${{ matrix.test_config.model }}' CASE='${{ matrix.test_config.case }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' echo "Running functional tests for rl" echo "Platform: $PLATFORM" @@ -150,22 +177,30 @@ jobs: echo "Task: $TASK" echo "Model: $MODEL" echo "Case: ${CASE:-all}" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" echo "Project root: $PROJECT_ROOT" - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️ Conda activation failed" + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || echo "⚠️ UV activation failed" + fi + ;; + pip) + echo "ℹ️ Running tests with pip/system Python" + ;; + esac # Display Python environment info echo "Python location: $(which python)" diff --git a/.github/workflows/functional_tests_serve.yml b/.github/workflows/functional_tests_serve.yml index 2d49a96e2..5c7adf2c7 100644 --- a/.github/workflows/functional_tests_serve.yml +++ b/.github/workflows/functional_tests_serve.yml @@ -30,16 +30,21 @@ on: required: true type: string description: Name of the artifact containing source code - conda_env: + pkg_mgr: required: false type: string - description: Conda environment name (optional, skips conda if not provided) + description: Package manager (pip, uv, conda). Default uv. + default: "uv" + env_name: + required: false + type: string + description: Conda environment name (for conda only) default: "" - conda_path: + env_path: required: false type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" + description: Environment path (venv path for uv, conda installation path for conda) + default: "/opt/venv" jobs: functional_test_serve: @@ -89,13 +94,13 @@ jobs: - name: Verify artifact download run: | if [ "${{ steps.download_attempt_1.outcome }}" == "success" ]; then - echo "✅ Artifact downloaded successfully on attempt 1" + echo "Artifact downloaded successfully on attempt 1" elif [ "${{ steps.download_attempt_2.outcome }}" == "success" ]; then - echo "✅ Artifact downloaded successfully on attempt 2 (retried once)" + echo "Artifact downloaded successfully on attempt 2 (retried once)" elif [ "${{ steps.download_attempt_3.outcome }}" == "success" ]; then - echo "✅ Artifact downloaded successfully on attempt 3 (retried twice)" + echo "Artifact downloaded successfully on attempt 3 (retried twice)" else - echo "❌ Error: All 3 download attempts failed" + echo "Error: All 3 download attempts failed" echo "Artifact name: ${{ inputs.source_artifact }}" exit 1 fi @@ -109,52 +114,47 @@ jobs: run: | git config --global --add safe.directory $PROJECT_ROOT - - name: Check sccache installation and get path - id: get_sccache_path - timeout-minutes: 15 - run: | - set -euo pipefail - cd $PROJECT_ROOT - - if ! command -v sccache &> /dev/null; then - echo "❌ sccache not found in PATH, running auto-installer..." - source ./tools/install/utils/install-sccache.sh - fi - - SCCACHE_PATH=$(command -v sccache) - echo "sccache_path=${SCCACHE_PATH}" >> $GITHUB_OUTPUT - - echo "Found sccache at: ${SCCACHE_PATH}" - - name: Install dependencies for serve - env: - SCCACHE_CPP2: 1 - CUDA_CCACHE: 1 - SCCACHE_VERBOSE: 0 - SCCACHE_CACHE_SIZE: 20G - SCCACHE_DIR: /github/home/.cache/sccache - CMAKE_C_COMPILER_LAUNCHER: ${{ steps.get_sccache_path.outputs.sccache_path }} - CMAKE_CXX_COMPILER_LAUNCHER: ${{ steps.get_sccache_path.outputs.sccache_path }} - CMAKE_CUDA_COMPILER_LAUNCHER: ${{ steps.get_sccache_path.outputs.sccache_path }} run: | set -euo pipefail cd $PROJECT_ROOT - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' + + echo "Installing dependencies for serve" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" + + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "Conda activation failed"; exit 1; } + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || { echo "UV activation failed"; exit 1; } + fi + ;; + pip) + echo "Using system Python with pip" + ;; + esac - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform ${{ inputs.platform }} --task serve --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi - fi + echo "Python location: $(which python)" + echo "Python version: $(python --version)" - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD - timeout-minutes: 90 + # For serve task: all dependencies are pre-installed in the env + # No additional installation needed + echo "Environment ready for serve tests" + timeout-minutes: 5 - name: Run functional tests id: functional_test @@ -167,8 +167,9 @@ jobs: TASK='${{ matrix.test_config.task }}' MODEL='${{ matrix.test_config.model }}' CASE='${{ matrix.test_config.case }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' echo "Running functional tests for serve" echo "Platform: $PLATFORM" @@ -176,22 +177,30 @@ jobs: echo "Task: $TASK" echo "Model: $MODEL" echo "Case: ${CASE:-all}" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" echo "Project root: $PROJECT_ROOT" - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || echo "Conda activation failed" + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || echo "UV activation failed" + fi + ;; + pip) + echo "Running tests with pip/system Python" + ;; + esac # Display Python environment info echo "Python location: $(which python)" @@ -208,9 +217,9 @@ jobs: exit_code=$? if [ $exit_code -eq 0 ]; then - echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE" + echo "Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE" else - echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)" + echo "Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)" fi echo "exit_code=$exit_code" >> $GITHUB_OUTPUT diff --git a/.github/workflows/functional_tests_train.yml b/.github/workflows/functional_tests_train.yml index f994bc589..2144393b6 100644 --- a/.github/workflows/functional_tests_train.yml +++ b/.github/workflows/functional_tests_train.yml @@ -27,16 +27,21 @@ on: required: true type: string description: Name of the artifact containing source code - conda_env: + pkg_mgr: required: false type: string - description: Conda environment name (optional, skips conda if not provided) + description: Package manager (pip, uv, conda). Default uv. + default: "uv" + env_name: + required: false + type: string + description: Conda environment name (for conda only) default: "" - conda_path: + env_path: required: false type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" + description: Environment path (venv path for uv, conda installation path for conda) + default: "/opt/venv" jobs: functional_test_train: @@ -111,20 +116,57 @@ jobs: set -euo pipefail cd $PROJECT_ROOT - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' + + echo "Installing dependencies for training" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" + + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; } + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; } + fi + ;; + pip) + echo "Using system Python with pip" + ;; + esac + + echo "Python location: $(which python)" + echo "Python version: $(python --version)" + + # Install task source dependencies (pip deps are pre-installed in the env) + echo "Installing task source dependencies..." - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform ${{ inputs.platform }} --task train --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi + # Derive install-dir from env_path (e.g., /root/miniconda3 -> /root) + INSTALL_DIR="" + if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then + INSTALL_DIR=$(dirname "$ENV_PATH") fi - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD + # Only install Megatron-LM source dep (pip deps are pre-installed in Docker image) + ./tools/install/install.sh \ + --platform ${{ inputs.platform }} \ + --task train \ + --pkg-mgr "$PKG_MGR" \ + ${ENV_NAME:+--env-name "$ENV_NAME"} \ + ${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \ + --no-system --no-dev --no-base --no-task \ + --src-deps megatron-lm \ + --retry-count 3 timeout-minutes: 30 - name: Run functional tests @@ -138,8 +180,9 @@ jobs: TASK='${{ matrix.test_config.task }}' MODEL='${{ matrix.test_config.model }}' CASE='${{ matrix.test_config.case }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' echo "Running functional tests for training" echo "Platform: $PLATFORM" @@ -147,22 +190,30 @@ jobs: echo "Task: $TASK" echo "Model: $MODEL" echo "Case: ${CASE:-all}" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" echo "Project root: $PROJECT_ROOT" - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️ Conda activation failed" + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || echo "⚠️ UV activation failed" + fi + ;; + pip) + echo "ℹ️ Running tests with pip/system Python" + ;; + esac # Display Python environment info echo "Python location: $(which python)" diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml index f1e26d46d..9a7de91fb 100644 --- a/.github/workflows/unit_tests_common.yml +++ b/.github/workflows/unit_tests_common.yml @@ -27,16 +27,21 @@ on: required: true type: string description: Name of the artifact containing source code - conda_env: + pkg_mgr: required: false type: string - description: Conda environment name (optional, skips conda if not provided) + description: Package manager (pip, uv, conda). Default uv. + default: "uv" + env_name: + required: false + type: string + description: Conda environment name (for conda only) default: "" - conda_path: + env_path: required: false type: string - description: Conda installation path (optional, auto-detects if not provided) - default: "/root/miniconda3" + description: Environment path (venv path for uv, conda installation path for conda) + default: "/opt/venv" jobs: unit_test: @@ -107,20 +112,57 @@ jobs: set -euo pipefail cd $PROJECT_ROOT - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' + + echo "Installing dependencies for unit tests" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" + + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; } + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; } + fi + ;; + pip) + echo "Using system Python with pip" + ;; + esac + + echo "Python location: $(which python)" + echo "Python version: $(python --version)" + + # Install task source dependencies (pip deps are pre-installed in the env) + echo "Installing task source dependencies..." - # Build install command with optional conda parameters - INSTALL_CMD="./tools/install/install.sh --platform ${{ inputs.platform }} --task train --retry-count 3" - if [ -n "$CONDA_ENV" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-env $CONDA_ENV" - if [ -n "$CONDA_PATH" ]; then - INSTALL_CMD="$INSTALL_CMD --conda-path $CONDA_PATH" - fi + # Derive install-dir from env_path (e.g., /root/miniconda3 -> /root) + INSTALL_DIR="" + if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then + INSTALL_DIR=$(dirname "$ENV_PATH") fi - # Run install script (handles conda activation internally if needed) - eval $INSTALL_CMD + # Only install Megatron-LM source dep (pip deps are pre-installed in Docker image) + ./tools/install/install.sh \ + --platform ${{ inputs.platform }} \ + --task train \ + --pkg-mgr "$PKG_MGR" \ + ${ENV_NAME:+--env-name "$ENV_NAME"} \ + ${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \ + --no-system --no-dev --no-base --no-task \ + --src-deps megatron-lm \ + --retry-count 3 # Copy test data (keep existing logic) mkdir -p /opt/data @@ -136,28 +178,37 @@ jobs: PLATFORM='${{ inputs.platform }}' DEVICE='${{ inputs.device }}' - CONDA_ENV='${{ inputs.conda_env }}' - CONDA_PATH='${{ inputs.conda_path }}' + PKG_MGR='${{ inputs.pkg_mgr }}' + ENV_NAME='${{ inputs.env_name }}' + ENV_PATH='${{ inputs.env_path }}' echo "Running unit tests" echo "Platform: $PLATFORM" echo "Device: $DEVICE" + echo "Package Manager: $PKG_MGR" + echo "Environment Name: $ENV_NAME" + echo "Environment Path: $ENV_PATH" echo "Project root: $PROJECT_ROOT" - # Source conda utilities for reusable functions - source ./tools/install/utils/conda_utils.sh - - # Conditionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - echo "⚠️ Conda environment requested but conda not found" - echo "Continuing without conda activation..." - fi - else - echo "ℹ️ Running tests with system Python" - fi + # Source environment utilities + source ./tools/install/utils/pyenv_utils.sh + + # Activate environment based on package manager + case "$PKG_MGR" in + conda) + if [ -n "$ENV_NAME" ]; then + activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️ Conda activation failed" + fi + ;; + uv) + if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then + activate_uv_env "$ENV_PATH" || echo "⚠️ UV activation failed" + fi + ;; + pip) + echo "ℹ️ Running tests with pip/system Python" + ;; + esac # Display Python environment info echo "Python location: $(which python)" diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 000000000..d0c13ff38 --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,257 @@ +#!/bin/bash +# FlagScale Docker Build Script +# +# NOTE: This script is experimental and requires further testing. +# Please report issues at https://github.com/FlagOpen/FlagScale/issues +# +# Usage: ./docker/build.sh [OPTIONS] +# +# Examples: +# ./docker/build.sh --platform cuda +# ./docker/build.sh --platform cuda --task train +# ./docker/build.sh --platform cuda --task train --target dev + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# ============================================================================= +# Logging functions +# ============================================================================= +log_info() { + echo "[INFO] $*" +} + +log_error() { + echo "[ERROR] $*" >&2 +} + +# ============================================================================= +# Default versions (same as tools/install, override via environment variables) +# ============================================================================= +PYTHON_VERSION="${PYTHON_VERSION:-3.12}" +UV_VERSION="${UV_VERSION:-0.7.2}" +OPENMPI_VERSION="${OPENMPI_VERSION:-4.1.6}" +CUDA_VERSION="${CUDA_VERSION:-12.8.1}" +UBUNTU_VERSION="${UBUNTU_VERSION:-22.04}" + +# ============================================================================= +# Default values +# ============================================================================= +PLATFORM="cuda" +TASK="" +TARGET="dev" +TAG_PREFIX="flagscale" +NO_CACHE=false + +# PyPI index URLs (optional, for custom mirrors) +PIP_INDEX_URL="${PIP_INDEX_URL:-}" +PIP_EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL:-}" + +# ============================================================================= +# Platform and task discovery +# ============================================================================= + +# Get available tasks by scanning Dockerfile.* files +get_platform_tasks() { + local platform=$1 + local platform_dir="$SCRIPT_DIR/$platform" + if [ -d "$platform_dir" ]; then + ls "$platform_dir"/Dockerfile.* 2>/dev/null | xargs -n1 basename | sed 's/Dockerfile\.//' || true + fi +} + +# Get first task as default +get_default_task() { + local platform=$1 + get_platform_tasks "$platform" | head -1 +} + +# Validate platform exists +validate_platform() { + local platform=$1 + if [ ! -d "$SCRIPT_DIR/$platform" ]; then + log_error "Platform directory not found: $SCRIPT_DIR/$platform" + exit 1 + fi +} + +# Validate task exists for platform +validate_task() { + local platform=$1 + local task=$2 + local dockerfile="$SCRIPT_DIR/$platform/Dockerfile.${task}" + if [ ! -f "$dockerfile" ]; then + log_error "Task '$task' not found for platform '$platform'" + log_error "Available tasks: $(get_platform_tasks "$platform" | tr '\n' ' ')" + exit 1 + fi +} + +# ============================================================================= +# Usage +# ============================================================================= +usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Build FlagScale Docker images. + +OPTIONS: + --platform PLATFORM Platform to build (default: cuda) + --task TASK Task to build (default: first task in platform) + --target TARGET Build target: dev, release (default: dev) + --tag-prefix PREFIX Image tag prefix (default: flagscale) + --index-url URL PyPI index URL (for custom mirrors) + --extra-index-url URL Extra PyPI index URL + --no-cache Build without cache + --help Show this help message + +VERSIONS (override via environment variables): + PYTHON_VERSION Python version (default: ${PYTHON_VERSION}) + UV_VERSION uv version (default: ${UV_VERSION}) + OPENMPI_VERSION OpenMPI version (default: ${OPENMPI_VERSION}) + CUDA_VERSION CUDA version (default: ${CUDA_VERSION}) + UBUNTU_VERSION Ubuntu version (default: ${UBUNTU_VERSION}) + +EXAMPLES: + $0 --platform cuda + $0 --platform cuda --task train + $0 --platform cuda --task train --target dev + CUDA_VERSION=12.4.0 $0 --platform cuda --task train + +EOF +} + +# ============================================================================= +# Argument parsing +# ============================================================================= +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --platform) PLATFORM="$2"; shift 2 ;; + --task) TASK="$2"; shift 2 ;; + --target) TARGET="$2"; shift 2 ;; + --tag-prefix) TAG_PREFIX="$2"; shift 2 ;; + --index-url) PIP_INDEX_URL="$2"; shift 2 ;; + --extra-index-url) PIP_EXTRA_INDEX_URL="$2"; shift 2 ;; + --no-cache) NO_CACHE=true; shift ;; + --help|-h) usage; exit 0 ;; + *) + log_error "Unknown option: $1" + usage + exit 1 + ;; + esac + done +} + +# ============================================================================= +# Image tag generation +# ============================================================================= +get_image_tag() { + local platform=$1 + local task=$2 + local tag="${TAG_PREFIX}-${task}:${TARGET}" + + # Add CUDA version suffix for cuda platform + if [ "$platform" = "cuda" ]; then + local cuda_major=$(echo "$CUDA_VERSION" | cut -d. -f1) + local cuda_minor=$(echo "$CUDA_VERSION" | cut -d. -f2) + tag="${tag}-cu${cuda_major}${cuda_minor}" + fi + + # Add python version + tag="${tag}-py${PYTHON_VERSION}" + + echo "$tag" +} + +# ============================================================================= +# Build image +# ============================================================================= +build_image() { + local platform=$PLATFORM + local task=$TASK + local dockerfile="$SCRIPT_DIR/$platform/Dockerfile.${task}" + + local image_tag=$(get_image_tag "$platform" "$task") + + log_info "Building image: $image_tag" + log_info "Dockerfile: $dockerfile" + log_info "Platform: $platform" + log_info "Task: $task" + log_info "Target: $TARGET" + + # Build command + local build_cmd="docker build -f $dockerfile --target $TARGET -t $image_tag" + + # Add version build args + log_info "PYTHON_VERSION: $PYTHON_VERSION" + log_info "UV_VERSION: $UV_VERSION" + log_info "OPENMPI_VERSION: $OPENMPI_VERSION" + build_cmd="$build_cmd --build-arg PYTHON_VERSION=$PYTHON_VERSION" + build_cmd="$build_cmd --build-arg UV_VERSION=$UV_VERSION" + build_cmd="$build_cmd --build-arg OPENMPI_VERSION=$OPENMPI_VERSION" + + # Compute and add derived values for CUDA platform + if [ "$platform" = "cuda" ]; then + local cuda_major=$(echo "$CUDA_VERSION" | cut -d. -f1) + local cuda_minor=$(echo "$CUDA_VERSION" | cut -d. -f2) + + local base_image="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" + local pytorch_index="https://download.pytorch.org/whl/cu${cuda_major}${cuda_minor}" + + log_info "CUDA_VERSION: $CUDA_VERSION" + log_info "UBUNTU_VERSION: $UBUNTU_VERSION" + log_info "BASE_IMAGE: $base_image" + log_info "PYTORCH_INDEX: $pytorch_index" + build_cmd="$build_cmd --build-arg BASE_IMAGE=$base_image" + build_cmd="$build_cmd --build-arg PYTORCH_INDEX=$pytorch_index" + fi + + # Add PyPI index URLs if specified + if [ -n "$PIP_INDEX_URL" ]; then + log_info "PIP_INDEX_URL: $PIP_INDEX_URL" + build_cmd="$build_cmd --build-arg PIP_INDEX_URL=$PIP_INDEX_URL" + fi + if [ -n "$PIP_EXTRA_INDEX_URL" ]; then + log_info "PIP_EXTRA_INDEX_URL: $PIP_EXTRA_INDEX_URL" + build_cmd="$build_cmd --build-arg PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL" + fi + + [ "$NO_CACHE" = true ] && build_cmd="$build_cmd --no-cache" + build_cmd="$build_cmd $PROJECT_ROOT" + + log_info "Running: $build_cmd" + eval "$build_cmd" + + log_info "Successfully built: $image_tag" +} + +# ============================================================================= +# Main +# ============================================================================= +main() { + parse_args "$@" + + # Validate platform + validate_platform "$PLATFORM" + + # Set default task if not specified + if [ -z "$TASK" ]; then + TASK=$(get_default_task "$PLATFORM") + log_info "No task specified, using default: $TASK" + fi + + # Validate task + validate_task "$PLATFORM" "$TASK" + + log_info "FlagScale Docker Build" + log_info "======================" + + build_image +} + +main "$@" diff --git a/docker/cuda/Dockerfile.all b/docker/cuda/Dockerfile.all new file mode 100644 index 000000000..543a3f5b0 --- /dev/null +++ b/docker/cuda/Dockerfile.all @@ -0,0 +1,204 @@ +# FlagScale All-in-One Dockerfile for CUDA Platform +# +# NOTE: This Dockerfile is experimental and requires further testing. +# Please report issues at https://github.com/FlagOpen/FlagScale/issues +# +# Comprehensive image with all task dependencies (train, inference, rl, hetero_train) +# +# Multi-stage build producing: +# - dev: Development image with all tools and dependencies +# - release: Production image with all dependencies +# +# Build examples: +# ./docker/build.sh --platform cuda --task all --target dev +# ./docker/build.sh --platform cuda --task all --target release +# ./docker/build.sh --platform cuda --task all --target dev --build-arg PKG_MGR=conda + +# ============================================================================= +# BUILD ARGUMENTS +# ============================================================================= +# Base image versions +ARG CUDA_VERSION=12.8.1 +ARG UBUNTU_VERSION=22.04 +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +# Tool versions +ARG PYTHON_VERSION=3.12 +ARG UV_VERSION=0.7.2 + +# Package manager: pip, uv, or conda (default: uv) +ARG PKG_MGR=uv + +# PyPI index URLs (for custom mirrors) +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL=${PIP_INDEX_URL} +ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} + +# PyTorch wheel index (derived from CUDA version) +ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 + +# ============================================================================= +# BASE STAGE - System dependencies +# ============================================================================= +FROM ${BASE_IMAGE} AS base + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG UV_VERSION +ARG PKG_MGR + +# Root installation directory (single source of truth) +ARG FLAGSCALE_HOME=/opt/flagscale + +# Build-time only environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai + +# Copy install scripts for system dependencies +COPY tools/install /tmp/tools/install + +# Install system dependencies (common for all tasks) +# Uses: install.sh --no-dev --no-base --no-task to only run system phase +RUN chmod +x /tmp/tools/install/*.sh && \ + chmod +x /tmp/tools/install/utils/*.sh && \ + chmod +x /tmp/tools/install/cuda/*.sh && \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + /tmp/tools/install/install.sh \ + --platform cuda \ + --task all \ + --pkg-mgr ${PKG_MGR} \ + --no-dev --no-base --no-task && \ + rm -rf /tmp/tools + +# Runtime environment variables (mirrors env.sh for non-interactive shell compatibility) +ENV FLAGSCALE_HOME=${FLAGSCALE_HOME} +ENV UV_PROJECT_ENVIRONMENT=${FLAGSCALE_HOME}/venv +ENV VIRTUAL_ENV=${FLAGSCALE_HOME}/venv +ENV FLAGSCALE_CONDA=${FLAGSCALE_HOME}/miniconda3 +ENV FLAGSCALE_DEPS=${FLAGSCALE_HOME}/deps +# UV configuration +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE=copy +# System paths +ENV MPI_HOME=/usr/local/mpi +ENV CUDA_HOME=/usr/local/cuda +# Combined PATH (includes both uv venv and conda paths for flexibility) +ENV PATH="${FLAGSCALE_HOME}/venv/bin:${FLAGSCALE_HOME}/miniconda3/bin:/root/.local/bin:/usr/local/mpi/bin:${CUDA_HOME}/bin:$PATH" +ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH" + +WORKDIR /workspace + +# ============================================================================= +# DEPS STAGE - Install all dependencies using install folder +# ============================================================================= +FROM base AS deps + +ARG PYTORCH_INDEX +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# PyPI index URLs (re-declare to use in this stage) +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ARG UV_EXTRA_INDEX_URL +ENV PIP_INDEX_URL=${PIP_INDEX_URL} +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} + +# Copy install scripts and requirements +COPY tools/install /workspace/tools/install +COPY requirements /workspace/requirements + +# Install all task dependencies based on package manager +# Note: Source env.sh to ensure environment is available in non-interactive RUN +# Cache mounts: uv cache, pip cache, conda pkgs cache +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=${FLAGSCALE_HOME}/miniconda3/pkgs \ + . /etc/profile.d/flagscale-env.sh && \ + chmod +x /workspace/tools/install/*.sh && \ + chmod +x /workspace/tools/install/utils/*.sh && \ + chmod +x /workspace/tools/install/cuda/*.sh && \ + cd /workspace && \ + if [ "$PKG_MGR" = "uv" ]; then \ + UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task all --pkg-mgr uv --no-system; \ + elif [ "$PKG_MGR" = "conda" ]; then \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task all --pkg-mgr conda --env-name flagscale-all --no-system; \ + else \ + PIP_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task all --pkg-mgr pip --no-system; \ + fi + +# ============================================================================= +# DEV STAGE - Full development image +# ============================================================================= +FROM deps AS dev + +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# Dev dependencies are already installed by default (use --no-dev to skip) +# Copy FlagScale source +COPY . /workspace/FlagScale +WORKDIR /workspace/FlagScale + +CMD ["/bin/bash"] + +# ============================================================================= +# RELEASE STAGE - Production image (uses same base for consistency) +# ============================================================================= +FROM base AS release + +ARG PYTORCH_INDEX +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# PyPI index URLs +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ARG UV_EXTRA_INDEX_URL +ENV PIP_INDEX_URL=${PIP_INDEX_URL} +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} + +# Copy install scripts and requirements +COPY tools/install /workspace/tools/install +COPY requirements /workspace/requirements + +# Install all task dependencies without dev tools +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=${FLAGSCALE_HOME}/miniconda3/pkgs \ + . /etc/profile.d/flagscale-env.sh && \ + chmod +x /workspace/tools/install/*.sh && \ + chmod +x /workspace/tools/install/utils/*.sh && \ + chmod +x /workspace/tools/install/cuda/*.sh && \ + cd /workspace && \ + if [ "$PKG_MGR" = "uv" ]; then \ + UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task all --pkg-mgr uv --no-system --no-dev; \ + elif [ "$PKG_MGR" = "conda" ]; then \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task all --pkg-mgr conda --env-name flagscale-all --no-system --no-dev; \ + else \ + PIP_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task all --pkg-mgr pip --no-system --no-dev; \ + fi + +# Copy FlagScale source +COPY . /workspace/FlagScale +WORKDIR /workspace/FlagScale + +CMD ["/bin/bash"] diff --git a/docker/cuda/Dockerfile.inference b/docker/cuda/Dockerfile.inference new file mode 100644 index 000000000..def736acd --- /dev/null +++ b/docker/cuda/Dockerfile.inference @@ -0,0 +1,202 @@ +# FlagScale Inference Dockerfile for CUDA Platform +# +# NOTE: This Dockerfile is experimental and requires further testing. +# Please report issues at https://github.com/FlagOpen/FlagScale/issues +# +# Multi-stage build producing: +# - dev: Development image with all tools +# - release: Production image for serving +# +# Build examples: +# ./docker/build.sh --platform cuda --task inference --target dev +# ./docker/build.sh --platform cuda --task inference --target release +# ./docker/build.sh --platform cuda --task inference --target dev --build-arg PKG_MGR=conda + +# ============================================================================= +# BUILD ARGUMENTS +# ============================================================================= +# Base image versions +ARG CUDA_VERSION=12.8.1 +ARG UBUNTU_VERSION=22.04 +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +# Tool versions +ARG PYTHON_VERSION=3.12 +ARG UV_VERSION=0.7.2 + +# Package manager: pip, uv, or conda (default: uv) +ARG PKG_MGR=uv + +# PyPI index URLs (for custom mirrors) +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL=${PIP_INDEX_URL} +ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} + +# PyTorch wheel index (derived from CUDA version) +ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 + +# ============================================================================= +# BASE STAGE - System dependencies +# ============================================================================= +FROM ${BASE_IMAGE} AS base + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG UV_VERSION +ARG PKG_MGR + +# Root installation directory (single source of truth) +ARG FLAGSCALE_HOME=/opt/flagscale + +# Build-time only environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai + +# Copy install scripts for system dependencies +COPY tools/install /tmp/tools/install + +# Install system dependencies (common for all tasks) +# Uses: install.sh --no-dev --no-base --no-task to only run system phase +RUN chmod +x /tmp/tools/install/*.sh && \ + chmod +x /tmp/tools/install/utils/*.sh && \ + chmod +x /tmp/tools/install/cuda/*.sh && \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + /tmp/tools/install/install.sh \ + --platform cuda \ + --task inference \ + --pkg-mgr ${PKG_MGR} \ + --no-dev --no-base --no-task && \ + rm -rf /tmp/tools + +# Runtime environment variables (mirrors env.sh for non-interactive shell compatibility) +ENV FLAGSCALE_HOME=${FLAGSCALE_HOME} +ENV UV_PROJECT_ENVIRONMENT=${FLAGSCALE_HOME}/venv +ENV VIRTUAL_ENV=${FLAGSCALE_HOME}/venv +ENV FLAGSCALE_CONDA=${FLAGSCALE_HOME}/miniconda3 +ENV FLAGSCALE_DEPS=${FLAGSCALE_HOME}/deps +# UV configuration +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE=copy +# System paths +ENV MPI_HOME=/usr/local/mpi +ENV CUDA_HOME=/usr/local/cuda +# Combined PATH (includes both uv venv and conda paths for flexibility) +ENV PATH="${FLAGSCALE_HOME}/venv/bin:${FLAGSCALE_HOME}/miniconda3/bin:/root/.local/bin:/usr/local/mpi/bin:${CUDA_HOME}/bin:$PATH" +ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH" + +WORKDIR /workspace + +# ============================================================================= +# DEPS STAGE - Install dependencies using install folder +# ============================================================================= +FROM base AS deps + +ARG PYTORCH_INDEX +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# PyPI index URLs (re-declare to use in this stage) +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ARG UV_EXTRA_INDEX_URL +ENV PIP_INDEX_URL=${PIP_INDEX_URL} +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} + +# Copy install scripts and requirements +COPY tools/install /workspace/tools/install +COPY requirements /workspace/requirements + +# Install task dependencies based on package manager +# Note: Source env.sh to ensure environment is available in non-interactive RUN +# Cache mounts: uv cache, pip cache, conda pkgs cache +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=${FLAGSCALE_HOME}/miniconda3/pkgs \ + . /etc/profile.d/flagscale-env.sh && \ + chmod +x /workspace/tools/install/*.sh && \ + chmod +x /workspace/tools/install/utils/*.sh && \ + chmod +x /workspace/tools/install/cuda/*.sh && \ + cd /workspace && \ + if [ "$PKG_MGR" = "uv" ]; then \ + UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task inference --pkg-mgr uv --no-system; \ + elif [ "$PKG_MGR" = "conda" ]; then \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task inference --pkg-mgr conda --env-name flagscale-inference --no-system; \ + else \ + PIP_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task inference --pkg-mgr pip --no-system; \ + fi + +# ============================================================================= +# DEV STAGE - Development image +# ============================================================================= +FROM deps AS dev + +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# Dev dependencies are already installed by default (use --no-dev to skip) +# Copy FlagScale source +COPY . /workspace/FlagScale +WORKDIR /workspace/FlagScale + +CMD ["/bin/bash"] + +# ============================================================================= +# RELEASE STAGE - Production image (uses same base for consistency) +# ============================================================================= +FROM base AS release + +ARG PYTORCH_INDEX +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# PyPI index URLs +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ARG UV_EXTRA_INDEX_URL +ENV PIP_INDEX_URL=${PIP_INDEX_URL} +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} + +# Copy install scripts and requirements +COPY tools/install /workspace/tools/install +COPY requirements /workspace/requirements + +# Install task dependencies without dev tools +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=${FLAGSCALE_HOME}/miniconda3/pkgs \ + . /etc/profile.d/flagscale-env.sh && \ + chmod +x /workspace/tools/install/*.sh && \ + chmod +x /workspace/tools/install/utils/*.sh && \ + chmod +x /workspace/tools/install/cuda/*.sh && \ + cd /workspace && \ + if [ "$PKG_MGR" = "uv" ]; then \ + UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task inference --pkg-mgr uv --no-system --no-dev; \ + elif [ "$PKG_MGR" = "conda" ]; then \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task inference --pkg-mgr conda --env-name flagscale-inference --no-system --no-dev; \ + else \ + PIP_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task inference --pkg-mgr pip --no-system --no-dev; \ + fi + +# Copy FlagScale source +COPY . /workspace/FlagScale +WORKDIR /workspace/FlagScale + +CMD ["/bin/bash"] diff --git a/docker/cuda/Dockerfile.train b/docker/cuda/Dockerfile.train new file mode 100644 index 000000000..ec6a5d198 --- /dev/null +++ b/docker/cuda/Dockerfile.train @@ -0,0 +1,202 @@ +# FlagScale Training Dockerfile for CUDA Platform +# +# NOTE: This Dockerfile is experimental and requires further testing. +# Please report issues at https://github.com/FlagOpen/FlagScale/issues +# +# Multi-stage build producing: +# - dev: Development image with all tools +# - release: Production image +# +# Build examples: +# ./docker/build.sh --platform cuda --task train --target dev +# ./docker/build.sh --platform cuda --task train --target release +# ./docker/build.sh --platform cuda --task train --target dev --build-arg PKG_MGR=conda + +# ============================================================================= +# BUILD ARGUMENTS +# ============================================================================= +# Base image versions +ARG CUDA_VERSION=12.8.1 +ARG UBUNTU_VERSION=22.04 +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +# Tool versions +ARG PYTHON_VERSION=3.12 +ARG UV_VERSION=0.7.2 + +# Package manager: pip, uv, or conda (default: uv) +ARG PKG_MGR=uv + +# PyPI index URLs (for custom mirrors) +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL=${PIP_INDEX_URL} +ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} + +# PyTorch wheel index (derived from CUDA version) +ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 + +# ============================================================================= +# BASE STAGE - System dependencies +# ============================================================================= +FROM ${BASE_IMAGE} AS base + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG UV_VERSION +ARG PKG_MGR + +# Root installation directory (single source of truth) +ARG FLAGSCALE_HOME=/opt/flagscale + +# Build-time only environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai + +# Copy install scripts for system dependencies +COPY tools/install /tmp/tools/install + +# Install system dependencies (common for all tasks) +# Uses: install.sh --no-dev --no-base --no-task to only run system phase +RUN chmod +x /tmp/tools/install/*.sh && \ + chmod +x /tmp/tools/install/utils/*.sh && \ + chmod +x /tmp/tools/install/cuda/*.sh && \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + /tmp/tools/install/install.sh \ + --platform cuda \ + --task train \ + --pkg-mgr ${PKG_MGR} \ + --no-dev --no-base --no-task && \ + rm -rf /tmp/tools + +# Runtime environment variables (mirrors env.sh for non-interactive shell compatibility) +ENV FLAGSCALE_HOME=${FLAGSCALE_HOME} +ENV UV_PROJECT_ENVIRONMENT=${FLAGSCALE_HOME}/venv +ENV VIRTUAL_ENV=${FLAGSCALE_HOME}/venv +ENV FLAGSCALE_CONDA=${FLAGSCALE_HOME}/miniconda3 +ENV FLAGSCALE_DEPS=${FLAGSCALE_HOME}/deps +# UV configuration +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE=copy +# System paths +ENV MPI_HOME=/usr/local/mpi +ENV CUDA_HOME=/usr/local/cuda +# Combined PATH (includes both uv venv and conda paths for flexibility) +ENV PATH="${FLAGSCALE_HOME}/venv/bin:${FLAGSCALE_HOME}/miniconda3/bin:/root/.local/bin:/usr/local/mpi/bin:${CUDA_HOME}/bin:$PATH" +ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH" + +WORKDIR /workspace + +# ============================================================================= +# DEPS STAGE - Install dependencies using install folder +# ============================================================================= +FROM base AS deps + +ARG PYTORCH_INDEX +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# PyPI index URLs (re-declare to use in this stage) +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ARG UV_EXTRA_INDEX_URL +ENV PIP_INDEX_URL=${PIP_INDEX_URL} +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} + +# Copy install scripts and requirements +COPY tools/install /workspace/tools/install +COPY requirements /workspace/requirements + +# Install task dependencies based on package manager +# Note: Source env.sh to ensure environment is available in non-interactive RUN +# Cache mounts: uv cache, pip cache, conda pkgs cache +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=${FLAGSCALE_HOME}/miniconda3/pkgs \ + . /etc/profile.d/flagscale-env.sh && \ + chmod +x /workspace/tools/install/*.sh && \ + chmod +x /workspace/tools/install/utils/*.sh && \ + chmod +x /workspace/tools/install/cuda/*.sh && \ + cd /workspace && \ + if [ "$PKG_MGR" = "uv" ]; then \ + UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task train --pkg-mgr uv --no-system; \ + elif [ "$PKG_MGR" = "conda" ]; then \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task train --pkg-mgr conda --env-name flagscale-train --no-system; \ + else \ + PIP_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task train --pkg-mgr pip --no-system; \ + fi + +# ============================================================================= +# DEV STAGE - Development image +# ============================================================================= +FROM deps AS dev + +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# Dev dependencies are already installed by default (use --no-dev to skip) +# Copy FlagScale source +COPY . /workspace/FlagScale +WORKDIR /workspace/FlagScale + +CMD ["/bin/bash"] + +# ============================================================================= +# RELEASE STAGE - Production image (uses same base for consistency) +# ============================================================================= +FROM base AS release + +ARG PYTORCH_INDEX +ARG PKG_MGR +ARG FLAGSCALE_HOME=/opt/flagscale + +# PyPI index URLs +ARG PIP_INDEX_URL +ARG PIP_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ARG UV_EXTRA_INDEX_URL +ENV PIP_INDEX_URL=${PIP_INDEX_URL} +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} + +# Copy install scripts and requirements +COPY tools/install /workspace/tools/install +COPY requirements /workspace/requirements + +# Install task dependencies without dev tools +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=${FLAGSCALE_HOME}/miniconda3/pkgs \ + . /etc/profile.d/flagscale-env.sh && \ + chmod +x /workspace/tools/install/*.sh && \ + chmod +x /workspace/tools/install/utils/*.sh && \ + chmod +x /workspace/tools/install/cuda/*.sh && \ + cd /workspace && \ + if [ "$PKG_MGR" = "uv" ]; then \ + UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task train --pkg-mgr uv --no-system --no-dev; \ + elif [ "$PKG_MGR" = "conda" ]; then \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task train --pkg-mgr conda --env-name flagscale-train --no-system --no-dev; \ + else \ + PIP_EXTRA_INDEX_URL=${PYTORCH_INDEX} \ + FLAGSCALE_HOME=${FLAGSCALE_HOME} \ + ./tools/install/install.sh --platform cuda --task train --pkg-mgr pip --no-system --no-dev; \ + fi + +# Copy FlagScale source +COPY . /workspace/FlagScale +WORKDIR /workspace/FlagScale + +CMD ["/bin/bash"] diff --git a/requirements/build.txt b/requirements/build.txt deleted file mode 100644 index 45486e3f4..000000000 --- a/requirements/build.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Build dependencies - should be mirrored in pyproject.toml [build-system] -setuptools>=77.0 -wheel>=0.45.1 -pip>=25.3 diff --git a/requirements/common.txt b/requirements/common.txt index b484ad36a..c23812a8f 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,5 +1,19 @@ # Common dependencies used by all tasks # Platform-agnostic packages including configuration management and experiment tracking -hydra-core -wandb +# Configuration and experiment tracking +hydra-core==1.3.2 +wandb==0.19.1 + +# Data science essentials (common across deepspeed, vllm, sglang, megatron) +numpy==1.26.4 +scipy==1.14.1 +pandas==2.2.3 +matplotlib==3.9.4 + +# Utilities +tqdm==4.67.1 +requests==2.32.3 +aiohttp==3.11.11 +pyyaml==6.0.2 +regex==2025.10.22 diff --git a/requirements/cuda/all.txt b/requirements/cuda/all.txt index e69de29bb..2918a57f0 100644 --- a/requirements/cuda/all.txt +++ b/requirements/cuda/all.txt @@ -0,0 +1,7 @@ +# All task dependencies combined +# Includes train, inference, rl, hetero_train + +-r ./train.txt +-r ./inference.txt +-r ./rl.txt +-r ./hetero_train.txt diff --git a/requirements/cuda/all_dev.txt b/requirements/cuda/all_dev.txt deleted file mode 100644 index 4dcc988c3..000000000 --- a/requirements/cuda/all_dev.txt +++ /dev/null @@ -1,4 +0,0 @@ --r ../build.txt --r ../lint.txt --r ../test.txt --r ./all.txt diff --git a/requirements/cuda/base.txt b/requirements/cuda/base.txt index 236b1ae9d..7bd2ea205 100644 --- a/requirements/cuda/base.txt +++ b/requirements/cuda/base.txt @@ -2,3 +2,8 @@ # Includes PyTorch with CUDA support and common dependencies -r ../common.txt + +# PyTorch +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 diff --git a/requirements/cuda/hetero_train_dev.txt b/requirements/cuda/hetero_train_dev.txt deleted file mode 100644 index 837a32f65..000000000 --- a/requirements/cuda/hetero_train_dev.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Heterogeneous training development dependencies -# Includes build tools, linting, testing, and heterogeneous training packages - --r ../build.txt --r ../lint.txt --r ../test.txt --r ./hetero_train.txt diff --git a/requirements/cuda/inference_dev.txt b/requirements/cuda/inference_dev.txt deleted file mode 100644 index b140f4d6a..000000000 --- a/requirements/cuda/inference_dev.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Inference development dependencies -# Includes build tools, linting, testing, and inference packages - --r ../build.txt --r ../lint.txt --r ../test.txt --r ./inference.txt diff --git a/requirements/cuda/rl_dev.txt b/requirements/cuda/rl_dev.txt deleted file mode 100644 index 8f0288564..000000000 --- a/requirements/cuda/rl_dev.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Reinforcement Learning development dependencies -# Includes build tools, linting, testing, and RL packages - --r ../build.txt --r ../lint.txt --r ../test.txt --r ./rl.txt diff --git a/requirements/cuda/train_dev.txt b/requirements/cuda/train_dev.txt deleted file mode 100644 index 96e399f37..000000000 --- a/requirements/cuda/train_dev.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Training development dependencies -# Includes build tools, linting, testing, and training packages - --r ../build.txt --r ../lint.txt --r ../test.txt --r ./train.txt diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 000000000..a5a3110e6 --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,24 @@ +# Development dependencies +# Includes build tools, linting, testing utilities +# Install with: ./tools/install/install.sh --dev ... + +# Build dependencies +setuptools==77.0.3 +wheel==0.45.1 +pip==25.0.1 +pybind11==3.0.1 + +# Linting and formatting +pre-commit==4.2.0 +black==24.4.2 +isort==5.13.2 +flake8==7.1.0 +pylint==3.2.6 + +# Testing +coverage==7.6.10 +pytest==8.3.5 +pytest-asyncio==0.25.3 +pytest-cov==6.0.0 +pytest-mock==3.14.0 +pytest-random-order==1.1.1 diff --git a/requirements/lint.txt b/requirements/lint.txt deleted file mode 100644 index 5ae98c338..000000000 --- a/requirements/lint.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Linting and formatting dependencies -# Code quality tools including formatters (black, isort) and linters (flake8, pylint) - -pre-commit==4.2.0 -black==24.4.2 -isort==5.13.2 -flake8==7.1.0 -pylint==3.2.6 diff --git a/requirements/test.txt b/requirements/test.txt deleted file mode 100644 index 8dd7ed466..000000000 --- a/requirements/test.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Testing dependencies -# Includes pytest framework, coverage tools, and testing utilities - -coverage -pytest -pytest_asyncio -pytest-cov -pytest_mock -pytest-random-order diff --git a/tests/test_utils/config/platforms/cuda.yaml b/tests/test_utils/config/platforms/cuda.yaml index 60265f0be..1cfeb44fc 100644 --- a/tests/test_utils/config/platforms/cuda.yaml +++ b/tests/test_utils/config/platforms/cuda.yaml @@ -25,18 +25,6 @@ a100: mixtral: ["tp2_pp1_ep2", "tp4_pp1_ep2"] hetero_train: aquila: ["tp2pp1_tp4pp1_tp2pp1", "tp2dp1pp1_tp2dp2pp1_tp1dp2pp1", "dp2dp4_shared_embedding"] - inference: - deepseek_r1_distill_qwen: ["7b-tp2"] - # deepseek_r1_distill_qwen-flaggems: ["7b-tp2"] # TODO: test need fix - qwen3: ["4b-tp2"] - # qwen3-flaggems: ["4b-tp2"] # TODO: test need fix - robobrain2: ["7b-tp2"] - # robobrain2-flaggems: ["7b-tp2"] # TODO: test need fix - serve: - qwen2_5: ["0.5b", "0.5b_multiple_instance"] - # base: ["multiple_model"] # TODO: test need fix - # rl: # TODO: test need fix - # qwen2_5: ["0_5b"] unit: # Include patterns: "*" for all, or list specific paths include: "*" diff --git a/tests/test_utils/runners/run_unit_tests.sh b/tests/test_utils/runners/run_unit_tests.sh index 6dc415ab3..ae255571c 100755 --- a/tests/test_utils/runners/run_unit_tests.sh +++ b/tests/test_utils/runners/run_unit_tests.sh @@ -75,7 +75,7 @@ run_unit_tests_for_device() { INCLUDE=$(echo "$PATTERN_OUTPUT" | grep "^INCLUDE=" | cut -d= -f2-) EXCLUDE=$(echo "$PATTERN_OUTPUT" | grep "^EXCLUDE=" | cut -d= -f2-) - # Build pytest command + # Build pytest command with torchrun for distributed test support PYTEST_CMD="torchrun --nproc_per_node=8 -m pytest tests/unit_tests/ -v --tb=short" # Apply exclude patterns if any diff --git a/tools/install/cuda/env.sh b/tools/install/cuda/env.sh new file mode 100755 index 000000000..7d934479b --- /dev/null +++ b/tools/install/cuda/env.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# ============================================================================= +# FlagScale CUDA Environment Variables +# ============================================================================= +# +# Self-contained environment setup for CUDA platform. +# Includes all common + CUDA-specific variables. +# +# Usage: +# - Development: source tools/install/cuda/env.sh +# - Docker: Sourced via /etc/profile.d/flagscale-env.sh +# +# Variables can be overridden by setting them before sourcing this file. +# +# FLAGSCALE_HOME is the root directory for all FlagScale installations: +# - $FLAGSCALE_HOME/miniconda3 - Conda installation +# - $FLAGSCALE_HOME/venv - UV virtual environment +# - $FLAGSCALE_HOME/deps - Source dependencies (Megatron, etc.) +# - $FLAGSCALE_HOME/downloads - Cached downloads (miniconda, etc.) +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Root Installation Directory (single source of truth) +# ----------------------------------------------------------------------------- +: "${FLAGSCALE_HOME:=/opt/flagscale}" + +# ----------------------------------------------------------------------------- +# Derived Paths (from FLAGSCALE_HOME) +# ----------------------------------------------------------------------------- +: "${UV_PROJECT_ENVIRONMENT:=$FLAGSCALE_HOME/venv}" +: "${FLAGSCALE_CONDA:=$FLAGSCALE_HOME/miniconda3}" +: "${FLAGSCALE_DEPS:=$FLAGSCALE_HOME/deps}" +: "${FLAGSCALE_DOWNLOADS:=$FLAGSCALE_HOME/downloads}" +: "${MPI_HOME:=/usr/local/mpi}" + +# ----------------------------------------------------------------------------- +# CUDA Configuration +# ----------------------------------------------------------------------------- +: "${CUDA_HOME:=/usr/local/cuda}" + +# ----------------------------------------------------------------------------- +# UV Configuration +# ----------------------------------------------------------------------------- +: "${UV_HTTP_TIMEOUT:=500}" +: "${UV_INDEX_STRATEGY:=unsafe-best-match}" +: "${UV_LINK_MODE:=copy}" + +# ----------------------------------------------------------------------------- +# Export Variables +# ----------------------------------------------------------------------------- +export FLAGSCALE_HOME FLAGSCALE_CONDA FLAGSCALE_DEPS FLAGSCALE_DOWNLOADS +export UV_PROJECT_ENVIRONMENT MPI_HOME CUDA_HOME +export UV_HTTP_TIMEOUT UV_INDEX_STRATEGY UV_LINK_MODE +export VIRTUAL_ENV="$UV_PROJECT_ENVIRONMENT" + +# ----------------------------------------------------------------------------- +# PATH Configuration +# ----------------------------------------------------------------------------- +export PATH="$UV_PROJECT_ENVIRONMENT/bin:$FLAGSCALE_CONDA/bin:$HOME/.local/bin:$MPI_HOME/bin:$CUDA_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$MPI_HOME/lib64:$MPI_HOME/lib:/usr/local/lib:$LD_LIBRARY_PATH" diff --git a/tools/install/cuda/install_base.sh b/tools/install/cuda/install_base.sh index 809a80a8b..d79d073f8 100755 --- a/tools/install/cuda/install_base.sh +++ b/tools/install/cuda/install_base.sh @@ -1,39 +1,49 @@ #!/bin/bash -# Base dependency installation script for CUDA platform -# Installs: common.txt + cuda/base.txt -# -# This script is called by install.sh and inherits its environment. -# It can also be run standalone for testing. - -set -euo pipefail +# Base phase (CUDA): apt packages + requirements/cuda/base.txt SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../utils/utils.sh" +source "$SCRIPT_DIR/../utils/pkg_utils.sh" source "$SCRIPT_DIR/../utils/retry_utils.sh" -# Use inherited values or defaults for standalone execution -PROJECT_ROOT="${PROJECT_ROOT:-$(get_project_root)}" -PLATFORM="${PLATFORM:-cuda}" -RETRY_COUNT="${RETRY_COUNT:-3}" +PROJECT_ROOT=$(get_project_root) +DEBUG="${FLAGSCALE_DEBUG:-false}" +RETRY_COUNT="${FLAGSCALE_RETRY_COUNT:-3}" +REQ_FILE="$PROJECT_ROOT/requirements/cuda/base.txt" -main() { - log_step "Installing base dependencies for $PLATFORM" +APT_PACKAGES="libcudnn9-dev-cuda-12" - # Install platform-agnostic common requirements - local common_file="$PROJECT_ROOT/requirements/common.txt" - if [ -f "$common_file" ]; then - log_info "Installing common requirements" - retry_pip_install "$common_file" "$RETRY_COUNT" - fi +while [[ $# -gt 0 ]]; do + case $1 in --debug) DEBUG=true; shift ;; *) shift ;; esac +done - # Install platform-specific base requirements - local base_file="$PROJECT_ROOT/requirements/$PLATFORM/base.txt" - if [ -f "$base_file" ]; then - log_info "Installing $PLATFORM base requirements" - retry_pip_install "$base_file" "$RETRY_COUNT" +install_apt() { + is_phase_enabled base || return 0 + set_step "Installing CUDA apt packages" + run_cmd -d $DEBUG apt-get install -y --no-install-recommends $APT_PACKAGES || return 1 + log_success "CUDA apt packages installed" +} + +install_pip() { + if is_phase_enabled base; then + # Phase enabled: install full requirements + [ ! -f "$REQ_FILE" ] && { log_info "base.txt not found"; return 0; } + set_step "Installing base requirements" + retry_pip_install -d $DEBUG "$REQ_FILE" "$RETRY_COUNT" || return 1 + log_success "Base requirements installed" + else + # Phase disabled: install only matching pip-deps + local pkgs=$(get_pip_deps_for_requirements "$REQ_FILE") + [ -z "$pkgs" ] && return 0 + set_step "Installing base pip packages (override)" + run_cmd -d $DEBUG $(get_pip_cmd) install --root-user-action=ignore $pkgs || return 1 + log_success "Base pip packages installed" fi +} - log_success "Base dependencies installed" +main() { + install_apt || die "CUDA apt packages failed" + install_pip || die "Base pip failed" } -main "$@" +main diff --git a/tools/install/cuda/install_hetero_train.sh b/tools/install/cuda/install_hetero_train.sh index ce5f16c08..78279f256 100755 --- a/tools/install/cuda/install_hetero_train.sh +++ b/tools/install/cuda/install_hetero_train.sh @@ -1,40 +1,6 @@ #!/bin/bash -# Source dependencies for hetero_train task (CUDA platform) -# Installs: Megatron-LM-FL from git -# -# This script is called by install.sh after base and pip requirements. -# It only handles source dependencies (git repos, etc.) - -set -euo pipefail +# Source dependencies for hetero_train task (same as train) +# Delegates to install_train.sh since both tasks use Megatron-LM-FL SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/../utils/utils.sh" -source "$SCRIPT_DIR/../utils/retry_utils.sh" - -# Use inherited values or defaults for standalone execution -PROJECT_ROOT="${PROJECT_ROOT:-$(get_project_root)}" -RETRY_COUNT="${RETRY_COUNT:-3}" - -install_megatron_lm() { - local megatron_dir="$PROJECT_ROOT/Megatron-LM-FL" - local megatron_url="https://github.com/flagos-ai/Megatron-LM-FL.git" - - log_info "Installing Megatron-LM-FL" - - # Clone repository - retry_git_clone "$megatron_url" "$megatron_dir" "$RETRY_COUNT" - - # Install from source - cd "$megatron_dir" - retry "$RETRY_COUNT" "pip install --no-build-isolation . -vvv" - cd "$PROJECT_ROOT" - - log_success "Megatron-LM-FL installed" -} - -main() { - log_step "Installing source dependencies for hetero_train task" - install_megatron_lm -} - -main "$@" +exec "$SCRIPT_DIR/install_train.sh" "$@" diff --git a/tools/install/cuda/install_inference.sh b/tools/install/cuda/install_inference.sh index 67f54342e..11fd74b76 100755 --- a/tools/install/cuda/install_inference.sh +++ b/tools/install/cuda/install_inference.sh @@ -1,41 +1,39 @@ #!/bin/bash -# Source dependencies for inference task (CUDA platform) -# -# This script is called by install.sh after base and pip requirements. -# It only handles source dependencies (git repos, etc.) -# -# Currently a placeholder - add source dependencies here when needed. - -set -euo pipefail +# Inference task (CUDA): requirements/cuda/inference.txt SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../utils/utils.sh" +source "$SCRIPT_DIR/../utils/pkg_utils.sh" source "$SCRIPT_DIR/../utils/retry_utils.sh" -# Use inherited values or defaults for standalone execution -PROJECT_ROOT="${PROJECT_ROOT:-$(get_project_root)}" -RETRY_COUNT="${RETRY_COUNT:-3}" - -install_vllm_lm() { - local vllm_dir="$PROJECT_ROOT/vllm-FL" - local vllm_url="https://github.com/flagos-ai/vllm-FL.git" - - log_info "Installing vllm-FL" - - # Clone repository - retry_git_clone "$vllm_url" "$vllm_dir" "$RETRY_COUNT" - - # Install from source - cd "$vllm_dir" - retry "$RETRY_COUNT" "pip install . -vvv" - cd "$PROJECT_ROOT" - - log_success "vllm-FL installed" +PROJECT_ROOT=$(get_project_root) +DEBUG="${FLAGSCALE_DEBUG:-false}" +RETRY_COUNT="${FLAGSCALE_RETRY_COUNT:-3}" +REQ_FILE="$PROJECT_ROOT/requirements/cuda/inference.txt" + +while [[ $# -gt 0 ]]; do + case $1 in --debug) DEBUG=true; shift ;; *) shift ;; esac +done + +install_pip() { + if is_phase_enabled task; then + [ ! -f "$REQ_FILE" ] && { log_info "inference.txt not found"; return 0; } + set_step "Installing inference requirements" + retry_pip_install -d $DEBUG "$REQ_FILE" "$RETRY_COUNT" || return 1 + log_success "Inference requirements installed" + else + local pkgs=$(get_pip_deps_for_requirements "$REQ_FILE") + [ -z "$pkgs" ] && return 0 + set_step "Installing inference pip packages (override)" + run_cmd -d $DEBUG $(get_pip_cmd) install --root-user-action=ignore $pkgs || return 1 + log_success "Inference pip packages installed" + fi } main() { - log_step "Installing source dependencies for inference task" - install_vllm_lm + install_pip || die "Inference pip failed" + # No source deps for inference task yet + # To add: SRC_DEPS_LIST="dep1 dep2" and install_src function } -main "$@" +main diff --git a/tools/install/cuda/install_rl.sh b/tools/install/cuda/install_rl.sh index 450d16ade..4b981cc02 100755 --- a/tools/install/cuda/install_rl.sh +++ b/tools/install/cuda/install_rl.sh @@ -1,24 +1,39 @@ #!/bin/bash -# Source dependencies for rl task (CUDA platform) -# -# This script is called by install.sh after base and pip requirements. -# It only handles source dependencies (git repos, etc.) -# -# Currently a placeholder - add source dependencies here when needed. - -set -euo pipefail +# RL task (CUDA): requirements/cuda/rl.txt SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../utils/utils.sh" +source "$SCRIPT_DIR/../utils/pkg_utils.sh" +source "$SCRIPT_DIR/../utils/retry_utils.sh" + +PROJECT_ROOT=$(get_project_root) +DEBUG="${FLAGSCALE_DEBUG:-false}" +RETRY_COUNT="${FLAGSCALE_RETRY_COUNT:-3}" +REQ_FILE="$PROJECT_ROOT/requirements/cuda/rl.txt" + +while [[ $# -gt 0 ]]; do + case $1 in --debug) DEBUG=true; shift ;; *) shift ;; esac +done + +install_pip() { + if is_phase_enabled task; then + [ ! -f "$REQ_FILE" ] && { log_info "rl.txt not found"; return 0; } + set_step "Installing rl requirements" + retry_pip_install -d $DEBUG "$REQ_FILE" "$RETRY_COUNT" || return 1 + log_success "RL requirements installed" + else + local pkgs=$(get_pip_deps_for_requirements "$REQ_FILE") + [ -z "$pkgs" ] && return 0 + set_step "Installing rl pip packages (override)" + run_cmd -d $DEBUG $(get_pip_cmd) install --root-user-action=ignore $pkgs || return 1 + log_success "RL pip packages installed" + fi +} main() { - log_info "No source dependencies for rl task (placeholder)" - # Add source dependency installations here when needed - # Example: - # source "$SCRIPT_DIR/../utils/retry_utils.sh" - # PROJECT_ROOT="${PROJECT_ROOT:-$(get_project_root)}" - # RETRY_COUNT="${RETRY_COUNT:-3}" - # retry_git_clone "https://github.com/..." "$PROJECT_ROOT/..." "$RETRY_COUNT" + install_pip || die "RL pip failed" + # No source deps for rl task yet + # To add: SRC_DEPS_LIST="dep1 dep2" and install_src function } -main "$@" +main diff --git a/tools/install/cuda/install_serve.sh b/tools/install/cuda/install_serve.sh index 1f2afab57..3c239ba62 100755 --- a/tools/install/cuda/install_serve.sh +++ b/tools/install/cuda/install_serve.sh @@ -1,41 +1,67 @@ #!/bin/bash -# Source dependencies for serve task (CUDA platform) -# -# This script is called by install.sh after base and pip requirements. -# It only handles source dependencies (git repos, etc.) -# -# Currently a placeholder - add source dependencies here when needed. - -set -euo pipefail +# Serve task (CUDA): requirements/cuda/serve.txt + source deps SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../utils/utils.sh" +source "$SCRIPT_DIR/../utils/pkg_utils.sh" source "$SCRIPT_DIR/../utils/retry_utils.sh" -# Use inherited values or defaults for standalone execution -PROJECT_ROOT="${PROJECT_ROOT:-$(get_project_root)}" -RETRY_COUNT="${RETRY_COUNT:-3}" +PROJECT_ROOT=$(get_project_root) +DEBUG="${FLAGSCALE_DEBUG:-false}" +RETRY_COUNT="${FLAGSCALE_RETRY_COUNT:-3}" +FLAGSCALE_HOME="${FLAGSCALE_HOME:-/opt/flagscale}" +FLAGSCALE_DEPS="${FLAGSCALE_DEPS:-$FLAGSCALE_HOME/deps}" +REQ_FILE="$PROJECT_ROOT/requirements/cuda/serve.txt" + +# Source deps available for this task +SRC_DEPS_LIST="vllm" -install_vllm_lm() { - local vllm_dir="$PROJECT_ROOT/vllm-FL" - local vllm_url="https://github.com/flagos-ai/vllm-FL.git" +while [[ $# -gt 0 ]]; do + case $1 in --debug) DEBUG=true; shift ;; *) shift ;; esac +done - log_info "Installing vllm-FL" +# ============================================================================= +# Pip Installation +# ============================================================================= +install_pip() { + if is_phase_enabled task; then + [ ! -f "$REQ_FILE" ] && { log_info "serve.txt not found"; return 0; } + set_step "Installing serve requirements" + retry_pip_install -d $DEBUG "$REQ_FILE" "$RETRY_COUNT" || return 1 + log_success "Serve requirements installed" + else + local pkgs=$(get_pip_deps_for_requirements "$REQ_FILE") + [ -z "$pkgs" ] && return 0 + set_step "Installing serve pip packages (override)" + run_cmd -d $DEBUG $(get_pip_cmd) install --root-user-action=ignore $pkgs || return 1 + log_success "Serve pip packages installed" + fi +} - # Clone repository - retry_git_clone "$vllm_url" "$vllm_dir" "$RETRY_COUNT" +# ============================================================================= +# Source Dependencies +# ============================================================================= +install_vllm() { + should_build_package "vllm" || return 0 + set_step "Installing vLLM-FL" + mkdir -p "$FLAGSCALE_DEPS" + retry_git_clone -d $DEBUG "https://github.com/flagos-ai/vllm-FL.git" "$FLAGSCALE_DEPS/vllm-FL" "$RETRY_COUNT" || return 1 + local pip_cmd=$(get_pip_cmd) + run_cmd -d $DEBUG bash -c "cd '$FLAGSCALE_DEPS/vllm-FL' && \ + $pip_cmd install --root-user-action=ignore --no-build-isolation . -vvv" || return 1 + log_success "vLLM-FL ready" +} - # Install from source - cd "$vllm_dir" - retry "$RETRY_COUNT" "pip install . -vvv" - cd "$PROJECT_ROOT" +install_src() { + # Skip if phase disabled and no matching src-deps + is_phase_enabled task || has_src_deps_for_phase $SRC_DEPS_LIST || return 0 - log_success "vllm-FL installed" + should_install_src task "vllm" && { install_vllm || die "vLLM failed"; } } main() { - log_step "Installing source dependencies for serve task" - install_vllm_lm + install_pip || die "Serve pip failed" + install_src } -main "$@" +main diff --git a/tools/install/cuda/install_train.sh b/tools/install/cuda/install_train.sh index 10471bb52..5b571aa33 100755 --- a/tools/install/cuda/install_train.sh +++ b/tools/install/cuda/install_train.sh @@ -1,40 +1,109 @@ #!/bin/bash -# Source dependencies for train task (CUDA platform) -# Installs: Megatron-LM-FL from git -# -# This script is called by install.sh after base and pip requirements. -# It only handles source dependencies (git repos, etc.) - -set -euo pipefail +# Train task (CUDA): requirements/cuda/train.txt + source deps SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../utils/utils.sh" +source "$SCRIPT_DIR/../utils/pkg_utils.sh" source "$SCRIPT_DIR/../utils/retry_utils.sh" -# Use inherited values or defaults for standalone execution -PROJECT_ROOT="${PROJECT_ROOT:-$(get_project_root)}" -RETRY_COUNT="${RETRY_COUNT:-3}" +PROJECT_ROOT=$(get_project_root) +DEBUG="${FLAGSCALE_DEBUG:-false}" +RETRY_COUNT="${FLAGSCALE_RETRY_COUNT:-3}" +FLAGSCALE_HOME="${FLAGSCALE_HOME:-/opt/flagscale}" +FLAGSCALE_DEPS="${FLAGSCALE_DEPS:-$FLAGSCALE_HOME/deps}" +REQ_FILE="$PROJECT_ROOT/requirements/cuda/train.txt" -install_megatron_lm() { - local megatron_dir="$PROJECT_ROOT/Megatron-LM-FL" - local megatron_url="https://github.com/flagos-ai/Megatron-LM-FL.git" +# Source deps available for this task +SRC_DEPS_LIST="apex flash-attn transformer-engine megatron-lm" + +while [[ $# -gt 0 ]]; do + case $1 in --debug) DEBUG=true; shift ;; *) shift ;; esac +done + +# ============================================================================= +# Pip Installation +# ============================================================================= +install_pip() { + if is_phase_enabled task; then + [ ! -f "$REQ_FILE" ] && { log_info "train.txt not found"; return 0; } + set_step "Installing train requirements" + retry_pip_install -d $DEBUG "$REQ_FILE" "$RETRY_COUNT" || return 1 + log_success "Train requirements installed" + else + local pkgs=$(get_pip_deps_for_requirements "$REQ_FILE") + [ -z "$pkgs" ] && return 0 + set_step "Installing train pip packages (override)" + run_cmd -d $DEBUG $(get_pip_cmd) install --root-user-action=ignore $pkgs || return 1 + log_success "Train pip packages installed" + fi +} + +# ============================================================================= +# Source Dependencies +# ============================================================================= +install_apex() { + should_build_package "apex" || return 0 + set_step "Installing NVIDIA Apex" + mkdir -p "$FLAGSCALE_DEPS" + retry_git_clone -d $DEBUG "https://github.com/NVIDIA/apex.git" "$FLAGSCALE_DEPS/apex" "$RETRY_COUNT" || return 1 + local pip_cmd=$(get_pip_cmd) + run_cmd -d $DEBUG bash -c "cd '$FLAGSCALE_DEPS/apex' && \ + NVCC_APPEND_FLAGS='--threads 4' APEX_PARALLEL_BUILD=8 APEX_CPP_EXT=1 APEX_CUDA_EXT=1 \ + $pip_cmd install --root-user-action=ignore --no-build-isolation . -v" || return 1 + log_success "NVIDIA Apex ready" +} + +install_flash_attn() { + should_build_package "flash_attn" || return 0 + set_step "Installing Flash-Attention 2" + local version="${FLASH_ATTN_VERSION:-2.8.1}" + mkdir -p "$FLAGSCALE_DEPS" + retry_git_clone -d $DEBUG --branch "v${version}" --depth 1 \ + "https://github.com/Dao-AILab/flash-attention.git" "$FLAGSCALE_DEPS/flash-attention" "$RETRY_COUNT" || return 1 + local pip_cmd=$(get_pip_cmd) + run_cmd -d $DEBUG bash -c "cd '$FLAGSCALE_DEPS/flash-attention' && \ + FLASH_ATTENTION_FORCE_BUILD=TRUE MAX_JOBS=4 \ + $pip_cmd install --root-user-action=ignore --no-build-isolation . -vvv" || return 1 + log_success "Flash-Attention 2 ready" +} - log_info "Installing Megatron-LM-FL" +install_transformer_engine() { + should_build_package "transformer_engine" || return 0 + set_step "Installing TransformerEngine" + local pip_cmd=$(get_pip_cmd) + run_cmd -d $DEBUG $pip_cmd install --root-user-action=ignore nvidia-mathdx --extra-index-url https://pypi.nvidia.com || return 1 + mkdir -p "$FLAGSCALE_DEPS" + retry_git_clone -d $DEBUG --recursive \ + "https://github.com/NVIDIA/TransformerEngine.git" "$FLAGSCALE_DEPS/TransformerEngine" "$RETRY_COUNT" || return 1 + run_cmd -d $DEBUG bash -c "cd '$FLAGSCALE_DEPS/TransformerEngine' && \ + NVTE_FRAMEWORK=pytorch $pip_cmd install --root-user-action=ignore --no-build-isolation . -vvv" || return 1 + log_success "TransformerEngine ready" +} - # Clone repository - retry_git_clone "$megatron_url" "$megatron_dir" "$RETRY_COUNT" +install_megatron_lm() { + should_build_package "megatron-core" || return 0 + set_step "Installing Megatron-LM-FL" + mkdir -p "$FLAGSCALE_DEPS" + retry_git_clone -d $DEBUG "https://github.com/flagos-ai/Megatron-LM-FL.git" "$FLAGSCALE_DEPS/Megatron-LM-FL" "$RETRY_COUNT" || return 1 + local pip_cmd=$(get_pip_cmd) + run_cmd -d $DEBUG bash -c "cd '$FLAGSCALE_DEPS/Megatron-LM-FL' && \ + $pip_cmd install --root-user-action=ignore --no-build-isolation . -vvv" || return 1 + log_success "Megatron-LM-FL ready" +} - # Install from source - cd "$megatron_dir" - retry "$RETRY_COUNT" "pip install --no-build-isolation . -vvv" - cd "$PROJECT_ROOT" +install_src() { + # Skip if phase disabled and no matching src-deps + is_phase_enabled task || has_src_deps_for_phase $SRC_DEPS_LIST || return 0 - log_success "Megatron-LM-FL installed" + should_install_src task "apex" && { install_apex || die "Apex failed"; } + should_install_src task "flash-attn" && { install_flash_attn || die "Flash-Attention failed"; } + should_install_src task "transformer-engine" && { install_transformer_engine || die "TransformerEngine failed"; } + should_install_src task "megatron-lm" && { install_megatron_lm || die "Megatron-LM failed"; } } main() { - log_step "Installing source dependencies for train task" - install_megatron_lm + install_pip || die "Train pip failed" + install_src } -main "$@" +main diff --git a/tools/install/install.sh b/tools/install/install.sh index 02540cae2..064446b8c 100755 --- a/tools/install/install.sh +++ b/tools/install/install.sh @@ -1,379 +1,260 @@ #!/bin/bash -# Master installation orchestrator script -# Delegates to task-specific install scripts +# ============================================================================= +# FlagScale Dependency Installation +# ============================================================================= # -# Task Discovery: -# Valid tasks are discovered from platform configuration files -# (tests/test_utils/config/platforms/*.yaml) which define supported -# tasks under the functional tests section. Install scripts serve -# as a fallback to ensure all tasks with implementations are recognized. - -set -euo pipefail +# Main entry point for installing FlagScale dependencies. +# Orchestrates four installation phases: system, dev, base, task. +# +# Usage: +# ./install.sh --platform PLATFORM --task TASK [OPTIONS] +# +# Examples: +# ./install.sh --platform cuda --task train # Full installation +# ./install.sh --platform cuda --task train --no-system # Skip system phase +# ./install.sh --platform cuda --task train --src-deps megatron-lm # Only megatron-lm +# ============================================================================= SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/utils/utils.sh" -source "$SCRIPT_DIR/utils/retry_utils.sh" -source "$SCRIPT_DIR/utils/conda_utils.sh" +source "$SCRIPT_DIR/utils/pkg_utils.sh" -# Get project root PROJECT_ROOT=$(get_project_root) -# Default values +# ============================================================================= +# Configuration (defaults) +# ============================================================================= TASK="" -PLATFORM="cuda" # Default to CUDA platform -RETRY_COUNT="3" -CONDA_ENV="" # Optional: conda environment to activate -CONDA_PATH="" # Optional: custom conda installation path -DEV_MODE="false" # Install development dependencies (build, lint, test) - -# Dynamically discover valid tasks from platform configuration -discover_valid_tasks() { +PLATFORM="" +PKG_MGR="uv" +ENV_NAME="" +DEBUG=false +RETRY_COUNT=3 +FORCE_BUILD=false +PYTHON_VERSION="${PYTHON_VERSION:-3.12}" +FLAGSCALE_HOME="${FLAGSCALE_HOME:-/opt/flagscale}" + +# Phase flags (default: install all) +INSTALL_SYSTEM=true +INSTALL_DEV=true +INSTALL_BASE=true +INSTALL_TASK=true + +# Override flags (selective installation) +SRC_DEPS="" +PIP_DEPS="" + +# PyPI index URLs +INDEX_URL="${PIP_INDEX_URL:-}" +EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL:-}" + +# ============================================================================= +# Helper Functions +# ============================================================================= +get_valid_tasks() { local tasks=() - local parse_config="$PROJECT_ROOT/tests/test_utils/runners/parse_config.py" - - # Primary method: Get tasks from platform configuration - # This is the source of truth for which tasks are supported on the platform - if [ -f "$parse_config" ] && command -v python >/dev/null 2>&1; then - # Use parse_config.py to get functional tests from platform config - # Extract task names (top-level keys) from the JSON output - while IFS= read -r task; do - if [ -n "$task" ]; then - tasks+=("$task") - fi - done < <(python "$parse_config" --platform "$PLATFORM" --type functional 2>/dev/null | \ - python -c "import sys, json; data = json.load(sys.stdin); print('\\n'.join(data.keys()))" 2>/dev/null || true) - fi - - # Fallback method: Get tasks from install scripts that exist - # This ensures tasks with install scripts but no tests yet are still valid - if [ -d "$SCRIPT_DIR/$PLATFORM" ]; then + if [ -n "$PLATFORM" ] && [ -d "$SCRIPT_DIR/$PLATFORM" ]; then for script in "$SCRIPT_DIR/$PLATFORM"/install_*.sh; do - if [ -f "$script" ]; then - task=$(basename "$script" | sed 's/^install_//' | sed 's/\.sh$//') - if [ "$task" != "base" ]; then - # Add task if not already in array - if [[ ! " ${tasks[@]} " =~ " ${task} " ]]; then - tasks+=("$task") - fi - fi - fi + [ -f "$script" ] || continue + local task=$(basename "$script" | sed 's/^install_//' | sed 's/\.sh$//') + [ "$task" != "base" ] && tasks+=("$task") done fi - - # Always add 'all' as a valid task for installing all dependencies tasks+=("all") - - # Return space-separated list echo "${tasks[@]}" } -# Dynamically discover valid platforms from test config -discover_valid_platforms() { - local platforms=() - - # Get platforms from test_utils/config/platforms/*.yaml files - local config_dir="$PROJECT_ROOT/tests/test_utils/config/platforms" - if [ -d "$config_dir" ]; then - for config_file in "$config_dir"/*.yaml; do - if [ -f "$config_file" ]; then - platform=$(basename "$config_file" .yaml) - # Skip template files - if [ "$platform" != "template" ]; then - platforms+=("$platform") - fi - fi - done - fi - - # Return space-separated list - echo "${platforms[@]}" +# Export all configuration as environment variables for phase scripts +export_config() { + # Paths + export FLAGSCALE_HOME + export FLAGSCALE_CONDA="$FLAGSCALE_HOME/miniconda3" + export FLAGSCALE_DEPS="$FLAGSCALE_HOME/deps" + export FLAGSCALE_DOWNLOADS="$FLAGSCALE_HOME/downloads" + export UV_PROJECT_ENVIRONMENT="$FLAGSCALE_HOME/venv" + + # Phase flags (for should_install functions) + export FLAGSCALE_INSTALL_SYSTEM="$INSTALL_SYSTEM" + export FLAGSCALE_INSTALL_DEV="$INSTALL_DEV" + export FLAGSCALE_INSTALL_BASE="$INSTALL_BASE" + export FLAGSCALE_INSTALL_TASK="$INSTALL_TASK" + + # Override flags + export FLAGSCALE_SRC_DEPS="$SRC_DEPS" + export FLAGSCALE_PIP_DEPS="$PIP_DEPS" + export FLAGSCALE_FORCE_BUILD="$FORCE_BUILD" + + # Other config + export FLAGSCALE_PLATFORM="$PLATFORM" + export FLAGSCALE_TASK="$TASK" + export FLAGSCALE_PKG_MGR="$PKG_MGR" + export FLAGSCALE_ENV_NAME="$ENV_NAME" + export FLAGSCALE_DEBUG="$DEBUG" + export FLAGSCALE_RETRY_COUNT="$RETRY_COUNT" + + # PyPI index + [ -n "$INDEX_URL" ] && { export PIP_INDEX_URL="$INDEX_URL" UV_INDEX_URL="$INDEX_URL"; } + [ -n "$EXTRA_INDEX_URL" ] && { export PIP_EXTRA_INDEX_URL="$EXTRA_INDEX_URL" UV_EXTRA_INDEX_URL="$EXTRA_INDEX_URL"; } } -# Arrays to hold valid tasks and platforms (populated after parse_args) -VALID_TASKS=() -VALID_PLATFORMS=() +# ============================================================================= +# Phase Execution +# ============================================================================= +run_phase() { + local phase="$1" + local script="$2" + local args="${3:-}" + + # Check if phase should run (enabled OR has overrides) + local phase_enabled + case "$phase" in + system) phase_enabled="$INSTALL_SYSTEM" ;; + dev) phase_enabled="$INSTALL_DEV" ;; + base) phase_enabled="$INSTALL_BASE" ;; + task) phase_enabled="$INSTALL_TASK" ;; + esac + + # Skip if phase disabled and no relevant overrides + if [ "$phase_enabled" = false ]; then + case "$phase" in + dev|base|task) + # These phases can have pip/src overrides + [ -z "$PIP_DEPS" ] && [ -z "$SRC_DEPS" ] && { log_info "Skipping $phase phase"; return 0; } + ;; + system) + # System phase has no overrides currently + log_info "Skipping $phase phase" + return 0 + ;; + esac + fi -# Initialize valid platforms (can be done early as it doesn't depend on user input) -init_valid_platforms() { - VALID_PLATFORMS=($(discover_valid_platforms)) -} + # Run the phase script + [ ! -f "$script" ] && { log_warn "Phase script not found: $script"; return 0; } -# Initialize valid tasks (must be done after PLATFORM is known) -init_valid_tasks() { - VALID_TASKS=($(discover_valid_tasks)) + print_header "${phase^} Phase" + [ "$DEBUG" = true ] && args="$args --debug" + "$script" $args || die "${phase^} phase failed" } +# ============================================================================= +# Main +# ============================================================================= usage() { - # Ensure platforms are discovered for help display - if [ ${#VALID_PLATFORMS[@]} -eq 0 ]; then - init_valid_platforms - fi - # Discover tasks for the current platform (default or specified) - if [ ${#VALID_TASKS[@]} -eq 0 ]; then - init_valid_tasks - fi - cat << EOF -Usage: $0 [OPTIONS] - -Master installation script for FlagScale dependencies. +Usage: $0 --platform PLATFORM --task TASK [OPTIONS] OPTIONS: - --task TASK Task type (required, see discovered tasks below) - --platform PLATFORM Platform: ${VALID_PLATFORMS[*]} (default: cuda) - --retry-count N Number of retry attempts (default: 3) - --conda-env ENV Optional: activate conda environment before install - --conda-path PATH Optional: custom conda installation path - --dev Install development dependencies (build, lint, test) - --help Show this help message + --platform NAME Platform (required, e.g., cuda) + --task TASK Task (required, e.g., train, serve, inference, rl, all) + + Phase Control (default: install all): + --no-system Skip system phase (apt, python, openmpi) + --no-dev Skip dev phase (dev requirements) + --no-base Skip base phase (base requirements + source) + --no-task Skip task phase (task requirements + source) + + Selective Installation (overrides --no-* for specific packages): + --pip-deps PKGS Install specific pip packages (comma-separated) + --src-deps DEPS Install specific source deps (comma-separated) + dev: sccache + train: apex,flash-attn,transformer-engine,megatron-lm + serve: vllm + + Environment: + --pkg-mgr MGR Package manager: pip, uv, conda (default: uv) + --env-name NAME Conda environment name + --install-dir DIR Root installation directory (default: /opt/flagscale) + --index-url URL PyPI index URL + --extra-index-url URL Extra PyPI index URL + + Other: + --retry-count N Retry attempts (default: 3) + --force-build Force rebuild source deps + --debug Dry-run mode + --help Show this help EXAMPLES: - # Install training dependencies for CUDA platform - $0 --task train --platform cuda - - # Install hetero_train dependencies (defaults to CUDA) - $0 --task hetero_train - - # Install all task dependencies - $0 --task all --platform cuda - - # Install with development dependencies (includes build, lint, test) - $0 --task train --dev - - # Install with custom retry count - $0 --task train --retry-count 5 - -TASK DISCOVERY: - Tasks are discovered from platform configuration files: - - Primary: tests/test_utils/config/platforms/\${PLATFORM}.yaml - - Fallback: install/\${PLATFORM}/install_*.sh scripts - -DISCOVERED VALID TASKS (for platform: $PLATFORM): -$(printf ' %s\n' "${VALID_TASKS[@]}") - -DISCOVERED VALID PLATFORMS: -$(printf ' %s\n' "${VALID_PLATFORMS[@]}") - + $0 --platform cuda --task train # Full installation + $0 --platform cuda --task train --no-system # Skip system phase + $0 --platform cuda --task train --no-system --no-dev --no-base --no-task --src-deps megatron-lm EOF } parse_args() { while [[ $# -gt 0 ]]; do case $1 in - --task) - TASK="$2" - shift 2 - ;; - --platform) - PLATFORM="$2" - shift 2 - ;; - --retry-count) - RETRY_COUNT="$2" - shift 2 - ;; - --conda-env) - CONDA_ENV="$2" - shift 2 - ;; - --conda-path) - CONDA_PATH="$2" - shift 2 - ;; - --dev) - DEV_MODE="true" - shift - ;; - --help|-h) - usage - exit 0 - ;; - *) - log_error "Unknown option: $1" - usage - exit 1 - ;; + --task) TASK="$2"; shift 2 ;; + --platform) PLATFORM="$2"; shift 2 ;; + --no-system) INSTALL_SYSTEM=false; shift ;; + --no-dev) INSTALL_DEV=false; shift ;; + --no-base) INSTALL_BASE=false; shift ;; + --no-task) INSTALL_TASK=false; shift ;; + --pkg-mgr) PKG_MGR="$2"; shift 2 ;; + --env-name) ENV_NAME="$2"; shift 2 ;; + --install-dir) FLAGSCALE_HOME="$2"; shift 2 ;; + --index-url) INDEX_URL="$2"; shift 2 ;; + --extra-index-url) EXTRA_INDEX_URL="$2"; shift 2 ;; + --retry-count) RETRY_COUNT="$2"; shift 2 ;; + --force-build) FORCE_BUILD=true; shift ;; + --src-deps) SRC_DEPS="$2"; shift 2 ;; + --pip-deps) PIP_DEPS="$2"; shift 2 ;; + --debug) DEBUG=true; shift ;; + --help|-h) usage; exit 0 ;; + *) log_error "Unknown option: $1"; usage; exit 1 ;; esac done } validate_inputs() { - # Initialize valid platforms first - init_valid_platforms + [ -z "$PLATFORM" ] && { log_error "Platform required (use --platform)"; usage; exit 1; } + [ ! -d "$SCRIPT_DIR/$PLATFORM" ] && { log_error "Invalid platform: $PLATFORM"; exit 1; } + [ -z "$TASK" ] && { log_error "Task required (use --task)"; usage; exit 1; } - # Check if platform is valid (must validate platform before discovering tasks) + local valid_tasks=($(get_valid_tasks)) local valid=false - for valid_platform in "${VALID_PLATFORMS[@]}"; do - if [ "$PLATFORM" = "$valid_platform" ]; then - valid=true - break - fi - done - - if [ "$valid" = "false" ]; then - log_error "Invalid platform: $PLATFORM" - log_error "Valid platforms: ${VALID_PLATFORMS[*]}" - exit 1 - fi - - # Now discover valid tasks for the specified platform - init_valid_tasks - - # Check if task is specified - if [ -z "$TASK" ]; then - log_error "Task not specified. Use --task to specify a task." - usage - exit 1 - fi - - # Check if task is valid - valid=false - for valid_task in "${VALID_TASKS[@]}"; do - if [ "$TASK" = "$valid_task" ]; then - valid=true - break - fi + for t in "${valid_tasks[@]}"; do + [ "$TASK" = "$t" ] && valid=true && break done - - if [ "$valid" = "false" ]; then - log_error "Invalid task: $TASK" - log_error "Valid tasks for platform '$PLATFORM': ${VALID_TASKS[*]}" - exit 1 - fi - - # Validate retry count - if ! [[ "$RETRY_COUNT" =~ ^[0-9]+$ ]] || [ "$RETRY_COUNT" -lt 1 ]; then - log_error "Invalid retry count: $RETRY_COUNT (must be positive integer)" - exit 1 - fi - - log_success "Input validation passed" -} - -# Install platform-specific base dependencies -install_base_dependencies() { - local base_script="$SCRIPT_DIR/$PLATFORM/install_base.sh" - - if [ ! -f "$base_script" ]; then - log_warn "Base install script not found: $base_script (skipping)" - return 0 - fi - - log_step "Installing base dependencies for platform: $PLATFORM" - chmod +x "$base_script" 2>/dev/null || true - "$base_script" -} - -# Install task-specific pip requirements -install_task_requirements() { - local task=$1 - local requirements_file - - # Use _dev.txt if --dev flag is set, otherwise use regular .txt - if [ "$DEV_MODE" = "true" ]; then - requirements_file="$PROJECT_ROOT/requirements/$PLATFORM/${task}_dev.txt" - if [ ! -f "$requirements_file" ]; then - log_warn "Dev requirements not found: $requirements_file, falling back to regular" - requirements_file="$PROJECT_ROOT/requirements/$PLATFORM/${task}.txt" - fi - else - requirements_file="$PROJECT_ROOT/requirements/$PLATFORM/${task}.txt" - fi - - if [ ! -f "$requirements_file" ]; then - log_info "No task requirements file: $requirements_file (skipping)" - return 0 - fi - - log_step "Installing pip requirements for task: $task" - if [ "$DEV_MODE" = "true" ]; then - log_info "Installing development dependencies (includes build, lint, test)" - fi - retry_pip_install "$requirements_file" "$RETRY_COUNT" -} - -# Install task-specific source dependencies (git repos, etc.) -install_source_dependencies() { - local task=$1 - local source_script="$SCRIPT_DIR/$PLATFORM/install_${task}.sh" - - if [ ! -f "$source_script" ]; then - log_info "No source dependency script for task: $task (skipping)" - return 0 - fi - - log_step "Installing source dependencies for task: $task" - chmod +x "$source_script" 2>/dev/null || true - "$source_script" -} - -# Install all dependencies for a task -install_task() { - local task=$1 - - print_header "Installing Dependencies for Task: $task ($PLATFORM)" - - # 1. Install base dependencies (platform-specific) - install_base_dependencies - - # 2. Install task pip requirements - install_task_requirements "$task" - - # 3. Install task source dependencies (git repos, etc.) - install_source_dependencies "$task" - - log_success "Task '$task' installation complete" + [ "$valid" = false ] && { log_error "Invalid task: $TASK. Valid: ${valid_tasks[*]}"; exit 1; } } main() { - print_header "FlagScale Dependency Installation" - - # Parse command line arguments parse_args "$@" - - # Validate inputs validate_inputs - # Optionally activate conda environment if specified - if [ -n "$CONDA_ENV" ]; then - log_step "Activating conda environment: $CONDA_ENV" - if activate_conda "$CONDA_ENV" "$CONDA_PATH"; then - : # Success message already displayed by activate_conda - else - log_warn "Conda activation failed, continuing with current environment" - fi - fi + [ "$DEBUG" = true ] && log_info "Dry-run mode" - # Display current environment - log_info "Current conda environment: $(get_conda_env)" - check_python_version || log_warn "Python version check failed (continuing anyway)" + print_header "FlagScale Installation" + log_info "Platform: $PLATFORM | Task: $TASK | Pkg: $PKG_MGR" + [ -n "$SRC_DEPS" ] && log_info "Source deps override: $SRC_DEPS" + [ -n "$PIP_DEPS" ] && log_info "Pip deps override: $PIP_DEPS" + log_info "Install dir: $FLAGSCALE_HOME" - # Display dev mode status - if [ "$DEV_MODE" = "true" ]; then - log_info "Development mode: ENABLED (will install build, lint, test deps)" - fi + export_config + + # Phase 1: System (apt, python, openmpi) + run_phase system "$SCRIPT_DIR/install_system.sh" "--platform $PLATFORM --pkg-mgr $PKG_MGR" + + # Phase 2: Dev (dev requirements) + run_phase dev "$SCRIPT_DIR/install_dev.sh" - # Install dependencies based on task + # Phase 3: Base (base requirements + source for platform) + run_phase base "$SCRIPT_DIR/$PLATFORM/install_base.sh" + + # Phase 4: Task (task requirements + source) if [ "$TASK" = "all" ]; then - log_info "Installing dependencies for all tasks" - # Install all valid tasks except 'all' itself - for task in "${VALID_TASKS[@]}"; do - if [ "$task" != "all" ]; then - print_separator - install_task "$task" - fi + for task in $(get_valid_tasks); do + [ "$task" = "all" ] && continue + FLAGSCALE_TASK="$task" + export FLAGSCALE_TASK + run_phase task "$SCRIPT_DIR/$PLATFORM/install_${task}.sh" done else - install_task "$TASK" + run_phase task "$SCRIPT_DIR/$PLATFORM/install_${TASK}.sh" fi print_header "Installation Complete" - log_success "All dependencies installed successfully for task: $TASK" } -# Make all install scripts executable -chmod +x "$SCRIPT_DIR"/*/install_*.sh 2>/dev/null || true -chmod +x "$SCRIPT_DIR"/utils/*.sh 2>/dev/null || true - -# Run main function main "$@" diff --git a/tools/install/install_dev.sh b/tools/install/install_dev.sh new file mode 100755 index 000000000..749a9caf1 --- /dev/null +++ b/tools/install/install_dev.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Dev phase: requirements/dev.txt + dev tools (sccache) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/utils/utils.sh" +source "$SCRIPT_DIR/utils/pkg_utils.sh" +source "$SCRIPT_DIR/utils/retry_utils.sh" + +PROJECT_ROOT=$(get_project_root) +DEBUG="${FLAGSCALE_DEBUG:-false}" +RETRY_COUNT="${FLAGSCALE_RETRY_COUNT:-3}" +REQ_FILE="$PROJECT_ROOT/requirements/dev.txt" + +# Source deps available for dev phase +SRC_DEPS_LIST="sccache" + +# Default versions (override via environment variables) +SCCACHE_VERSION="${SCCACHE_VERSION:-0.8.1}" + +while [[ $# -gt 0 ]]; do + case $1 in --debug) DEBUG=true; shift ;; *) shift ;; esac +done + +# ============================================================================= +# Pip Installation +# ============================================================================= +install_pip() { + if is_phase_enabled dev; then + [ ! -f "$REQ_FILE" ] && { log_warn "dev.txt not found"; return 0; } + set_step "Installing dev requirements" + retry_pip_install -d $DEBUG "$REQ_FILE" "$RETRY_COUNT" || return 1 + log_success "Dev requirements installed" + else + local pkgs=$(get_pip_deps_for_requirements "$REQ_FILE") + [ -z "$pkgs" ] && return 0 + set_step "Installing dev pip packages (override)" + run_cmd -d $DEBUG $(get_pip_cmd) install --root-user-action=ignore $pkgs || return 1 + log_success "Dev pip packages installed" + fi +} + +# ============================================================================= +# Source Dependencies +# ============================================================================= +install_sccache() { + # Check if already installed + if command -v sccache &>/dev/null; then + local ver=$(sccache --version 2>/dev/null | head -n1 | awk '{print $2}') + [ "$ver" = "$SCCACHE_VERSION" ] && { log_info "sccache $SCCACHE_VERSION already installed"; return 0; } + fi + + # Check dependencies + command -v curl &>/dev/null || { log_error "curl not found"; return 1; } + command -v tar &>/dev/null || { log_error "tar not found"; return 1; } + + # Detect architecture + local arch + case "$(uname -m)" in + x86_64) arch="x86_64-unknown-linux-musl" ;; + aarch64) arch="aarch64-unknown-linux-musl" ;; + *) log_error "Unsupported architecture: $(uname -m)"; return 1 ;; + esac + + local url="https://github.com/mozilla/sccache/releases/download/v${SCCACHE_VERSION}/sccache-v${SCCACHE_VERSION}-${arch}.tar.gz" + local tmp_dir="sccache-v${SCCACHE_VERSION}-${arch}" + + set_step "Installing sccache v${SCCACHE_VERSION}" + + if [ "$DEBUG" = true ]; then + log_info "[DRY-RUN] curl -L $url | tar xz" + return 0 + fi + + curl --connect-timeout 120 --max-time 600 --retry 5 --retry-delay 60 -L "$url" | tar xz || { + log_error "Failed to download sccache" + [ -d "$tmp_dir" ] && rm -rf "$tmp_dir" + return 1 + } + + [ ! -f "$tmp_dir/sccache" ] && { log_error "sccache binary not found"; rm -rf "$tmp_dir"; return 1; } + mv "$tmp_dir/sccache" /usr/bin/sccache + chmod 755 /usr/bin/sccache + rm -rf "$tmp_dir" + + # Configure for GitHub Actions + [ -n "${GITHUB_ENV:-}" ] && { + echo "SCCACHE_DIR=/root/.cache/sccache" >> "$GITHUB_ENV" + echo "RUSTC_WRAPPER=$(which sccache)" >> "$GITHUB_ENV" + } + + log_success "sccache v${SCCACHE_VERSION} installed" +} + +install_src() { + # Skip if phase disabled and no matching src-deps + is_phase_enabled dev || has_src_deps_for_phase $SRC_DEPS_LIST || return 0 + + should_install_src dev "sccache" && { install_sccache || die "sccache failed"; } +} + +main() { + install_pip || die "Dev pip failed" + install_src +} + +main diff --git a/tools/install/install_system.sh b/tools/install/install_system.sh new file mode 100755 index 000000000..86ad041e8 --- /dev/null +++ b/tools/install/install_system.sh @@ -0,0 +1,332 @@ +#!/bin/bash +# ============================================================================= +# FlagScale System Dependencies Installation +# ============================================================================= +# +# Installs system-level dependencies: apt packages, OpenMPI, Python environment +# Supports multiple package managers: pip, uv (default), conda +# +# Usage: +# ./install_system.sh --platform PLATFORM [OPTIONS] +# +# Examples: +# ./install_system.sh --platform cuda # Basic installation (uv) +# ./install_system.sh --platform cuda --pkg-mgr uv # Use uv package manager +# ./install_system.sh --platform cuda --pkg-mgr conda # Use conda package manager +# ./install_system.sh --platform cuda --pkg-mgr pip # Use pip (system Python) +# ./install_system.sh --platform cuda --no-dev # Skip dev tools +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/utils/utils.sh" + +# ============================================================================= +# Configuration +# ============================================================================= +INSTALL_DEV=true # Install dev packages by default (use --no-dev to skip) +PLATFORM="${PLATFORM:-}" # Required: use --platform to specify +PKG_MGR="${PKG_MGR:-uv}" # pip, uv, conda (default: uv) +DEBUG=false + +# Default versions (override via environment variables) +PYTHON_VERSION="${PYTHON_VERSION:-3.12}" +UV_VERSION="${UV_VERSION:-0.7.2}" +OPENMPI_VERSION="${OPENMPI_VERSION:-4.1.6}" + +# Root installation directory (single source of truth) +FLAGSCALE_HOME="${FLAGSCALE_HOME:-/opt/flagscale}" + +# Derived paths from FLAGSCALE_HOME +UV_PROJECT_ENVIRONMENT="${UV_PROJECT_ENVIRONMENT:-$FLAGSCALE_HOME/venv}" +FLAGSCALE_CONDA="${FLAGSCALE_CONDA:-$FLAGSCALE_HOME/miniconda3}" +FLAGSCALE_DOWNLOADS="${FLAGSCALE_DOWNLOADS:-$FLAGSCALE_HOME/downloads}" + +# ============================================================================= +# Package Lists +# ============================================================================= +# Core system packages (common across deepspeed, vllm, sglang, megatron) +# Note: Python is installed via install_python (supports uv, conda, pip) +BASE_PACKAGES=" + software-properties-common ca-certificates curl wget sudo + git git-lfs unzip tzdata locales gettext + build-essential cmake ninja-build perl pkg-config file gfortran libopenblas-dev + openssh-client openssh-server + rsync lsof kmod netcat-openbsd psmisc uuid-runtime + net-tools iputils-ping +" + +# InfiniBand/RDMA packages (common for distributed training) +RDMA_PACKAGES=" + libibverbs-dev libibverbs1 librdmacm1 rdma-core + ibverbs-providers infiniband-diags perftest + libnuma-dev libnuma1 numactl +" + +# Libraries for ML frameworks (image, audio, async IO) +# Note: Platform-specific packages (e.g., libcupti-dev) should be in platform install scripts +ML_PACKAGES=" + ffmpeg libsm6 libxext6 libgl1 + libsndfile-dev libjpeg-dev libpng-dev + libaio-dev libssl-dev libcurl4-openssl-dev + ccache patchelf +" + +DEV_PACKAGES="vim tmux screen htop iftop iotop gdb less tree" + +# ============================================================================= +# Installation Functions +# ============================================================================= + +# Configure timezone non-interactively to avoid tzdata prompts +configure_timezone() { + local tz="${TZ:-Asia/Shanghai}" + set_step "Configuring timezone ($tz)" + + # Set environment variables for non-interactive installation + export DEBIAN_FRONTEND=noninteractive + export TZ="$tz" + + if [ "$DEBUG" = true ]; then + echo " [dry-run] ln -sf /usr/share/zoneinfo/$tz /etc/localtime" >&2 + echo " [dry-run] echo $tz > /etc/timezone" >&2 + return 0 + fi + + # Pre-configure timezone before apt install + ln -sf "/usr/share/zoneinfo/$tz" /etc/localtime 2>/dev/null || true + echo "$tz" > /etc/timezone 2>/dev/null || true + log_success "Timezone configured" +} + +install_apt_packages() { + set_step "Installing apt packages" + + local packages="$BASE_PACKAGES $RDMA_PACKAGES $ML_PACKAGES" + [ "$INSTALL_DEV" = true ] && packages="$packages $DEV_PACKAGES" + run_cmd -d $DEBUG -m "Updating package lists..." apt-get update + # shellcheck disable=SC2086 + run_cmd -d $DEBUG -m "Installing packages..." apt-get install -y --no-install-recommends $packages + # run_cmd -d $DEBUG -m "Cleaning up..." apt-get clean + # run_cmd -d $DEBUG rm -rf /var/lib/apt/lists/* + log_success "Apt packages done" +} + +install_python_uv() { + set_step "Installing Python ${PYTHON_VERSION} (uv ${UV_VERSION})" + + run_cmd -d $DEBUG -m "Installing uv ${UV_VERSION}..." \ + bash -c "curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh" + run_cmd -d $DEBUG -m "Creating venv at ${UV_PROJECT_ENVIRONMENT}..." \ + "$HOME/.local/bin/uv" venv "${UV_PROJECT_ENVIRONMENT}" --python "${PYTHON_VERSION}" + run_cmd -d $DEBUG -m "Symlink python3..." ln -sf "${UV_PROJECT_ENVIRONMENT}/bin/python3" /usr/bin/python3 + run_cmd -d $DEBUG ln -sf "${UV_PROJECT_ENVIRONMENT}/bin/python3-config" /usr/bin/python3-config + run_cmd -d $DEBUG ln -sf "${UV_PROJECT_ENVIRONMENT}/bin/pip" /usr/bin/pip + run_cmd -d $DEBUG ln -sf /usr/bin/python3 /usr/bin/python + log_success "Python ready: ${UV_PROJECT_ENVIRONMENT}" +} + +install_python_conda() { + set_step "Installing Python ${PYTHON_VERSION} (conda)" + + local env_name="${FLAGSCALE_ENV_NAME:-}" + + # Skip if conda already installed at FLAGSCALE_CONDA + if [ -f "${FLAGSCALE_CONDA}/bin/conda" ]; then + log_info "Conda already installed at ${FLAGSCALE_CONDA}" + else + # Create download directory + mkdir -p "$FLAGSCALE_DOWNLOADS" + local conda_installer="$FLAGSCALE_DOWNLOADS/miniconda.sh" + # Download miniconda if not present (cached for future use) + if [ ! -f "$conda_installer" ]; then + run_cmd -d $DEBUG -m "Downloading Miniconda to $FLAGSCALE_DOWNLOADS..." \ + wget -q "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O "$conda_installer" + else + log_info "Using cached miniconda installer" + fi + # Silent install with TOS acceptance + run_cmd -d $DEBUG -m "Installing Miniconda to ${FLAGSCALE_CONDA}..." \ + env ANACONDA_ACCEPT_TOS=yes bash "$conda_installer" -b -u -p "${FLAGSCALE_CONDA}" + fi + + log_info "Configuring conda..." + run_cmd -d $DEBUG env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" init bash + run_cmd -d $DEBUG env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" config --set auto_activate_base false + run_cmd -d $DEBUG env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" config --set channel_priority flexible + run_cmd -d $DEBUG env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" config --set solver classic + + # Create named environment if specified, otherwise install to base + if [ -n "$env_name" ]; then + run_cmd -d $DEBUG -m "Creating conda env: $env_name (python=${PYTHON_VERSION})..." \ + env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" create -y -n "$env_name" "python=${PYTHON_VERSION}" + log_info "Setting up symlinks to $env_name env..." + run_cmd -d $DEBUG ln -sf "${FLAGSCALE_CONDA}/envs/${env_name}/bin/python3" /usr/bin/python3 + run_cmd -d $DEBUG ln -sf "${FLAGSCALE_CONDA}/envs/${env_name}/bin/python3-config" /usr/bin/python3-config + run_cmd -d $DEBUG ln -sf "${FLAGSCALE_CONDA}/envs/${env_name}/bin/pip" /usr/bin/pip + log_success "Conda env '$env_name' ready: ${FLAGSCALE_CONDA}/envs/${env_name}" + else + run_cmd -d $DEBUG -m "Installing Python ${PYTHON_VERSION} to base..." \ + env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" install -y python="${PYTHON_VERSION}" + log_info "Setting up symlinks to base..." + run_cmd -d $DEBUG ln -sf "${FLAGSCALE_CONDA}/bin/python3" /usr/bin/python3 + run_cmd -d $DEBUG ln -sf "${FLAGSCALE_CONDA}/bin/python3-config" /usr/bin/python3-config + run_cmd -d $DEBUG ln -sf "${FLAGSCALE_CONDA}/bin/pip" /usr/bin/pip + log_success "Conda base ready: ${FLAGSCALE_CONDA}" + fi + run_cmd -d $DEBUG ln -sf /usr/bin/python3 /usr/bin/python +} + +install_python_pip() { + set_step "Installing Python ${PYTHON_VERSION} (system pip)" + + run_cmd -d $DEBUG -m "Adding deadsnakes PPA..." add-apt-repository -y ppa:deadsnakes/ppa + run_cmd -d $DEBUG -m "Updating package lists..." apt-get update + run_cmd -d $DEBUG -m "Installing Python ${PYTHON_VERSION}..." apt-get install -y --no-install-recommends \ + "python${PYTHON_VERSION}" "python${PYTHON_VERSION}-dev" \ + "python${PYTHON_VERSION}-venv" python3-pip + log_info "Configuring alternatives..." + run_cmd -d $DEBUG update-alternatives --install /usr/bin/python3 python3 "/usr/bin/python${PYTHON_VERSION}" 1 + run_cmd -d $DEBUG update-alternatives --set python3 "/usr/bin/python${PYTHON_VERSION}" + run_cmd -d $DEBUG ln -sf "/usr/bin/python${PYTHON_VERSION}-config" /usr/bin/python3-config + run_cmd -d $DEBUG ln -sf /usr/bin/python3 /usr/bin/python + run_cmd -d $DEBUG -m "Upgrading pip..." python3 -m pip install --root-user-action=ignore --upgrade pip + log_success "System Python ready" +} + +install_python() { + case "$PKG_MGR" in + uv) install_python_uv ;; + conda) install_python_conda ;; + pip) install_python_pip ;; + *) log_error "Unknown pkg manager: $PKG_MGR"; exit 1 ;; + esac +} + +install_openmpi() { + set_step "Installing OpenMPI ${OPENMPI_VERSION}" + + local version="$OPENMPI_VERSION" + local base_version="${version%.*}" + local prefix="/usr/local/openmpi-${version}" + local tarball_url="https://download.open-mpi.org/release/open-mpi/v${base_version}/openmpi-${version}.tar.gz" + + # Download tarball to FLAGSCALE_DOWNLOADS (cached for future use) + mkdir -p "$FLAGSCALE_DOWNLOADS" + local tarball="$FLAGSCALE_DOWNLOADS/openmpi-${version}.tar.gz" + if [ ! -f "$tarball" ]; then + run_cmd -d $DEBUG -m "Downloading OpenMPI ${version} to $FLAGSCALE_DOWNLOADS..." \ + wget -q "$tarball_url" -O "$tarball" + else + log_info "Using cached OpenMPI tarball" + fi + + # Extract and build + run_cmd -d $DEBUG -m "Extracting OpenMPI..." \ + bash -c "cd /tmp && tar xzf '$tarball'" + run_cmd -d $DEBUG -m "Configuring OpenMPI..." \ + bash -c "cd /tmp/openmpi-${version} && ./configure --prefix=${prefix} --quiet" + run_cmd -d $DEBUG -m "Building OpenMPI (may take a while)..." \ + bash -c "cd /tmp/openmpi-${version} && make -j\$(nproc) install" + run_cmd -d $DEBUG -m "Creating symlink..." ln -sf "${prefix}" /usr/local/mpi + run_cmd -d $DEBUG -m "Setting up mpirun wrapper..." \ + bash -c "mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && cat > /usr/local/mpi/bin/mpirun << 'WRAPPER' +#!/bin/bash +exec mpirun.real --allow-run-as-root --prefix /usr/local/mpi \"\\\$@\" +WRAPPER +chmod +x /usr/local/mpi/bin/mpirun" + run_cmd -d $DEBUG rm -rf /tmp/openmpi-${version} + log_success "OpenMPI done" +} + +install_env_scripts() { + set_step "Installing environment scripts" + + local profile_dir="/etc/profile.d" + local env_script="$SCRIPT_DIR/$PLATFORM/env.sh" + + if [ -z "$PLATFORM" ]; then + log_error "PLATFORM not set. Use --platform to specify (e.g., --platform cuda)" + exit 1 + fi + + if [ ! -f "$env_script" ]; then + log_error "Environment script not found: $env_script" + exit 1 + fi + + run_cmd -d $DEBUG -m "Installing ${PLATFORM} env..." cp "$env_script" "$profile_dir/flagscale-env.sh" + + run_cmd -d $DEBUG -m "Configuring bash.bashrc..." \ + bash -c 'grep -q "flagscale-env.sh" /etc/bash.bashrc 2>/dev/null || cat >> /etc/bash.bashrc << "BASHRC" + +# FlagScale environment +[ -f /etc/profile.d/flagscale-env.sh ] && . /etc/profile.d/flagscale-env.sh +BASHRC' + log_success "Env scripts done" +} + +# ============================================================================= +# Main +# ============================================================================= +usage() { + cat << EOF +Usage: $0 --platform PLATFORM [OPTIONS] + +Options: + --platform NAME Platform for env scripts (required, e.g., cuda) + --no-dev Skip development tools (vim, tmux, htop, etc.) + --pkg-mgr MGR Package manager: pip, uv, conda (default: uv) + --debug Debug mode: print commands without executing (dry-run) + --help Show this help + +Package Managers: + uv - Fast, modern package manager with venv (default) + conda - Miniconda installation + pip - System Python with pip + +Versions (override via environment variables): + PYTHON_VERSION Python version (default: ${PYTHON_VERSION}) + UV_VERSION uv version (default: ${UV_VERSION}) + OPENMPI_VERSION OpenMPI version (default: ${OPENMPI_VERSION}) + +Environment paths (derived from FLAGSCALE_HOME, override via environment variables): + FLAGSCALE_HOME Root installation directory (default: /opt/flagscale) + UV_PROJECT_ENVIRONMENT uv venv path (default: \$FLAGSCALE_HOME/venv) + FLAGSCALE_CONDA Miniconda path (default: \$FLAGSCALE_HOME/miniconda3) + FLAGSCALE_DOWNLOADS Downloads directory (default: \$FLAGSCALE_HOME/downloads) +EOF +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --no-dev) INSTALL_DEV=false; shift ;; + --platform) PLATFORM="$2"; shift 2 ;; + --pkg-mgr) PKG_MGR="$2"; shift 2 ;; + --debug) DEBUG=true; shift ;; + --help|-h) usage; exit 0 ;; + *) log_error "Unknown option: $1"; exit 1 ;; + esac + done +} + +main() { + parse_args "$@" + + # Validate required parameters + [ -z "$PLATFORM" ] && { log_error "Platform required (use --platform, e.g., --platform cuda)"; usage; exit 1; } + + [ "$DEBUG" = true ] && log_info "Dry-run mode: commands printed, not executed" + + log_info "Python ${PYTHON_VERSION} | ${PKG_MGR} | OpenMPI ${OPENMPI_VERSION}" + + configure_timezone || die "Timezone configuration failed" + install_apt_packages || die "Apt packages installation failed" + install_python || die "Python installation failed" + install_openmpi || die "OpenMPI installation failed" + install_env_scripts || die "Environment scripts installation failed" + + log_success "System setup complete" +} + +main "$@" diff --git a/tools/install/utils/conda_utils.sh b/tools/install/utils/conda_utils.sh deleted file mode 100755 index 996132213..000000000 --- a/tools/install/utils/conda_utils.sh +++ /dev/null @@ -1,201 +0,0 @@ -#!/bin/bash -# Conda environment management utilities - -# Source utils for logging -_UTILS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$_UTILS_DIR/utils.sh" - -# Advanced conda activation with auto-detection of installation locations -# This function tries multiple methods to find and activate conda -# Usage: activate_conda [conda_custom_path] -# Returns: 0 on success, 1 on failure -# -# Priority order: -# 0. Use explicitly provided conda path (if provided) -# 1. Check if conda is already in PATH -# 2. Search common conda installation locations -# 3. Use 'which' to find conda dynamically -activate_conda() { - local env_name=$1 - local conda_custom_path=${2:-""} - - # Method 0: Use explicitly provided conda path if available - if [ -n "$conda_custom_path" ]; then - if [ -f "$conda_custom_path/bin/activate" ]; then - echo "🐍 Using provided conda path: $conda_custom_path" - source "$conda_custom_path/bin/activate" "$env_name" - if [ $? -eq 0 ]; then - echo "✅ Successfully activated conda environment: $env_name" - return 0 - fi - else - echo "⚠️ Provided conda path not valid: $conda_custom_path" - echo "Falling back to auto-detection..." - fi - fi - - # Method 1: Check if conda command is already available - if command -v conda &> /dev/null; then - echo "🐍 Found conda in PATH, activating environment: $env_name" - eval "$(conda shell.bash hook)" - conda activate "$env_name" - if [ $? -eq 0 ]; then - echo "✅ Successfully activated conda environment: $env_name" - return 0 - fi - fi - - # Method 2: Check common conda installation locations - local conda_paths=( - "/root/miniconda3" - "/root/anaconda3" - "$HOME/miniconda3" - "$HOME/anaconda3" - "/opt/conda" - "/usr/local/miniconda3" - "/usr/local/anaconda3" - ) - - for conda_path in "${conda_paths[@]}"; do - if [ -f "$conda_path/bin/activate" ]; then - echo "🐍 Found conda at $conda_path, activating environment: $env_name" - source "$conda_path/bin/activate" "$env_name" - if [ $? -eq 0 ]; then - echo "✅ Successfully activated conda environment: $env_name" - return 0 - fi - fi - done - - # Method 3: Try to find conda using which - local conda_exe conda_base - if conda_exe=$(which conda 2>/dev/null); then - conda_base=$(dirname "$(dirname "$conda_exe")") - if [ -f "$conda_base/bin/activate" ]; then - echo "🐍 Found conda via which at $conda_base, activating environment: $env_name" - source "$conda_base/bin/activate" "$env_name" - if [ $? -eq 0 ]; then - echo "✅ Successfully activated conda environment: $env_name" - return 0 - fi - fi - fi - - echo "❌ Failed to find and activate conda environment: $env_name" - return 1 -} - -# Display conda and Python environment information -# Usage: display_python_info -display_python_info() { - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo " Python Environment Information" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - - if command -v python &> /dev/null; then - echo "Python location: $(which python)" - echo "Python version: $(python --version 2>&1)" - else - echo "⚠️ Python not found in PATH" - fi - - if [ -n "$CONDA_DEFAULT_ENV" ]; then - echo "Conda environment: $CONDA_DEFAULT_ENV" - if command -v conda &> /dev/null; then - local conda_prefix=$(conda info --base 2>/dev/null) - if [ -n "$conda_prefix" ]; then - echo "Conda prefix: $conda_prefix" - fi - fi - else - echo "Conda environment: none" - fi - - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -} - -# Create a new conda environment -# Usage: create_conda_env [python_version] -create_conda_env() { - local env_name=$1 - local python_version=${2:-3.12} - - if ! command_exists conda; then - log_error "Conda not found in PATH" - return 1 - fi - - # Check if environment already exists - if conda env list | grep -q "^${env_name} "; then - log_info "Conda environment '$env_name' already exists" - return 0 - fi - - log_step "Creating conda environment: $env_name (Python $python_version)" - if conda create -n "$env_name" python="$python_version" -y; then - log_success "Conda environment '$env_name' created successfully" - return 0 - else - log_error "Failed to create conda environment '$env_name'" - return 1 - fi -} - -# Activate a conda environment (legacy function, kept for backwards compatibility) -# Usage: activate_conda_env -# Note: Use activate_conda for better auto-detection capabilities -activate_conda_env() { - local env_name=$1 - - if ! command_exists conda; then - log_error "Conda not found in PATH" - return 1 - fi - - # Get conda base directory - local conda_base=$(conda info --base) - - if [ ! -f "$conda_base/bin/activate" ]; then - log_error "Conda activate script not found at $conda_base/bin/activate" - return 1 - fi - - log_step "Activating conda environment: $env_name" - source "$conda_base/bin/activate" "$env_name" - - if [ $? -eq 0 ]; then - log_success "Conda environment '$env_name' activated" - log_info "Current environment: $(get_conda_env)" - return 0 - else - log_error "Failed to activate conda environment '$env_name'" - return 1 - fi -} - -# Check if a conda environment exists -# Usage: conda_env_exists -conda_env_exists() { - local env_name=$1 - - if ! command_exists conda; then - return 1 - fi - - if conda env list | grep -q "^${env_name} "; then - return 0 - else - return 1 - fi -} - -# List all conda environments -list_conda_envs() { - if ! command_exists conda; then - log_error "Conda not found in PATH" - return 1 - fi - - log_info "Available conda environments:" - conda env list -} diff --git a/tools/install/utils/install-sccache.sh b/tools/install/utils/install-sccache.sh deleted file mode 100644 index cc9fc6a58..000000000 --- a/tools/install/utils/install-sccache.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -# Automate downloading, extracting, and installing sccache - -# Configuration Variables - Editable -# sccache version number; check for latest releases at: -# https://github.com/mozilla/sccache/releases -SCCACHE_VERSION="0.8.1" -SCCACHE_ARCH="x86_64-unknown-linux-musl" -SCCACHE_DOWNLOAD_URL="https://github.com/mozilla/sccache/releases/download/v${SCCACHE_VERSION}/sccache-v${SCCACHE_VERSION}-${SCCACHE_ARCH}.tar.gz" -SCCACHE_INSTALL_PATH="/usr/bin/sccache" - -CURL_OPTS="--connect-timeout 120 --max-time 600 --retry 5 --retry-delay 60 -L" - -SCCACHE_TMP_DIR="sccache-v${SCCACHE_VERSION}-${SCCACHE_ARCH}" - -# Source utils for logging functions -_RETRY_UTILS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$_RETRY_UTILS_DIR/utils.sh" - -if ! command -v curl &> /dev/null; then - log_error "Error: curl command not found. Please install curl first!" - log_step "Installation command reference: Debian/Ubuntu: apt install curl -y ; CentOS/RHEL: yum install curl -y ; Alpine: apk add curl" - exit 1 -fi - -if ! command -v tar &> /dev/null; then - log_error "Error: tar command not found. Please install tar first!" - log_step "Installation command reference: Debian/Ubuntu: apt install tar -y ; CentOS/RHEL: yum install tar -y ; Alpine: apk add tar" - exit 1 -fi - -log_info "Downloading sccache v${SCCACHE_VERSION} (arch: ${SCCACHE_ARCH})..." -# Download archive via curl and pipe directly to tar for extraction -if ! curl ${CURL_OPTS} "${SCCACHE_DOWNLOAD_URL}" | tar xz; then - log_error "Error: Failed to download or extract sccache! Please check your network or download URL." - # Clean up partial extraction directory if it exists - [ -d "${SCCACHE_TMP_DIR}" ] && rm -rf "${SCCACHE_TMP_DIR}" - exit 1 -fi - -# Verify that the extracted binary exists -if [ ! -f "${SCCACHE_TMP_DIR}/sccache" ]; then - log_error "Error: Extraction succeeded, but executable ${SCCACHE_TMP_DIR}/sccache not found!" - rm -rf "${SCCACHE_TMP_DIR}" - exit 1 -fi - -log_info "Installing sccache to ${SCCACHE_INSTALL_PATH}..." -mv "${SCCACHE_TMP_DIR}/sccache" "${SCCACHE_INSTALL_PATH}" - -# Set standard executable permissions: rwxr-xr-x -chmod 755 "${SCCACHE_INSTALL_PATH}" - -log_step "Cleaning up temporary extraction directory..." -rm -rf "${SCCACHE_TMP_DIR}" - -log_step "Configuring sccache environment variables..." -# Apply environment variables if running in GitHub Actions -if [ -n "${GITHUB_ENV:-}" ]; then - echo "SCCACHE_DIR=/root/.cache/sccache" >> "${GITHUB_ENV}" - echo "RUSTC_WRAPPER=$(which sccache)" >> "${GITHUB_ENV}" -fi - -log_step "Installation complete! sccache version: " -sccache --version | head -n1 diff --git a/tools/install/utils/load_platform_config.sh b/tools/install/utils/load_platform_config.sh index 1b68537e8..6fec35614 100755 --- a/tools/install/utils/load_platform_config.sh +++ b/tools/install/utils/load_platform_config.sh @@ -23,6 +23,18 @@ load_platform_config() { VOLUMES=$(/usr/local/bin/yq -o=json -I=0 '.container_volumes' "$CONFIG_FILE") CONTAINER_OPTIONS=$(/usr/local/bin/yq -r '.container_options' "$CONFIG_FILE") + # Extract package manager configuration + PKG_MGR=$(/usr/local/bin/yq -r '.pkg_mgr // "uv"' "$CONFIG_FILE") + ENV_PATH=$(/usr/local/bin/yq -r '.env_path // "/opt/venv"' "$CONFIG_FILE") + ENV_NAME_TRAIN=$(/usr/local/bin/yq -r '.env_names.train // "flagscale-train"' "$CONFIG_FILE") + ENV_NAME_INFERENCE=$(/usr/local/bin/yq -r '.env_names.inference // "flagscale-inference"' "$CONFIG_FILE") + ENV_NAME_SERVE=$(/usr/local/bin/yq -r '.env_names.serve // "flagscale-serve"' "$CONFIG_FILE") + ENV_NAME_RL=$(/usr/local/bin/yq -r '.env_names.rl // "flagscale-rl"' "$CONFIG_FILE") + + echo "Package manager: $PKG_MGR" + echo "Environment path: $ENV_PATH" + echo "Environment names: train=$ENV_NAME_TRAIN, inference=$ENV_NAME_INFERENCE, serve=$ENV_NAME_SERVE, rl=$ENV_NAME_RL" + # Validate required fields if [ -z "$CI_IMAGE" ] || [ "$CI_IMAGE" = "null" ]; then echo "❌ Error: ci_image not found in $CONFIG_FILE" @@ -85,4 +97,12 @@ load_platform_config() { { echo 'inference_test_matrix<> $GITHUB_OUTPUT { echo 'serve_test_matrix<> $GITHUB_OUTPUT { echo 'rl_test_matrix<> $GITHUB_OUTPUT + + # Output package manager configuration + echo "pkg_mgr=$PKG_MGR" >> $GITHUB_OUTPUT + echo "env_path=$ENV_PATH" >> $GITHUB_OUTPUT + echo "env_name_train=$ENV_NAME_TRAIN" >> $GITHUB_OUTPUT + echo "env_name_inference=$ENV_NAME_INFERENCE" >> $GITHUB_OUTPUT + echo "env_name_serve=$ENV_NAME_SERVE" >> $GITHUB_OUTPUT + echo "env_name_rl=$ENV_NAME_RL" >> $GITHUB_OUTPUT } diff --git a/tools/install/utils/pkg_utils.sh b/tools/install/utils/pkg_utils.sh new file mode 100644 index 000000000..8ee69e2f0 --- /dev/null +++ b/tools/install/utils/pkg_utils.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# ============================================================================= +# Package Manager Utilities +# ============================================================================= +# +# Unified interface for pip/uv/conda package installation. +# +# Environment: +# FLAGSCALE_PKG_MGR - "uv", "pip", or "conda" (default: uv) +# FLAGSCALE_CONDA - path to conda installation +# FLAGSCALE_ENV_NAME - conda environment name (optional) +# ============================================================================= + +_PKG_UTILS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$_PKG_UTILS_DIR/utils.sh" + +# ============================================================================= +# Package Manager +# ============================================================================= + +get_pkg_manager() { + echo "${FLAGSCALE_PKG_MGR:-uv}" +} + +# Get the pip command for the current package manager +# Returns the full path to pip for conda environments +get_pip_cmd() { + local manager=$(get_pkg_manager) + case "$manager" in + conda) + local conda_path="${FLAGSCALE_CONDA:-/opt/flagscale/miniconda3}" + local env_name="${FLAGSCALE_ENV_NAME:-}" + if [ -n "$env_name" ]; then + echo "$conda_path/envs/$env_name/bin/pip" + else + echo "$conda_path/bin/pip" + fi + ;; + *) + echo "pip" + ;; + esac +} + +# ============================================================================= +# Package Checks +# ============================================================================= + +is_package_installed() { + local package=$1 + local normalized=$(echo "$package" | tr '-' '_') + local pip_cmd=$(get_pip_cmd) + $pip_cmd show "$normalized" &>/dev/null || $pip_cmd show "$package" &>/dev/null +} + +get_package_version() { + local package=$1 + local normalized=$(echo "$package" | tr '-' '_') + local pip_cmd=$(get_pip_cmd) + $pip_cmd show "$normalized" 2>/dev/null | grep -i "^Version:" | awk '{print $2}' || \ + $pip_cmd show "$package" 2>/dev/null | grep -i "^Version:" | awk '{print $2}' +} + +# Check if should build from source (not installed or FLAGSCALE_FORCE_BUILD=true) +should_build_package() { + local package=$1 + + if [ "${FLAGSCALE_FORCE_BUILD:-false}" = true ]; then + log_info "Force build enabled, will build $package" + return 0 + fi + + if is_package_installed "$package"; then + local version=$(get_package_version "$package") + log_info "$package already installed (version: ${version:-unknown}), skipping" + return 1 + fi + return 0 +} + +# ============================================================================= +# Phase Control +# ============================================================================= +# Environment variables (from install.sh): +# FLAGSCALE_INSTALL_SYSTEM/DEV/BASE/TASK - true/false +# FLAGSCALE_PIP_DEPS - comma-separated pip packages +# FLAGSCALE_SRC_DEPS - comma-separated source deps + +is_phase_enabled() { + local phase="$1" + case "$phase" in + system) [ "${FLAGSCALE_INSTALL_SYSTEM:-true}" = true ] ;; + dev) [ "${FLAGSCALE_INSTALL_DEV:-true}" = true ] ;; + base) [ "${FLAGSCALE_INSTALL_BASE:-true}" = true ] ;; + task) [ "${FLAGSCALE_INSTALL_TASK:-true}" = true ] ;; + *) return 1 ;; + esac +} + +is_in_override() { + local type="$1" item="$2" list="" + case "$type" in + pip) list="${FLAGSCALE_PIP_DEPS:-}" ;; + src) list="${FLAGSCALE_SRC_DEPS:-}" ;; + *) return 1 ;; + esac + [ -n "$list" ] && echo ",$list," | grep -q ",$item," +} + +# Should install source dep? +# Usage: should_install_src +should_install_src() { + local phase="$1" item="$2" + is_phase_enabled "$phase" && return 0 + is_in_override src "$item" && return 0 + return 1 +} + +# ============================================================================= +# Phase-Scoped Filtering +# ============================================================================= + +# Get pip-deps that match a requirements file +get_pip_deps_for_requirements() { + local req_file="$1" + local pip_deps="${FLAGSCALE_PIP_DEPS:-}" + local matched="" + + [ -z "$pip_deps" ] || [ ! -f "$req_file" ] && return 0 + + for pkg in $(echo "$pip_deps" | tr ',' ' '); do + grep -qiE "^${pkg}([=<>!~\[]|$)" "$req_file" 2>/dev/null && matched="$matched $pkg" + done + echo "$matched" | xargs +} + +# Check if any src-deps match the valid list +has_src_deps_for_phase() { + local valid_deps="$*" + local src_deps="${FLAGSCALE_SRC_DEPS:-}" + [ -z "$src_deps" ] && return 1 + + for dep in $(echo "$src_deps" | tr ',' ' '); do + for valid in $valid_deps; do + [ "$dep" = "$valid" ] && return 0 + done + done + return 1 +} diff --git a/tools/install/utils/pyenv_utils.sh b/tools/install/utils/pyenv_utils.sh new file mode 100644 index 000000000..3fe086bfb --- /dev/null +++ b/tools/install/utils/pyenv_utils.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# ============================================================================= +# Python Environment Utilities +# ============================================================================= +# +# Environment activation for conda and uv. +# Used by GitHub workflows to activate pre-installed environments. +# +# Usage: +# source pyenv_utils.sh +# activate_conda "env_name" "/path/to/conda" +# activate_uv_env "/path/to/venv" +# +# With debug mode (optional): +# activate_conda -d true "env_name" "/path/to/conda" +# activate_uv_env -d true "/path/to/venv" +# ============================================================================= + +_PYENV_UTILS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$_PYENV_UTILS_DIR/utils.sh" + +# ============================================================================= +# UV Environment +# ============================================================================= + +# Activate uv virtual environment +# Usage: activate_uv_env [-d debug] [venv_path] +activate_uv_env() { + local debug=false + [[ "$1" == "-d" ]] && { debug="$2"; shift 2; } + + local venv_path=${1:-${UV_PROJECT_ENVIRONMENT:-"/opt/venv"}} + + if [ "$debug" = true ]; then + log_info "[dry-run] Activate UV env: $venv_path" + return 0 + fi + + [ ! -d "$venv_path" ] && { log_error "Venv not found: $venv_path"; return 1; } + [ ! -f "$venv_path/bin/activate" ] && { log_error "Invalid venv: $venv_path"; return 1; } + + source "$venv_path/bin/activate" + export UV_PROJECT_ENVIRONMENT="$venv_path" + log_info "Activated UV env: $venv_path" + return 0 +} + +# ============================================================================= +# Conda Environment +# ============================================================================= + +# Check if conda environment exists +conda_env_exists() { + local env_name=$1 + local conda_path=$2 + CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "$conda_path/bin/conda" env list 2>/dev/null | grep -q "^${env_name} " || \ + [ -d "$conda_path/envs/$env_name" ] +} + +# Create conda environment if it doesn't exist +# Usage: create_conda_env [-d debug] [python_version] +create_conda_env() { + local debug=false + [[ "$1" == "-d" ]] && { debug="$2"; shift 2; } + + local env_name=$1 + local conda_path=$2 + local python_version=${3:-"3.12"} + + if [ "$debug" = true ]; then + log_info "[dry-run] Create conda env: $env_name (python=$python_version)" + return 0 + fi + + if conda_env_exists "$env_name" "$conda_path"; then + log_info "Conda env '$env_name' already exists" + return 0 + fi + + # Configure solver for non-interactive use + CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "$conda_path/bin/conda" config --set solver classic >/dev/null 2>&1 || true + + log_info "Creating conda env: $env_name (python=$python_version)" + CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "$conda_path/bin/conda" create -y -n "$env_name" "python=$python_version" || { + log_error "Failed to create conda env: $env_name" + return 1 + } + log_success "Conda env '$env_name' created" + return 0 +} + +# Activate conda environment (creates if doesn't exist) +# Usage: activate_conda [-d debug] [python_version] +activate_conda() { + local debug=false + [[ "$1" == "-d" ]] && { debug="$2"; shift 2; } + + local env_name=$1 + local conda_path=${2:-""} + local python_version=${3:-"3.12"} + + [ -z "$conda_path" ] && { log_error "conda_path required"; return 1; } + + if [ "$debug" = true ]; then + log_info "[dry-run] Activate conda env: $env_name at $conda_path" + return 0 + fi + + [ ! -f "$conda_path/etc/profile.d/conda.sh" ] && { log_error "Invalid conda: $conda_path"; return 1; } + + source "$conda_path/etc/profile.d/conda.sh" + create_conda_env "$env_name" "$conda_path" "$python_version" || return 1 + + log_info "Activating conda env: $env_name" + conda activate "$env_name" || { log_error "Failed: conda activate $env_name"; return 1; } + return 0 +} diff --git a/tools/install/utils/retry_utils.sh b/tools/install/utils/retry_utils.sh index 0a1930d0d..3534baa42 100755 --- a/tools/install/utils/retry_utils.sh +++ b/tools/install/utils/retry_utils.sh @@ -1,81 +1,90 @@ #!/bin/bash -# Retry utilities for network-dependent operations -# Extracted from .github/workflows/scripts/retry_functions.sh +# ============================================================================= +# Retry Utilities +# ============================================================================= +# +# Retry wrappers for network-dependent operations (pip install, git clone). +# ============================================================================= -# Source utils for logging functions +# Source utils for logging functions and package manager _RETRY_UTILS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$_RETRY_UTILS_DIR/utils.sh" +source "$_RETRY_UTILS_DIR/pkg_utils.sh" -# Retry a single command with a specified number of attempts -# Usage: retry +# Retry command with specified attempts +# Usage: retry -d retry() { + local debug=false + if [[ "$1" == "-d" ]]; then + debug="$2"; shift 2 + fi + local retries=$1 shift local cmd="$*" local count=0 + if [ "$debug" = true ]; then + echo " [dry-run] $cmd" >&2 + return 0 + fi + until eval "$cmd"; do count=$((count + 1)) - if [ $count -ge $retries ]; then - log_error "Command failed after $retries retries: $cmd" - return 1 - fi - log_warn "Command failed (attempt $count/$retries), retrying in 5 seconds..." + [ $count -ge $retries ] && { log_error "Failed after $retries attempts"; return 1; } + log_warn "Retry $count/$retries in 5s..." sleep 5 done - - if [ $count -gt 0 ]; then - log_success "Command succeeded after $count retries: $cmd" - fi - return 0 -} - -# Retry a batch of commands sequentially -# Usage: retry_commands ... -retry_commands() { - local retries=$1 - shift - local -a cmds=("$@") - - log_info "Retry config: max retries = $retries" - log_info "Total commands to execute: ${#cmds[@]}" - - for cmd in "${cmds[@]}"; do - log_info "Executing command: $cmd" - retry $retries "$cmd" - local cmd_exit_code=$? - if [ $cmd_exit_code -ne 0 ]; then - log_error "Batch commands failed at: $cmd" - return $cmd_exit_code - fi - done - - log_success "All batch commands executed successfully!" return 0 } -# Retry pip install with a requirements file -# Usage: retry_pip_install [retry_count] +# Retry pip/uv install from requirements file +# Usage: retry_pip_install -d [retries] retry_pip_install() { + local debug=false + if [[ "$1" == "-d" ]]; then + debug="$2"; shift 2 + fi + local requirements_file=$1 local retries=${2:-3} + local manager=$(get_pkg_manager) - if [ ! -f "$requirements_file" ]; then - log_error "Requirements file not found: $requirements_file" - return 1 - fi + [ ! -f "$requirements_file" ] && [ "$debug" != true ] && { log_error "Not found: $requirements_file"; return 1; } - log_info "Installing from $requirements_file with $retries retries" - retry $retries "pip install -r '$requirements_file'" + log_info "Installing $(basename "$requirements_file")..." + local pip_cmd=$(get_pip_cmd) + case "$manager" in + uv) retry -d $debug $retries "uv pip install -r '$requirements_file'" ;; + *) retry -d $debug $retries "$pip_cmd install --root-user-action=ignore -r '$requirements_file'" ;; + esac } -# Retry git clone operation -# Usage: retry_git_clone [retry_count] +# Retry git clone with options +# Usage: retry_git_clone -d [--branch BRANCH] [--depth N] [--recursive] [retries] retry_git_clone() { + local debug=false branch="" depth="" recursive="" + + while [[ "$1" == -* ]]; do + case "$1" in + -d) debug="$2"; shift 2 ;; + --branch) branch="$2"; shift 2 ;; + --depth) depth="$2"; shift 2 ;; + --recursive) recursive="--recursive"; shift ;; + *) break ;; + esac + done + local repo_url=$1 local target_dir=$2 local retries=${3:-3} - log_info "Cloning $repo_url to $target_dir with $retries retries" - retry $retries "rm -rf '$target_dir' && git clone '$repo_url' '$target_dir'" + # Build clone options + local opts="" + [ -n "$branch" ] && opts="$opts --branch $branch" + [ -n "$depth" ] && opts="$opts --depth $depth" + [ -n "$recursive" ] && opts="$opts $recursive" + + log_info "Cloning $(basename "$repo_url" .git)" + retry -d $debug $retries "rm -rf '$target_dir' && git clone$opts '$repo_url' '$target_dir'" } diff --git a/tools/install/utils/utils.sh b/tools/install/utils/utils.sh index 0d0d3a3ef..e435dfa32 100755 --- a/tools/install/utils/utils.sh +++ b/tools/install/utils/utils.sh @@ -1,102 +1,86 @@ #!/bin/bash -# Common utility functions for install scripts - -# Logging functions with emojis for better visibility -log_info() { - echo -e "\033[36m🔍 [INFO] $*\033[0m" +# ============================================================================= +# Common Utilities +# ============================================================================= +# +# Core utility functions: logging, error handling, command execution. +# ============================================================================= + +# ============================================================================= +# Error Handling +# ============================================================================= + +CURRENT_STEP="" + +# Print error message and exit +die() { + local msg="$1" + local code="${2:-1}" + + echo "" >&2 + echo "══════════════════════════════════════════════════════════════════" >&2 + echo " ✗ INSTALLATION FAILED" >&2 + echo "══════════════════════════════════════════════════════════════════" >&2 + [ -n "$CURRENT_STEP" ] && echo " Step: $CURRENT_STEP" >&2 + echo " Error: $msg" >&2 + echo " Exit code: $code" >&2 + echo "══════════════════════════════════════════════════════════════════" >&2 + exit "$code" } -log_warn() { - echo -e "\033[33m⚠️ [WARN] $*\033[0m" >&2 +set_step() { + CURRENT_STEP="$1" + log_step "$1" } -log_error() { - echo -e "\033[31m❌ [ERROR] $*\033[0m" >&2 +# ============================================================================= +# Command Execution +# ============================================================================= + +# Run command or print in debug mode +# Usage: run_cmd -d [-m "message"] command args... +run_cmd() { + local msg="" debug="false" + while [[ "$1" == -* ]]; do + case "$1" in + -m) msg="$2"; shift 2 ;; + -d) debug="$2"; shift 2 ;; + *) break ;; + esac + done + + [ -n "$msg" ] && log_info "$msg" + + if [ "$debug" = true ]; then + echo " [dry-run] $*" >&2 + return 0 + fi + "$@" } -log_success() { - echo -e "\033[32m✅ [SUCCESS] $*\033[0m" -} +# ============================================================================= +# Logging +# ============================================================================= -log_step() { - echo -e "\033[35m🔧 [STEP] $*\033[0m" +log_info() { echo " · $*" >&2; } +log_warn() { echo " ! $*" >&2; } +log_error() { echo " ✗ $*" >&2; } +log_success() { echo " ✓ $*" >&2; } +log_step() { echo "→ $*" >&2; } + +print_header() { + echo "" >&2 + echo "══════════════════════════════════════════════════════════════════" >&2 + echo " $*" >&2 + echo "══════════════════════════════════════════════════════════════════" >&2 } -# Get the project root directory +# ============================================================================= +# Helpers +# ============================================================================= + get_project_root() { local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$script_dir/../../.." pwd } - -# Check if Python version meets minimum requirement -# Usage: check_python_version [min_version] -# Example: check_python_version 3.12 -check_python_version() { - local min_version=${1:-"3.10"} - - if ! command -v python &> /dev/null; then - log_error "Python not found in PATH" - return 1 - fi - - local python_version - python_version=$(python --version 2>&1 | awk '{print $2}') - log_info "Found Python version: $python_version" - - # Parse version components - local py_major py_minor min_major min_minor - py_major=$(echo "$python_version" | cut -d. -f1) - py_minor=$(echo "$python_version" | cut -d. -f2) - min_major=$(echo "$min_version" | cut -d. -f1) - min_minor=$(echo "$min_version" | cut -d. -f2) - - # Compare versions: major must match or exceed, then check minor - if [ "$py_major" -lt "$min_major" ]; then - log_error "Python $min_version+ required, found $python_version" - return 1 - elif [ "$py_major" -eq "$min_major" ] && [ "$py_minor" -lt "$min_minor" ]; then - log_error "Python $min_version+ required, found $python_version" - return 1 - fi - - log_success "Python version check passed (>= $min_version)" - return 0 -} - -# Check if we're in a conda environment -is_conda_env() { - if [ -n "$CONDA_DEFAULT_ENV" ]; then - return 0 - else - return 1 - fi -} - -# Get current conda environment name -get_conda_env() { - if is_conda_env; then - echo "$CONDA_DEFAULT_ENV" - else - echo "none" - fi -} - -# Check if a command exists -command_exists() { - command -v "$1" &> /dev/null -} - -# Print a separator line for better output formatting -print_separator() { - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -} - -# Print a section header -print_header() { - echo "" - print_separator - echo " $*" - print_separator - echo "" -}