Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/configs/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ display_name: "CUDA Tests"

# Docker image for this hardware
ci_image: localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.1-time2510131515
ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12-20260224162355
ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12-20260225234600

# Runner labels for this hardware
runner_labels:
Expand Down
78 changes: 40 additions & 38 deletions .github/workflows/all_tests_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ jobs:
shell: bash
runs-on: ubuntu-latest
outputs:
ci_image: ${{ steps.config.outputs.ci_image }}
# ci_image: ${{ steps.config.outputs.ci_image }}
ci_train_image: ${{ steps.config.outputs.ci_train_image }}
ci_inference_image: ${{ steps.config.outputs.ci_inference_image }}
runs_on: ${{ steps.config.outputs.runs_on }}
container_volumes: ${{ steps.config.outputs.container_volumes }}
container_options: ${{ steps.config.outputs.container_options }}
Expand Down Expand Up @@ -108,7 +110,7 @@ jobs:
with:
platform: ${{ inputs.platform }}
device: ${{ matrix.device }}
image: ${{ needs.checkout_and_config.outputs.ci_image }}
image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
container_options: ${{ needs.checkout_and_config.outputs.container_options }}
Expand All @@ -126,7 +128,7 @@ jobs:
with:
platform: ${{ inputs.platform }}
test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
image: ${{ needs.checkout_and_config.outputs.ci_image }}
image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
container_options: ${{ needs.checkout_and_config.outputs.container_options }}
Expand All @@ -144,7 +146,7 @@ jobs:
with:
platform: ${{ inputs.platform }}
test_matrix: ${{ needs.checkout_and_config.outputs.hetero_train_test_matrix }}
image: ${{ needs.checkout_and_config.outputs.ci_image }}
image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
container_options: ${{ needs.checkout_and_config.outputs.container_options }}
Expand All @@ -154,41 +156,41 @@ jobs:
env_path: ${{ needs.checkout_and_config.outputs.env_path }}

# NOTE: Inference, serve, and rl functional tests are temporarily disabled
# functional_tests_inference:
# needs:
# - checkout_and_config
# - unit_tests
# if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null
# uses: ./.github/workflows/functional_tests_inference.yml
# with:
# platform: ${{ inputs.platform }}
# test_matrix: ${{ needs.checkout_and_config.outputs.inference_test_matrix }}
# image: ${{ needs.checkout_and_config.outputs.ci_image }}
# runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
# container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
# container_options: ${{ needs.checkout_and_config.outputs.container_options }}
# source_artifact: flagscale-source-${{ github.sha }}
# pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
# env_name: ${{ needs.checkout_and_config.outputs.env_name_inference }}
# env_path: ${{ needs.checkout_and_config.outputs.env_path }}
functional_tests_inference:
needs:
- checkout_and_config
- unit_tests
if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null
uses: ./.github/workflows/functional_tests_inference.yml
with:
platform: ${{ inputs.platform }}
test_matrix: ${{ needs.checkout_and_config.outputs.inference_test_matrix }}
image: ${{ needs.checkout_and_config.outputs.ci_inference_image }}
runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
container_options: ${{ needs.checkout_and_config.outputs.container_options }}
source_artifact: flagscale-source-${{ github.sha }}
pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
env_name: ${{ needs.checkout_and_config.outputs.env_name_inference }}
env_path: ${{ needs.checkout_and_config.outputs.env_path }}

# functional_tests_serve:
# needs:
# - checkout_and_config
# - unit_tests
# if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null
# uses: ./.github/workflows/functional_tests_serve.yml
# with:
# platform: ${{ inputs.platform }}
# test_matrix: ${{ needs.checkout_and_config.outputs.serve_test_matrix }}
# image: ${{ needs.checkout_and_config.outputs.ci_image }}
# runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
# container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
# container_options: ${{ needs.checkout_and_config.outputs.container_options }}
# source_artifact: flagscale-source-${{ github.sha }}
# pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
# env_name: ${{ needs.checkout_and_config.outputs.env_name_serve }}
# env_path: ${{ needs.checkout_and_config.outputs.env_path }}
functional_tests_serve:
needs:
- checkout_and_config
- unit_tests
if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null
uses: ./.github/workflows/functional_tests_serve.yml
with:
platform: ${{ inputs.platform }}
test_matrix: ${{ needs.checkout_and_config.outputs.serve_test_matrix }}
image: ${{ needs.checkout_and_config.outputs.ci_inference_image }}
runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
container_options: ${{ needs.checkout_and_config.outputs.container_options }}
source_artifact: flagscale-source-${{ github.sha }}
pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
env_name: ${{ needs.checkout_and_config.outputs.env_name_serve }}
env_path: ${{ needs.checkout_and_config.outputs.env_path }}

# functional_tests_rl:
# needs:
Expand Down
11 changes: 10 additions & 1 deletion .github/workflows/functional_tests_hetero_train.yml
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,13 @@ jobs:

echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
exit $exit_code
timeout-minutes: 15
timeout-minutes: 30

- name: Upload Functional Test Logs
if: always() && steps.functional_test.outcome == 'failure'
uses: actions/upload-artifact@v4
with:
name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
retention-days: 7
if-no-files-found: warn
4 changes: 2 additions & 2 deletions .github/workflows/functional_tests_inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
run:
shell: bash
env:
PROJECT_ROOT: /workspace/FlagScale
PROJECT_ROOT: /tmp/FlagScale
runs-on: ${{ fromJson(inputs.runs_on) }}
strategy:
fail-fast: false
Expand Down Expand Up @@ -231,6 +231,6 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/results_test
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
retention-days: 7
if-no-files-found: warn
9 changes: 9 additions & 0 deletions .github/workflows/functional_tests_rl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,12 @@ jobs:
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
exit $exit_code
timeout-minutes: 15

- name: Upload Functional Test Logs
if: always() && steps.functional_test.outcome == 'failure'
uses: actions/upload-artifact@v4
with:
name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
retention-days: 7
if-no-files-found: warn
4 changes: 2 additions & 2 deletions .github/workflows/functional_tests_serve.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
run:
shell: bash
env:
PROJECT_ROOT: /workspace/FlagScale
PROJECT_ROOT: ${{ github.workspace }}
runs-on: ${{ fromJson(inputs.runs_on) }}
strategy:
fail-fast: false
Expand Down Expand Up @@ -231,6 +231,6 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/results_test
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
retention-days: 7
if-no-files-found: warn
50 changes: 29 additions & 21 deletions .github/workflows/functional_tests_train.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,26 +147,25 @@ jobs:

echo "Python location: $(which python)"
echo "Python version: $(python --version)"

# Install task source dependencies (pip deps are pre-installed in the env)
echo "Installing task source dependencies..."

# Derive install-dir from env_path (e.g., /root/miniconda3 -> /root)
INSTALL_DIR=""
if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then
INSTALL_DIR=$(dirname "$ENV_PATH")
fi

# Only install Megatron-LM source dep (pip deps are pre-installed in Docker image)
./tools/install/install.sh \
--platform ${{ inputs.platform }} \
--task train \
--pkg-mgr "$PKG_MGR" \
${ENV_NAME:+--env-name "$ENV_NAME"} \
${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \
--no-system --no-dev --no-base --no-task \
--src-deps megatron-lm \
--retry-count 3
pip install sentencepiece==0.2.1 transformers==4.57.6 tiktoken==0.12.0
# # Install task source dependencies (pip deps are pre-installed in the env)
# echo "Installing task source dependencies..."

# # Derive install-dir from env_path (e.g., /root/miniconda3 -> /root)
# INSTALL_DIR=""
# if [ "$PKG_MGR" = "conda" ] && [ -n "$ENV_PATH" ]; then
# INSTALL_DIR=$(dirname "$ENV_PATH")
# fi

# # Only install Megatron-LM source dep (pip deps are pre-installed in Docker image)
# ./tools/install/install.sh \
# --platform ${{ inputs.platform }} \
# --task train \
# --pkg-mgr "$PKG_MGR" \
# ${ENV_NAME:+--env-name "$ENV_NAME"} \
# ${INSTALL_DIR:+--install-dir "$INSTALL_DIR"} \
# --no-system --no-dev --no-base --no-task \
# --retry-count 3
timeout-minutes: 30

- name: Run functional tests
Expand Down Expand Up @@ -237,4 +236,13 @@ jobs:

echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
exit $exit_code
timeout-minutes: 15
timeout-minutes: 30

- name: Upload Functional Test Logs
if: always() && steps.functional_test.outcome == 'failure'
uses: actions/upload-artifact@v4
with:
name: functional_tests-logs-${{ github.run_id }}-${{ matrix.test_config.task }}-${{ matrix.test_config.model }}
path: tests/functional_tests/${{ matrix.test_config.task }}/${{ matrix.test_config.model }}/test_results
retention-days: 7
if-no-files-found: warn
13 changes: 13 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
# ./docker/build.sh --platform cuda
# ./docker/build.sh --platform cuda --task train
# ./docker/build.sh --platform cuda --task train --target dev
# ./docker/build.sh --platform cuda --task train --target dev --build-arg PKG_MGR=conda


set -euo pipefail

Expand Down Expand Up @@ -44,6 +46,7 @@ TASK=""
TARGET="dev"
TAG_PREFIX="flagscale"
NO_CACHE=false
BUILD_ARGS=()

# PyPI index URLs (optional, for custom mirrors)
PIP_INDEX_URL="${PIP_INDEX_URL:-}"
Expand Down Expand Up @@ -105,6 +108,7 @@ OPTIONS:
--tag-prefix PREFIX Image tag prefix (default: flagscale)
--index-url URL PyPI index URL (for custom mirrors)
--extra-index-url URL Extra PyPI index URL
--build-arg K=V Pass build-arg to docker (can be repeated)
--no-cache Build without cache
--help Show this help message

Expand All @@ -119,6 +123,7 @@ EXAMPLES:
$0 --platform cuda
$0 --platform cuda --task train
$0 --platform cuda --task train --target dev
$0 --platform cuda --task train --target dev --build-arg PKG_MGR=conda
CUDA_VERSION=12.4.0 $0 --platform cuda --task train

EOF
Expand All @@ -136,6 +141,7 @@ parse_args() {
--tag-prefix) TAG_PREFIX="$2"; shift 2 ;;
--index-url) PIP_INDEX_URL="$2"; shift 2 ;;
--extra-index-url) PIP_EXTRA_INDEX_URL="$2"; shift 2 ;;
--build-arg) BUILD_ARGS+=("$2"); shift 2 ;;
--no-cache) NO_CACHE=true; shift ;;
--help|-h) usage; exit 0 ;;
*)
Expand Down Expand Up @@ -165,6 +171,9 @@ get_image_tag() {
# Add python version
tag="${tag}-py${PYTHON_VERSION}"

# Add timestamp
tag="${tag}-$(date +%Y%m%d%H%M%S)"

echo "$tag"
}

Expand Down Expand Up @@ -222,6 +231,10 @@ build_image() {
fi

[ "$NO_CACHE" = true ] && build_cmd="$build_cmd --no-cache"
for arg in "${BUILD_ARGS[@]}"; do
log_info "Build-arg: $arg"
build_cmd="$build_cmd --build-arg \"$arg\""
done
build_cmd="$build_cmd $PROJECT_ROOT"

log_info "Running: $build_cmd"
Expand Down
35 changes: 32 additions & 3 deletions docker/cuda/Dockerfile.all
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,39 @@ ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/l

WORKDIR /workspace

# =============================================================================
# ENV STAGE - Ensure uv/conda/pip environments
# =============================================================================
FROM base AS env

ARG PKG_MGR
ARG FLAGSCALE_HOME=/opt/flagscale
ARG PYTHON_VERSION=3.12

RUN . /etc/profile.d/flagscale-env.sh && \
if [ "$PKG_MGR" = "uv" ]; then \
if [ ! -f "${FLAGSCALE_HOME}/venv/bin/python3" ]; then \
echo "[ENV] Creating uv venv at ${FLAGSCALE_HOME}/venv (python=${PYTHON_VERSION})"; \
"$HOME/.local/bin/uv" venv "${FLAGSCALE_HOME}/venv" --python "${PYTHON_VERSION}"; \
else \
echo "[ENV] Found uv venv at ${FLAGSCALE_HOME}/venv"; \
fi; \
elif [ "$PKG_MGR" = "conda" ]; then \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point to have conda or miniconda inside a container?
For miniconda, the base installation is approximately 250 MB to 900 MB, while for venv, the footprint is typically 5-10 MB.
Are we planning to install multiple Python versions in the same container image or something?

CONDA_ENV_NAME=flagscale-all; \
if [ -d "${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}" ]; then \
echo "[ENV] Found conda env ${CONDA_ENV_NAME} at ${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}"; \
else \
echo "[ENV] Creating conda env ${CONDA_ENV_NAME} (python=${PYTHON_VERSION})"; \
env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" create -y -n "${CONDA_ENV_NAME}" "python=${PYTHON_VERSION}"; \
fi; \
else \
echo "[ENV] Using system pip (no dedicated env to create)"; \
fi

# =============================================================================
# DEPS STAGE - Install all dependencies using install folder
# =============================================================================
FROM base AS deps
FROM env AS deps

ARG PYTORCH_INDEX
ARG PKG_MGR
Expand Down Expand Up @@ -153,9 +182,9 @@ WORKDIR /workspace/FlagScale
CMD ["/bin/bash"]

# =============================================================================
# RELEASE STAGE - Production image (uses same base for consistency)
# RELEASE STAGE - Production image (uses same env for consistency)
# =============================================================================
FROM base AS release
FROM env AS release

ARG PYTORCH_INDEX
ARG PKG_MGR
Expand Down
Loading
Loading