Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
# ./docker/build.sh --platform cuda
# ./docker/build.sh --platform cuda --task train
# ./docker/build.sh --platform cuda --task train --target dev
# ./docker/build.sh --platform cuda --task train --target dev --build-arg PKG_MGR=conda


set -euo pipefail

Expand Down Expand Up @@ -44,6 +46,7 @@ TASK=""
TARGET="dev"
TAG_PREFIX="flagscale"
NO_CACHE=false
BUILD_ARGS=()

# PyPI index URLs (optional, for custom mirrors)
PIP_INDEX_URL="${PIP_INDEX_URL:-}"
Expand Down Expand Up @@ -105,6 +108,7 @@ OPTIONS:
--tag-prefix PREFIX Image tag prefix (default: flagscale)
--index-url URL PyPI index URL (for custom mirrors)
--extra-index-url URL Extra PyPI index URL
--build-arg K=V Pass build-arg to docker (can be repeated)
--no-cache Build without cache
--help Show this help message

Expand All @@ -119,6 +123,7 @@ EXAMPLES:
$0 --platform cuda
$0 --platform cuda --task train
$0 --platform cuda --task train --target dev
$0 --platform cuda --task train --target dev --build-arg PKG_MGR=conda
CUDA_VERSION=12.4.0 $0 --platform cuda --task train

EOF
Expand All @@ -136,6 +141,7 @@ parse_args() {
--tag-prefix) TAG_PREFIX="$2"; shift 2 ;;
--index-url) PIP_INDEX_URL="$2"; shift 2 ;;
--extra-index-url) PIP_EXTRA_INDEX_URL="$2"; shift 2 ;;
--build-arg) BUILD_ARGS+=("$2"); shift 2 ;;
--no-cache) NO_CACHE=true; shift ;;
--help|-h) usage; exit 0 ;;
*)
Expand Down Expand Up @@ -165,6 +171,9 @@ get_image_tag() {
# Add python version
tag="${tag}-py${PYTHON_VERSION}"

# Add timestamp
tag="${tag}-$(date +%Y%m%d%H%M%S)"

echo "$tag"
}

Expand Down Expand Up @@ -222,6 +231,10 @@ build_image() {
fi

[ "$NO_CACHE" = true ] && build_cmd="$build_cmd --no-cache"
for arg in "${BUILD_ARGS[@]}"; do
log_info "Build-arg: $arg"
build_cmd="$build_cmd --build-arg \"$arg\""
done
build_cmd="$build_cmd $PROJECT_ROOT"

log_info "Running: $build_cmd"
Expand Down
35 changes: 32 additions & 3 deletions docker/cuda/Dockerfile.all
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,39 @@ ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/l

WORKDIR /workspace

# =============================================================================
# ENV STAGE - Ensure uv/conda/pip environments
# =============================================================================
FROM base AS env

ARG PKG_MGR
ARG FLAGSCALE_HOME=/opt/flagscale
ARG PYTHON_VERSION=3.12

RUN . /etc/profile.d/flagscale-env.sh && \
if [ "$PKG_MGR" = "uv" ]; then \
if [ ! -f "${FLAGSCALE_HOME}/venv/bin/python3" ]; then \
echo "[ENV] Creating uv venv at ${FLAGSCALE_HOME}/venv (python=${PYTHON_VERSION})"; \
"$HOME/.local/bin/uv" venv "${FLAGSCALE_HOME}/venv" --python "${PYTHON_VERSION}"; \
else \
echo "[ENV] Found uv venv at ${FLAGSCALE_HOME}/venv"; \
fi; \
elif [ "$PKG_MGR" = "conda" ]; then \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point to have conda or miniconda inside a container?
For miniconda, the base installation is approximately 250 MB to 900 MB, while for venv, the footprint is typically 5-10 MB.
Are we planning to install multiple Python versions in the same container image or something?

CONDA_ENV_NAME=flagscale-all; \
if [ -d "${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}" ]; then \
echo "[ENV] Found conda env ${CONDA_ENV_NAME} at ${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}"; \
else \
echo "[ENV] Creating conda env ${CONDA_ENV_NAME} (python=${PYTHON_VERSION})"; \
env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" create -y -n "${CONDA_ENV_NAME}" "python=${PYTHON_VERSION}"; \
fi; \
else \
echo "[ENV] Using system pip (no dedicated env to create)"; \
fi

# =============================================================================
# DEPS STAGE - Install all dependencies using install folder
# =============================================================================
FROM base AS deps
FROM env AS deps

ARG PYTORCH_INDEX
ARG PKG_MGR
Expand Down Expand Up @@ -153,9 +182,9 @@ WORKDIR /workspace/FlagScale
CMD ["/bin/bash"]

# =============================================================================
# RELEASE STAGE - Production image (uses same base for consistency)
# RELEASE STAGE - Production image (uses same env for consistency)
# =============================================================================
FROM base AS release
FROM env AS release

ARG PYTORCH_INDEX
ARG PKG_MGR
Expand Down
35 changes: 32 additions & 3 deletions docker/cuda/Dockerfile.inference
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,39 @@ ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/l

WORKDIR /workspace

# =============================================================================
# ENV STAGE - Ensure uv/conda/pip environments
# =============================================================================
FROM base AS env

ARG PKG_MGR
ARG FLAGSCALE_HOME=/opt/flagscale
ARG PYTHON_VERSION=3.12

RUN . /etc/profile.d/flagscale-env.sh && \
if [ "$PKG_MGR" = "uv" ]; then \
if [ ! -f "${FLAGSCALE_HOME}/venv/bin/python3" ]; then \
echo "[ENV] Creating uv venv at ${FLAGSCALE_HOME}/venv (python=${PYTHON_VERSION})"; \
"$HOME/.local/bin/uv" venv "${FLAGSCALE_HOME}/venv" --python "${PYTHON_VERSION}"; \
else \
echo "[ENV] Found uv venv at ${FLAGSCALE_HOME}/venv"; \
fi; \
elif [ "$PKG_MGR" = "conda" ]; then \
CONDA_ENV_NAME=flagscale-inference; \
if [ -d "${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}" ]; then \
echo "[ENV] Found conda env ${CONDA_ENV_NAME} at ${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}"; \
else \
echo "[ENV] Creating conda env ${CONDA_ENV_NAME} (python=${PYTHON_VERSION})"; \
env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" create -y -n "${CONDA_ENV_NAME}" "python=${PYTHON_VERSION}"; \
fi; \
else \
echo "[ENV] Using system pip (no dedicated env to create)"; \
fi

# =============================================================================
# DEPS STAGE - Install dependencies using install folder
# =============================================================================
FROM base AS deps
FROM env AS deps

ARG PYTORCH_INDEX
ARG PKG_MGR
Expand Down Expand Up @@ -151,9 +180,9 @@ WORKDIR /workspace/FlagScale
CMD ["/bin/bash"]

# =============================================================================
# RELEASE STAGE - Production image (uses same base for consistency)
# RELEASE STAGE - Production image (uses same env for consistency)
# =============================================================================
FROM base AS release
FROM env AS release

ARG PYTORCH_INDEX
ARG PKG_MGR
Expand Down
39 changes: 36 additions & 3 deletions docker/cuda/Dockerfile.train
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,44 @@ ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/l

WORKDIR /workspace

# =============================================================================
# ENV STAGE - Ensure uv/conda/pip environments
# =============================================================================
FROM base AS env

ARG PKG_MGR
ARG FLAGSCALE_HOME=/opt/flagscale
ARG PYTHON_VERSION=3.12

RUN . /etc/profile.d/flagscale-env.sh && \
if [ "$PKG_MGR" = "uv" ]; then \
if [ ! -f "${FLAGSCALE_HOME}/venv/bin/python3" ]; then \
echo "[ENV] Creating uv venv at ${FLAGSCALE_HOME}/venv (python=${PYTHON_VERSION})"; \
"$HOME/.local/bin/uv" venv "${FLAGSCALE_HOME}/venv" --python "${PYTHON_VERSION}"; \
else \
echo "[ENV] Found uv venv at ${FLAGSCALE_HOME}/venv"; \
fi; \
elif [ "$PKG_MGR" = "conda" ]; then \
CONDA_ENV_NAME=flagscale-train; \
if [ -d "${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}" ]; then \
echo "[ENV] Found conda env ${CONDA_ENV_NAME} at ${FLAGSCALE_CONDA}/envs/${CONDA_ENV_NAME}"; \
else \
echo "[ENV] Creating conda env ${CONDA_ENV_NAME} (python=${PYTHON_VERSION})"; \
env CONDA_NO_PLUGINS=true ANACONDA_ACCEPT_TOS=yes "${FLAGSCALE_CONDA}/bin/conda" create -y -n "${CONDA_ENV_NAME}" "python=${PYTHON_VERSION}"; \
fi; \
else \
echo "[ENV] Using system pip (no dedicated env to create)"; \
fi

# =============================================================================
# DEPS STAGE - Install dependencies using install folder
# =============================================================================
FROM base AS deps
FROM env AS deps

ARG PYTORCH_INDEX
ARG PKG_MGR
ARG FLAGSCALE_HOME=/opt/flagscale
ARG PYTHON_VERSION=3.12
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually not configurable, because the packages we are installing later on are all about python3.12. Right?


# PyPI index URLs (re-declare to use in this stage)
ARG PIP_INDEX_URL
Expand Down Expand Up @@ -151,13 +181,14 @@ WORKDIR /workspace/FlagScale
CMD ["/bin/bash"]

# =============================================================================
# RELEASE STAGE - Production image (uses same base for consistency)
# RELEASE STAGE - Production image (uses same env for consistency)
# =============================================================================
FROM base AS release
FROM env AS release

ARG PYTORCH_INDEX
ARG PKG_MGR
ARG FLAGSCALE_HOME=/opt/flagscale
ARG PYTHON_VERSION=3.12

# PyPI index URLs
ARG PIP_INDEX_URL
Expand All @@ -181,6 +212,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
chmod +x /workspace/tools/install/*.sh && \
chmod +x /workspace/tools/install/utils/*.sh && \
chmod +x /workspace/tools/install/cuda/*.sh && \
( [ "$PKG_MGR" != "uv" ] || [ -f "${FLAGSCALE_HOME}/venv/bin/python3" ] ) || \
( mkdir -p "${FLAGSCALE_HOME}" && "$HOME/.local/bin/uv" venv "${FLAGSCALE_HOME}/venv" --python "${PYTHON_VERSION}" ) && \
cd /workspace && \
if [ "$PKG_MGR" = "uv" ]; then \
UV_EXTRA_INDEX_URL=${PYTORCH_INDEX} \
Expand Down
6 changes: 3 additions & 3 deletions requirements/cuda/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
-r ../common.txt

# PyTorch
torch==2.9.1
torchaudio==2.9.1
torchvision==0.24.1
torch==2.8.0
torchaudio==2.8.0
torchvision==0.23.0
2 changes: 2 additions & 0 deletions requirements/cuda/inference.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Inference-specific dependencies

-r ./base.txt
vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.11.0%2Bfl.0.1.cu124/vllm-0.11.0%2Bfl.0.1.cu124-cp312-cp312-linux_x86_64.whl
transformers==4.57.6
1 change: 1 addition & 0 deletions requirements/cuda/serve.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# serve-specific dependencies

-r ./base.txt
vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.11.0%2Bfl.0.1.cu124/vllm-0.11.0%2Bfl.0.1.cu124-cp312-cp312-linux_x86_64.whl

# support 0.5b_multiple_instance ci test
ray==2.49.1
Expand Down
1 change: 1 addition & 0 deletions requirements/cuda/serve_dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# serve development dependencies
# Includes build tools, linting, testing, and serve packages
vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.11.0%2Bfl.0.1.cu124/vllm-0.11.0%2Bfl.0.1.cu124-cp312-cp312-linux_x86_64.whl

-r ../build.txt
-r ../lint.txt
Expand Down
1 change: 1 addition & 0 deletions requirements/cuda/train.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Training-specific dependencies

-r ./base.txt
megatron_core @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/megatron-core/0.16.0rc0/megatron_core-0.16.0rc0-cp312-cp312-linux_x86_64.whl
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does this megatron_core come from?
The official release or megatron core is 0.15.3.

4 changes: 4 additions & 0 deletions tests/test_utils/config/platforms/cuda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ a100:
mixtral: ["tp2_pp1_ep2", "tp4_pp1_ep2"]
hetero_train:
aquila: ["tp2pp1_tp4pp1_tp2pp1", "tp2dp1pp1_tp2dp2pp1_tp1dp2pp1", "dp2dp4_shared_embedding"]
inference:
qwen3: ["4b-tp2"]
serve:
qwen2_5: ["0.5b"]
unit:
# Include patterns: "*" for all, or list specific paths
include: "*"
Expand Down