Skip to content

Commit

Permalink
Switch to multistage dockerfile (foundation-model-stack#154)
Browse files Browse the repository at this point in the history
* Multi-stage build

Signed-off-by: Thara Palanivel <[email protected]>

* Tiny fix for app dir

Signed-off-by: Thara Palanivel <[email protected]>

* Fix training_logs_filename import error

Signed-off-by: Thara Palanivel <[email protected]>

* Update flashattn

Signed-off-by: Thara Palanivel <[email protected]>

* Improve python dependencies management

Signed-off-by: ted chang <[email protected]>

---------

Signed-off-by: Thara Palanivel <[email protected]>
Signed-off-by: ted chang <[email protected]>
Co-authored-by: ted chang <[email protected]>
  • Loading branch information
tharapalanivel and tedhtchang authored May 31, 2024
1 parent 3d0c4f3 commit 54466e6
Showing 1 changed file with 91 additions and 85 deletions.
176 changes: 91 additions & 85 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,57 +12,59 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM registry.access.redhat.com/ubi9/python-311 as wheel

ARG WHEEL_VERSION=""
USER root
RUN --mount=type=cache,target=/root/.cache/pip \
python -m pip install --upgrade pip && \
python -m pip install build
COPY tuning tuning
COPY .git .git
COPY pyproject.toml pyproject.toml
# build wheel if wheel version is empty else download the wheel from PyPi
RUN if [[ -z "${WHEEL_VERSION}" ]]; \
then python -m build --wheel --outdir /tmp; \
else pip download fms-hf-tuning==${WHEEL_VERSION} --dest /tmp --only-binary=:all: --no-deps; \
fi && \
ls /tmp/*.whl >/tmp/bdist_name


FROM registry.access.redhat.com/ubi9/ubi AS release

ARG CUDA_VERSION=11.8.0
## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=latest
ARG USER=tuning
ARG USER_UID=1000
ARG SET_NUM_PROCESSES_TO_NUM_GPUS=True
ARG PYTHON_VERSION=3.11
ARG WHEEL_VERSION=""

## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} as base

USER root
ARG PYTHON_VERSION
ARG USER
ARG USER_UID

RUN dnf remove -y --disableplugin=subscription-manager \
subscription-manager \
# we install newer version of requests via pip
python3.11-requests \
&& dnf install -y make \
# to help with debugging
procps \
&& dnf install -y python${PYTHON_VERSION} procps \
&& ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
&& python -m ensurepip --upgrade \
&& python -m pip install --upgrade pip \
&& dnf update -y \
&& dnf clean all

ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8

ENV CUDA_VERSION=$CUDA_VERSION \
NV_CUDA_LIB_VERSION=11.8.0-1 \
RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
chmod g+rx /home/${USER}

## Used as base of the Release stage to removed unrelated the packages and CVEs
FROM base as release-base

# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf
RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts


## CUDA Base ###################################################################
FROM base as cuda-base

# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
ENV CUDA_VERSION=12.1.0 \
NV_CUDA_LIB_VERSION=12.1.0-1 \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
NV_CUDA_CUDART_VERSION=11.8.89-1 \
NV_CUDA_COMPAT_VERSION=520.61.05-1
NV_CUDA_CUDART_VERSION=12.1.55-1 \
NV_CUDA_COMPAT_VERSION=530.30.02-1

RUN dnf config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
&& dnf install -y \
cuda-cudart-11-8-${NV_CUDA_CUDART_VERSION} \
cuda-compat-11-8-${NV_CUDA_COMPAT_VERSION} \
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
&& dnf clean all
Expand All @@ -71,92 +73,96 @@ ENV CUDA_HOME="/usr/local/cuda" \
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"

## CUDA Development ############################################################
FROM cuda-base as cuda-devel

ENV NV_NVTX_VERSION=11.8.86-1 \
NV_LIBNPP_VERSION=11.8.0.86-1 \
NV_LIBCUBLAS_VERSION=11.11.3.6-1 \
NV_LIBNCCL_PACKAGE_VERSION=2.15.5-1+cuda11.8
# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
NV_NVML_DEV_VERSION=12.1.55-1 \
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1

RUN dnf config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
&& dnf install -y \
cuda-libraries-11-8-${NV_CUDA_LIB_VERSION} \
cuda-nvtx-11-8-${NV_NVTX_VERSION} \
libnpp-11-8-${NV_LIBNPP_VERSION} \
libcublas-11-8-${NV_LIBCUBLAS_VERSION} \
libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
&& dnf clean all

ENV NV_CUDA_CUDART_DEV_VERSION=11.8.89-1 \
NV_NVML_DEV_VERSION=11.8.86-1 \
NV_LIBCUBLAS_DEV_VERSION=11.11.3.6-1 \
NV_LIBNPP_DEV_VERSION=11.8.0.86-1 \
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1+cuda11.8

RUN dnf config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
&& dnf install -y \
cuda-command-line-tools-11-8-${NV_CUDA_LIB_VERSION} \
cuda-libraries-devel-11-8-${NV_CUDA_LIB_VERSION} \
cuda-minimal-build-11-8-${NV_CUDA_LIB_VERSION} \
cuda-cudart-devel-11-8-${NV_CUDA_CUDART_DEV_VERSION} \
cuda-nvml-devel-11-8-${NV_NVML_DEV_VERSION} \
libcublas-devel-11-8-${NV_LIBCUBLAS_DEV_VERSION} \
libnpp-devel-11-8-${NV_LIBNPP_DEV_VERSION} \
cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
&& dnf clean all

ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"

RUN dnf install -y python3.11 git \
&& ln -s /usr/bin/python3.11 /bin/python \
&& python -m ensurepip --upgrade \
&& dnf update -y \
&& dnf clean all
FROM cuda-devel as python-installations

# Removes the example private key to avoid high severity vulnerability warning
RUN rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem

# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf
RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts
ARG WHEEL_VERSION
ARG USER
ARG USER_UID

RUN dnf install -y git && \
# perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
# Twistlock detects it as H severity: Private keys stored in image
rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \
dnf clean all
USER ${USER}
WORKDIR /tmp
COPY --from=wheel /tmp/*.whl /tmp/bdist_name /tmp/
RUN --mount=type=cache,target=/root/.cache/pip \
python -m pip install --upgrade pip && \
python -m pip install wheel && \
python -m pip install "$(head bdist_name)" && \
# Due to FIPS tolerance issues, removing aim at this time
#python -m pip install "$(head bdist_name)[aim]" && \
python -m pip install "$(head bdist_name)[flash-attn]" && \
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
python -m pip install --user build
COPY --chown=${USER}:root tuning tuning
COPY .git .git
COPY pyproject.toml pyproject.toml

# Build a wheel if PyPi wheel_version is empty else download the wheel from PyPi
RUN if [[ -z "${WHEEL_VERSION}" ]]; \
then python -m build --wheel --outdir /tmp; \
else pip download fms-hf-tuning==${WHEEL_VERSION} --dest /tmp --only-binary=:all: --no-deps; \
fi && \
ls /tmp/*.whl >/tmp/bdist_name

# Install from the wheel
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
python -m pip install --user wheel && \
python -m pip install --user "$(head bdist_name)" && \
python -m pip install --user "$(head bdist_name)[flash-attn]" && \
# Clean up the wheel module. It's only needed by flash-attn install
python -m pip uninstall wheel -y && \
python -m pip uninstall wheel build -y && \
# Cleanup the bdist whl file
rm $(head bdist_name) /tmp/bdist_name

## Final image ################################################
FROM release-base as release
ARG USER
ARG PYTHON_VERSION

RUN mkdir -p /licenses
COPY LICENSE /licenses/

RUN mkdir /app
RUN mkdir /app && \
chown -R $USER:0 /app /tmp && \
chmod -R g+rwX /app /tmp

# Copy scripts and default configs
COPY build/launch_training.py build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/
COPY build/utils.py /app/build/
RUN chmod +x /app/launch_training.py /app/accelerate_launch.py

ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml"
ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"

# Need a better way to address this hack
RUN touch /.aim_profile && \
chmod -R 777 /.aim_profile && \
mkdir /.cache && \
chmod -R 777 /.cache

# create tuning user and give ownership to dirs
RUN useradd -u $USER_UID tuning -m -g 0 --system && \
chown -R $USER:0 /app /tmp && \
chmod -R g+rwX /app /tmp

WORKDIR /app
USER ${USER}
COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages"

CMD [ "python", "/app/accelerate_launch.py" ]

0 comments on commit 54466e6

Please sign in to comment.