From 7290e7726b9083974b4c88af76a922bf7c0552ba Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 17:50:33 -0400 Subject: [PATCH 01/14] WIP test out new dockerfile with more nvidia tools --- .github/workflows/beta.yml | 36 ++++++++++++++++++++++++++++++++++ docker/Dockerfile-base | 6 ++++++ docker/Dockerfile-beta | 40 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 .github/workflows/beta.yml create mode 100644 docker/Dockerfile-beta diff --git a/.github/workflows/beta.yml b/.github/workflows/beta.yml new file mode 100644 index 000000000..a3255ea6c --- /dev/null +++ b/.github/workflows/beta.yml @@ -0,0 +1,36 @@ +name: beta-docker-images + +on: + workflow_dispatch: + pull_request: + +jobs: + build-axolotl-beta: + if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }} + strategy: + fail-fast: false + runs-on: axolotl-gpu-runner + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Docker metadata + id: metadata + uses: docker/metadata-action@v5 + with: + images: winglian/axolotl-beta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/ + - name: Build and export to Docker + uses: docker/build-push-action@v5 + with: + context: . + file: ./docker/Dockerfile-beta + tags: | + ${{ steps.metadata.outputs.tags }} + labels: ${{ steps.metadata.outputs.labels }} diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index 1de5537da..1c9b9e04a 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -35,3 +35,9 @@ RUN git lfs install --skip-repo && \ pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working pip3 install -U --no-cache-dir pydantic==1.10.10 + +WORKDIR /workspace + +RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git + +WORKDIR /workspace/axolotl diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta new file mode 100644 index 000000000..bd4d8c104 --- /dev/null +++ b/docker/Dockerfile-beta @@ -0,0 +1,40 @@ +FROM nvcr.io/nvidia/pytorch:24.03-py3 + +RUN python3 -m pip install --upgrade pip + +RUN groupadd axolotl && useradd -m -g axolotl -s /bin/bash axolotl + +USER axolotl + +RUN mkdir -p /home/axolotl/venv + +RUN python -m venv /home/axolotl/venv/axolotl + +ENV PATH="/home/axolotl/venv/axolotl/bin:$PATH" + +RUN echo "source /home/axolotl/venv/axolotl/bin/activate" >> /home/axolotl/.bashrc + +RUN git lfs install --skip-repo && \ + pip3 install awscli + +RUN pip install causal_conv1d && \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] + +# So we can test the Docker image +RUN pip install pytest + +# fix so that git fetch/pull from remote works +RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ + git config --get remote.origin.fetch + +# helper for huggingface-login cli +RUN git config --global credential.helper store + + +ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" +ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub" +ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub" +ENV HF_HOME="/workspace/data/huggingface-cache/hub" +ENV HF_HUB_ENABLE_HF_TRANSFER="1" + +CMD ["sleep", "infinity"] From 2ca30d1a97eac0d60d571217701f173df46f28b9 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 17:57:34 -0400 Subject: [PATCH 02/14] make sure to install axolotl :facepalm: --- docker/Dockerfile-beta | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index bd4d8c104..289452580 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -23,6 +23,12 @@ RUN pip install causal_conv1d && \ # So we can test the Docker image RUN pip install pytest +WORKDIR /workspace + +RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git + +WORKDIR /workspace/axolotl + # fix so that git fetch/pull from remote works RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch From 89086d3d1c67ba01eb17fad9fc4a6c8881f771ab Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 18:16:50 -0400 Subject: [PATCH 03/14] install venv from apt --- docker/Dockerfile-beta | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 289452580..29be8dd6e 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -1,5 +1,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 +RUN apt update && apt install python3.10-venv + RUN python3 -m pip install --upgrade pip RUN groupadd axolotl && useradd -m -g axolotl -s /bin/bash axolotl From f07693371979e01c3098a993aea2e90a80e0313a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 18:34:02 -0400 Subject: [PATCH 04/14] add -yes option to pip install --- docker/Dockerfile-beta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 29be8dd6e..e47e269c7 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -1,6 +1,6 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 -RUN apt update && apt install python3.10-venv +RUN apt update && apt install -y python3.10-venv RUN python3 -m pip install --upgrade pip From 493b6b9ae87bb0159d1ec0d7a10413025e7fd9ef Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 18:44:42 -0400 Subject: [PATCH 05/14] install git-lfs --- docker/Dockerfile-beta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index e47e269c7..781e8544d 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -1,6 +1,6 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 -RUN apt update && apt install -y python3.10-venv +RUN apt update && apt install -y python3.10-venv git-lfs RUN python3 -m pip install --upgrade pip From dd87c81e8861f4d2511862b0a1c90ca80207866d Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 18:58:17 -0400 Subject: [PATCH 06/14] make sure to install packaing too --- docker/Dockerfile-beta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 781e8544d..0f1e2a931 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -2,7 +2,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 RUN apt update && apt install -y python3.10-venv git-lfs -RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --upgrade pip && pip install packaging RUN groupadd axolotl && useradd -m -g axolotl -s /bin/bash axolotl From b9211fc2e67cf8d709861a7768aa85efd34dcec0 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 19:19:23 -0400 Subject: [PATCH 07/14] use --system-site-packages to pickup global packaging pkg --- docker/Dockerfile-beta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 0f1e2a931..f86c86fa2 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -10,7 +10,7 @@ USER axolotl RUN mkdir -p /home/axolotl/venv -RUN python -m venv /home/axolotl/venv/axolotl +RUN python -m venv --system-site-packages /home/axolotl/venv/axolotl ENV PATH="/home/axolotl/venv/axolotl/bin:$PATH" From b8aba89f56c3ee4143707f63cffe19a3660d9f33 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 19:33:49 -0400 Subject: [PATCH 08/14] setup /workspace --- docker/Dockerfile-beta | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index f86c86fa2..41fb69a3c 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -4,7 +4,10 @@ RUN apt update && apt install -y python3.10-venv git-lfs RUN python3 -m pip install --upgrade pip && pip install packaging -RUN groupadd axolotl && useradd -m -g axolotl -s /bin/bash axolotl +RUN groupadd axolotl && \ + useradd -m -g axolotl -s /bin/bash axolotl && \ + mkdir /workspace && \ + chown axolotl:axolotl /workspace USER axolotl From 4be78e970729c09908026dffff66eda4428522b0 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 19:54:47 -0400 Subject: [PATCH 09/14] no need for mkdir --- docker/Dockerfile-beta | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 41fb69a3c..cd96a7c60 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -6,7 +6,6 @@ RUN python3 -m pip install --upgrade pip && pip install packaging RUN groupadd axolotl && \ useradd -m -g axolotl -s /bin/bash axolotl && \ - mkdir /workspace && \ chown axolotl:axolotl /workspace USER axolotl From b4df593deb7953e0ef92ca248c279b5b18462d02 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 20:09:26 -0400 Subject: [PATCH 10/14] make sure to cd to axolotl dir --- docker/Dockerfile-beta | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index cd96a7c60..d59636bdc 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -21,7 +21,8 @@ RUN echo "source /home/axolotl/venv/axolotl/bin/activate" >> /home/axolotl/.bash RUN git lfs install --skip-repo && \ pip3 install awscli -RUN pip install causal_conv1d && \ +RUN pip install causal_conv1d && \ + cd /workspace/axolotl && \ pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] # So we can test the Docker image From b5e282169670b0cf3856639c9e330f51da923292 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 21:02:21 -0400 Subject: [PATCH 11/14] reorder for fix --- docker/Dockerfile-beta | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index d59636bdc..825fbb282 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -21,6 +21,12 @@ RUN echo "source /home/axolotl/venv/axolotl/bin/activate" >> /home/axolotl/.bash RUN git lfs install --skip-repo && \ pip3 install awscli +WORKDIR /workspace + +RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git + +WORKDIR /workspace/axolotl + RUN pip install causal_conv1d && \ cd /workspace/axolotl && \ pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] @@ -28,12 +34,6 @@ RUN pip install causal_conv1d && \ # So we can test the Docker image RUN pip install pytest -WORKDIR /workspace - -RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git - -WORKDIR /workspace/axolotl - # fix so that git fetch/pull from remote works RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ git config --get remote.origin.fetch From f807516bf688be46c8da61386044952f26ce5316 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 21:22:10 -0400 Subject: [PATCH 12/14] use torch 2.2.2 --- docker/Dockerfile-beta | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 825fbb282..04f15f21c 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -2,7 +2,9 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 RUN apt update && apt install -y python3.10-venv git-lfs -RUN python3 -m pip install --upgrade pip && pip install packaging +RUN python3 -m pip install --upgrade pip && \ + pip install packaging && \ + pip install -U torch==2.2.2 RUN groupadd axolotl && \ useradd -m -g axolotl -s /bin/bash axolotl && \ From 8401b6ff493a7c233270fbc3089813b5fa28ceed Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 21:59:08 -0400 Subject: [PATCH 13/14] shuffle some dependency versions around --- docker/Dockerfile-beta | 7 ++++--- setup.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 04f15f21c..6b7a84b70 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -4,7 +4,9 @@ RUN apt update && apt install -y python3.10-venv git-lfs RUN python3 -m pip install --upgrade pip && \ pip install packaging && \ - pip install -U torch==2.2.2 + pip uninstall torch-tensorrt && \ + pip install -U torch==2.2.2 && \ + pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable RUN groupadd axolotl && \ useradd -m -g axolotl -s /bin/bash axolotl && \ @@ -20,8 +22,7 @@ ENV PATH="/home/axolotl/venv/axolotl/bin:$PATH" RUN echo "source /home/axolotl/venv/axolotl/bin/activate" >> /home/axolotl/.bashrc -RUN git lfs install --skip-repo && \ - pip3 install awscli +RUN git lfs install --skip-repo WORKDIR /workspace diff --git a/setup.py b/setup.py index fbca5a360..3d221cf5e 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ def parse_requirements(): dependency_links=dependency_links, extras_require={ "flash-attn": [ - "flash-attn==2.5.5", + "flash-attn>=2.4.2", ], "fused-dense-lib": [ "fused-dense-lib @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib", From 95c2e5f313282154769ed56e73704660cfa3cdbf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 21 Apr 2024 22:09:13 -0400 Subject: [PATCH 14/14] agree to uninstall --- docker/Dockerfile-beta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-beta b/docker/Dockerfile-beta index 6b7a84b70..fd89bc2bd 100644 --- a/docker/Dockerfile-beta +++ b/docker/Dockerfile-beta @@ -4,7 +4,7 @@ RUN apt update && apt install -y python3.10-venv git-lfs RUN python3 -m pip install --upgrade pip && \ pip install packaging && \ - pip uninstall torch-tensorrt && \ + pip uninstall -y torch-tensorrt && \ pip install -U torch==2.2.2 && \ pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable