Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/perf-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-head bash -c 'uv pip install --system -e .\[all\]'

- name: Clean dataset cache
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/unit-test-partial.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ jobs:
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-head bash -c 'uv pip install --system -e .\[all\]'

- name: Print Pip Dependency Tree
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system pipdeptree'
docker compose exec ray-head bash -c 'uv pip install --system pipdeptree'
docker compose exec ray-head bash -c 'pipdeptree'

- name: Clean dataset cache
Expand Down Expand Up @@ -90,8 +90,8 @@ jobs:
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-worker bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-head bash -c 'uv pip install --system -e .\[all\]'
docker compose exec ray-worker bash -c 'uv pip install --system -e .\[all\]'

- name: Clean dataset cache
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
- name: Install coverage
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system coverage'
docker compose exec ray-head bash -c 'uv pip install --system coverage'

- name: Download Coverage Report Standalone
uses: actions/download-artifact@v4
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ jobs:
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-worker bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-head bash -c 'uv pip install --system -e .\[all\]'
docker compose exec ray-worker bash -c 'uv pip install --system -e .\[all\]'

- name: Print Pip Dependency Tree
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system pipdeptree'
docker compose exec ray-head bash -c 'uv pip install --system pipdeptree'
docker compose exec ray-head bash -c 'pipdeptree'

- name: Clean dataset cache
Expand Down Expand Up @@ -87,8 +87,8 @@ jobs:
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-worker bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
docker compose exec ray-head bash -c 'uv pip install --system -e .\[all\]'
docker compose exec ray-worker bash -c 'uv pip install --system -e .\[all\]'

- name: Clean dataset cache
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
Expand Down Expand Up @@ -139,7 +139,7 @@ jobs:
- name: Install coverage
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system coverage'
docker compose exec ray-head bash -c 'uv pip install --system coverage'

- name: Download Coverage Report Standalone
uses: actions/download-artifact@v4
Expand Down
97 changes: 74 additions & 23 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,45 +1,96 @@
# The data-juicer image includes all open-source contents of data-juicer,
# and it will be installed in editable mode.

FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04

# change to aliyun source
# avoid hanging on interactive installation
ENV DEBIAN_FRONTEND=noninteractive

# add aliyun apt source mirrors for faster download in China
RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list \
&& sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list

# install python 3.10
RUN DEBIAN_FRONTEND=noninteractive apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y git curl vim wget python3.10 libpython3.10-dev python3-pip libgl1-mesa-glx libglib2.0-0 \
&& ln -sf /usr/bin/python3.10 /usr/bin/python3 \
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
&& apt-get autoclean && rm -rf /var/lib/apt/lists/* \
&& pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
# install some basic system dependencies
RUN apt-get update && apt-get install -y \
git curl vim wget aria2 openssh-server gnupg build-essential cmake gfortran \
ffmpeg libsm6 libxext6 libgl1 libglx-mesa0 libglib2.0-0 libosmesa6-dev \
freeglut3-dev libglfw3-dev libgles2-mesa-dev vulkan-tools \
libopenblas-dev liblapack-dev postgresql postgresql-contrib libpq-dev \
software-properties-common \
&& rm -rf /var/lib/apt/lists/*

# install Git LFS
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | /bin/bash \
&& apt-get install -y git-lfs && git lfs install

# install gcc-11 and g++-11
RUN apt-get update && \
apt-get install -y gcc-11 g++-11 && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 200 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 200

# set up Vulkan for NVIDIA
ENV NVIDIA_VISIBLE_DEVICES=all NVIDIA_DRIVER_CAPABILITIES=all VK_DRIVER_FILES=/etc/vulkan/icd.d/nvidia_icd.json
RUN mkdir -p /etc/vulkan/icd.d /etc/vulkan/implicit_layer.d /usr/share/glvnd/egl_vendor.d
RUN wget https://pai-vision-data-sh.oss-cn-shanghai.aliyuncs.com/aigc-data/isaac/nb10/nvidia_icd.json -O /etc/vulkan/icd.d/nvidia_icd.json
RUN wget https://pai-vision-data-sh.oss-cn-shanghai.aliyuncs.com/aigc-data/isaac/nb10/nvidia_layers.json -O /etc/vulkan/implicit_layer.d/nvidia_layers.json
RUN wget https://pai-vision-data-sh.oss-cn-shanghai.aliyuncs.com/aigc-data/isaac/nb10/10_nvidia.json -O /usr/share/glvnd/egl_vendor.d/10_nvidia.json
RUN wget https://pai-vision-data-sh.oss-cn-shanghai.aliyuncs.com/aigc-data/isaac/nb10/50_mesa.json -O /usr/share/glvnd/egl_vendor.d/50_mesa.json

# install 3rd-party system dependencies
RUN DEBIAN_FRONTEND=noninteractive apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libsm6 libxext6 software-properties-common build-essential cmake gfortran libopenblas-dev liblapack-dev postgresql postgresql-contrib libpq-dev
# install Python 3.11
RUN add-apt-repository -y ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y python3.11 python3.11-dev python3.11-venv python3.11-distutils && \
# set the default Python
update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
# install pip
curl https://bootstrap.pypa.io/get-pip.py | python3.11 && \
pip install --upgrade pip

# prepare the java env
# install uv
RUN pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple

# install java
WORKDIR /opt
# download jdk
RUN wget https://aka.ms/download-jdk/microsoft-jdk-17.0.9-linux-x64.tar.gz -O jdk.tar.gz \
RUN wget https://aka.ms/download-jdk/microsoft-jdk-17.0.9-linux-x64.tar.gz -O jdk.tar.gz \
&& tar -xzf jdk.tar.gz \
&& rm -rf jdk.tar.gz \
&& mv jdk-17.0.9+8 jdk

# set the environment variable
ENV JAVA_HOME=/opt/jdk
ENV PATH=$JAVA_HOME/bin:$PATH

WORKDIR /data-juicer
# install Isaac Sim
ENV UV_HTTP_TIMEOUT=300
RUN uv pip install isaacsim[all,extscache]==5.1.0 --extra-index-url https://pypi.nvidia.com --system

# install uv
RUN pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
# install Isaac Lab 2.3
ENV ACCEPT_EULA=Y
ENV OMNI_KIT_ACCEPT_EULA=Y
RUN mkdir -p /third-party
RUN uv pip install usd-core --system
# clone and install Isaac Lab
RUN cd /tmp && git clone https://github.com/isaac-sim/IsaacLab.git isaaclab && mv /tmp/isaaclab /third-party/isaaclab \
&& cd /third-party/isaaclab \
&& git checkout v2.3.0 \
&& ./isaaclab.sh --install

# set env vars for Isaac Lab
ENV ISAACLAB_ROOT_PATH=/third-party/isaaclab ISAACLAB_VERSION=2.3.0

# install requirements which need to be installed from source
# modify assets.py for customized assets
RUN wget https://pai-vision-data-sh.oss-cn-shanghai.aliyuncs.com/aigc-data/isaac/assets.py -O /third-party/isaaclab/source/isaaclab/isaaclab/utils/assets.py

WORKDIR /data-juicer

# install basic dependencies for Data-Juicer
RUN uv pip install --upgrade setuptools==69.5.1 setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple --system \
&& uv pip install http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/recognize-anything-main.zip -i https://pypi.tuna.tsinghua.edu.cn/simple --system
&& uv pip install git+https://github.com/HYLcool/recognize-anything.git -i https://pypi.tuna.tsinghua.edu.cn/simple --system

# install data-juicer then
# copy source code and install
COPY . .
RUN uv pip install -v -e .[all] -i https://pypi.tuna.tsinghua.edu.cn/simple --system \
&& python -c "import nltk; nltk.download('punkt_tab'); nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('averaged_perceptron_tagger_eng')"

# 最终入口配置
CMD ["/bin/bash"]