Skip to content

update gpt-neox image to gpt-neox v2.0 and combine with gpt-neox build #205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,10 @@ build-pytorch10-tf27-rocm50:
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

DEEPSPEED_VERSION := 0.8.3
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
DEEPSPEED_VERSION := 0.9.2
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1

# This builds deepspeed environment off of upstream microsoft/DeepSpeed.
Expand Down Expand Up @@ -239,8 +239,8 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg "$(NCCL_BUILD_ARG)" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \
--build-arg DET_BUILD_NCCL="" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@determined2#egg=deepspeed" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
-t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
Expand Down
29 changes: 26 additions & 3 deletions dockerfile_scripts/install_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,30 @@
set -e

DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev
# Triton is needed to build deepspeed's sparse_attn operation.
python -m pip install triton==1.0.0
DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
# Not building sparse attn operation which depends on a very old version of triton
DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
python -m deepspeed.env_report

if [[ "$DEEPSPEED_PIP" == *"determined2"* ]]; then
# Build gpt-neox and dependencies when we install the gpt-neox version of deepspeed.
# Triton is needed for flash attn
python -m pip install triton==2.0.0.dev20221202
# This is a dependency of gpt-neox
apt-get install -y mpich
# Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894.
pip install setuptools==59.5.0
# Install gpt-neox and dependencies
git clone -b determined2 https://github.com/determined-ai/gpt-neox.git
python gpt-neox/megatron/fused_kernels/setup.py install

# Exclude DeeperSpeed reinstall since the version in requirements is not pinned.
pip install $(grep -ivE "DeeperSpeed" gpt-neox/requirements/requirements.txt)
pip install -r /gpt-neox/requirements/requirements-flashattention.txt

# Download sample data
gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data

# Modify permissions to enable example to run in nonroot mode
chmod -R 777 /gpt-neox
chmod -R 777 /tmp
fi