Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/blossom-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
# See LICENSE for license information.

# A workflow to trigger ci on hybrid infra (github + self hosted runner)

# DISABLED in FlagOS
name: Blossom-CI
on:
issue_comment:
types: [created]
types: [__disabled_do_not_remove__]
workflow_dispatch:
inputs:
platform:
Expand Down
88 changes: 14 additions & 74 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,90 +8,30 @@ on:
pull_request:
workflow_dispatch:
jobs:
core:
name: 'Core'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake==3.21.0 pybind11[global] ninja nvidia-mathdx==25.1.1
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: none
MAX_JOBS: 1
- name: 'Sanity check'
run: python3 -c "import transformer_engine"
working-directory: /
pytorch:
name: 'PyTorch'
runs-on: ubuntu-latest
runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
defaults:
run:
shell: bash
container:
image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript nvidia-mathdx==25.1.1
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install --no-build-isolation . -v --no-deps
run:
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
pip install --no-build-isolation . -v --no-deps
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
- name: 'Sanity check'
run: python3 tests/pytorch/test_sanity_import.py
jax:
name: 'JAX'
runs-on: ubuntu-latest
container:
image: ghcr.io/nvidia/jax:jax
options: --user root
steps:
- name: 'Dependencies'
run: pip install pybind11[global] nvidia-mathdx==25.1.1
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: jax
MAX_JOBS: 1
- name: 'Sanity check'
run: python3 tests/jax/test_sanity_import.py
all:
name: 'All'
runs-on: ubuntu-latest
container:
image: ghcr.io/nvidia/jax:jax
options: --user root
steps:
- name: 'Dependencies'
run: pip install torch pybind11[global] einops onnxscript nvidia-mathdx==25.1.1
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install --no-build-isolation . -v --no-deps
env:
NVTE_FRAMEWORK: all
MAX_JOBS: 1
TE_WITH_NCCL: 1
- name: 'Sanity check'
run: python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py
run:
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
python3 tests/pytorch/test_sanity_import.py
2 changes: 1 addition & 1 deletion .github/workflows/deploy_nightly_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
name: Deploy nightly docs
on:
push:
branches: [ "main" ]
branches: [ "__disabled_do_not_remove__" ]
jobs:
build:
uses: ./.github/workflows/docs.yml
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/license.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# A workflow to trigger the TE license check on GitHub
name: 'License'
on:
pull_request:
pull_request: [__disabled_do_not_remove__]
workflow_dispatch:
jobs:
check:
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/qa-format.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: format_check

on:
pull_request:
branches: [ "main" ]
types: [opened, synchronize, reopened]

jobs:
format:
runs-on: ubuntu-22.04
env:
PRID: ${{ github.event.pull_request.number }}
BRANCH: main

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.base.ref }}

- name: Merge PR to sub-branch
run: |
git fetch origin pull/${PRID}/merge
git checkout -b test FETCH_HEAD

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Run pre-commit
run: bash ./qa/format.sh
78 changes: 78 additions & 0 deletions .github/workflows/qa-l0-pytorch-wheel.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
name: QA Pytorch Wheel

on:
push:
branches:
- __disabled_do_not_remove__
pull_request:
branches:
- __disabled_do_not_remove__

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
cancel-in-progress: true

jobs:
qa-l0-pytorch-wheel:
runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
defaults:
run:
shell: bash
container:
image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
ports:
- 80:80
options: >-
--gpus all
--shm-size=500g
--privileged
--ipc=host
--ulimit memlock=-1
--ulimit stack=67108864
--ulimit nofile=65535:65535
--user root
--pull always

steps:
- name: Checkout Code
uses: actions/checkout@v6.0.1
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
ssh-strict: true
ssh-user: git
persist-credentials: true
clean: true
sparse-checkout-cone-mode: true
fetch-tags: false
show-progress: true
lfs: false
submodules: recursive
set-safe-directory: true

- name: L0 Pytorch Wheel
id: L0_pytoech_wheel
# timeout-minutes: 50
env:
TE_PATH: .
RUN_LOG: /logs/pytorch/wheel
run: |
echo "TE_PATH: ${TE_PATH}"
sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh

cat qa/L0_pytorch_wheel/test.sh
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
pip uninstall -y transformer_engine

bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log

- name: Upload Installation Logs
if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
uses: actions/upload-artifact@v4
with:
name: L0-pytorch-logs-${{ github.run_id }}
path: /logs/pytorch/wheel
retention-days: 7
if-no-files-found: warn
Loading