Skip to content

Commit 1532531

Browse files
booxterRobotSail
authored andcommitted
ci: add 3.12 smoke workflow flavor
Yes, it's a lot of duplication. Until we have some way to generate workflows from a template, we have to have it: matrix doesn't apply because if used, it will use the same single ec2 runner for both 3.11 and 3.12 runs - and while it works, it slows down feedback almost by x2 because these are run sequentially. Signed-off-by: Ihar Hrachyshka <[email protected]>
1 parent 2d2da55 commit 1532531

File tree

3 files changed

+231
-63
lines changed

3 files changed

+231
-63
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
name: 'Run smoke tests'
2+
description: 'Runs smoke tests'
3+
inputs:
4+
python-version:
5+
required: true
6+
description: >-
7+
Python version to use. Must be in the form of "3.xx".
8+
runs:
9+
using: "composite"
10+
steps:
11+
- name: "Harden runner"
12+
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
13+
with:
14+
egress-policy: audit
15+
16+
- name: "Install packages"
17+
shell: bash
18+
run: |
19+
cat /etc/os-release
20+
sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
21+
22+
- name: "Verify cuda environment is setup"
23+
shell: bash
24+
run: |
25+
export CUDA_HOME="/usr/local/cuda"
26+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
27+
export PATH="${PATH}:${CUDA_HOME}/bin"
28+
nvidia-smi
29+
30+
# installs in $GITHUB_WORKSPACE/venv.
31+
# only has to install Tox because Tox will do the other virtual environment management.
32+
- name: "Setup Python virtual environment"
33+
shell: bash
34+
run: |
35+
python${{ inputs.python-version }} -m venv --upgrade-deps venv
36+
. venv/bin/activate
37+
pip install tox
38+
39+
# flash-attn has a bug in the setup.py that causes pip to attempt
40+
# installing it before torch is installed. This is a bug because their
41+
# setup.py depends on importing the module, so it should have been listed
42+
# in build_requires. Alas. See:
43+
# https://github.com/Dao-AILab/flash-attention/pull/958
44+
- name: "Install torch and other unlisted build dependencies for flash-attn"
45+
shell: bash
46+
run: |
47+
source venv/bin/activate
48+
# The list is taken from the pull request linked above
49+
pip install torch packaging setuptools wheel psutil ninja
50+
51+
- name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
52+
shell: bash
53+
run: |
54+
source venv/bin/activate
55+
pip install tox-current-env
56+
57+
- name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
58+
shell: bash
59+
run: |
60+
source venv/bin/activate
61+
tox -e py3-smoke --print-deps-to-file=./deps.txt
62+
pip install -r ./deps.txt --no-build-isolation
63+
pip install .
64+
65+
- name: "Show disk utilization BEFORE tests"
66+
shell: bash
67+
if: always()
68+
run: |
69+
df -h
70+
71+
- name: "Run smoke tests with Tox and Pytest"
72+
shell: bash
73+
run: |
74+
source venv/bin/activate
75+
tox --current-env -e py3-smoke
76+
77+
- name: "Show disk utilization AFTER tests"
78+
shell: bash
79+
if: always()
80+
run: |
81+
df -h

.github/workflows/smoke-py312.yaml

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: "Run smoke tests via Tox::pytest (python 3.12)"
4+
# These tests will be long running and require accelerated hardware.
5+
6+
on:
7+
workflow_dispatch:
8+
inputs:
9+
branch:
10+
type: string
11+
default: main
12+
# using this rather than pull_request because this workflow
13+
# needs to run in the context of the base branch (main) and
14+
# access the repo's secrets to start the AWS instances.
15+
pull_request_target:
16+
branches:
17+
- main
18+
- release-*
19+
paths:
20+
# note this should match the merging criteria in 'mergify.yml'
21+
- "**.py"
22+
- "tox.ini"
23+
- "pyproject.toml"
24+
- "requirements-dev.txt"
25+
- "requirements-cuda.txt"
26+
27+
permissions:
28+
contents: read
29+
30+
defaults:
31+
run:
32+
shell: bash
33+
34+
env:
35+
ec2_runner_variant: "g6e.12xlarge" # 4x L40s
36+
37+
jobs:
38+
start-large-ec2-runner:
39+
runs-on: ubuntu-latest
40+
outputs:
41+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
42+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
43+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
44+
steps:
45+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
46+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
47+
with:
48+
repository: instructlab/ci-actions
49+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
50+
path: ci-actions
51+
ref: release-v0.1
52+
sparse-checkout: |
53+
actions/launch-ec2-runner-with-fallback
54+
55+
- name: Launch EC2 Runner with Fallback
56+
id: launch-ec2-instance-with-fallback
57+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
58+
env:
59+
TMPDIR: "/tmp"
60+
with:
61+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
62+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
63+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
64+
regions_config: >
65+
[
66+
{
67+
"region": "us-east-2",
68+
"subnets": {
69+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
70+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
71+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
75+
},
76+
{
77+
"region": "us-east-1",
78+
"subnets": {
79+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
80+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
81+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
82+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
83+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
84+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
85+
},
86+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
87+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
88+
}
89+
]
90+
try_spot_instance_first: false
91+
ec2_instance_type: g6e.12xlarge
92+
aws_resource_tags: >
93+
[
94+
{"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
95+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
96+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
97+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
98+
]
99+
100+
run-smoke-tests:
101+
needs:
102+
- start-large-ec2-runner
103+
runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
104+
# It is important that this job has no write permissions and has
105+
# no access to any secrets. This part is where we are running
106+
# untrusted code from PRs.
107+
permissions: {}
108+
steps:
109+
- name: "Checkout code"
110+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
111+
with:
112+
fetch-depth: 0
113+
ref: ${{inputs.branch}}
114+
115+
- name: Run smoke tests
116+
uses: ./.github/actions/run-smoke
117+
with:
118+
python-version: 3.12
119+
120+
stop-large-ec2-runner:
121+
needs:
122+
- start-large-ec2-runner
123+
- run-smoke-tests
124+
runs-on: ubuntu-latest
125+
if: ${{ always() }}
126+
steps:
127+
- name: "Harden runner"
128+
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
129+
with:
130+
egress-policy: audit
131+
132+
- name: "Configure AWS credentials"
133+
uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0
134+
with:
135+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
136+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
137+
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
138+
139+
- name: "Stop EC2 runner"
140+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
141+
with:
142+
mode: stop
143+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
144+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
145+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

.github/workflows/smoke.yaml

Lines changed: 5 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3-
name: "Run smoke tests via Tox::pytest"
3+
name: "Run smoke tests via Tox::pytest (python 3.11)"
44
# These tests will be long running and require accelerated hardware.
55

66
on:
@@ -106,74 +106,16 @@ jobs:
106106
# untrusted code from PRs.
107107
permissions: {}
108108
steps:
109-
- name: "Harden runner"
110-
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
111-
with:
112-
egress-policy: audit
113-
114-
- name: "Install packages"
115-
run: |
116-
cat /etc/os-release
117-
sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
118-
119-
- name: "Verify cuda environment is setup"
120-
run: |
121-
export CUDA_HOME="/usr/local/cuda"
122-
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
123-
export PATH="${PATH}:${CUDA_HOME}/bin"
124-
nvidia-smi
125-
126109
- name: "Checkout code"
127110
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
128111
with:
129112
fetch-depth: 0
130113
ref: ${{inputs.branch}}
131114

132-
# installs in $GITHUB_WORKSPACE/venv.
133-
# only has to install Tox because Tox will do the other virtual environment management.
134-
- name: "Setup Python virtual environment"
135-
run: |
136-
python3.11 -m venv --upgrade-deps venv
137-
. venv/bin/activate
138-
pip install tox
139-
140-
# flash-attn has a bug in the setup.py that causes pip to attempt
141-
# installing it before torch is installed. This is a bug because their
142-
# setup.py depends on importing the module, so it should have been listed
143-
# in build_requires. Alas.
144-
# See: https://github.com/Dao-AILab/flash-attention/pull/958
145-
- name: "Install torch and other unlisted build dependencies for flash-attn"
146-
run: |
147-
source venv/bin/activate
148-
# The list is taken from the pull request linked above
149-
pip install torch packaging setuptools wheel psutil ninja
150-
151-
- name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
152-
run: |
153-
source venv/bin/activate
154-
pip install tox-current-env
155-
156-
- name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
157-
run: |
158-
source venv/bin/activate
159-
tox -e py3-smoke --print-deps-to-file=./deps.txt
160-
pip install -r ./deps.txt --no-build-isolation
161-
pip install .
162-
163-
- name: "Show disk utilization BEFORE tests"
164-
if: always()
165-
run: |
166-
df -h
167-
168-
- name: "Run smoke tests with Tox and Pytest"
169-
run: |
170-
source venv/bin/activate
171-
tox --current-env -e py3-smoke
172-
173-
- name: "Show disk utilization AFTER tests"
174-
if: always()
175-
run: |
176-
df -h
115+
- name: Run smoke tests
116+
uses: ./.github/actions/run-smoke
117+
with:
118+
python-version: 3.11
177119

178120
stop-large-ec2-runner:
179121
needs:

0 commit comments

Comments
 (0)