Skip to content

Commit

Permalink
[DRAFT] CI experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
alliepiper committed Apr 23, 2024
1 parent bfb8fe9 commit e86ef84
Show file tree
Hide file tree
Showing 15 changed files with 1,157 additions and 351 deletions.
10 changes: 6 additions & 4 deletions .devcontainer/make_devcontainers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ while [[ $# -gt 0 ]]; do
done

MATRIX_FILE="../ci/matrix.yaml"
COMPUTE_MATRIX="../ci/compute-matrix.py"

# Enable verbose mode if requested
if [ "$VERBOSE" = true ]; then
Expand All @@ -82,16 +83,17 @@ if [ "$VERBOSE" = true ]; then
fi

# Read matrix.yaml and convert it to json
matrix_json=$(yq -o json ${MATRIX_FILE})
matrix_json=$(python3 ${COMPUTE_MATRIX} ${MATRIX_FILE} --devcontainer-info)

# Exclude Windows environments
readonly matrix_json=$(echo "$matrix_json" | jq 'del(.pull_request.nvcc[] | select(.os | contains("windows")))')
if [ "$VERBOSE" = true ]; then
echo "$matrix_json"
fi

# Get the devcontainer image version and define image tag root
readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')

# Get unique combinations of cuda version, compiler name/version, and Ubuntu version
readonly combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_exe: .compiler.exe, compiler_version: .compiler.version, os: .os}] | unique | .[]')
readonly combinations=$(echo "$matrix_json" | jq -c '.combinations[]')

# Update the base devcontainer with the default values
# The root devcontainer.json file is used as the default container as well as a template for all
Expand Down
25 changes: 0 additions & 25 deletions .github/actions/compute-matrix/action.yml

This file was deleted.

82 changes: 0 additions & 82 deletions .github/actions/compute-matrix/compute-matrix.sh

This file was deleted.

45 changes: 45 additions & 0 deletions .github/workflows/ci-dispatch-group.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: "CI/Dispatch/Group"

defaults:
run:
shell: bash -euo pipefail {0}

on:
workflow_call:
inputs:
name: {type: string, required: true}
jobs: {type: string, required: true}

permissions:
contents: read

jobs:
standlone-jobs:
name: '[Standalone]'
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
include: ${{fromJSON(inputs.jobs)['standalone']}}
uses: ./.github/workflows/ci-dispatch-job.yml
with:
name: ${{ matrix.name }}
runner: ${{ matrix.runner }}
image: ${{ matrix.image }}
command: ${{ matrix.command }}

two-stage-jobs:
name: '[TwoStage]'
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
include: ${{fromJSON(inputs.jobs)['two_stage']}}
uses: ./.github/workflows/ci-dispatch-two-stage.yml
with:
producers: ${{ toJSON(matrix.producers) }}
consumers: ${{ toJSON(matrix.consumers) }}
145 changes: 145 additions & 0 deletions .github/workflows/ci-dispatch-job.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
name: "CI/Dispatch/Job"

# Important note about depending on this workflow: The `result` will be a failure, even if successful.
#
# This reusable workflow dispatches to a number of internal jobs. Only one job will run,
# and some may be in error states due to empty matrices (which are used instead of `if` to keep
# skipped dispatch jobs out of the GHA UI). The `continue-on-error` flag should prevent these
# errors from failing the workflow, but this does not work.
#
# Thus, the `result` of this workflow will always be a failure, even if the job itself is successful.
# To depend on this job, you should use the `success` output instead:
#
# ```
# dependent_job:
# needs: dispatch-job
# if: ${{ !cancelled() && needs.dispatch-job.outputs.success }}
# ```

defaults:
run:
shell: bash -euo pipefail {0}

on:
workflow_call:
outputs:
success:
value: ${{ contains(toJSON(jobs.*.outputs.success), 'true') }}
inputs:
name: {type: string, required: true}
image: {type: string, required: true}
runner: {type: string, required: true}
command: {type: string, required: true}
env: {type: string, required: false}
dummy_matrix: {type: string, required: false, default: '[{"valid": true}]'}

permissions:
contents: read

jobs:
linux:
name: ${{inputs.name}}
continue-on-error: ${{ ! startsWith(inputs.runner, 'linux') }}
outputs:
success: ${{ steps.done.outputs.SUCCESS }}
permissions:
id-token: write
contents: read
strategy:
matrix:
include: ${{ fromJSON(startsWith(inputs.runner, 'linux') && inputs.dummy_matrix || '[]') }}
runs-on: ${{inputs.runner}}
container:
options: -u root
image: ${{inputs.image}}
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
path: cccl
persist-credentials: false
- name: Move files to coder user home directory
run: |
cp -R cccl /home/coder/cccl
chown -R coder:coder /home/coder/
- name: Add NVCC problem matcher
run: |
echo "::add-matcher::cccl/.github/problem-matchers/problem-matcher.json"
- name: Configure credentials and environment variables for sccache
uses: ./cccl/.github/actions/configure_cccl_sccache
- name: Run command
shell: su coder {0}
env:
COMMAND: ${{inputs.command}}
run: |
set -eo pipefail
cd ~/cccl
echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m"
echo -e "\e[1;34m${COMMAND}\e[0m"
eval "${COMMAND}" || exit_code=$?
if [ ! -z "$exit_code" ]; then
echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
echo "::error:: To replicate this failure locally, follow the steps below:"
echo "1. Clone the repository, and navigate to the correct branch and commit:"
echo " git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
echo ""
echo "2. Run the failed command inside the same Docker container used by the CI:"
echo " docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
echo ""
echo "For additional information, see:"
echo " - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
echo " - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
exit $exit_code
fi
- name: Mark job as successful
id: done
run: echo "SUCCESS=true" | tee -a ${GITHUB_OUTPUT}

windows:
name: ${{inputs.name}}
continue-on-error: ${{ ! startsWith(inputs.runner, 'windows') }}
outputs:
success: ${{ steps.done.outputs.SUCCESS }}
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(startsWith(inputs.runner, 'windows') && inputs.dummy_matrix || '[]') }}
runs-on: ${{inputs.runner}}
env:
SCCACHE_BUCKET: rapids-sccache-devs
SCCACHE_REGION: us-east-2
SCCACHE_IDLE_TIMEOUT: 0
SCCACHE_S3_USE_SSL: true
SCCACHE_S3_NO_CREDENTIALS: false
steps:
- name: Get AWS credentials for sccache bucket
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
aws-region: us-east-2
role-duration-seconds: 43200 # 12 hours
- name: Fetch ${{ inputs.image }}
run: docker pull ${{ inputs.image }}
- name: Run Command
run: >-
docker run ${{ inputs.image }} powershell -c "[System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}')
[System.Environment]::SetEnvironmentVariable('AWS_SECRET_ACCESS_KEY','${{env.AWS_SECRET_ACCESS_KEY}}')
[System.Environment]::SetEnvironmentVariable('AWS_SESSION_TOKEN','${{env.AWS_SESSION_TOKEN }}')
[System.Environment]::SetEnvironmentVariable('SCCACHE_BUCKET','${{env.SCCACHE_BUCKET}}')
[System.Environment]::SetEnvironmentVariable('SCCACHE_REGION','${{env.SCCACHE_REGION}}')
[System.Environment]::SetEnvironmentVariable('SCCACHE_IDLE_TIMEOUT','${{env.SCCACHE_IDLE_TIMEOUT}}')
[System.Environment]::SetEnvironmentVariable('SCCACHE_S3_USE_SSL','${{env.SCCACHE_S3_USE_SSL}}')
[System.Environment]::SetEnvironmentVariable('SCCACHE_S3_NO_CREDENTIALS','${{env.SCCACHE_S3_NO_CREDENTIALS}}')
git clone https://github.com/NVIDIA/cccl.git;
cd cccl;
git fetch --all;
git checkout ${{github.ref_name}};
${{inputs.command}}"
- name: Mark job as successful
id: done
run: echo "SUCCESS=true" | tee -a ${GITHUB_OUTPUT}
54 changes: 54 additions & 0 deletions .github/workflows/ci-dispatch-two-stage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: "CI/Dispatch/TwoStage"

defaults:
run:
shell: bash -euo pipefail {0}

on:
workflow_call:
inputs:
producers: {type: string, required: true}
consumers: {type: string, required: true}

permissions:
contents: read

jobs:
producers:
name: '[Producer]'
# It is impossible to accumulate output variables across a matrix, and we cannot rely on the results of the dispatch-job workflow to determine success.
# See the note in ci-dispatch-job.yml for more information.
#
# Since we cannot accumulate results from multiple producers, only support a single producer for now.
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
include: ${{fromJSON(inputs.producers)}}
uses: ./.github/workflows/ci-dispatch-job.yml
with:
name: ${{ matrix.name }}
runner: ${{ matrix.runner }}
image: ${{ matrix.image }}
command: ${{ matrix.command }}

consumers:
name: '[Consumer]'
needs: producers
# dispatch-job's result is always false, check the outputs instead. See ci-dispatch-job.yml for more information.
if: ${{ !cancelled() && fromJson(needs.producers.outputs.success) }}
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
include: ${{fromJSON(inputs.consumers)}}
uses: ./.github/workflows/ci-dispatch-job.yml
with:
name: ${{ matrix.name }}
runner: ${{ matrix.runner }}
image: ${{ matrix.image }}
command: ${{ matrix.command }}
1 change: 1 addition & 0 deletions .github/workflows/dispatch-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
workflow_call:
inputs:
project_name: {type: string, required: true}
job_type: {type: string, required: true}
per_cuda_compiler_matrix: {type: string, required: true}
devcontainer_version: {type: string, required: true}
is_windows: {type: boolean, required: true}
Expand Down
Loading

0 comments on commit e86ef84

Please sign in to comment.