Self-hosted runner with slow tests (scheduled) #1118
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Self-hosted runner with slow tests (scheduled) | |
| on: | |
| workflow_dispatch: | |
| schedule: | |
| - cron: "0 2 * * *" | |
| env: | |
| RUN_SLOW: "yes" | |
| IS_GITHUB_CI: "1" | |
| # To be able to run tests on CUDA 12.2 | |
| NVIDIA_DISABLE_REQUIRE: "1" | |
| SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} | |
| permissions: {} | |
| jobs: | |
| run_all_tests_single_gpu: | |
| strategy: | |
| fail-fast: false | |
| runs-on: | |
| group: aws-g6-4xlarge-plus | |
| env: | |
| CUDA_VISIBLE_DEVICES: "0" | |
| TEST_TYPE: "single_gpu" | |
| container: | |
| image: huggingface/peft-gpu:latest | |
| options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| persist-credentials: false | |
| - name: Pip install | |
| run: | | |
| source activate peft | |
| pip install -e . --no-deps | |
| pip install pytest-reportlog | |
| - name: Run common tests on single GPU | |
| id: common_tests | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_common_gpu | |
| - name: Run examples on single GPU | |
| id: examples | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_examples_single_gpu | |
| - name: Run core tests on single GPU | |
| id: core_tests | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_core_single_gpu | |
| - name: Run regression tests on single GPU | |
| id: regression | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_regression | |
| - name: Generate Report | |
| if: always() | |
| run: | | |
| pip install slack_sdk tabulate | |
| python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY | |
| - name: Check for test failures | |
| if: | | |
| steps.common_tests.outcome == 'failure' || | |
| steps.examples.outcome == 'failure' || | |
| steps.core_tests.outcome == 'failure' || | |
| steps.regression.outcome == 'failure' | |
| run: | | |
| echo "One or more test suites failed. Check the logs above." | |
| exit 1 | |
| run_all_tests_multi_gpu: | |
| strategy: | |
| fail-fast: false | |
| runs-on: | |
| group: aws-g6-12xlarge-plus | |
| env: | |
| CUDA_VISIBLE_DEVICES: "0,1" | |
| TEST_TYPE: "multi_gpu" | |
| container: | |
| image: huggingface/peft-gpu:latest | |
| options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| persist-credentials: false | |
| - name: Pip install | |
| run: | | |
| source activate peft | |
| pip install -e . --no-deps | |
| pip install pytest-reportlog | |
| - name: Run common tests on multi GPU | |
| id: common_tests | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_common_gpu | |
| - name: Run examples on multi GPU | |
| id: examples | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_examples_multi_gpu | |
| - name: Run core tests on multi GPU | |
| id: core_tests | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_core_multi_gpu | |
| - name: Run training on multi GPU | |
| id: training | |
| continue-on-error: true | |
| run: | | |
| source activate peft | |
| make tests_training | |
| - name: Generate Report | |
| if: always() | |
| run: | | |
| pip install slack_sdk tabulate | |
| python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY | |
| - name: Check for test failures | |
| if: | | |
| steps.common_tests.outcome == 'failure' || | |
| steps.examples.outcome == 'failure' || | |
| steps.core_tests.outcome == 'failure' || | |
| steps.training.outcome == 'failure' | |
| run: | | |
| echo "One or more test suites failed. Check the logs above." | |
| exit 1 |