Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions .github/workflows/flakiness_detector.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
name: Flakiness Detector

# Runs after any CI workflow completes and can also be triggered manually for testing.
on:
workflow_run:
workflows: ["PR Validation", "Test Data Display"]
types: [completed]
workflow_dispatch:
inputs:
workflow_run_id:
description: 'Workflow run ID to analyse (required for manual trigger)'
required: true
type: string
repo:
description: 'Repository in owner/repo format'
required: false
default: 'OWASP-BLT/BLT-Leaf'
pr_number:
description: 'PR number (optional)'
required: false
type: string

permissions:
issues: write # create / reopen / comment on flaky-test issues
pull-requests: write # post flakiness summary comment on PRs
actions: write # trigger rerun-failed-jobs

jobs:
detect:
name: Detect & Record Flakiness
runs-on: ubuntu-latest

# Skip runs triggered by this workflow's own bot commits to prevent loops
if: >
github.event_name == 'workflow_dispatch' ||
(github.event.workflow_run.conclusion != null &&
!contains(github.event.workflow_run.head_commit.message, '[skip ci]'))

steps:
# -----------------------------------------------------------------------
# 1. Checkout
# -----------------------------------------------------------------------
- name: Checkout repository
uses: actions/checkout@v4

# -----------------------------------------------------------------------
# 2. Python + dependencies
# -----------------------------------------------------------------------
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install Python dependencies
run: pip install --quiet requests pyyaml

# -----------------------------------------------------------------------
# 3. Resolve run metadata (works for both trigger types)
# -----------------------------------------------------------------------
- name: Resolve workflow run metadata
id: meta
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "run_id=${{ github.event.inputs.workflow_run_id }}" >> "$GITHUB_OUTPUT"
echo "repo=${{ github.event.inputs.repo }}" >> "$GITHUB_OUTPUT"
echo "pr_number=${{ github.event.inputs.pr_number }}" >> "$GITHUB_OUTPUT"
else
echo "run_id=${{ github.event.workflow_run.id }}" >> "$GITHUB_OUTPUT"
echo "repo=${{ github.repository }}" >> "$GITHUB_OUTPUT"
# Extract PR number from workflow_run context (empty string if not a PR run)
PR=$(echo '${{ toJson(github.event.workflow_run.pull_requests) }}' \
| python3 -c "import json,sys; prs=json.load(sys.stdin); print(prs[0]['number'] if prs else '')")
echo "pr_number=${PR}" >> "$GITHUB_OUTPUT"
fi

# -----------------------------------------------------------------------
# 4. Collect CI results → writes ci_run_history rows to D1, outputs failed jobs
# -----------------------------------------------------------------------
- name: Collect CI results
id: collect
env:
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
CLOUDFLARE_D1_DATABASE_ID: ${{ secrets.CLOUDFLARE_D1_DATABASE_ID }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
run: |
python scripts/flakiness/collect_ci_results.py \
--workflow-run-id "${{ steps.meta.outputs.run_id }}" \
--repo "${{ steps.meta.outputs.repo }}" \
--github-token "${{ secrets.GITHUB_TOKEN }}" \
${{ steps.meta.outputs.pr_number != '' && format('--pr-number {0}', steps.meta.outputs.pr_number) || '' }} \
| tee /tmp/collect_output.json

# Expose whether any test failures were found
FAILED=$(python3 -c "
import json, sys
d = json.load(open('/tmp/collect_output.json'))
print('true' if d.get('failed_jobs') else 'false')
")
echo "has_failures=${FAILED}" >> "$GITHUB_OUTPUT"

# -----------------------------------------------------------------------
# 5. Retry failed jobs (only when there are first-attempt failures)
# -----------------------------------------------------------------------
- name: Retry failed jobs
id: retry
if: steps.collect.outputs.has_failures == 'true'
env:
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
CLOUDFLARE_D1_DATABASE_ID: ${{ secrets.CLOUDFLARE_D1_DATABASE_ID }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
run: |
python scripts/flakiness/retry_failures.py \
--workflow-run-id "${{ steps.meta.outputs.run_id }}" \
--repo "${{ steps.meta.outputs.repo }}" \
--github-token "${{ secrets.GITHUB_TOKEN }}" \
--collect-output /tmp/collect_output.json \
| tee /tmp/retry_output.json

# -----------------------------------------------------------------------
# 6. Re-collect after retry so flake_confirmed rows are recorded in D1
# -----------------------------------------------------------------------
- name: Re-collect after retry
if: steps.collect.outputs.has_failures == 'true'
env:
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
CLOUDFLARE_D1_DATABASE_ID: ${{ secrets.CLOUDFLARE_D1_DATABASE_ID }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
run: |
python scripts/flakiness/collect_ci_results.py \
--workflow-run-id "${{ steps.meta.outputs.run_id }}" \
--repo "${{ steps.meta.outputs.repo }}" \
--github-token "${{ secrets.GITHUB_TOKEN }}" \
${{ steps.meta.outputs.pr_number != '' && format('--pr-number {0}', steps.meta.outputs.pr_number) || '' }} \
> /dev/null

# -----------------------------------------------------------------------
# 7. Analyse flakiness → upserts flakiness_scores in D1, outputs classification
# -----------------------------------------------------------------------
- name: Analyse flakiness
id: analyze
env:
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
CLOUDFLARE_D1_DATABASE_ID: ${{ secrets.CLOUDFLARE_D1_DATABASE_ID }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
run: |
python scripts/flakiness/analyze_flakiness.py \
--repo "${{ steps.meta.outputs.repo }}" \
| tee /tmp/flaky_report.json

FLAKY_COUNT=$(python3 -c "
import json
d = json.load(open('/tmp/flaky_report.json'))
print(len(d.get('flaky', [])))
")
echo "flaky_count=${FLAKY_COUNT}" >> "$GITHUB_OUTPUT"

# -----------------------------------------------------------------------
# 8. Report flakiness to GitHub Issues / PR comment, write local files
# -----------------------------------------------------------------------
- name: Report flakiness
env:
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
CLOUDFLARE_D1_DATABASE_ID: ${{ secrets.CLOUDFLARE_D1_DATABASE_ID }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
run: |
python scripts/flakiness/report_flakiness.py \
--repo "${{ steps.meta.outputs.repo }}" \
--github-token "${{ secrets.GITHUB_TOKEN }}" \
${{ steps.meta.outputs.pr_number != '' && format('--pr-number {0}', steps.meta.outputs.pr_number) || '' }} \
--flaky-report /tmp/flaky_report.json

# -----------------------------------------------------------------------
# 9. Upload report artifacts (flakiness_report.md + flakiness_metrics.json)
# -----------------------------------------------------------------------
- name: Upload flakiness artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: flakiness-report
path: |
data/flakiness_report.md
data/flakiness_metrics.json
if-no-files-found: ignore
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ dist/
*.pyc
__pycache__/
.pytest_cache/

# Flakiness pipeline artifacts
data/flakiness_report.md
data/flakiness_metrics.json
57 changes: 57 additions & 0 deletions migrations/0004_create_flakiness_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
-- Migration: Create flakiness detection tables
-- Created: 2026-03-11
-- Description: Add tables for CI run history, flakiness scores, and known infrastructure issue patterns

-- Per-run record for every CI job execution
CREATE TABLE IF NOT EXISTS ci_run_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
check_name TEXT NOT NULL, -- name of the CI check / job step
job_name TEXT NOT NULL, -- GitHub Actions job name
workflow_name TEXT NOT NULL, -- workflow display name (e.g. "CI")
workflow_run_id INTEGER NOT NULL, -- github.run_id
run_attempt INTEGER NOT NULL DEFAULT 1, -- 1 = first run, 2+ = retry
status TEXT NOT NULL, -- 'pass' | 'fail' | 'skip'
conclusion_category TEXT NOT NULL, -- 'test_failure' | 'infrastructure' | 'flake_confirmed' | 'pass' | 'skip'
commit_sha TEXT NOT NULL,
pr_number INTEGER, -- NULL when not a PR-triggered run
repo TEXT NOT NULL, -- "owner/repo" format
timestamp TEXT NOT NULL DEFAULT (datetime('now'))
);

CREATE INDEX IF NOT EXISTS idx_ci_run_history_lookup
ON ci_run_history(check_name, job_name, repo, timestamp);

-- Computed flakiness scores per (check_name, job_name) pair
CREATE TABLE IF NOT EXISTS flakiness_scores (
check_name TEXT NOT NULL,
job_name TEXT NOT NULL,
workflow_name TEXT NOT NULL,
flakiness_score REAL NOT NULL DEFAULT 0.0, -- 0.0 – 1.0
severity TEXT NOT NULL DEFAULT 'stable', -- 'stable' | 'low' | 'medium' | 'high' | 'deterministic'
classification TEXT NOT NULL DEFAULT 'stable', -- 'stable' | 'flaky' | 'deterministic'
total_runs INTEGER NOT NULL DEFAULT 0,
failure_count INTEGER NOT NULL DEFAULT 0,
flaky_failures INTEGER NOT NULL DEFAULT 0, -- failures that passed on re-run
consecutive_failures INTEGER NOT NULL DEFAULT 0, -- current streak of consecutive failures
last_updated TEXT NOT NULL DEFAULT (datetime('now')),
PRIMARY KEY (check_name, job_name)
);

-- Seed patterns used to classify infrastructure failures separately from test flakiness
CREATE TABLE IF NOT EXISTS known_infrastructure_issues (
id INTEGER PRIMARY KEY AUTOINCREMENT,
pattern TEXT NOT NULL UNIQUE,
category TEXT NOT NULL DEFAULT 'infrastructure',
description TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
);

INSERT OR IGNORE INTO known_infrastructure_issues (pattern, category, description) VALUES
('ECONNRESET', 'infrastructure', 'TCP connection reset — transient network issue'),
('timed_out', 'infrastructure', 'GitHub Actions step conclusion: timed_out'),
('timeout', 'infrastructure', 'Generic timeout — network or infrastructure issue'),
('rate limit', 'infrastructure', 'API or package registry rate limit hit'),
('ETIMEDOUT', 'infrastructure', 'TCP connection timed out'),
('fetch failed', 'infrastructure', 'Network fetch failure — transient'),
('network error', 'infrastructure', 'Generic network error'),
('Could not resolve host', 'infrastructure', 'DNS resolution failure');
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"lint": "echo 'Linting not configured yet' && exit 0",
"format:check": "echo 'Format checking not configured yet' && exit 0",
"test": "node test-data-display.js",
"test:data-display": "node test-data-display.js"
"test:data-display": "node test-data-display.js",
"test:flakiness": "python -m unittest discover -s scripts/flakiness/tests -p \"test_*.py\" -v"
},
"keywords": [
"github",
Expand Down
Loading
Loading