docs: use experiment-action label (#26) #77

Workflow file for this run

	name: CI

	on:
	push:
	branches: [main]
	pull_request:
	branches: [main]

	permissions:
	contents: read

	concurrency:
	group: ci-${{ github.ref }}
	cancel-in-progress: true

	env:
	NODE_VERSION: "24"
	PYTHON_VERSION: "3.14"

	jobs:
	lint-typecheck-test:
	name: Lint, typecheck, unit tests
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false
	- uses: pnpm/action-setup@903f9c1a6ebcba6cf41d87230be49611ac97822e # v6.0.3
	with:
	run_install: false
	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: ${{ env.NODE_VERSION }}
	cache: pnpm
	- run: pnpm install --frozen-lockfile
	- run: pnpm run format:check
	- run: pnpm run lint
	- run: pnpm run typecheck
	- run: pnpm run test

	check-dist:
	name: Verify dist/ is up to date
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false
	- uses: pnpm/action-setup@903f9c1a6ebcba6cf41d87230be49611ac97822e # v6.0.3
	with:
	run_install: false
	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: ${{ env.NODE_VERSION }}
	cache: pnpm
	- run: pnpm install --frozen-lockfile
	- name: Rebuild bundle
	run: pnpm run build
	- name: Fail if dist/ differs from committed bundle
	run: \|
	if ! git diff --exit-code -- dist; then
	echo "::error::dist/ is out of date — run 'pnpm build' and commit the result."
	exit 1
	fi

	check-schema:
	name: Verify schemas/ is up to date
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false
	- uses: pnpm/action-setup@903f9c1a6ebcba6cf41d87230be49611ac97822e # v6.0.3
	with:
	run_install: false
	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: ${{ env.NODE_VERSION }}
	cache: pnpm
	- run: pnpm install --frozen-lockfile
	- name: Regenerate JSON Schema
	run: pnpm run check:schema

	e2e-langfuse:
	name: E2E — real Langfuse server
	runs-on: ubuntu-latest
	needs: [lint-typecheck-test, check-dist, check-schema]
	permissions:
	contents: read
	# Let the action post experiment result comments when the workflow runs
	# on a pull_request event.
	pull-requests: write
	# Lets the action resolve the current job URL via the REST API so
	# "View run" links to the specific job rather than the workflow run.
	actions: read
	env:
	LANGFUSE_BASE_URL: http://localhost:3000
	LANGFUSE_PUBLIC_KEY: pk-lf-1234567890
	LANGFUSE_SECRET_KEY: sk-lf-1234567890
	E2E_DATASET_NAME: experiment-action-e2e-${{ github.run_id }}
	# "true" on pull_request, "false" elsewhere (push to main, schedule).
	COMMENT_ON_PR: ${{ github.event_name == 'pull_request' && 'true' \|\| 'false' }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false
	- uses: pnpm/action-setup@903f9c1a6ebcba6cf41d87230be49611ac97822e # v6.0.3
	with:
	run_install: false
	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: ${{ env.NODE_VERSION }}
	cache: pnpm
	- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
	with:
	python-version: ${{ env.PYTHON_VERSION }}
	- run: pnpm install --frozen-lockfile
	- name: Start Langfuse server
	run: pnpm run dev:up
	- name: Wait for Langfuse to be healthy
	run: pnpm run dev:wait
	- name: Install Python SDK for dataset setup
	run: python -m pip install --disable-pip-version-check --quiet langfuse
	- name: Create E2E dataset
	env:
	DATASET_NAME: ${{ env.E2E_DATASET_NAME }}
	run: \|
	python <<'PY'
	from langfuse import Langfuse
	import os

	client = Langfuse(
	public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
	secret_key=os.environ["LANGFUSE_SECRET_KEY"],
	host=os.environ["LANGFUSE_BASE_URL"],
	)
	dataset_name = os.environ["DATASET_NAME"]
	client.create_dataset(
	name=dataset_name,
	description="experiment-action e2e dataset",
	)
	for item in [
	{"input": "hello", "expected_output": "HELLO"},
	{"input": "world", "expected_output": "WORLD"},
	{"input": "langfuse", "expected_output": "LANGFUSE"},
	]:
	client.create_dataset_item(
	dataset_name=dataset_name,
	input=item["input"],
	expected_output=item["expected_output"],
	)
	client.flush()
	PY

	# --- Scenario 1: single Python script ------------------------------
	- name: Run action — single Python experiment
	id: py
	uses: ./
	with:
	experiment_path: tests/fixtures/e2e/experiment.py
	langfuse_public_key: ${{ env.LANGFUSE_PUBLIC_KEY }}
	langfuse_secret_key: ${{ env.LANGFUSE_SECRET_KEY }}
	langfuse_base_url: ${{ env.LANGFUSE_BASE_URL }}
	dataset_name: ${{ env.E2E_DATASET_NAME }}
	github_token: ${{ github.token }}
	should_comment_on_pr: ${{ env.COMMENT_ON_PR }}
	- name: Assert single-python passed
	env:
	FAILED: ${{ steps.py.outputs.failed }}
	RESULT_JSON: ${{ steps.py.outputs.result_json }}
	run: \|
	test "$FAILED" = "false"
	node --import tsx scripts/assert-result-shape.ts
	echo "$RESULT_JSON" \| jq -e '.schema_version == "v1"'
	echo "$RESULT_JSON" \| jq -e '.results \| length == 1'
	echo "$RESULT_JSON" \| jq -e '.results[0].runtime == "python"'
	echo "$RESULT_JSON" \| jq -e '.results[0].status == "passed"'
	echo "$RESULT_JSON" \| jq -e '.results[0].experiment_result.run_evaluations[0].name == "avg_accuracy"'
	echo "$RESULT_JSON" \| jq -e '.results[0].experiment_result.run_evaluations[0].value == 1'
	echo "$RESULT_JSON" \| jq -e '
	.results[0].experiment_result.item_results
	\| map(.expected_output)
	\| sort
	\| . == ["HELLO", "LANGFUSE", "WORLD"]
	'

	# --- Scenario 2: single TypeScript script --------------------------
	- name: Run action — single TypeScript experiment
	id: ts
	uses: ./
	with:
	experiment_path: tests/fixtures/e2e/experiment.ts
	langfuse_public_key: ${{ env.LANGFUSE_PUBLIC_KEY }}
	langfuse_secret_key: ${{ env.LANGFUSE_SECRET_KEY }}
	langfuse_base_url: ${{ env.LANGFUSE_BASE_URL }}
	dataset_name: ${{ env.E2E_DATASET_NAME }}
	github_token: ${{ github.token }}
	should_comment_on_pr: ${{ env.COMMENT_ON_PR }}
	- name: Assert single-typescript passed
	env:
	FAILED: ${{ steps.ts.outputs.failed }}
	RESULT_JSON: ${{ steps.ts.outputs.result_json }}
	run: \|
	test "$FAILED" = "false"
	node --import tsx scripts/assert-result-shape.ts
	echo "$RESULT_JSON" \| jq -e '.results \| length == 1'
	echo "$RESULT_JSON" \| jq -e '.results[0].runtime == "node"'
	echo "$RESULT_JSON" \| jq -e '.results[0].status == "passed"'
	echo "$RESULT_JSON" \| jq -e '.results[0].experiment_result.run_evaluations[0].name == "avg_accuracy"'
	echo "$RESULT_JSON" \| jq -e '.results[0].experiment_result.run_evaluations[0].value == 1'
	echo "$RESULT_JSON" \| jq -e '
	.results[0].experiment_result.item_results
	\| map(.expected_output)
	\| sort
	\| . == ["HELLO", "LANGFUSE", "WORLD"]
	'

	# --- Scenario 3: mixed-runtime directory ---------------------------
	- name: Run action — mixed Python+TS directory
	id: mixed
	uses: ./
	with:
	experiment_path: tests/fixtures/e2e/mixed
	langfuse_public_key: ${{ env.LANGFUSE_PUBLIC_KEY }}
	langfuse_secret_key: ${{ env.LANGFUSE_SECRET_KEY }}
	langfuse_base_url: ${{ env.LANGFUSE_BASE_URL }}
	dataset_name: ${{ env.E2E_DATASET_NAME }}
	github_token: ${{ github.token }}
	should_comment_on_pr: ${{ env.COMMENT_ON_PR }}
	- name: Assert mixed directory ran both scripts
	env:
	FAILED: ${{ steps.mixed.outputs.failed }}
	RESULT_JSON: ${{ steps.mixed.outputs.result_json }}
	run: \|
	test "$FAILED" = "false"
	node --import tsx scripts/assert-result-shape.ts
	echo "$RESULT_JSON" \| jq -e '.results \| length == 2'
	echo "$RESULT_JSON" \| jq -e '.results \| map(.runtime) \| sort \| . == ["node", "python"]'

	# --- Scenario 4: regression path (non-fatal) -----------------------
	- name: Run action — regression fixture (should_fail_on_regression=false)
	id: regression
	uses: ./
	with:
	experiment_path: tests/fixtures/e2e/regression/experiment.py
	langfuse_public_key: ${{ env.LANGFUSE_PUBLIC_KEY }}
	langfuse_secret_key: ${{ env.LANGFUSE_SECRET_KEY }}
	langfuse_base_url: ${{ env.LANGFUSE_BASE_URL }}
	dataset_name: ${{ env.E2E_DATASET_NAME }}
	github_token: ${{ github.token }}
	should_comment_on_pr: ${{ env.COMMENT_ON_PR }}
	should_fail_on_regression: "false"
	- name: Assert regression was captured (without failing the job)
	env:
	FAILED: ${{ steps.regression.outputs.failed }}
	RESULT_JSON: ${{ steps.regression.outputs.result_json }}
	run: \|
	test "$FAILED" = "true"
	node --import tsx scripts/assert-result-shape.ts
	echo "$RESULT_JSON" \| jq -e '.results[0].status == "regression"'
	echo "$RESULT_JSON" \| jq -e '.results[0].error.name == "RegressionError"'
	echo "$RESULT_JSON" \| jq -e '.results[0].error.is_regression == true'

	- name: Tear down Langfuse
	if: always()
	run: pnpm run dev:down

	# Single required-status-check target for branch protection. Gets the
	# green check if every upstream job passed (or was intentionally skipped).
	all-tests-passed:
	name: All tests passed
	runs-on: ubuntu-latest
	needs: [lint-typecheck-test, check-dist, check-schema, e2e-langfuse]
	if: always()
	steps:
	- name: Succeed when no upstream failed
	if: ${{ !contains(needs..result, 'failure') && !contains(needs..result, 'cancelled') }}
	run: exit 0
	- name: Fail when any upstream failed
	if: ${{ contains(needs..result, 'failure') \|\| contains(needs..result, 'cancelled') }}
	run: exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

docs: use experiment-action label (#26) #77

Workflow file

docs: use experiment-action label (#26) #77

Uh oh!

Workflow file for this run