diff --git a/.github/actions/setup-environment/action.yml b/.github/actions/setup-environment/action.yml new file mode 100644 index 0000000..e22d51d --- /dev/null +++ b/.github/actions/setup-environment/action.yml @@ -0,0 +1,17 @@ +name: Setup Environment +description: Setup Python, uv, and install dev dependencies + +runs: + using: composite + steps: + - name: Setup uv + uses: astral-sh/setup-uv@v3 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install dependencies + shell: bash + run: uv pip install --system -e ".[dev]" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 80d3aa3..c48eecb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,29 +6,34 @@ on: branches: [main] jobs: - pre-commit: + lint: runs-on: ubuntu-latest + env: + UV_NO_SYNC: 1 steps: - name: Checkout code uses: actions/checkout@v4 - - name: Setup uv - uses: astral-sh/setup-uv@v3 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - - name: Install dependencies - run: uv pip install --system -e ".[dev]" + - name: Setup environment + uses: ./.github/actions/setup-environment - name: Install pre-commit run: uv pip install --system pre-commit - - name: Run pre-commit + - name: Run linting run: SKIP=mypy pre-commit run --all-files + typecheck: + runs-on: ubuntu-latest + env: + UV_NO_SYNC: 1 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup environment + uses: ./.github/actions/setup-environment + - name: Run mypy run: uv run mypy src/ tests/ --exclude 'tests/benchmarks/appworld/' @@ -36,21 +41,14 @@ jobs: runs-on: ubuntu-latest env: CI: 1 + UV_NO_SYNC: 1 steps: - name: Checkout code uses: actions/checkout@v4 - - name: Setup uv - uses: astral-sh/setup-uv@v3 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - - name: Install dependencies - run: uv pip install --system -e ".[dev]" + - name: Setup environment + uses: ./.github/actions/setup-environment - name: Run unit tests run: uv run pytest tests/unit/ -v -p no:warnings diff --git a/docs/evals.md b/docs/evals.md index e840c07..db5a4e9 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -87,6 +87,21 @@ Validate existing logs without running new tests: .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --validate-only --log-dir outputs/experiment1/raw ``` +### Parallel Execution + +Run tests in parallel using multiple workers: + +```bash +# Run with 4 workers +.venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n 4 + +# Run with 8 workers +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 8 + +# Auto-detect number of CPUs +.venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n auto +``` + ## Further Reading ### BFCL diff --git a/pyproject.toml b/pyproject.toml index af17802..f32ecd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,8 @@ where = ["src"] dev = [ "pytest>=7.0", "pytest-asyncio>=0.21", + "pytest-xdist>=3.0", + "pytest-timeout>=2.0", "ruff", "mypy", ] @@ -86,6 +88,8 @@ filterwarnings = [ "ignore::sqlalchemy.exc.SADeprecationWarning:appworld.apps.lib.models.db", "ignore::pydantic.warnings.PydanticDeprecatedSince20:appworld.apps.*", ] +timeout = 0 +timeout_method = "thread" # Use thread-based timeout for better async compatibility [tool.mypy] warn_return_any = true diff --git a/tests/benchmarks/appworld/README.md b/tests/benchmarks/appworld/README.md index a85daae..2871f21 100644 --- a/tests/benchmarks/appworld/README.md +++ b/tests/benchmarks/appworld/README.md @@ -54,6 +54,15 @@ pytest tests/benchmarks/appworld/test_appworld.py --validate-only --temperature 0.001 # Temperature for sampling (default: 0.001) ``` +### Parallel Execution +```bash +-n 4 # Run with 4 workers +-n 8 # Run with 8 workers +-n auto # Auto-detect number of CPUs +``` + +Example: `pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 4` + ## File Structure ``` diff --git a/tests/benchmarks/appworld/test_appworld.py b/tests/benchmarks/appworld/test_appworld.py index d9de6e8..85d697a 100644 --- a/tests/benchmarks/appworld/test_appworld.py +++ b/tests/benchmarks/appworld/test_appworld.py @@ -50,6 +50,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: @pytest.mark.asyncio +@pytest.mark.timeout(300) async def test_appworld( task_id: str, model: str, diff --git a/tests/conftest.py b/tests/conftest.py index b683612..1eafd22 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,7 +50,6 @@ def pytest_addoption(parser: pytest.Parser) -> None: parser.addoption("--output-dir", default="outputs", help="Output directory for results") parser.addoption("--validate-only", action="store_true", help="Only validate existing logs") parser.addoption("--log-dir", default="outputs/raw", help="Directory with logs (for validate mode)") - parser.addoption("--max-workers", default=4, type=int, help="Max concurrent tests (default: 4)") def pytest_configure(config: pytest.Config) -> None: