Skip to content

Commit

Permalink
Bench (#843)
Browse files Browse the repository at this point in the history
First iteration of adding benchmarks to guidance.
  • Loading branch information
nopdive authored May 21, 2024
1 parent c08e830 commit 0f15e4b
Show file tree
Hide file tree
Showing 13 changed files with 1,912 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/action_gpu_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,15 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pytest
pip install -e .[schemas,test]
pip install -e .[schemas,test,bench]
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Other dependencies
run: |
pip install sentencepiece
- name: GPU pip installs
run: |
pip install accelerate
pip uninstall -y llama-cpp-python
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
- name: Check GPU available
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/action_plain_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pytest
pip install -e .[schemas,test]
pip install -e .[schemas,test,bench]
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Install model-specific dependencies
run: |
pip install sentencepiece
pip uninstall -y llama-cpp-python
pip install "llama-cpp-python!=0.2.58"
- name: Run tests (except server)
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/action_server_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pytest
pip install -e .[all,test]
pip install -e .[all,test,bench]
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Run server tests
shell: bash
Expand Down
30 changes: 30 additions & 0 deletions guidance/bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Elementary benchmarking for `guidance` development purposes.
`guidance` lives in a fast paced LLM environment, has complex dependencies and is tricky to implement.
These benchmarks are designed to focus on key use cases, where regressions can create havoc.
General guidelines:
- Simplicity first, then customization - reproducibility by the community is encouraged
- Everything takes forever - allow a pathway to scale horizontally
- Goalposts shift - some of the code for benchmarking will change frequently and that's okay
Implementation:
The `bench` function is provided for no frills benchmarking that is designated for
automated testing.
For customization, we provide a notebook demonstration of how to run custom benchmarks
that are near mirror versions of what is available in the `bench` function provided.
Not implemented yet, but we intend to provide an avenue of running the benchmarks via
docker containers that have GPU resourcing to scale horizontally.
"""

from guidance.bench._powerlift import (
retrieve_langchain,
langchain_chat_extract_runner,
langchain_chat_extract_filter_template,
)
from guidance.bench._api import bench

# TODO(nopdive): Enable docker containers to execute benchmarking easily
49 changes: 49 additions & 0 deletions guidance/bench/_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""User facing API for benchmarking."""

from typing import List, Optional, Tuple, Union
from pathlib import Path
from guidance.bench._utils import lib_bench_dir

"""Available models to run benchmark against."""
AVAILABLE_MODELS = [
"guidance-mistral-7b-instruct",
"base-mistral-7b-instruct",
"guidance-phi-3-mini-4k-instruct",
"base-phi-3-mini-4k-instruct",
"guidance-llama2-7b-32k-instruct",
"base-llama2-7b-32k-instruct",
]


def bench(
db_url: str,
experiment_name: str,
models: List[str] = AVAILABLE_MODELS,
force_recreate: bool = False,
timeout: int = 3600,
cache_dir: Union[str, Path] = lib_bench_dir() / "cache",
debug_mode: bool = False,
) -> Tuple[object, object]:
"""Benchmarks guidance against preset tasks.
This runs on a single machine, one trial at a time.
To run this the first time you will need API_LANGCHAIN_KEY set as an environment variable.
Args:
db_url (str): Database connection string.
experiment_name (str): Name of experiment to create / run.
models (List[str], optional): Models to benchmark. Defaults to AVAILABLE_MODELS.
force_recreate (bool, optional): Recreate the database before benchmarking. Defaults to False.
timeout (int, optional): Max execution time per trial. Defaults to 3600.
cache_dir (Union[str, Path], optional): Cache to store external datasets. Defaults to lib_bench_dir() / "cache".
debug_mode (bool): Set this when you require a debugger to step line by line in the trial_runner.
Returns:
Tuple[object, object]: (status, results) data frames where status relates to trials, results are wide form aggregates of each model.
"""
from guidance.bench._powerlift import bench as inner_bench

status_df, result_df = inner_bench(
db_url, experiment_name, models, force_recreate, timeout, cache_dir, debug_mode
)
return status_df, result_df
Loading

0 comments on commit 0f15e4b

Please sign in to comment.