-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First iteration of adding benchmarks to guidance.
- Loading branch information
Showing
13 changed files
with
1,912 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
"""Elementary benchmarking for `guidance` development purposes. | ||
`guidance` lives in a fast paced LLM environment, has complex dependencies and is tricky to implement. | ||
These benchmarks are designed to focus on key use cases, where regressions can create havoc. | ||
General guidelines: | ||
- Simplicity first, then customization - reproducibility by the community is encouraged | ||
- Everything takes forever - allow a pathway to scale horizontally | ||
- Goalposts shift - some of the code for benchmarking will change frequently and that's okay | ||
Implementation: | ||
The `bench` function is provided for no frills benchmarking that is designated for | ||
automated testing. | ||
For customization, we provide a notebook demonstration of how to run custom benchmarks | ||
that are near mirror versions of what is available in the `bench` function provided. | ||
Not implemented yet, but we intend to provide an avenue of running the benchmarks via | ||
docker containers that have GPU resourcing to scale horizontally. | ||
""" | ||
|
||
from guidance.bench._powerlift import ( | ||
retrieve_langchain, | ||
langchain_chat_extract_runner, | ||
langchain_chat_extract_filter_template, | ||
) | ||
from guidance.bench._api import bench | ||
|
||
# TODO(nopdive): Enable docker containers to execute benchmarking easily |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""User facing API for benchmarking.""" | ||
|
||
from typing import List, Optional, Tuple, Union | ||
from pathlib import Path | ||
from guidance.bench._utils import lib_bench_dir | ||
|
||
"""Available models to run benchmark against.""" | ||
AVAILABLE_MODELS = [ | ||
"guidance-mistral-7b-instruct", | ||
"base-mistral-7b-instruct", | ||
"guidance-phi-3-mini-4k-instruct", | ||
"base-phi-3-mini-4k-instruct", | ||
"guidance-llama2-7b-32k-instruct", | ||
"base-llama2-7b-32k-instruct", | ||
] | ||
|
||
|
||
def bench( | ||
db_url: str, | ||
experiment_name: str, | ||
models: List[str] = AVAILABLE_MODELS, | ||
force_recreate: bool = False, | ||
timeout: int = 3600, | ||
cache_dir: Union[str, Path] = lib_bench_dir() / "cache", | ||
debug_mode: bool = False, | ||
) -> Tuple[object, object]: | ||
"""Benchmarks guidance against preset tasks. | ||
This runs on a single machine, one trial at a time. | ||
To run this the first time you will need API_LANGCHAIN_KEY set as an environment variable. | ||
Args: | ||
db_url (str): Database connection string. | ||
experiment_name (str): Name of experiment to create / run. | ||
models (List[str], optional): Models to benchmark. Defaults to AVAILABLE_MODELS. | ||
force_recreate (bool, optional): Recreate the database before benchmarking. Defaults to False. | ||
timeout (int, optional): Max execution time per trial. Defaults to 3600. | ||
cache_dir (Union[str, Path], optional): Cache to store external datasets. Defaults to lib_bench_dir() / "cache". | ||
debug_mode (bool): Set this when you require a debugger to step line by line in the trial_runner. | ||
Returns: | ||
Tuple[object, object]: (status, results) data frames where status relates to trials, results are wide form aggregates of each model. | ||
""" | ||
from guidance.bench._powerlift import bench as inner_bench | ||
|
||
status_df, result_df = inner_bench( | ||
db_url, experiment_name, models, force_recreate, timeout, cache_dir, debug_mode | ||
) | ||
return status_df, result_df |
Oops, something went wrong.