Bench (#843)

First iteration of adding benchmarks to guidance.
guidance-ai · May 21, 2024 · 0f15e4b · 0f15e4b
1 parent c08e830
commit 0f15e4b
Show file tree

Hide file tree

Showing 13 changed files with 1,912 additions and 3 deletions.
diff --git a/.github/workflows/action_gpu_unit_tests.yml b/.github/workflows/action_gpu_unit_tests.yml
@@ -48,14 +48,15 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install pytest
-          pip install -e .[schemas,test]
+          pip install -e .[schemas,test,bench]
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Other dependencies
         run: |
           pip install sentencepiece
       - name: GPU pip installs
         run: |
           pip install accelerate
+          pip uninstall -y llama-cpp-python
           CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |

diff --git a/.github/workflows/action_plain_unit_tests.yml b/.github/workflows/action_plain_unit_tests.yml
@@ -33,11 +33,12 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install pytest
-          pip install -e .[schemas,test]
+          pip install -e .[schemas,test,bench]
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Install model-specific dependencies
         run: |
           pip install sentencepiece
+          pip uninstall -y llama-cpp-python
           pip install "llama-cpp-python!=0.2.58"
       - name: Run tests (except server)
         shell: bash

diff --git a/.github/workflows/action_server_tests.yml b/.github/workflows/action_server_tests.yml
@@ -25,7 +25,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install pytest
-          pip install -e .[all,test]
+          pip install -e .[all,test,bench]
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Run server tests
         shell: bash

diff --git a/guidance/bench/__init__.py b/guidance/bench/__init__.py
@@ -0,0 +1,30 @@
+"""Elementary benchmarking for `guidance` development purposes.
+
+`guidance` lives in a fast paced LLM environment, has complex dependencies and is tricky to implement.
+These benchmarks are designed to focus on key use cases, where regressions can create havoc.
+
+General guidelines:
+- Simplicity first, then customization - reproducibility by the community is encouraged
+- Everything takes forever - allow a pathway to scale horizontally
+- Goalposts shift - some of the code for benchmarking will change frequently and that's okay
+
+Implementation:
+
+The `bench` function is provided for no frills benchmarking that is designated for
+automated testing.
+
+For customization, we provide a notebook demonstration of how to run custom benchmarks
+that are near mirror versions of what is available in the `bench` function provided.
+
+Not implemented yet, but we intend to provide an avenue of running the benchmarks via
+docker containers that have GPU resourcing to scale horizontally.
+"""
+
+from guidance.bench._powerlift import (
+    retrieve_langchain,
+    langchain_chat_extract_runner,
+    langchain_chat_extract_filter_template,
+)
+from guidance.bench._api import bench
+
+# TODO(nopdive): Enable docker containers to execute benchmarking easily
diff --git a/guidance/bench/_api.py b/guidance/bench/_api.py
@@ -0,0 +1,49 @@
+"""User facing API for benchmarking."""
+
+from typing import List, Optional, Tuple, Union
+from pathlib import Path
+from guidance.bench._utils import lib_bench_dir
+
+"""Available models to run benchmark against."""
+AVAILABLE_MODELS = [
+    "guidance-mistral-7b-instruct",
+    "base-mistral-7b-instruct",
+    "guidance-phi-3-mini-4k-instruct",
+    "base-phi-3-mini-4k-instruct",
+    "guidance-llama2-7b-32k-instruct",
+    "base-llama2-7b-32k-instruct",
+]
+
+
+def bench(
+    db_url: str,
+    experiment_name: str,
+    models: List[str] = AVAILABLE_MODELS,
+    force_recreate: bool = False,
+    timeout: int = 3600,
+    cache_dir: Union[str, Path] = lib_bench_dir() / "cache",
+    debug_mode: bool = False,
+) -> Tuple[object, object]:
+    """Benchmarks guidance against preset tasks.
+
+    This runs on a single machine, one trial at a time.
+    To run this the first time you will need API_LANGCHAIN_KEY set as an environment variable.
+
+    Args:
+        db_url (str): Database connection string.
+        experiment_name (str): Name of experiment to create / run.
+        models (List[str], optional): Models to benchmark. Defaults to AVAILABLE_MODELS.
+        force_recreate (bool, optional): Recreate the database before benchmarking. Defaults to False.
+        timeout (int, optional): Max execution time per trial. Defaults to 3600.
+        cache_dir (Union[str, Path], optional): Cache to store external datasets. Defaults to lib_bench_dir() / "cache".
+        debug_mode (bool): Set this when you require a debugger to step line by line in the trial_runner.
+
+    Returns:
+        Tuple[object, object]: (status, results) data frames where status relates to trials, results are wide form aggregates of each model.
+    """
+    from guidance.bench._powerlift import bench as inner_bench
+
+    status_df, result_df = inner_bench(
+        db_url, experiment_name, models, force_recreate, timeout, cache_dir, debug_mode
+    )
+    return status_df, result_df