Significant-Gravitas · k8si · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
@@ -11,6 +11,9 @@
 ## GROQ_API_KEY - Groq API Key (Example: gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx)
 # GROQ_API_KEY=
 
+## LLAMAFILE_API_BASE - Llamafile API base URL
+# LLAMAFILE_API_BASE=http://localhost:8080/v1
+
 ## TELEMETRY_OPT_IN - Share telemetry on errors and other issues with the AutoGPT team, e.g. through Sentry.
 ##   This helps us to spot and solve problems earlier & faster. (Default: DISABLED)
 # TELEMETRY_OPT_IN=true

@@ -0,0 +1,3 @@
+*.llamafile
+*.llamafile.exe
+llamafile.exe
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""
+Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model
+
+Usage:
+  cd <repo-root>/autogpt
+  ./scripts/llamafile/serve.py
+"""
+
+import os
+import platform
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+import click
+
+LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile")
+LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}"  # noqa
+LLAMAFILE_EXE = Path("llamafile.exe")
+LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6"  # noqa
+
+
+@click.command()
+@click.option(
+    "--llamafile",
+    type=click.Path(dir_okay=False),
+    help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
+)
+@click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
+@click.option(
+    "--host", help="Specify the address for the llamafile server to listen on"
+)
+@click.option(
+    "--port", type=int, help="Specify the port for the llamafile server to listen on"
+)
+@click.option(
+    "--use-gpu", is_flag=True, help="Use an AMD or Nvidia GPU to speed up inference"
+)
+def main(
+    llamafile: Optional[Path] = None,
+    llamafile_url: Optional[str] = None,
+    host: Optional[str] = None,
+    port: Optional[int] = None,
+    use_gpu: bool = False,
+):
+    if not llamafile:
+        if not llamafile_url:
+            llamafile = LLAMAFILE
+        else:
+            llamafile = Path(llamafile_url.rsplit("/", 1)[1])
+            if llamafile.suffix != ".llamafile":
+                click.echo(
+                    click.style(
+                        "The given URL does not end with '.llamafile' -> "
+                        "can't get filename from URL. "
+                        "Specify the filename using --llamafile.",
+                        fg="red",
+                    ),
+                    err=True,
+                )
+                return
+
+    if llamafile == LLAMAFILE and not llamafile_url:
+        llamafile_url = LLAMAFILE_URL
+    elif llamafile_url != LLAMAFILE_URL:
+        if not click.prompt(
+            click.style(
+                "You seem to have specified a different URL for the default model "
+                f"({llamafile.name}). Are you sure this is correct? "
-                f"({llamafile.name}). Are you sure this is correct? "
+                f"({llamafile}). Are you sure this is correct? "
-                f"({llamafile.name}). Are you sure this is correct? "
+                f"({llamafile}). Are you sure this is correct? "
+                "If you want to use a different model, also specify --llamafile.",
+                fg="yellow",
+            ),
+            type=bool,
+        ):
+            return
+
+    # Go to autogpt/scripts/llamafile/
+    os.chdir(Path(__file__).resolve().parent)
+
+    on_windows = platform.system() == "Windows"
+
+    if not llamafile.is_file():
+        if not llamafile_url:
+            click.echo(
+                click.style(
+                    "Please use --lamafile_url to specify a download URL for "
+                    f"'{llamafile.name}'. "
+                    "This will only be necessary once, so we can download the model.",
+                    fg="red",
+                ),
+                err=True,
+            )
+            return
+
+        download_file(llamafile_url, llamafile)
+
+        if not on_windows:
+            llamafile.chmod(0o755)
+            subprocess.run([llamafile, "--version"], check=True)
+
+    if not on_windows:
+        base_command = [f"./{llamafile}"]
+    else:
+        # Windows does not allow executables over 4GB, so we have to download a
+        # model-less llamafile.exe and run that instead.
+        if not LLAMAFILE_EXE.is_file():
+            download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
+            LLAMAFILE_EXE.chmod(0o755)
+            subprocess.run([f".\\{LLAMAFILE_EXE}", "--version"], check=True)
+
+        base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]
+
+    if host:
+        base_command.extend(["--host", host])
+    if port:
+        base_command.extend(["--port", str(port)])
+    if use_gpu:
+        base_command.extend(["-ngl", "9999"])
+
+    subprocess.run(
+        [
+            *base_command,
+            "--server",
+            "--nobrowser",
+            "--ctx-size",
+            "0",
+            "--n-predict",
+            "1024",
+        ],
+        check=True,
+    )
+
+    # note: --ctx-size 0 means the prompt context size will be set directly from the
+    # underlying model configuration. This may cause slow response times or consume
+    # a lot of memory.
+
+
+def download_file(url: str, to_file: Path) -> None:
+    print(f"Downloading {to_file.name}...")
+    import urllib.request
+
+    urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
+    print()
+
+
+def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
+    if total_size != -1:
+        downloaded_size = chunk_number * chunk_size
+        percent = min(1, downloaded_size / total_size)
+        bar = "#" * int(40 * percent)
+        print(
+            f"\rDownloading: [{bar:<40}] {percent:.0%}"
+            f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
+            end="",
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
@@ -190,3 +190,66 @@ If you don't know which to choose, you can safely go with OpenAI*.
 
 [groq/api-keys]: https://console.groq.com/keys
 [groq/models]: https://console.groq.com/docs/models
+
+
+### Llamafile
+
+With llamafile you can run models locally, which means no need to set up billing,
+and guaranteed data privacy.
+
+For more information and in-depth documentation, check out the [llamafile documentation].
+
+!!! warning
+    At the moment, llamafile only serves one model at a time. This means you can not
+    set `SMART_LLM` and `FAST_LLM` to two different llamafile models.
+
+!!! warning
+    Due to the issues linked below, llamafiles don't work on WSL. To use a llamafile
+    with AutoGPT in WSL, you will have to run the llamafile in Windows (outside WSL).
+
+    <details>
+    <summary>Instructions</summary>
+
+    1. Get the `llamafile/serve.py` script through one of these two ways:
+        1. Clone the AutoGPT repo somewhere in your Windows environment,
+           with the script located at `autogpt/scripts/llamafile/serve.py`
+        2. Download just the [serve.py] script somewhere in your Windows environment
+    2. Make sure you have `click` installed: `pip install click`
+    3. Run `ip route | grep default | awk '{print $3}'` *inside WSL* to get the address
+       of the WSL host machine
+    4. Run `python3 serve.py --host {WSL_HOST_ADDR}`, where `{WSL_HOST_ADDR}`
+       is the address you found at step 3.
+       If port 8080 is taken, also specify a different port using `--port {PORT}`.
+    5. In WSL, set `LLAMAFILE_API_BASE=http://{WSL_HOST_ADDR}:8080/v1` in your `.env`.
+    6. Follow the rest of the regular instructions below.
+
+    [serve.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/autogpt/scripts/llamafile/serve.py
+    </details>
+
+    * [Mozilla-Ocho/llamafile#356](https://github.com/Mozilla-Ocho/llamafile/issues/356)
+    * [Mozilla-Ocho/llamafile#100](https://github.com/Mozilla-Ocho/llamafile/issues/100)
+
+!!! note
+    These instructions will download and use `mistral-7b-instruct-v0.2.Q5_K_M.llamafile`.
+    `mistral-7b-instruct-v0.2` is currently the only tested and supported model.
+    If you want to try other models, you'll have to add them to `LlamafileModelName` in
+    [`llamafile.py`][forge/llamafile.py].
+    For optimal results, you may also have to add some logic to adapt the message format,
+    like `LlamafileProvider._adapt_chat_messages_for_mistral_instruct(..)` does.
+
+1. Run the llamafile serve script:
+   ```shell
+   python3 ./scripts/llamafile/serve.py
+   ```
+   The first time this is run, it will download a file containing the model + runtime,
+   which may take a while and a few gigabytes of disk space.
+
+   To force GPU acceleration, add `--use-gpu` to the command.
+
+3. In `.env`, set `SMART_LLM`/`FAST_LLM` or both to `mistral-7b-instruct-v0.2`
+
+4. If the server is running on different address than `http://localhost:8080/v1`,
+   set `LLAMAFILE_API_BASE` in `.env` to the right base URL
+
+[llamafile documentation]: https://github.com/Mozilla-Ocho/llamafile#readme
+[forge/llamafile.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/forge/forge/llm/providers/llamafile/llamafile.py
diff --git a/forge/forge/llm/providers/llamafile/README.md b/forge/forge/llm/providers/llamafile/README.md
@@ -0,0 +1,36 @@
+# Llamafile Integration Notes
+
+Tested with:
+* Python 3.11
+* Apple M2 Pro (32 GB), macOS 14.2.1
+* quantized mistral-7b-instruct-v0.2
+
+## Setup
+
+Download a `mistral-7b-instruct-v0.2` llamafile:
+```shell
+wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version
+```
+
+Run the llamafile server:
+```shell
+LLAMAFILE="./mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
+
+"${LLAMAFILE}" \
+--server \
+--nobrowser \
+--ctx-size 0 \
+--n-predict 1024
+
+# note: ctx-size=0 means the prompt context size will be set directly from the
+# underlying model configuration. This may cause slow response times or consume
+# a lot of memory.
+```
+
+## TODOs
+
+* `SMART_LLM`/`FAST_LLM` configuration: Currently, the llamafile server only serves one model at a time. However, there's no reason you can't start multiple llamafile servers on different ports. To support using different models for `smart_llm` and `fast_llm`, you could implement config vars like `LLAMAFILE_SMART_LLM_URL` and `LLAMAFILE_FAST_LLM_URL` that point to different llamafile servers (one serving a 'big model' and one serving a 'fast model'). 
+* Authorization: the `serve.sh` script does not set up any authorization for the llamafile server; this can be turned on by adding arg `--api-key <some-key>` to the server startup command. However I haven't attempted to test whether the integration with autogpt works when this feature is turned on.
+* Test with other models
diff --git a/forge/forge/llm/providers/llamafile/__init__.py b/forge/forge/llm/providers/llamafile/__init__.py
@@ -0,0 +1,17 @@
+from .llamafile import (
+    LLAMAFILE_CHAT_MODELS,
+    LLAMAFILE_EMBEDDING_MODELS,
+    LlamafileCredentials,
+    LlamafileModelName,
+    LlamafileProvider,
+    LlamafileSettings,
+)
+
+__all__ = [
+    "LLAMAFILE_CHAT_MODELS",
+    "LLAMAFILE_EMBEDDING_MODELS",
+    "LlamafileCredentials",
+    "LlamafileModelName",
+    "LlamafileProvider",
+    "LlamafileSettings",
+]