From 03d8e1e16929db0c076c0804261489d42240777f Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Thu, 18 Apr 2024 13:10:37 -0400
Subject: [PATCH 01/40] Add minimal implementation of LlamafileProvider, a new
 ChatModelProvider for llamafiles. Currently it just extends OpenAIProvider
 and only overrides methods that are necessary to get the system to work at a
 basic level. Update ModelProviderName schema and config/configurator so that
 app startup using this provider is handled correctly. Add
 'mistral-7b-instruct-v0' to OpenAIModelName/OPEN_AI_CHAT_MODELS registries.

---
 autogpts/autogpt/autogpt/app/configurator.py  |  86 +++++++++---
 autogpts/autogpt/autogpt/app/main.py          |  16 ++-
 autogpts/autogpt/autogpt/config/config.py     |   9 ++
 .../resource/model_providers/llamafile.py     | 125 ++++++++++++++++++
 .../core/resource/model_providers/openai.py   |  19 +++
 .../core/resource/model_providers/schema.py   |   1 +
 autogpts/autogpt/autogpt/llm/api_manager.py   |  23 ++++
 7 files changed, 255 insertions(+), 24 deletions(-)
 create mode 100644 autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py

diff --git a/autogpts/autogpt/autogpt/app/configurator.py b/autogpts/autogpt/autogpt/app/configurator.py
index cd789002102..7e218a66069 100644
--- a/autogpts/autogpt/autogpt/app/configurator.py
+++ b/autogpts/autogpt/autogpt/app/configurator.py
@@ -16,6 +16,8 @@
 from autogpt.logs.helpers import request_user_double_check
 from autogpt.memory.vector import get_supported_memory_backends
 
+from autogpt.core.resource.model_providers.schema import ModelProviderName
+
 if TYPE_CHECKING:
     from autogpt.core.resource.model_providers.openai import OpenAICredentials
 
@@ -95,31 +97,43 @@ def apply_overrides_to_config(
     if speak:
         config.tts_config.speak_mode = True
 
-    # Set the default LLM models
-    if gpt3only:
-        # --gpt3only should always use gpt-3.5-turbo, despite user's FAST_LLM config
-        config.fast_llm = GPT_3_MODEL
-        config.smart_llm = GPT_3_MODEL
-    elif (
-        gpt4only
-        and check_model(
-            GPT_4_MODEL,
-            model_type="smart_llm",
-            api_credentials=config.openai_credentials,
-        )
-        == GPT_4_MODEL
-    ):
-        # --gpt4only should always use gpt-4, despite user's SMART_LLM config
-        config.fast_llm = GPT_4_MODEL
-        config.smart_llm = GPT_4_MODEL
-    else:
-        config.fast_llm = check_model(
+    if config.llm_provider == ModelProviderName.OPENAI:
+        # Set the default LLM models
+        if gpt3only:
+            # --gpt3only should always use gpt-3.5-turbo, despite user's FAST_LLM config
+            config.fast_llm = GPT_3_MODEL
+            config.smart_llm = GPT_3_MODEL
+        elif (
+            gpt4only
+            and check_model(
+                GPT_4_MODEL,
+                model_type="smart_llm",
+                api_credentials=config.openai_credentials,
+            )
+            == GPT_4_MODEL
+        ):
+            # --gpt4only should always use gpt-4, despite user's SMART_LLM config
+            config.fast_llm = GPT_4_MODEL
+            config.smart_llm = GPT_4_MODEL
+        else:
+            config.fast_llm = check_model(
+                config.fast_llm, "fast_llm", api_credentials=config.openai_credentials
+            )
+            config.smart_llm = check_model(
+                config.smart_llm, "smart_llm", api_credentials=config.openai_credentials
+            )
+
+    elif config.llm_provider == ModelProviderName.LLAMAFILE:
+        config.fast_llm = check_model_llamafile(
             config.fast_llm, "fast_llm", api_credentials=config.openai_credentials
         )
-        config.smart_llm = check_model(
+        config.smart_llm = check_model_llamafile(
             config.smart_llm, "smart_llm", api_credentials=config.openai_credentials
         )
 
+    else:
+        raise NotImplementedError(f"llm_provider: {config.llm_provider} is not supported.")
+
     if memory_type:
         supported_memory = get_supported_memory_backends()
         chosen = memory_type
@@ -199,3 +213,35 @@ def check_model(
         f"You don't have access to {model_name}. Setting {model_type} to gpt-3.5-turbo."
     )
     return "gpt-3.5-turbo"
+
+
+def check_model_llamafile(
+    model_name: str,
+    model_type: Literal["smart_llm", "fast_llm"],
+    api_credentials: OpenAICredentials,
+) -> str:
+    """
+    Check if model is available for use. If not, raise exception.
+    """
+    api_manager = ApiManager()
+    models = api_manager.get_models_llamafile(api_credentials)
+    # note: at the moment, llamafile only serves one model at a time (so this
+    # list will only ever have one value). however, in the future, llamafile
+    # may support multiple models, so leaving this method as-is for now.
+
+    # clean up model names
+    # e.g. 'mistral-7b-instruct-v0.2.Q5_K_M.gguf' -> 'mistral-7b-instruct-v0.2'
+    # e.g. '/Users/kate/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf' -> 'mistral-7b-instruct-v0.2
+    available_model_ids = [
+        Path(model.id).name.split(".")[0] for model in models
+    ]
+
+    if any(model_name == m for m in available_model_ids):
+        return model_name
+
+    else:
+        # TODO: feels weird to show a 'secret value' here but is the api base_url really a secret?
+        raise ValueError(
+            f"llamafile server at {api_credentials.api_base.get_secret_value()} does not have access to {model_name}. Please configure {model_type} to use one of {available_model_ids} or use a different llamafile."
+        )
+
diff --git a/autogpts/autogpt/autogpt/app/main.py b/autogpts/autogpt/autogpt/app/main.py
index 6c428ca6e89..b1f9ee5c6da 100644
--- a/autogpts/autogpt/autogpt/app/main.py
+++ b/autogpts/autogpt/autogpt/app/main.py
@@ -37,6 +37,7 @@
     assert_config_has_openai_api_key,
 )
 from autogpt.core.resource.model_providers.openai import OpenAIProvider
+from autogpt.core.resource.model_providers.llamafile import LlamafileProvider
 from autogpt.core.runner.client_lib.utils import coroutine
 from autogpt.file_storage import FileStorageBackendName, get_storage
 from autogpt.logs.config import configure_chat_plugins, configure_logging
@@ -441,10 +442,17 @@ def _configure_openai_provider(config: Config) -> OpenAIProvider:
 
     openai_settings = OpenAIProvider.default_settings.copy(deep=True)
     openai_settings.credentials = config.openai_credentials
-    return OpenAIProvider(
-        settings=openai_settings,
-        logger=logging.getLogger("OpenAIProvider"),
-    )
+
+    if config.llm_provider == "llamafile":
+        return LlamafileProvider(
+            settings=openai_settings,
+            logger=logging.getLogger("LlamafileProvider"),
+        )
+    else:
+        return OpenAIProvider(
+            settings=openai_settings,
+            logger=logging.getLogger("OpenAIProvider"),
+        )
 
 
 def _get_cycle_budget(continuous_mode: bool, continuous_limit: int) -> int | float:
diff --git a/autogpts/autogpt/autogpt/config/config.py b/autogpts/autogpt/autogpt/config/config.py
index ed1e5f78c14..06a62129657 100644
--- a/autogpts/autogpt/autogpt/config/config.py
+++ b/autogpts/autogpt/autogpt/config/config.py
@@ -26,6 +26,7 @@
 from autogpt.logs.config import LoggingConfig
 from autogpt.plugins.plugins_config import PluginsConfig
 from autogpt.speech import TTSConfig
+from autogpt.core.resource.model_providers.schema import ModelProviderName
 
 logger = logging.getLogger(__name__)
 
@@ -83,6 +84,14 @@ class Config(SystemSettings, arbitrary_types_allowed=True):
     )
 
     # Model configuration
+    # llm_provider: str = UserConfigurable(
+    #     default="openai",
+    #     from_env=lambda: os.getenv("LLM_PROVIDER")
+    # )
+    llm_provider: ModelProviderName = UserConfigurable(
+        default=ModelProviderName.OPENAI,
+        from_env=lambda: ModelProviderName(os.getenv("LLM_PROVIDER"))
+    )
     fast_llm: str = UserConfigurable(
         default="gpt-3.5-turbo-0125",
         from_env=lambda: os.getenv("FAST_LLM"),
diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
new file mode 100644
index 00000000000..c5f2e2c8e9e
--- /dev/null
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
@@ -0,0 +1,125 @@
+import logging
+from typing import Any, Iterator, Optional, TypeVar
+
+from openai.types.chat import (
+    ChatCompletionMessage,
+    ChatCompletionMessageParam,
+)
+from overrides import overrides
+
+from autogpt.core.resource.model_providers.openai import (
+    OpenAIProvider,
+    OpenAIModelName,
+    _functions_compat_fix_kwargs
+)
+from autogpt.core.resource.model_providers.schema import (
+    AssistantToolCall,
+    AssistantToolCallDict,
+    ChatMessage,
+    CompletionModelFunction,
+)
+from autogpt.core.utils.json_utils import json_loads
+
+_T = TypeVar("_T")
+
+
+class LlamafileProvider(OpenAIProvider):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @overrides
+    def _get_chat_completion_args(
+            self,
+            model_prompt: list[ChatMessage],
+            model_name: OpenAIModelName,
+            functions: Optional[list[CompletionModelFunction]] = None,
+            **kwargs,
+    ) -> tuple[list[ChatCompletionMessageParam], dict[str, Any]]:
+        """Prepare chat completion arguments and keyword arguments for API call.
+
+        Args:
+            model_prompt: List of ChatMessages.
+            model_name: The model to use.
+            functions: Optional list of functions available to the LLM.
+            kwargs: Additional keyword arguments.
+
+        Returns:
+            list[ChatCompletionMessageParam]: Prompt messages for the OpenAI call
+            dict[str, Any]: Any other kwargs for the OpenAI call
+        """
+        kwargs.update(self._credentials.get_model_access_kwargs(model_name))
+
+        if functions:
+            _functions_compat_fix_kwargs(functions, kwargs)
+
+        if extra_headers := self._configuration.extra_request_headers:
+            kwargs["extra_headers"] = kwargs.get("extra_headers", {})
+            kwargs["extra_headers"].update(extra_headers.copy())
+
+        if "messages" in kwargs:
+            model_prompt += kwargs["messages"]
+            del kwargs["messages"]
+
+        openai_messages: list[ChatCompletionMessageParam] = [
+            message.dict(
+                include={"role", "content", "tool_calls", "name"},
+                exclude_none=True,
+            )
+            for message in model_prompt
+        ]
+
+        return openai_messages, kwargs
+
+    @overrides
+    def _parse_assistant_tool_calls(
+        self, assistant_message: ChatCompletionMessage, compat_mode: bool = False
+    ):
+        if not assistant_message.content:
+            raise ValueError("Assistant message content is empty")
+        if not compat_mode:
+            raise ValueError("compat_mode must be enabled for LlamafileProvider")
+
+        tool_calls: list[AssistantToolCall] = []
+        parse_errors: list[Exception] = []
+
+        for tool_call in _tool_calls_compat_extract_calls(assistant_message.content):
+            tool_calls.append(tool_call)
+
+        # try:
+        #     tool_calls = list(
+        #         _tool_calls_compat_extract_calls(assistant_message.content)
+        #     )
+        # except Exception as e:
+        #     parse_errors.append(e)
+
+        return tool_calls, parse_errors
+
+
+def _tool_calls_compat_extract_calls(response: str) -> Iterator[AssistantToolCall]:
+    import re
+    import uuid
+
+    logging.debug(f"Trying to extract tool calls from response:\n{response}")
+
+    response = response.strip()  # strip off any leading/trailing whitespace
+    if response.startswith("```"):
+        # attempt to remove any extraneous markdown artifacts like "```json"
+        response = response.strip("```")
+        if response.startswith("json"):
+            response = response.strip("json")
+        response = response.strip()  # any remaining whitespace
+
+    if response[0] == "[":
+        tool_calls: list[AssistantToolCallDict] = json_loads(response)
+    else:
+        block = re.search(r"```(?:tool_calls)?\n(.*)\n```\s*$", response, re.DOTALL)
+        if not block:
+            raise ValueError("Could not find tool_calls block in response")
+        tool_calls: list[AssistantToolCallDict] = json_loads(block.group(1))
+
+    for t in tool_calls:
+        t["id"] = str(uuid.uuid4())
+        # t["function"]["arguments"] = str(t["function"]["arguments"])  # HACK
+
+        yield AssistantToolCall.parse_obj(t)
\ No newline at end of file
diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py b/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
index cd01b496a0b..f5a6fb7c7b1 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
@@ -81,6 +81,9 @@ class OpenAIModelName(str, enum.Enum):
     GPT4 = GPT4_ROLLING
     GPT4_32k = GPT4_ROLLING_32k
 
+    # TODO: added here for convenience, maybe better to move this somewhere else though
+    LLAMAFILE_MISTRAL_7B_INSTRUCT = "mistral-7b-instruct-v0"
+
 
 OPEN_AI_EMBEDDING_MODELS = {
     info.name: info
@@ -179,6 +182,22 @@ class OpenAIModelName(str, enum.Enum):
             max_tokens=128000,
             has_function_call_api=True,
         ),
+        ChatModelInfo(
+            name=OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT,
+            service=ModelProviderService.CHAT,
+            provider_name=ModelProviderName.LLAMAFILE,
+            prompt_token_cost=0.0,
+            completion_token_cost=0.0,
+            # TODO: the actual mistral model token limit is 32768 but the
+            #  llamafile server has its own max token limit that is configured
+            #  when the server is started bc the full context window might not
+            #  fit in memory depending on the hardware the llamafile is being
+            #  run on. probably need to think about how to set/coordinate s.t.
+            #  this value and the server's value match. I use 2048 here bc
+            #  that's the value I've been using during testing.
+            max_tokens=2048,
+            has_function_call_api=True,
+        ),
     ]
 }
 # Copy entries for models with equivalent specs
diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/schema.py b/autogpts/autogpt/autogpt/core/resource/model_providers/schema.py
index cc0030995aa..442d5b033d4 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/schema.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/schema.py
@@ -37,6 +37,7 @@ class ModelProviderService(str, enum.Enum):
 
 class ModelProviderName(str, enum.Enum):
     OPENAI = "openai"
+    LLAMAFILE = "llamafile"
 
 
 class ChatMessage(BaseModel):
diff --git a/autogpts/autogpt/autogpt/llm/api_manager.py b/autogpts/autogpt/autogpt/llm/api_manager.py
index 1cfcdd755f7..74cd42df3aa 100644
--- a/autogpts/autogpt/autogpt/llm/api_manager.py
+++ b/autogpts/autogpt/autogpt/llm/api_manager.py
@@ -128,3 +128,26 @@ def get_models(self, openai_credentials: OpenAICredentials) -> List[Model]:
             exit(1)
 
         return self.models
+
+    def get_models_llamafile(self, openai_credentials: OpenAICredentials) -> List[Model]:
+        """
+        Same as `get_models` but doesn't filter out non-'gpt' models.
+        TODO: No real reason for this to be a separate method but I don't know
+          what effect it will have on the OpenAI-related to remove the
+          'gpt-only' filter from the original method.
+        """
+        if self.models is not None:
+            return self.models
+
+        try:
+            all_models = (
+                OpenAI(**openai_credentials.get_api_access_kwargs())
+                .models.list()
+                .data
+            )
+            self.models = [model for model in all_models]
+        except APIError as e:
+            logger.error(e.message)
+            exit(1)
+
+        return self.models

From ed1dfd0e8431c46dbfe17cecf3ddd49be849d448 Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Thu, 18 Apr 2024 14:49:14 -0400
Subject: [PATCH 02/40] Adapt model prompt message roles to be compatible with
 the Mistral-7b-Instruct chat template, which supports the 'user' &
 'assistant' roles but does not support the 'system' role.

---
 .../resource/model_providers/llamafile.py     | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
index c5f2e2c8e9e..41350136d58 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
@@ -61,6 +61,9 @@ def _get_chat_completion_args(
             model_prompt += kwargs["messages"]
             del kwargs["messages"]
 
+        if model_name == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+            model_prompt = self._adapt_chat_messages_for_mistral_instruct(model_prompt)
+
         openai_messages: list[ChatCompletionMessageParam] = [
             message.dict(
                 include={"role", "content", "tool_calls", "name"},
@@ -71,6 +74,43 @@ def _get_chat_completion_args(
 
         return openai_messages, kwargs
 
+    def _adapt_chat_messages_for_mistral_instruct(
+            self,
+            messages: list[ChatMessage]
+    ) -> list[ChatMessage]:
+        """
+        Munge the messages to be compatible with the mistral-7b-instruct chat
+        template, which:
+        - only supports 'user' and 'assistant' roles.
+        - expects messages to alternate between user/assistant roles.
+
+        See details here: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format
+
+        """
+        adapted_messages = []
+        for m in messages:
+            if m.role == ChatMessage.Role.SYSTEM:
+                # convert to 'user' role
+                adapted_messages.append(ChatMessage.user(m.content))
+            else:
+                adapted_messages.append(m)
+
+        # if there are multiple adjacent user messages, glom them together
+        # into a single user message
+        glommed = []
+        i = 0
+        while i < len(adapted_messages):
+            if len(glommed) == 0:
+                glommed.append(adapted_messages[i])
+            elif adapted_messages[i].role != glommed[-1].role:
+                glommed.append(adapted_messages[i])
+            else:
+                glommed[-1].content += " " + adapted_messages[i].content
+            i += 1
+
+        return glommed
+
+
     @overrides
     def _parse_assistant_tool_calls(
         self, assistant_message: ChatCompletionMessage, compat_mode: bool = False

From c56c290d98df65b04d94046c42229481d9a2200f Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Thu, 18 Apr 2024 17:06:45 -0400
Subject: [PATCH 03/40] In `OpenAIProvider`, change methods
 `count_message_tokens`, `count_tokens`, and `get_tokenizer` from classmethods
 so I can override them in LlamafileProvide (and so I can access instance
 instance attributes from inside them). Implement class `LlamafileTokenizer`
 that calls the llamafile server's `/tokenize` API endpoint.

---
 .../resource/model_providers/llamafile.py     | 105 ++++++++++++++++++
 .../core/resource/model_providers/openai.py   |  13 +--
 2 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
index 41350136d58..fa96b43b708 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
@@ -1,13 +1,16 @@
 import logging
 from typing import Any, Iterator, Optional, TypeVar
 
+import requests
 from openai.types.chat import (
+    ChatCompletion,
     ChatCompletionMessage,
     ChatCompletionMessageParam,
 )
 from overrides import overrides
 
 from autogpt.core.resource.model_providers.openai import (
+    OpenAICredentials,
     OpenAIProvider,
     OpenAIModelName,
     _functions_compat_fix_kwargs
@@ -17,17 +20,100 @@
     AssistantToolCallDict,
     ChatMessage,
     CompletionModelFunction,
+    ModelTokenizer,
 )
 from autogpt.core.utils.json_utils import json_loads
 
 _T = TypeVar("_T")
 
 
+class LlamafileTokenizer(ModelTokenizer):
+
+    def __init__(self, credentials: OpenAICredentials):
+        self._credentials = credentials
+
+    @property
+    def _tokenizer_base_url(self):
+        # The OpenAI-chat-compatible base url should look something like
+        # 'http://localhost:8080/v1' but the tokenizer endpoint is
+        # 'http://localhost:8080/tokenize'. So here we just strip off the '/v1'.
+        api_base = self._credentials.api_base.get_secret_value()
+        return api_base.strip("/v1")
+
+    def encode(self, text: str) -> list[int]:
+        response = requests.post(
+            url=f"{self._tokenizer_base_url}/tokenize",
+            json={"content": text}
+        )
+        response.raise_for_status()
+        return response.json()["tokens"]
+
+    def decode(self, tokens: list[int]) -> str:
+        response = requests.post(
+            url=f"{self._tokenizer_base_url}/detokenize",
+            json={"tokens": tokens}
+        )
+        response.raise_for_status()
+        return response.json()["content"]
+
+
 class LlamafileProvider(OpenAIProvider):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+    @overrides
+    def get_tokenizer(self, model_name: OpenAIModelName) -> ModelTokenizer:
+        return LlamafileTokenizer(self._credentials)
+
+    @overrides
+    def count_tokens(self, text: str, model_name: OpenAIModelName) -> int:
+        return len(self.get_tokenizer(model_name).encode(text))
+
+    @overrides
+    def count_message_tokens(
+        self,
+        messages: ChatMessage | list[ChatMessage],
+        model_name: OpenAIModelName,
+    ) -> int:
+        if isinstance(messages, ChatMessage):
+            messages = [messages]
+
+        if model_name == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+            # For mistral-instruct, num added tokens depends on if the message
+            # is a prompt/instruction or an assistant-generated message.
+            # - prompt gets [INST], [/INST] added and the first instruction
+            # begins with '<s>' ('beginning-of-sentence' token).
+            # - assistant-generated messages get '</s>' added
+            # see: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+            #
+            prompt_added = 1  # one for '<s>' token
+            assistant_num_added = 0
+            ntokens = 0
+            for message in messages:
+                # if message.role == ChatMessage.Role.SYSTEM:
+                #     raise ValueError(f"{model_name} does not support 'system' role")
+                if (
+                        message.role == ChatMessage.Role.USER or
+                        message.role == ChatMessage.Role.SYSTEM  # note that 'system' messages will get converted to 'user' messages before being sent to the model
+                ):
+                    # 5 tokens for [INST], [/INST], which actually get
+                    # tokenized into "[, INST, ]" and "[, /, INST, ]"
+                    # by the mistral tokenizer
+                    prompt_added += 5
+                elif message.role == ChatMessage.Role.ASSISTANT:
+                    assistant_num_added += 1  # for </s>
+                else:
+                    raise ValueError(f"{model_name} does not support role: {message.role}")
+
+                ntokens += self.count_tokens(message.content, model_name)
+
+            total_token_count = prompt_added + assistant_num_added + ntokens
+            return total_token_count
+
+        else:
+            raise NotImplementedError(f"count_message_tokens not implemented for model {model_name}")
+
     @overrides
     def _get_chat_completion_args(
             self,
@@ -110,6 +196,25 @@ def _adapt_chat_messages_for_mistral_instruct(
 
         return glommed
 
+    @overrides
+    async def _create_chat_completion(
+        self,
+        messages: list[ChatCompletionMessageParam],
+        model: OpenAIModelName,
+        *_,
+        **kwargs,
+    ) -> tuple[ChatCompletion, float, int, int]:
+        if model == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+            # validate that all messages have roles that are supported by
+            # mistral-7b-instruct
+            for m in messages:
+                if m["role"] not in [
+                    ChatMessage.Role.USER,
+                    ChatMessage.Role.ASSISTANT
+                ]:
+                    raise ValueError(f"Role {m['role']} not supported by model {model}")
+
+        return await super()._create_chat_completion(messages, model, **kwargs)
 
     @overrides
     def _parse_assistant_tool_calls(
diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py b/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
index f5a6fb7c7b1..d2e75f47215 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
@@ -196,7 +196,7 @@ class OpenAIModelName(str, enum.Enum):
             #  this value and the server's value match. I use 2048 here bc
             #  that's the value I've been using during testing.
             max_tokens=2048,
-            has_function_call_api=True,
+            has_function_call_api=False,
         ),
     ]
 }
@@ -368,18 +368,15 @@ def get_token_limit(self, model_name: str) -> int:
         """Get the token limit for a given model."""
         return OPEN_AI_MODELS[model_name].max_tokens
 
-    @classmethod
-    def get_tokenizer(cls, model_name: OpenAIModelName) -> ModelTokenizer:
+    def get_tokenizer(self, model_name: OpenAIModelName) -> ModelTokenizer:
         return tiktoken.encoding_for_model(model_name)
 
-    @classmethod
-    def count_tokens(cls, text: str, model_name: OpenAIModelName) -> int:
-        encoding = cls.get_tokenizer(model_name)
+    def count_tokens(self, text: str, model_name: OpenAIModelName) -> int:
+        encoding = self.get_tokenizer(model_name)
         return len(encoding.encode(text))
 
-    @classmethod
     def count_message_tokens(
-        cls,
+        self,
         messages: ChatMessage | list[ChatMessage],
         model_name: OpenAIModelName,
     ) -> int:

From 234d05933cc75a559a68a6ef221722234adfe220 Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Thu, 18 Apr 2024 18:38:45 -0400
Subject: [PATCH 04/40] misc cleanup

---
 .../resource/model_providers/llamafile.py     | 23 ++++++-------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
index fa96b43b708..c8c0e9f5804 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
@@ -91,8 +91,6 @@ def count_message_tokens(
             assistant_num_added = 0
             ntokens = 0
             for message in messages:
-                # if message.role == ChatMessage.Role.SYSTEM:
-                #     raise ValueError(f"{model_name} does not support 'system' role")
                 if (
                         message.role == ChatMessage.Role.USER or
                         message.role == ChatMessage.Role.SYSTEM  # note that 'system' messages will get converted to 'user' messages before being sent to the model
@@ -220,23 +218,16 @@ async def _create_chat_completion(
     def _parse_assistant_tool_calls(
         self, assistant_message: ChatCompletionMessage, compat_mode: bool = False
     ):
-        if not assistant_message.content:
-            raise ValueError("Assistant message content is empty")
-        if not compat_mode:
-            raise ValueError("compat_mode must be enabled for LlamafileProvider")
-
         tool_calls: list[AssistantToolCall] = []
         parse_errors: list[Exception] = []
 
-        for tool_call in _tool_calls_compat_extract_calls(assistant_message.content):
-            tool_calls.append(tool_call)
-
-        # try:
-        #     tool_calls = list(
-        #         _tool_calls_compat_extract_calls(assistant_message.content)
-        #     )
-        # except Exception as e:
-        #     parse_errors.append(e)
+        if compat_mode and assistant_message.content:
+            try:
+                tool_calls = list(
+                    _tool_calls_compat_extract_calls(assistant_message.content)
+                )
+            except Exception as e:
+                parse_errors.append(e)
 
         return tool_calls, parse_errors
 

From 05d2b81e84c18c498c802847c6b08ab25e07f2bd Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Thu, 18 Apr 2024 19:52:47 -0400
Subject: [PATCH 05/40] add README for llamafile integration including setup
 instruction + notes on the integration; add helper scripts for
 downloading/running a llamafile + example env file.

---
 .../autogpt/llamafile-integration/README.md   | 109 ++++++++
 .../env.llamafile.example                     | 247 ++++++++++++++++++
 .../autogpt/llamafile-integration/serve.sh    |  16 ++
 .../llamafile-integration/setup-llamafile.sh  |   8 +
 4 files changed, 380 insertions(+)
 create mode 100644 autogpts/autogpt/llamafile-integration/README.md
 create mode 100644 autogpts/autogpt/llamafile-integration/env.llamafile.example
 create mode 100755 autogpts/autogpt/llamafile-integration/serve.sh
 create mode 100755 autogpts/autogpt/llamafile-integration/setup-llamafile.sh

diff --git a/autogpts/autogpt/llamafile-integration/README.md b/autogpts/autogpt/llamafile-integration/README.md
new file mode 100644
index 00000000000..bb28977ca6c
--- /dev/null
+++ b/autogpts/autogpt/llamafile-integration/README.md
@@ -0,0 +1,109 @@
+# Llamafile/AutoGPT Integration Notes
+
+## Setup
+
+### AutoGPT setup
+
+```bash
+git clone git@github.com:Mozilla-Ocho/AutoGPT.git
+cd AutoGPT/autogpts/autogpt
+pyenv local 3.11
+./setup
+cp llamafile-integration/env.llamafile.example .env
+```
+
+
+### llamafile setup
+
+Run the llamafile setup script:
+
+```bash
+./llamafile-integration/setup-llamafile.sh
+```
+
+### Run AutoGPT + llamafile
+
+First, start the llamafile server:
+
+```bash
+./llamafile-integration/serve.sh
+```
+
+Then, in a separate terminal, run AutoGPT:
+
+```bash
+./autogpt.sh run
+```
+
+I tested everything using the task prompt: "Tell me about Roman dodecahedrons."
+
+```bash
+Enter the task that you want AutoGPT to execute, with as much detail as possible: Tell me about Roman dodecahedrons.
+```
+
+## Implementation Notes
+
+Here's a brief summary of the issues I encountered & fixed while I was trying to get this integration to work.
+
+### Initial Setup
+
+Tested with:
+* Python 3.11
+* Apple M2 Pro (32 GB), macOS 14.2.1
+
+AutoGPT setup steps:
+
+starting commit: `7082e63b115d72440ee2dfe3f545fa3dcba490d5`
+
+```bash
+git clone git@github.com:Mozilla-Ocho/AutoGPT.git
+cd AutoGPT/autogpts/autogpt
+pyenv local 3.11
+./setup
+cp .env.template .env
+```
+
+then I edited `.env` to set:
+
+```dotenv
+OPENAI_API_KEY=sk-noop
+OPENAI_API_BASE_URL=http://localhost:8080/v1
+```
+
+In a separate terminal window, I started the llamafile server:
+
+```bash
+./llamafile-integration/setup.sh
+./llamafile-integration/serve.sh
+```
+
+### Issue 1: Fix 'Error: Invalid OpenAI API key'
+
+Culprit: API key validation is baked in regardless of whether we actually need an API key or what format the API key is supposed to take. See:
+- https://github.com/Mozilla-Ocho/AutoGPT/blob/262771a69c787814222e23d856f4438333256245/autogpts/autogpt/autogpt/app/main.py#L104
+- https://github.com/Mozilla-Ocho/AutoGPT/blob/028d2c319f3dcca6aa57fc4fdcd2e78a01926e3f/autogpts/autogpt/autogpt/config/config.py#L306
+
+Temporary fix: In `.env`, changed `OPENAI_API_KEY` to something that passes the regex validator:
+
+```bash
+## OPENAI_API_KEY - OpenAI API Key (Example: my-openai-api-key)
+#OPENAI_API_KEY=sk-noop
+OPENAI_API_KEY="sk-000000000000000000000000000000000000000000000000"
+```
+
+### Issue 2: Fix 'ValueError: LLM did not call `create_agent` function; agent profile creation failed'
+
+* Added new entry to `OPEN_AI_CHAT_MODELS` with `has_function_call_api=False` so that `tool_calls_compat_mode` will be triggered in the `create_chat_completion` (changes in `autogpt/core/resource/model_providers/openai.py`)
+* Modified `_tool_calls_compat_extract_calls` to strip off whitespace and markdown syntax at the beginning/end of model responses (changes in `autogpt/core/resource/model_providers/llamafile.py`)
+* Modified `_get_chat_completion_args` to adapt model prompt message roles to be compatible with the [Mistral-7b-Instruct chat template](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format), which supports the 'user' & 'assistant' roles but does not support the 'system' role (changes in `autogpt/core/resource/model_providers/llamafile.py`).
+
+### Issue 3: Fix: 'NotImplementedError: `count_message_tokens()` is not implemented for model'
+
+* In `OpenAIProvider`, change methods `count_message_tokens`, `count_tokens`, and `get_tokenizer` from classmethods to regular methods so a) I can override them in subclass `LlamafileProvider`, b) these methods can access instance attributes (this is required in my implementation of these methods in `LlamafileProvider`). 
+* Implement class `LlamafileTokenizer` that calls the llamafile server's `/tokenize` API endpoint. Implement methods `count_message_tokens`, `count_tokens`, and `get_tokenizer` in `LlamafileProvider` (changes in `autogpt/core/resource/model_providers/llamafile.py`).
+
+## Other TODOs
+
+* `SMART_LLM`/`FAST_LLM` configuration: Currently, the llamafile server only serves one model at a time. However, there's no reason you can't start multiple llamafile servers on different ports. To support using different models for `smart_llm` and `fast_llm`, you could implement config vars like `LLAMAFILE_SMART_LLM_URL` and `LLAMAFILE_FAST_LLM_URL` that point to different llamafile servers (one serving a 'big model' and one serving a 'fast model'). 
+* Authorization: the `serve.sh` script does not set up any authorization for the llamafile server; this can be turned on by adding arg `--api-key <some-key>` to the server startup command. However I haven't attempted to test whether the integration with autogpt works when this feature is turned on.
+* Added a few TODOs inline in the code
\ No newline at end of file
diff --git a/autogpts/autogpt/llamafile-integration/env.llamafile.example b/autogpts/autogpt/llamafile-integration/env.llamafile.example
new file mode 100644
index 00000000000..ed39b78942e
--- /dev/null
+++ b/autogpts/autogpt/llamafile-integration/env.llamafile.example
@@ -0,0 +1,247 @@
+################################################################################
+### AutoGPT - GENERAL SETTINGS
+################################################################################
+
+## OPENAI_API_KEY - OpenAI API Key (Example: my-openai-api-key)
+#OPENAI_API_KEY=your-openai-api-key
+OPENAI_API_KEY=sk-000000000000000000000000000000000000000000000000
+
+## TELEMETRY_OPT_IN - Share telemetry on errors and other issues with the AutoGPT team, e.g. through Sentry.
+##   This helps us to spot and solve problems earlier & faster. (Default: DISABLED)
+TELEMETRY_OPT_IN=false
+
+## EXECUTE_LOCAL_COMMANDS - Allow local command execution (Default: False)
+# EXECUTE_LOCAL_COMMANDS=False
+
+### Workspace ###
+
+## RESTRICT_TO_WORKSPACE - Restrict file operations to workspace ./data/agents/<agent_id>/workspace (Default: True)
+# RESTRICT_TO_WORKSPACE=True
+
+## DISABLED_COMMAND_CATEGORIES - The list of categories of commands that are disabled (Default: None)
+# DISABLED_COMMAND_CATEGORIES=
+
+## FILE_STORAGE_BACKEND - Choose a storage backend for contents
+## Options: local, gcs, s3
+# FILE_STORAGE_BACKEND=local
+
+## STORAGE_BUCKET - GCS/S3 Bucket to store contents in
+# STORAGE_BUCKET=autogpt
+
+## GCS Credentials
+# see https://cloud.google.com/storage/docs/authentication#libauth
+
+## AWS/S3 Credentials
+# see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
+
+## S3_ENDPOINT_URL - If you're using non-AWS S3, set your endpoint here.
+# S3_ENDPOINT_URL=
+
+### Miscellaneous ###
+
+## USER_AGENT - Define the user-agent used by the requests library to browse website (string)
+# USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
+
+## AI_SETTINGS_FILE - Specifies which AI Settings file to use, relative to the AutoGPT root directory. (defaults to ai_settings.yaml)
+# AI_SETTINGS_FILE=ai_settings.yaml
+
+## PLUGINS_CONFIG_FILE - The path to the plugins_config.yaml file, relative to the AutoGPT root directory. (Default plugins_config.yaml)
+# PLUGINS_CONFIG_FILE=plugins_config.yaml
+
+## PROMPT_SETTINGS_FILE - Specifies which Prompt Settings file to use, relative to the AutoGPT root directory. (defaults to prompt_settings.yaml)
+# PROMPT_SETTINGS_FILE=prompt_settings.yaml
+
+## AUTHORISE COMMAND KEY - Key to authorise commands
+# AUTHORISE_COMMAND_KEY=y
+
+## EXIT_KEY - Key to exit AutoGPT
+# EXIT_KEY=n
+
+################################################################################
+### LLM PROVIDER
+################################################################################
+
+LLM_PROVIDER=llamafile
+
+## TEMPERATURE - Sets temperature in OpenAI (Default: 0)
+# TEMPERATURE=0
+
+## OPENAI_API_BASE_URL - Custom url for the OpenAI API, useful for connecting to custom backends. No effect if USE_AZURE is true, leave blank to keep the default url
+# the following is an example:
+# OPENAI_API_BASE_URL=http://localhost:443/v1
+OPENAI_API_BASE_URL=http://localhost:8080/v1
+
+# OPENAI_API_TYPE=
+# OPENAI_API_VERSION=
+
+## OPENAI_FUNCTIONS - Enables OpenAI functions: https://platform.openai.com/docs/guides/gpt/function-calling
+## Note: this feature is only supported by OpenAI's newer models.
+# OPENAI_FUNCTIONS=False
+
+## OPENAI_ORGANIZATION - Your OpenAI Organization key (Default: None)
+# OPENAI_ORGANIZATION=
+
+## USE_AZURE - Use Azure OpenAI or not (Default: False)
+# USE_AZURE=False
+
+## AZURE_CONFIG_FILE - The path to the azure.yaml file, relative to the folder containing this file. (Default: azure.yaml)
+# AZURE_CONFIG_FILE=azure.yaml
+
+# AZURE_OPENAI_AD_TOKEN=
+# AZURE_OPENAI_ENDPOINT=
+
+################################################################################
+### LLM MODELS
+################################################################################
+
+## SMART_LLM - Smart language model (Default: gpt-4-turbo-preview)
+# SMART_LLM=gpt-4-turbo-preview
+SMART_LLM=mistral-7b-instruct-v0
+
+## FAST_LLM - Fast language model (Default: gpt-3.5-turbo-0125)
+# FAST_LLM=gpt-3.5-turbo-0125
+FAST_LLM=mistral-7b-instruct-v0
+
+## EMBEDDING_MODEL - Model to use for creating embeddings
+# EMBEDDING_MODEL=text-embedding-3-small
+
+################################################################################
+### SHELL EXECUTION
+################################################################################
+
+## SHELL_COMMAND_CONTROL - Whether to use "allowlist" or "denylist" to determine what shell commands can be executed (Default: denylist)
+# SHELL_COMMAND_CONTROL=denylist
+
+## ONLY if SHELL_COMMAND_CONTROL is set to denylist:
+## SHELL_DENYLIST - List of shell commands that ARE NOT allowed to be executed by AutoGPT (Default: sudo,su)
+# SHELL_DENYLIST=sudo,su
+
+## ONLY if SHELL_COMMAND_CONTROL is set to allowlist:
+## SHELL_ALLOWLIST - List of shell commands that ARE allowed to be executed by AutoGPT (Default: None)
+# SHELL_ALLOWLIST=
+
+################################################################################
+### IMAGE GENERATION PROVIDER
+################################################################################
+
+### Common
+
+## IMAGE_PROVIDER - Image provider (Default: dalle)
+# IMAGE_PROVIDER=dalle
+
+## IMAGE_SIZE - Image size (Default: 256)
+# IMAGE_SIZE=256
+
+### Huggingface (IMAGE_PROVIDER=huggingface)
+
+## HUGGINGFACE_IMAGE_MODEL - Text-to-image model from Huggingface (Default: CompVis/stable-diffusion-v1-4)
+# HUGGINGFACE_IMAGE_MODEL=CompVis/stable-diffusion-v1-4
+
+## HUGGINGFACE_API_TOKEN - HuggingFace API token (Default: None)
+# HUGGINGFACE_API_TOKEN=
+
+### Stable Diffusion (IMAGE_PROVIDER=sdwebui)
+
+## SD_WEBUI_AUTH - Stable Diffusion Web UI username:password pair (Default: None)
+# SD_WEBUI_AUTH=
+
+## SD_WEBUI_URL - Stable Diffusion Web UI API URL (Default: http://localhost:7860)
+# SD_WEBUI_URL=http://localhost:7860
+
+################################################################################
+### AUDIO TO TEXT PROVIDER
+################################################################################
+
+## AUDIO_TO_TEXT_PROVIDER - Audio-to-text provider (Default: huggingface)
+# AUDIO_TO_TEXT_PROVIDER=huggingface
+
+## HUGGINGFACE_AUDIO_TO_TEXT_MODEL - The model for HuggingFace to use (Default: CompVis/stable-diffusion-v1-4)
+# HUGGINGFACE_AUDIO_TO_TEXT_MODEL=CompVis/stable-diffusion-v1-4
+
+################################################################################
+### GITHUB
+################################################################################
+
+## GITHUB_API_KEY - Github API key / PAT (Default: None)
+# GITHUB_API_KEY=
+
+## GITHUB_USERNAME - Github username (Default: None)
+# GITHUB_USERNAME=
+
+################################################################################
+### WEB BROWSING
+################################################################################
+
+## HEADLESS_BROWSER - Whether to run the browser in headless mode (default: True)
+# HEADLESS_BROWSER=True
+
+## USE_WEB_BROWSER - Sets the web-browser driver to use with selenium (default: chrome)
+# USE_WEB_BROWSER=chrome
+USE_WEB_BROWSER=firefox
+
+## BROWSE_CHUNK_MAX_LENGTH - When browsing website, define the length of chunks to summarize (Default: 3000)
+# BROWSE_CHUNK_MAX_LENGTH=3000
+
+## BROWSE_SPACY_LANGUAGE_MODEL - spaCy language model](https://spacy.io/usage/models) to use when creating chunks. (Default: en_core_web_sm)
+# BROWSE_SPACY_LANGUAGE_MODEL=en_core_web_sm
+
+## GOOGLE_API_KEY - Google API key (Default: None)
+# GOOGLE_API_KEY=
+
+## GOOGLE_CUSTOM_SEARCH_ENGINE_ID - Google custom search engine ID (Default: None)
+# GOOGLE_CUSTOM_SEARCH_ENGINE_ID=
+
+################################################################################
+### TEXT TO SPEECH PROVIDER
+################################################################################
+
+## TEXT_TO_SPEECH_PROVIDER - Which Text to Speech provider to use (Default: gtts)
+## Options: gtts, streamelements, elevenlabs, macos
+# TEXT_TO_SPEECH_PROVIDER=gtts
+
+## STREAMELEMENTS_VOICE - Voice to use for StreamElements (Default: Brian)
+# STREAMELEMENTS_VOICE=Brian
+
+## ELEVENLABS_API_KEY - Eleven Labs API key (Default: None)
+# ELEVENLABS_API_KEY=
+
+## ELEVENLABS_VOICE_ID - Eleven Labs voice ID (Example: None)
+# ELEVENLABS_VOICE_ID=
+
+################################################################################
+### CHAT MESSAGES
+################################################################################
+
+## CHAT_MESSAGES_ENABLED - Enable chat messages (Default: False)
+# CHAT_MESSAGES_ENABLED=False
+
+################################################################################
+### LOGGING
+################################################################################
+
+## LOG_LEVEL - Set the minimum level to filter log output by. Setting this to DEBUG implies LOG_FORMAT=debug, unless LOG_FORMAT is set explicitly.
+## Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
+# LOG_LEVEL=INFO
+LOG_LEVEL=DEBUG
+
+## LOG_FORMAT - The format in which to log messages to the console (and log files).
+## Options: simple, debug, structured_google_cloud
+# LOG_FORMAT=simple
+
+## LOG_FILE_FORMAT - Normally follows the LOG_FORMAT setting, but can be set separately.
+## Note: Log file output is disabled if LOG_FORMAT=structured_google_cloud.
+# LOG_FILE_FORMAT=simple
+
+## PLAIN_OUTPUT - Disables animated typing and the spinner in the console output. (Default: False)
+# PLAIN_OUTPUT=False
+
+
+################################################################################
+### Agent Protocol Server Settings
+################################################################################
+## AP_SERVER_PORT - Specifies what port the agent protocol server will listen on. (Default: 8000)
+## AP_SERVER_DB_URL - Specifies what connection url the agent protocol database will connect to (Default: Internal SQLite)
+## AP_SERVER_CORS_ALLOWED_ORIGINS - Comma separated list of allowed origins for CORS. (Default: http://localhost:{AP_SERVER_PORT})
+# AP_SERVER_PORT=8000
+# AP_SERVER_DB_URL=sqlite:///data/ap_server.db
+# AP_SERVER_CORS_ALLOWED_ORIGINS=
diff --git a/autogpts/autogpt/llamafile-integration/serve.sh b/autogpts/autogpt/llamafile-integration/serve.sh
new file mode 100755
index 00000000000..91f273a6bd0
--- /dev/null
+++ b/autogpts/autogpt/llamafile-integration/serve.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#
+# Use llamafile to server a (quantized) mistral-7b-instruct-v0.2 model
+#
+# Usage:
+#   cd <repo-root>/autogpts/autogpt
+#   ./llamafile-integration/serve.sh
+#
+
+LLAMAFILE="./llamafile-integration/mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
+
+"${LLAMAFILE}" \
+--server \
+--nobrowser \
+--ctx-size 2048 \
+--n-predict 512
diff --git a/autogpts/autogpt/llamafile-integration/setup-llamafile.sh b/autogpts/autogpt/llamafile-integration/setup-llamafile.sh
new file mode 100755
index 00000000000..ad2426a2324
--- /dev/null
+++ b/autogpts/autogpt/llamafile-integration/setup-llamafile.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cd llamafile-integration
+
+# Download the mistral-7b-instruct llamafile from HuggingFace
+wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version

From 1cd3e8b17bf49e99dd1fe09af11b6a8e1f5aeca2 Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Thu, 18 Apr 2024 22:50:18 -0400
Subject: [PATCH 06/40] simplify mistral message handling; set seed=0 in chat
 completion kwargs for reproducibility

---
 .../resource/model_providers/llamafile.py     | 100 +++++-------------
 1 file changed, 25 insertions(+), 75 deletions(-)

diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
index c8c0e9f5804..8ea412dc145 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/llamafile.py
@@ -112,56 +112,10 @@ def count_message_tokens(
         else:
             raise NotImplementedError(f"count_message_tokens not implemented for model {model_name}")
 
-    @overrides
-    def _get_chat_completion_args(
-            self,
-            model_prompt: list[ChatMessage],
-            model_name: OpenAIModelName,
-            functions: Optional[list[CompletionModelFunction]] = None,
-            **kwargs,
-    ) -> tuple[list[ChatCompletionMessageParam], dict[str, Any]]:
-        """Prepare chat completion arguments and keyword arguments for API call.
-
-        Args:
-            model_prompt: List of ChatMessages.
-            model_name: The model to use.
-            functions: Optional list of functions available to the LLM.
-            kwargs: Additional keyword arguments.
-
-        Returns:
-            list[ChatCompletionMessageParam]: Prompt messages for the OpenAI call
-            dict[str, Any]: Any other kwargs for the OpenAI call
-        """
-        kwargs.update(self._credentials.get_model_access_kwargs(model_name))
-
-        if functions:
-            _functions_compat_fix_kwargs(functions, kwargs)
-
-        if extra_headers := self._configuration.extra_request_headers:
-            kwargs["extra_headers"] = kwargs.get("extra_headers", {})
-            kwargs["extra_headers"].update(extra_headers.copy())
-
-        if "messages" in kwargs:
-            model_prompt += kwargs["messages"]
-            del kwargs["messages"]
-
-        if model_name == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
-            model_prompt = self._adapt_chat_messages_for_mistral_instruct(model_prompt)
-
-        openai_messages: list[ChatCompletionMessageParam] = [
-            message.dict(
-                include={"role", "content", "tool_calls", "name"},
-                exclude_none=True,
-            )
-            for message in model_prompt
-        ]
-
-        return openai_messages, kwargs
-
     def _adapt_chat_messages_for_mistral_instruct(
             self,
-            messages: list[ChatMessage]
-    ) -> list[ChatMessage]:
+            messages: list[ChatCompletionMessageParam]
+    ) -> list[ChatCompletionMessageParam]:
         """
         Munge the messages to be compatible with the mistral-7b-instruct chat
         template, which:
@@ -172,27 +126,25 @@ def _adapt_chat_messages_for_mistral_instruct(
 
         """
         adapted_messages = []
-        for m in messages:
-            if m.role == ChatMessage.Role.SYSTEM:
-                # convert to 'user' role
-                adapted_messages.append(ChatMessage.user(m.content))
-            else:
-                adapted_messages.append(m)
-
-        # if there are multiple adjacent user messages, glom them together
-        # into a single user message
-        glommed = []
-        i = 0
-        while i < len(adapted_messages):
-            if len(glommed) == 0:
-                glommed.append(adapted_messages[i])
-            elif adapted_messages[i].role != glommed[-1].role:
-                glommed.append(adapted_messages[i])
+        for message in messages:
+
+            # convert 'system' role to 'user' role as mistral-7b-instruct does
+            # not support 'system'
+            if message["role"] == ChatMessage.Role.SYSTEM:
+                message["role"] = ChatMessage.Role.USER
+
+            if len(adapted_messages) == 0:
+                adapted_messages.append(message)
+
             else:
-                glommed[-1].content += " " + adapted_messages[i].content
-            i += 1
+                if message["role"] == adapted_messages[-1]["role"]:
+                    # if the curr message has the same role as the previous one,
+                    # concat the current message content to the prev message
+                    adapted_messages[-1]["content"] += " " + message["content"]
+                else:
+                    adapted_messages.append(message)
 
-        return glommed
+        return adapted_messages
 
     @overrides
     async def _create_chat_completion(
@@ -203,14 +155,12 @@ async def _create_chat_completion(
         **kwargs,
     ) -> tuple[ChatCompletion, float, int, int]:
         if model == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
-            # validate that all messages have roles that are supported by
-            # mistral-7b-instruct
-            for m in messages:
-                if m["role"] not in [
-                    ChatMessage.Role.USER,
-                    ChatMessage.Role.ASSISTANT
-                ]:
-                    raise ValueError(f"Role {m['role']} not supported by model {model}")
+            messages = self._adapt_chat_messages_for_mistral_instruct(messages)
+
+        if "seed" not in kwargs:
+            # TODO: temporarily hard-coded for reproducibility, instead the
+            #  seed should be set from config
+            kwargs["seed"] = 0
 
         return await super()._create_chat_completion(messages, model, **kwargs)
 

From dc36c69684e8984d84b59028d897f04043c85434 Mon Sep 17 00:00:00 2001
From: Kate Silverstein <ksilverstein@mozilla.com>
Date: Fri, 19 Apr 2024 00:18:03 -0400
Subject: [PATCH 07/40] set mistral max_tokens to actual value configured in
 the model and change serve.sh to use model's full context size (this does not
 seem to cause OOM errors, surpisingly).

---
 .../core/resource/model_providers/openai.py   |   9 +-
 .../autogpt/llamafile-integration/README.md   | 115 +++++++++++++++++-
 .../autogpt/llamafile-integration/serve.sh    |   8 +-
 3 files changed, 116 insertions(+), 16 deletions(-)

diff --git a/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py b/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
index d2e75f47215..2b36dc8eb79 100644
--- a/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
+++ b/autogpts/autogpt/autogpt/core/resource/model_providers/openai.py
@@ -188,14 +188,7 @@ class OpenAIModelName(str, enum.Enum):
             provider_name=ModelProviderName.LLAMAFILE,
             prompt_token_cost=0.0,
             completion_token_cost=0.0,
-            # TODO: the actual mistral model token limit is 32768 but the
-            #  llamafile server has its own max token limit that is configured
-            #  when the server is started bc the full context window might not
-            #  fit in memory depending on the hardware the llamafile is being
-            #  run on. probably need to think about how to set/coordinate s.t.
-            #  this value and the server's value match. I use 2048 here bc
-            #  that's the value I've been using during testing.
-            max_tokens=2048,
+            max_tokens=32768,
             has_function_call_api=False,
         ),
     ]
diff --git a/autogpts/autogpt/llamafile-integration/README.md b/autogpts/autogpt/llamafile-integration/README.md
index bb28977ca6c..36db72f536a 100644
--- a/autogpts/autogpt/llamafile-integration/README.md
+++ b/autogpts/autogpt/llamafile-integration/README.md
@@ -1,5 +1,12 @@
 # Llamafile/AutoGPT Integration Notes
 
+Tested with:
+* Python 3.11
+* Apple M2 Pro (32 GB), macOS 14.2.1
+* quantized mistral-7b-instruct-v0.2
+
+I tested everything using the task: "Tell me about Roman dodecahedrons."
+
 ## Setup
 
 ### AutoGPT setup
@@ -7,6 +14,7 @@
 ```bash
 git clone git@github.com:Mozilla-Ocho/AutoGPT.git
 cd AutoGPT/autogpts/autogpt
+git checkout draft-llamafile-support
 pyenv local 3.11
 ./setup
 cp llamafile-integration/env.llamafile.example .env
@@ -35,10 +43,102 @@ Then, in a separate terminal, run AutoGPT:
 ./autogpt.sh run
 ```
 
-I tested everything using the task prompt: "Tell me about Roman dodecahedrons."
+Sample interaction:
 
 ```bash
+2024-04-18 23:55:26,895 WARNING  You are running on `draft-llamafile-support` branch - this is not a supported branch.
+2024-04-18 23:55:26,896 INFO  Smart LLM: mistral-7b-instruct-v0
+2024-04-18 23:55:26,896 INFO  Fast LLM: mistral-7b-instruct-v0
+2024-04-18 23:55:26,896 INFO  Browser: firefox
+2024-04-18 23:55:26,898 INFO  Code Execution: DISABLED (Docker unavailable)
 Enter the task that you want AutoGPT to execute, with as much detail as possible: Tell me about Roman dodecahedrons.
+2024-04-18 23:55:59,738 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
+2024-04-18 23:55:59,741 INFO  Current AI Settings:
+2024-04-18 23:55:59,741 INFO  -------------------:
+2024-04-18 23:55:59,741 INFO  Name : HistorianDodecahedronGPT
+2024-04-18 23:55:59,741 INFO  Role : An autonomous agent specialized in providing in-depth knowledge and analysis about Roman dodecahedrons.
+2024-04-18 23:55:59,741 INFO  Constraints:
+2024-04-18 23:55:59,741 INFO  - Exclusively use the commands listed below.
+2024-04-18 23:55:59,741 INFO  - You can only act proactively, and are unable to start background jobs or set up webhooks for yourself. Take this into account when planning your actions.
+2024-04-18 23:55:59,741 INFO  - You are unable to interact with physical objects. If this is absolutely necessary to fulfill a task or objective or to complete a step, you must ask the user to do it for you. If the user refuses this, and there is no other way to achieve your goals, you must terminate to avoid wasting time and energy.
+2024-04-18 23:55:59,741 INFO  - Limit responses to facts and historical information.
+2024-04-18 23:55:59,741 INFO  - Provide sources and citations for all information provided.
+2024-04-18 23:55:59,741 INFO  Resources:
+2024-04-18 23:55:59,742 INFO  - Internet access for searches and information gathering.
+2024-04-18 23:55:59,742 INFO  - The ability to read and write files.
+2024-04-18 23:55:59,742 INFO  - You are a Large Language Model, trained on millions of pages of text, including a lot of factual knowledge. Make use of this factual knowledge to avoid unnecessary gathering of information.
+2024-04-18 23:55:59,742 INFO  Best practices:
+2024-04-18 23:55:59,742 INFO  - Continuously review and analyze your actions to ensure you are performing to the best of your abilities.
+2024-04-18 23:55:59,742 INFO  - Constructively self-criticize your big-picture behavior constantly.
+2024-04-18 23:55:59,742 INFO  - Reflect on past decisions and strategies to refine your approach.
+2024-04-18 23:55:59,742 INFO  - Every command has a cost, so be smart and efficient. Aim to complete tasks in the least number of steps.
+2024-04-18 23:55:59,742 INFO  - Only make use of your information gathering abilities to find information that you don't yet have knowledge of.
+2024-04-18 23:55:59,742 INFO  - Provide accurate and detailed historical context about the origin, usage, and cultural significance of Roman dodecahedrons.
+2024-04-18 23:55:59,742 INFO  - Analyze and interpret various historical artifacts and texts to gain a comprehensive understanding of the subject.
+2024-04-18 23:55:59,742 INFO  - Offer visualizations and diagrams to help illustrate complex concepts related to Roman dodecahedrons.
+2024-04-18 23:55:59,742 INFO  - Provide recommendations for further reading and resources for those interested in learning more about the topic.
+Continue with these settings? [Y/n] Y
+2024-04-18 23:56:41,707 INFO  NOTE: All files/directories created by this agent can be found inside its workspace at: /Users/ksilverstein/dev/autogpt/v4-autogpt-llamafile-support/autogpts/autogpt/data/agents/HistorianDodecahedronGPT-d4df1da9/workspace
+/ Thinking...
+2024-04-18 23:57:08,180 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
+2024-04-18 23:57:08,188 INFO  HISTORIANDODECAHEDRONGPT THOUGHTS: Roman dodecahedrons are polyhedra with twelve faces, each of which is a regular pentagon. They have been found in various archaeological sites across the Roman Empire, dating back to the 1st century BC. The exact purpose and significance of these objects are still a subject of debate among historians and archaeologists. Some theories suggest they were used as gaming pieces, while others propose they had religious or symbolic meanings.
+2024-04-18 23:57:08,188 INFO  REASONING: Based on the user's request, I will provide historical information about Roman dodecahedrons.
+2024-04-18 23:57:08,188 INFO  PLAN:
+2024-04-18 23:57:08,188 INFO  -  Research the historical context and significance of Roman dodecahedrons.
+2024-04-18 23:57:08,188 INFO  -  Identify theories regarding their usage and meaning.
+2024-04-18 23:57:08,188 INFO  -  Provide visualizations and diagrams to help illustrate the concepts.
+2024-04-18 23:57:08,188 INFO  CRITICISM:
+2024-04-18 23:57:08,188 INFO  SPEAK: Roman dodecahedrons are intriguing objects with a rich history. They were used by the ancient Romans and have twelve faces, each one a regular pentagon. While their exact purpose remains a topic of debate, some theories suggest they were used as gaming pieces, while others propose religious or symbolic meanings. Let me delve deeper into the historical context and significance of these fascinating objects.
+
+2024-04-18 23:57:08,188 INFO  NEXT ACTION: COMMAND = web_search  ARGUMENTS = {'query': 'Roman dodecahedron history significance'}
+2024-04-18 23:57:08,188 INFO  Enter 'y' to authorise command, 'y -N' to run N continuous commands, 'n' to exit program, or enter feedback for HistorianDodecahedronGPT...
+Input: y
+2024-04-18 23:57:36,589 INFO  -=-=-=-=-=-=-= COMMAND AUTHORISED BY USER -=-=-=-=-=-=-=
+2024-04-18 23:57:48,021 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
+2024-04-18 23:57:48,022 INFO  SYSTEM:
+## Search results
+### "Roman Dodecahedron - the Mystery of an Ancient Artifact"
+**URL:** https://roman-empire.net/discoveries/roman-dodecahedron/
+**Excerpt:** "Scholars have long debated the purpose and significance of the Roman dodecahedron. Some experts argue that it was used as a measuring instrument for astronomical calculations, while others believe it was used for religious purposes or as a gaming piece. ... The rich history and culture of the Roman Empire has lasting impacts in modern society."
+
+### "Roman dodecahedron - Wikipedia"
+**URL:** https://en.wikipedia.org/wiki/Roman_dodecahedron
+**Excerpt:** "Roman bronze dodecahedron found in Tongeren, Gallo-Roman Museum, Tongeren A Roman dodecahedron or Gallo-Roman dodecahedron is a small hollow object made of copper alloy which has been cast into a regular dodecahedral shape: twelve flat pentagonal faces. Each face has a circular hole of varying diameter in the middle, the holes connecting to the hollow center, and each corner has a protruding knob."
+
+### "The Mysterious Dodecahedrons of the Roman Empire - Atlas Obscura"
+**URL:** https://www.atlasobscura.com/articles/dodecahedrons-roman-empire
+**Excerpt:** "This ancient dodecahedron found in Avenches, Switzerland, once the Roman city of Aventicum. Woudloper/Wikimedia/CC BY-SA 3.0. In the first episode of Buck Rogers, the 1980s television series about ..."
+
+### "What Was the Purpose of a Roman Dodecahedron? - History Defined"
+**URL:** https://www.historydefined.net/what-was-the-purpose-of-a-roman-dodecahedron/
+**Excerpt:** "One of the significant advantages any historian of ancient Rome has is a wealth of written material that has survived from 2,000 years ago to help explain to us what the remains of the Roman Empire mean. For instance, we know how the towns of Pompeii and Herculaneum ended up buried under volcanic ash because"
+
+### "The Enigma of the Roman Dodecahedra | Ancient Origins"
+**URL:** https://www.ancient-origins.net/artifacts-other-artifacts-news-unexplained-phenomena/enigma-roman-dodecahedra-002371
+**Excerpt:** "The Roman dodecahedron is a small, hollow object made of bronze or (more rarely) stone, with a geometrical shape that has 12 flat faces. Each face is a pentagon, a five-sided shape. The Roman dodecahedra are also embellished with a series of knobs on each corner point of the pentagons, and the pentagon faces in most cases contain circular holes ..."
+
+### "The mysterious dodecahedrons of the Roman Empire | English Heritage"
+**URL:** https://www.english-heritage.org.uk/visit/places/corbridge-roman-town-hadrians-wall/dodecahedron-exhibition/
+**Excerpt:** "The dodecahedron (12 sided object) has been puzzling archaeologists since the 18 th century when it was first discovered. Only 120 have been found across what was the Roman Empire, however, none have been found in Italy, Spain, Africa or the Mediterranean. These striking objects are some of the rarest and least understood to survive from the ..."
+
+### "What is a Roman Dodecahedron? The Mystery is Still Unsolved"
+**URL:** https://www.historicmysteries.com/archaeology/roman-dodecahedron/14742/
+**Excerpt:** "This Roman dodecahedron derives from the 2nd or 3rd century in Stuttgart, Germany. CC3.0 Anagoria. The enigmatic geometric forms consist of copper alloy. They range in size from 4 cm to 11 cm, and each of the 12 pentagonal faces contains a circular hole. Oddly, the diameters of the holes are not identical in a single dodecahedron."
+
+### "The Mysterious Bronze Objects That Have Baffled Archeologists for ..."
+**URL:** https://www.mentalfloss.com/article/514246/are-roman-dodecahedrons-worlds-most-mysterious-artifact
+**Excerpt:** "The Roman dodecahedron Brian Campbell found in East London / Brian Campbell One August day in 1987, Brian Campbell was refilling the hole left by a tree stump in his yard in Romford, East London ..."
+2024-04-18 23:58:16,114 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
+2024-04-18 23:58:16,180 INFO  HISTORIANDODECAHEDRONGPT THOUGHTS: Roman dodecahedrons are small, hollow objects made of bronze or copper alloy with a dodecahedral shape, consisting of twelve flat pentagonal faces. Each face has a circular hole in the middle, and the corners have protruding knobs. The exact purpose and significance of these objects remain a mystery, with theories suggesting they were used for astronomical calculations, religious purposes, or as gaming pieces.
+2024-04-18 23:58:16,180 INFO  REASONING: Based on the user's request, I provided historical information about Roman dodecahedrons.
+2024-04-18 23:58:16,180 INFO  PLAN:
+2024-04-18 23:58:16,180 INFO  -  Read the webpage contents in more detail to extract specific information about the origins and cultural significance of Roman dodecahedrons.
+2024-04-18 23:58:16,180 INFO  CRITICISM: I could have provided more specific information about the origins and cultural significance of Roman dodecahedrons from the existing search results.
+2024-04-18 23:58:16,180 INFO  SPEAK: Roman dodecahedrons are intriguing objects with a rich history. They were made of bronze or copper alloy and had a dodecahedral shape, consisting of twelve flat pentagonal faces. Each face had a circular hole in the middle, and the corners had protruding knobs. The exact purpose and significance of these objects remain a mystery, with theories suggesting they were used for astronomical calculations, religious purposes, or as gaming pieces.
+
+2024-04-18 23:58:16,180 INFO  NEXT ACTION: COMMAND = read_webpage  ARGUMENTS = {'url': 'https://en.wikipedia.org/wiki/Roman_dodecahedron', 'topics_of_interest': ['origins', 'cultural_significance']}
+2024-04-18 23:58:16,180 INFO  Enter 'y' to authorise command, 'y -N' to run N continuous commands, 'n' to exit program, or enter feedback for HistorianDodecahedronGPT...
+...
 ```
 
 ## Implementation Notes
@@ -47,10 +147,6 @@ Here's a brief summary of the issues I encountered & fixed while I was trying to
 
 ### Initial Setup
 
-Tested with:
-* Python 3.11
-* Apple M2 Pro (32 GB), macOS 14.2.1
-
 AutoGPT setup steps:
 
 starting commit: `7082e63b115d72440ee2dfe3f545fa3dcba490d5`
@@ -102,8 +198,15 @@ OPENAI_API_KEY="sk-000000000000000000000000000000000000000000000000"
 * In `OpenAIProvider`, change methods `count_message_tokens`, `count_tokens`, and `get_tokenizer` from classmethods to regular methods so a) I can override them in subclass `LlamafileProvider`, b) these methods can access instance attributes (this is required in my implementation of these methods in `LlamafileProvider`). 
 * Implement class `LlamafileTokenizer` that calls the llamafile server's `/tokenize` API endpoint. Implement methods `count_message_tokens`, `count_tokens`, and `get_tokenizer` in `LlamafileProvider` (changes in `autogpt/core/resource/model_providers/llamafile.py`).
 
+### Issue 4: Fix `Command web_search returned an error: DuckDuckGoSearchException: Ratelimit`
+
+* Ran: `poetry update duckduckgo-search` - this got rid of the rate limit error
+* Why is the `send_token_limit` divided by 3 [here](https://github.com/Mozilla-Ocho/AutoGPT/blob/37904a0f80f3499ea43e7846f78d5274b32cad03/autogpts/autogpt/autogpt/agents/agent.py#L274)? 
+
 ## Other TODOs
 
+* Test with other tasks
 * `SMART_LLM`/`FAST_LLM` configuration: Currently, the llamafile server only serves one model at a time. However, there's no reason you can't start multiple llamafile servers on different ports. To support using different models for `smart_llm` and `fast_llm`, you could implement config vars like `LLAMAFILE_SMART_LLM_URL` and `LLAMAFILE_FAST_LLM_URL` that point to different llamafile servers (one serving a 'big model' and one serving a 'fast model'). 
 * Authorization: the `serve.sh` script does not set up any authorization for the llamafile server; this can be turned on by adding arg `--api-key <some-key>` to the server startup command. However I haven't attempted to test whether the integration with autogpt works when this feature is turned on.
-* Added a few TODOs inline in the code
\ No newline at end of file
+* Added a few TODOs inline in the code
+* Test with other models
\ No newline at end of file
diff --git a/autogpts/autogpt/llamafile-integration/serve.sh b/autogpts/autogpt/llamafile-integration/serve.sh
index 91f273a6bd0..bd8eca67475 100755
--- a/autogpts/autogpt/llamafile-integration/serve.sh
+++ b/autogpts/autogpt/llamafile-integration/serve.sh
@@ -12,5 +12,9 @@ LLAMAFILE="./llamafile-integration/mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
 "${LLAMAFILE}" \
 --server \
 --nobrowser \
---ctx-size 2048 \
---n-predict 512
+--ctx-size 0 \
+--n-predict 1024
+
+# note: ctx-size=0 means the prompt context size will be set directly from the
+# underlying model configuration. This may cause slow response times or consume
+# a lot of memory.
\ No newline at end of file

From 7e7037d8b598c131f1a43c8ceb1e33a29828131d Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Sat, 25 May 2024 13:01:09 +0200
Subject: [PATCH 08/40] remove llamafile stuff from openai.py

---
 forge/forge/llm/providers/llamafile.py | 23 ++++++++---------------
 forge/forge/llm/providers/openai.py    | 12 ------------
 2 files changed, 8 insertions(+), 27 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile.py b/forge/forge/llm/providers/llamafile.py
index 7ad1d19202f..c1819a15a22 100644
--- a/forge/forge/llm/providers/llamafile.py
+++ b/forge/forge/llm/providers/llamafile.py
@@ -13,18 +13,12 @@
 
 from forge.json.parsing import json_loads
 
-from .openai import (
-    OpenAICredentials,
-    OpenAIProvider,
-    OpenAIModelName,
-    _functions_compat_fix_kwargs
-)
+from .openai import OpenAICredentials, OpenAIProvider
 from .schema import (
     AssistantToolCall,
     AssistantToolCallDict,
     ChatMessage,
     ChatModelInfo,
-    CompletionModelFunction,
     ModelProviderName,
     ModelTokenizer,
 )
@@ -40,7 +34,7 @@ class LlamafileModelName(str, enum.Enum):
     info.name: info
     for info in [
         ChatModelInfo(
-            name=OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT,
+            name=LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT,
             provider_name=ModelProviderName.LLAMAFILE,
             prompt_token_cost=0.0,
             completion_token_cost=0.0,
@@ -102,23 +96,23 @@ async def get_available_models(self) -> list[ChatModelInfo]:
         ]
 
     @overrides
-    def get_tokenizer(self, model_name: OpenAIModelName) -> ModelTokenizer:
+    def get_tokenizer(self, model_name: LlamafileModelName) -> ModelTokenizer:
         return LlamafileTokenizer(self._credentials)
 
     @overrides
-    def count_tokens(self, text: str, model_name: OpenAIModelName) -> int:
+    def count_tokens(self, text: str, model_name: LlamafileModelName) -> int:
         return len(self.get_tokenizer(model_name).encode(text))
 
     @overrides
     def count_message_tokens(
         self,
         messages: ChatMessage | list[ChatMessage],
-        model_name: OpenAIModelName,
+        model_name: LlamafileModelName,
     ) -> int:
         if isinstance(messages, ChatMessage):
             messages = [messages]
 
-        if model_name == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+        if model_name == LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
             # For mistral-instruct, num added tokens depends on if the message
             # is a prompt/instruction or an assistant-generated message.
             # - prompt gets [INST], [/INST] added and the first instruction
@@ -162,7 +156,6 @@ def _adapt_chat_messages_for_mistral_instruct(
         - expects messages to alternate between user/assistant roles.
 
         See details here: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format
-
         """
         adapted_messages = []
         for message in messages:
@@ -189,11 +182,11 @@ def _adapt_chat_messages_for_mistral_instruct(
     async def _create_chat_completion(
         self,
         messages: list[ChatCompletionMessageParam],
-        model: OpenAIModelName,
+        model: LlamafileModelName,
         *_,
         **kwargs,
     ) -> tuple[ChatCompletion, float, int, int]:
-        if model == OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+        if model == LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
             messages = self._adapt_chat_messages_for_mistral_instruct(messages)
 
         if "seed" not in kwargs:
diff --git a/forge/forge/llm/providers/openai.py b/forge/forge/llm/providers/openai.py
index eb7b29759f3..21c559a7384 100644
--- a/forge/forge/llm/providers/openai.py
+++ b/forge/forge/llm/providers/openai.py
@@ -84,9 +84,6 @@ class OpenAIModelName(str, enum.Enum):
     GPT4_32k = GPT4_ROLLING_32k
     GPT4_O = GPT4_O_ROLLING
 
-    # TODO: added here for convenience, maybe better to move this somewhere else though
-    LLAMAFILE_MISTRAL_7B_INSTRUCT = "mistral-7b-instruct-v0"
-
 
 OPEN_AI_EMBEDDING_MODELS = {
     info.name: info
@@ -175,15 +172,6 @@ class OpenAIModelName(str, enum.Enum):
             max_tokens=128000,
             has_function_call_api=True,
         ),
-        ChatModelInfo(
-            name=OpenAIModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT,
-            service=ModelProviderService.CHAT,
-            provider_name=ModelProviderName.LLAMAFILE,
-            prompt_token_cost=0.0,
-            completion_token_cost=0.0,
-            max_tokens=32768,
-            has_function_call_api=False,
-        ),
         ChatModelInfo(
             name=OpenAIModelName.GPT4_O,
             provider_name=ModelProviderName.OPENAI,

From 5d0f8b02ece54a82efb52bcb5cd139997f7835e1 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 31 May 2024 04:19:00 +0200
Subject: [PATCH 09/40] fix linting errors

---
 autogpt/autogpt/app/configurator.py | 3 +--
 forge/forge/config/config.py        | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/autogpt/autogpt/app/configurator.py b/autogpt/autogpt/app/configurator.py
index 29199e4a9ba..a012a13270a 100644
--- a/autogpt/autogpt/app/configurator.py
+++ b/autogpt/autogpt/app/configurator.py
@@ -7,8 +7,7 @@
 import click
 from colorama import Back, Style
 from forge.config.config import GPT_3_MODEL, GPT_4_MODEL, Config
-from forge.llm.providers import ModelName, MultiProvider, ModelProviderName
-from forge.llm.providers.openai import OpenAICredentials
+from forge.llm.providers import ModelName, ModelProviderName, MultiProvider
 
 logger = logging.getLogger(__name__)
 
diff --git a/forge/forge/config/config.py b/forge/forge/config/config.py
index 5858e01ddf0..5e157a339c4 100644
--- a/forge/forge/config/config.py
+++ b/forge/forge/config/config.py
@@ -62,7 +62,7 @@ class Config(SystemSettings, arbitrary_types_allowed=True):
     # )
     llm_provider: ModelProviderName = UserConfigurable(
         default=ModelProviderName.OPENAI,
-        from_env=lambda: ModelProviderName(os.getenv("LLM_PROVIDER"))
+        from_env=lambda: ModelProviderName(os.getenv("LLM_PROVIDER")),
     )
     fast_llm: ModelName = UserConfigurable(
         default=OpenAIModelName.GPT3,

From 960155a844fc90ad34f0ec272412119daa167bed Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 31 May 2024 04:49:34 +0200
Subject: [PATCH 10/40] Create `BaseOpenAIProvider` with common functionality
 from `OpenAIProvider`, `GroqProvider` and `LlamafileProvider` and rebase the
 latter three on `BaseOpenAIProvider`

---
 forge/forge/llm/providers/_openai_base.py | 516 ++++++++++++++++++++++
 forge/forge/llm/providers/anthropic.py    |  19 +-
 forge/forge/llm/providers/groq.py         | 323 +-------------
 forge/forge/llm/providers/llamafile.py    | 179 +++++---
 forge/forge/llm/providers/multi.py        |  20 +-
 forge/forge/llm/providers/openai.py       | 449 +++----------------
 forge/forge/llm/providers/schema.py       |  31 +-
 7 files changed, 754 insertions(+), 783 deletions(-)
 create mode 100644 forge/forge/llm/providers/_openai_base.py

diff --git a/forge/forge/llm/providers/_openai_base.py b/forge/forge/llm/providers/_openai_base.py
new file mode 100644
index 00000000000..0420bfc8011
--- /dev/null
+++ b/forge/forge/llm/providers/_openai_base.py
@@ -0,0 +1,516 @@
+import logging
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    ClassVar,
+    Mapping,
+    Optional,
+    ParamSpec,
+    Sequence,
+    TypeVar,
+    cast,
+)
+
+import sentry_sdk
+import tenacity
+from openai._exceptions import APIConnectionError, APIStatusError
+from openai.types import CreateEmbeddingResponse, EmbeddingCreateParams
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionMessage,
+    ChatCompletionMessageParam,
+    CompletionCreateParams,
+)
+from openai.types.shared_params import FunctionDefinition
+
+from forge.json.parsing import json_loads
+
+from .schema import (
+    AssistantChatMessage,
+    AssistantFunctionCall,
+    AssistantToolCall,
+    BaseChatModelProvider,
+    BaseEmbeddingModelProvider,
+    BaseModelProvider,
+    ChatMessage,
+    ChatModelInfo,
+    ChatModelResponse,
+    CompletionModelFunction,
+    Embedding,
+    EmbeddingModelInfo,
+    EmbeddingModelResponse,
+    ModelProviderService,
+    _ModelName,
+    _ModelProviderSettings,
+)
+from .utils import validate_tool_calls
+
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+
+
+class _BaseOpenAIProvider(BaseModelProvider[_ModelName, _ModelProviderSettings]):
+    """Base class for LLM providers with OpenAI-like APIs"""
+
+    MODELS: ClassVar[
+        Mapping[_ModelName, ChatModelInfo[_ModelName] | EmbeddingModelInfo[_ModelName]]  # type: ignore # noqa
+    ]
+
+    def __init__(
+        self,
+        settings: Optional[_ModelProviderSettings] = None,
+        logger: Optional[logging.Logger] = None,
+    ):
+        if not getattr(self, "MODELS", None):
+            raise ValueError(f"{self.__class__.__name__}.MODELS is not set")
+
+        if not settings:
+            settings = self.default_settings.copy(deep=True)
+        if not settings.credentials:
+            settings.credentials = self.default_settings.__fields__[
+                "credentials"
+            ].type_.from_env()
+
+        super(_BaseOpenAIProvider, self).__init__(settings=settings, logger=logger)
+
+        if not getattr(self, "_client", None):
+            from openai import AsyncOpenAI
+
+            self._client = AsyncOpenAI(
+                **self._credentials.get_api_access_kwargs()  # type: ignore
+            )
+
+    async def get_available_models(
+        self,
+    ) -> Sequence[ChatModelInfo[_ModelName] | EmbeddingModelInfo[_ModelName]]:
+        _models = (await self._client.models.list()).data
+        return [
+            self.MODELS[cast(_ModelName, m.id)] for m in _models if m.id in self.MODELS
+        ]
+
+    def get_token_limit(self, model_name: _ModelName) -> int:
+        """Get the maximum number of input tokens for a given model"""
+        return self.MODELS[model_name].max_tokens
+
+    def count_tokens(self, text: str, model_name: _ModelName) -> int:
+        return len(self.get_tokenizer(model_name).encode(text))
+
+    def _retry_api_request(self, func: Callable[_P, _T]) -> Callable[_P, _T]:
+        return tenacity.retry(
+            retry=(
+                tenacity.retry_if_exception_type(APIConnectionError)
+                | tenacity.retry_if_exception(
+                    lambda e: isinstance(e, APIStatusError) and e.status_code >= 500
+                )
+            ),
+            wait=tenacity.wait_exponential(),
+            stop=tenacity.stop_after_attempt(self._configuration.retries_per_request),
+            after=tenacity.after_log(self._logger, logging.DEBUG),
+        )(func)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}()"
+
+
+class BaseOpenAIChatProvider(
+    _BaseOpenAIProvider[_ModelName, _ModelProviderSettings],
+    BaseChatModelProvider[_ModelName, _ModelProviderSettings],
+):
+    CHAT_MODELS: ClassVar[dict[_ModelName, ChatModelInfo[_ModelName]]]  # type: ignore
+
+    def __init__(
+        self,
+        settings: Optional[_ModelProviderSettings] = None,
+        logger: Optional[logging.Logger] = None,
+    ):
+        if not getattr(self, "CHAT_MODELS", None):
+            raise ValueError(f"{self.__class__.__name__}.CHAT_MODELS is not set")
+
+        super(BaseOpenAIChatProvider, self).__init__(settings=settings, logger=logger)
+
+    async def get_available_chat_models(self) -> Sequence[ChatModelInfo[_ModelName]]:
+        all_available_models = await self.get_available_models()
+        return [
+            model
+            for model in all_available_models
+            if model.service == ModelProviderService.CHAT
+        ]
+
+    def count_message_tokens(
+        self,
+        messages: ChatMessage | list[ChatMessage],
+        model_name: _ModelName,
+    ) -> int:
+        if isinstance(messages, ChatMessage):
+            messages = [messages]
+        return self.count_tokens(
+            "\n\n".join(f"{m.role.upper()}: {m.content}" for m in messages), model_name
+        )
+
+    async def create_chat_completion(
+        self,
+        model_prompt: list[ChatMessage],
+        model_name: _ModelName,
+        completion_parser: Callable[[AssistantChatMessage], _T] = lambda _: None,
+        functions: Optional[list[CompletionModelFunction]] = None,
+        max_output_tokens: Optional[int] = None,
+        prefill_response: str = "",
+        **kwargs,
+    ) -> ChatModelResponse[_T]:
+        """Create a chat completion using the API."""
+
+        (
+            openai_messages,
+            completion_kwargs,
+            parse_kwargs,
+        ) = self._get_chat_completion_args(
+            prompt_messages=model_prompt,
+            model=model_name,
+            functions=functions,
+            max_output_tokens=max_output_tokens,
+            **kwargs,
+        )
+
+        total_cost = 0.0
+        attempts = 0
+        while True:
+            completion_kwargs["messages"] = openai_messages
+            _response, _cost, t_input, t_output = await self._create_chat_completion(
+                model=model_name,
+                completion_kwargs=completion_kwargs,
+            )
+            total_cost += _cost
+
+            # If parsing the response fails, append the error to the prompt, and let the
+            # LLM fix its mistake(s).
+            attempts += 1
+            parse_errors: list[Exception] = []
+
+            _assistant_msg = _response.choices[0].message
+
+            tool_calls, _errors = self._parse_assistant_tool_calls(
+                _assistant_msg, **parse_kwargs
+            )
+            parse_errors += _errors
+
+            # Validate tool calls
+            if not parse_errors and tool_calls and functions:
+                parse_errors += validate_tool_calls(tool_calls, functions)
+
+            assistant_msg = AssistantChatMessage(
+                content=_assistant_msg.content or "",
+                tool_calls=tool_calls or None,
+            )
+
+            parsed_result: _T = None  # type: ignore
+            if not parse_errors:
+                try:
+                    parsed_result = completion_parser(assistant_msg)
+                except Exception as e:
+                    parse_errors.append(e)
+
+            if not parse_errors:
+                if attempts > 1:
+                    self._logger.debug(
+                        f"Total cost for {attempts} attempts: ${round(total_cost, 5)}"
+                    )
+
+                return ChatModelResponse(
+                    response=AssistantChatMessage(
+                        content=_assistant_msg.content or "",
+                        tool_calls=tool_calls or None,
+                    ),
+                    parsed_result=parsed_result,
+                    model_info=self.CHAT_MODELS[model_name],
+                    prompt_tokens_used=t_input,
+                    completion_tokens_used=t_output,
+                )
+
+            else:
+                self._logger.debug(
+                    f"Parsing failed on response: '''{_assistant_msg}'''"
+                )
+                parse_errors_fmt = "\n\n".join(
+                    f"{e.__class__.__name__}: {e}" for e in parse_errors
+                )
+                self._logger.warning(
+                    f"Parsing attempt #{attempts} failed: {parse_errors_fmt}"
+                )
+                for e in parse_errors:
+                    sentry_sdk.capture_exception(
+                        error=e,
+                        extras={"assistant_msg": _assistant_msg, "i_attempt": attempts},
+                    )
+
+                if attempts < self._configuration.fix_failed_parse_tries:
+                    openai_messages.append(
+                        cast(
+                            ChatCompletionAssistantMessageParam,
+                            _assistant_msg.dict(exclude_none=True),
+                        )
+                    )
+                    openai_messages.append(
+                        {
+                            "role": "system",
+                            "content": (
+                                f"ERROR PARSING YOUR RESPONSE:\n\n{parse_errors_fmt}"
+                            ),
+                        }
+                    )
+                    continue
+                else:
+                    raise parse_errors[0]
+
+    def _get_chat_completion_args(
+        self,
+        prompt_messages: list[ChatMessage],
+        model: _ModelName,
+        functions: Optional[list[CompletionModelFunction]] = None,
+        max_output_tokens: Optional[int] = None,
+        **kwargs,
+    ) -> tuple[
+        list[ChatCompletionMessageParam],
+        CompletionCreateParams,
+        dict[str, Any],
+    ]:
+        """Prepare keyword arguments for a chat completion API call
+
+        Args:
+            prompt_messages: List of ChatMessages
+            model: The model to use
+            functions (optional): List of functions available to the LLM
+            max_output_tokens (optional): Maximum number of tokens to generate
+
+        Returns:
+            list[ChatCompletionMessageParam]: Prompt messages for the API call
+            CompletionCreateParams: Mapping of other kwargs for the API call
+            Mapping[str, Any]: Any keyword arguments to pass on to the completion parser
+        """
+        kwargs = cast(CompletionCreateParams, kwargs)
+
+        if max_output_tokens:
+            kwargs["max_tokens"] = max_output_tokens
+
+        if functions:
+            kwargs["tools"] = [  # pyright: ignore - it fails to infer the dict type
+                {"type": "function", "function": format_function_def_for_openai(f)}
+                for f in functions
+            ]
+            if len(functions) == 1:
+                # force the model to call the only specified function
+                kwargs["tool_choice"] = {  # pyright: ignore - type inference failure
+                    "type": "function",
+                    "function": {"name": functions[0].name},
+                }
+
+        if extra_headers := self._configuration.extra_request_headers:
+            # 'extra_headers' is not on CompletionCreateParams, but is on chat.create()
+            kwargs["extra_headers"] = kwargs.get("extra_headers", {})  # type: ignore
+            kwargs["extra_headers"].update(extra_headers.copy())  # type: ignore
+
+        prepped_messages: list[ChatCompletionMessageParam] = [
+            message.dict(  # type: ignore
+                include={"role", "content", "tool_calls", "tool_call_id", "name"},
+                exclude_none=True,
+            )
+            for message in prompt_messages
+        ]
+
+        if "messages" in kwargs:
+            prepped_messages += kwargs["messages"]
+            del kwargs["messages"]  # type: ignore - messages are added back later
+
+        return prepped_messages, kwargs, {}
+
+    async def _create_chat_completion(
+        self,
+        model: _ModelName,
+        completion_kwargs: CompletionCreateParams,
+    ) -> tuple[ChatCompletion, float, int, int]:
+        """
+        Create a chat completion using an OpenAI-like API with retry handling
+
+        Params:
+            model: The model to use for the completion
+            completion_kwargs: All other arguments for the completion call
+
+        Returns:
+            ChatCompletion: The chat completion response object
+            float: The cost ($) of this completion
+            int: Number of prompt tokens used
+            int: Number of completion tokens used
+        """
+
+        @self._retry_api_request
+        async def _create_chat_completion_with_retry() -> ChatCompletion:
+            return await self._client.chat.completions.create(
+                model=model,
+                **completion_kwargs,  # type: ignore
+            )
+
+        completion = await _create_chat_completion_with_retry()
+
+        if completion.usage:
+            prompt_tokens_used = completion.usage.prompt_tokens
+            completion_tokens_used = completion.usage.completion_tokens
+        else:
+            prompt_tokens_used = completion_tokens_used = 0
+
+        if self._budget:
+            cost = self._budget.update_usage_and_cost(
+                model_info=self.CHAT_MODELS[model],
+                input_tokens_used=prompt_tokens_used,
+                output_tokens_used=completion_tokens_used,
+            )
+        else:
+            cost = 0
+
+        self._logger.debug(
+            f"{model} completion usage: {prompt_tokens_used} input, "
+            f"{completion_tokens_used} output - ${round(cost, 5)}"
+        )
+        return completion, cost, prompt_tokens_used, completion_tokens_used
+
+    def _parse_assistant_tool_calls(
+        self, assistant_message: ChatCompletionMessage, **kwargs
+    ) -> tuple[list[AssistantToolCall], list[Exception]]:
+        tool_calls: list[AssistantToolCall] = []
+        parse_errors: list[Exception] = []
+
+        if assistant_message.tool_calls:
+            for _tc in assistant_message.tool_calls:
+                try:
+                    parsed_arguments = json_loads(_tc.function.arguments)
+                except Exception as e:
+                    err_message = (
+                        f"Decoding arguments for {_tc.function.name} failed: "
+                        + str(e.args[0])
+                    )
+                    parse_errors.append(
+                        type(e)(err_message, *e.args[1:]).with_traceback(
+                            e.__traceback__
+                        )
+                    )
+                    continue
+
+                tool_calls.append(
+                    AssistantToolCall(
+                        id=_tc.id,
+                        type=_tc.type,
+                        function=AssistantFunctionCall(
+                            name=_tc.function.name,
+                            arguments=parsed_arguments,
+                        ),
+                    )
+                )
+
+            # If parsing of all tool calls succeeds in the end, we ignore any issues
+            if len(tool_calls) == len(assistant_message.tool_calls):
+                parse_errors = []
+
+        return tool_calls, parse_errors
+
+
+class BaseOpenAIEmbeddingProvider(
+    _BaseOpenAIProvider[_ModelName, _ModelProviderSettings],
+    BaseEmbeddingModelProvider[_ModelName, _ModelProviderSettings],
+):
+    EMBEDDING_MODELS: ClassVar[
+        dict[_ModelName, EmbeddingModelInfo[_ModelName]]  # type: ignore
+    ]
+
+    def __init__(
+        self,
+        settings: Optional[_ModelProviderSettings] = None,
+        logger: Optional[logging.Logger] = None,
+    ):
+        if not getattr(self, "EMBEDDING_MODELS", None):
+            raise ValueError(f"{self.__class__.__name__}.EMBEDDING_MODELS is not set")
+
+        super(BaseOpenAIEmbeddingProvider, self).__init__(
+            settings=settings, logger=logger
+        )
+
+    async def get_available_embedding_models(
+        self,
+    ) -> Sequence[EmbeddingModelInfo[_ModelName]]:
+        all_available_models = await self.get_available_models()
+        return [
+            model
+            for model in all_available_models
+            if model.service == ModelProviderService.EMBEDDING
+        ]
+
+    async def create_embedding(
+        self,
+        text: str,
+        model_name: _ModelName,
+        embedding_parser: Callable[[Embedding], Embedding],
+        **kwargs,
+    ) -> EmbeddingModelResponse:
+        """Create an embedding using an OpenAI-like API"""
+        embedding_kwargs = self._get_embedding_kwargs(
+            input=text, model=model_name, **kwargs
+        )
+        response = await self._create_embedding(embedding_kwargs)
+
+        return EmbeddingModelResponse(
+            embedding=embedding_parser(response.data[0].embedding),
+            model_info=self.EMBEDDING_MODELS[model_name],
+            prompt_tokens_used=response.usage.prompt_tokens,
+        )
+
+    def _get_embedding_kwargs(
+        self, input: str | list[str], model: _ModelName, **kwargs
+    ) -> EmbeddingCreateParams:
+        """Get kwargs for an embedding API call
+
+        Params:
+            input: Text body or list of text bodies to create embedding(s) from
+            model: Embedding model to use
+
+        Returns:
+            The kwargs for the embedding API call
+        """
+        kwargs = cast(EmbeddingCreateParams, kwargs)
+
+        kwargs["input"] = input
+        kwargs["model"] = model
+
+        if extra_headers := self._configuration.extra_request_headers:
+            # 'extra_headers' is not on CompletionCreateParams, but is on embedding.create()  # noqa
+            kwargs["extra_headers"] = kwargs.get("extra_headers", {})  # type: ignore
+            kwargs["extra_headers"].update(extra_headers.copy())  # type: ignore
+
+        return kwargs
+
+    def _create_embedding(
+        self, embedding_kwargs: EmbeddingCreateParams
+    ) -> Awaitable[CreateEmbeddingResponse]:
+        """Create an embedding using an OpenAI-like API with retry handling."""
+
+        @self._retry_api_request
+        async def _create_embedding_with_retry() -> CreateEmbeddingResponse:
+            return await self._client.embeddings.create(**embedding_kwargs)
+
+        return _create_embedding_with_retry()
+
+
+def format_function_def_for_openai(self: CompletionModelFunction) -> FunctionDefinition:
+    """Returns an OpenAI-consumable function definition"""
+
+    return {
+        "name": self.name,
+        "description": self.description,
+        "parameters": {
+            "type": "object",
+            "properties": {
+                name: param.to_dict() for name, param in self.parameters.items()
+            },
+            "required": [
+                name for name, param in self.parameters.items() if param.required
+            ],
+        },
+    }
diff --git a/forge/forge/llm/providers/anthropic.py b/forge/forge/llm/providers/anthropic.py
index 4da5ed070cb..ef082c04878 100644
--- a/forge/forge/llm/providers/anthropic.py
+++ b/forge/forge/llm/providers/anthropic.py
@@ -2,7 +2,7 @@
 
 import enum
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Optional, ParamSpec, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Optional, ParamSpec, Sequence, TypeVar
 
 import sentry_sdk
 import tenacity
@@ -10,7 +10,9 @@
 from anthropic import APIConnectionError, APIStatusError
 from pydantic import SecretStr
 
-from forge.llm.providers.schema import (
+from forge.models.config import UserConfigurable
+
+from .schema import (
     AssistantChatMessage,
     AssistantFunctionCall,
     AssistantToolCall,
@@ -27,8 +29,6 @@
     ModelTokenizer,
     ToolResultMessage,
 )
-from forge.models.config import UserConfigurable
-
 from .utils import validate_tool_calls
 
 if TYPE_CHECKING:
@@ -77,10 +77,6 @@ class AnthropicModelName(str, enum.Enum):
 }
 
 
-class AnthropicConfiguration(ModelProviderConfiguration):
-    fix_failed_parse_tries: int = UserConfigurable(3)
-
-
 class AnthropicCredentials(ModelProviderCredentials):
     """Credentials for Anthropic."""
 
@@ -101,7 +97,6 @@ def get_api_access_kwargs(self) -> dict[str, str]:
 
 
 class AnthropicSettings(ModelProviderSettings):
-    configuration: AnthropicConfiguration  # type: ignore
     credentials: Optional[AnthropicCredentials]  # type: ignore
     budget: ModelProviderBudget  # type: ignore
 
@@ -110,7 +105,7 @@ class AnthropicProvider(BaseChatModelProvider[AnthropicModelName, AnthropicSetti
     default_settings = AnthropicSettings(
         name="anthropic_provider",
         description="Provides access to Anthropic's API.",
-        configuration=AnthropicConfiguration(
+        configuration=ModelProviderConfiguration(
             retries_per_request=7,
         ),
         credentials=None,
@@ -118,7 +113,7 @@ class AnthropicProvider(BaseChatModelProvider[AnthropicModelName, AnthropicSetti
     )
 
     _settings: AnthropicSettings
-    _configuration: AnthropicConfiguration
+    _configuration: ModelProviderConfiguration
     _credentials: AnthropicCredentials
     _budget: ModelProviderBudget
 
@@ -140,7 +135,7 @@ def __init__(
             **self._credentials.get_api_access_kwargs()  # type: ignore
         )
 
-    async def get_available_models(self) -> list[ChatModelInfo[AnthropicModelName]]:
+    async def get_available_models(self) -> Sequence[ChatModelInfo[AnthropicModelName]]:
         return list(ANTHROPIC_CHAT_MODELS.values())
 
     def get_token_limit(self, model_name: AnthropicModelName) -> int:
diff --git a/forge/forge/llm/providers/groq.py b/forge/forge/llm/providers/groq.py
index 70996c132e0..8f577073f1e 100644
--- a/forge/forge/llm/providers/groq.py
+++ b/forge/forge/llm/providers/groq.py
@@ -2,24 +2,13 @@
 
 import enum
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Optional, ParamSpec, TypeVar
+from typing import Any, Optional
 
-import sentry_sdk
-import tenacity
 import tiktoken
-from groq import APIConnectionError, APIStatusError
 from pydantic import SecretStr
 
-from forge.json.parsing import json_loads
 from forge.llm.providers.schema import (
-    AssistantChatMessage,
-    AssistantFunctionCall,
-    AssistantToolCall,
-    BaseChatModelProvider,
-    ChatMessage,
     ChatModelInfo,
-    ChatModelResponse,
-    CompletionModelFunction,
     ModelProviderBudget,
     ModelProviderConfiguration,
     ModelProviderCredentials,
@@ -29,16 +18,7 @@
 )
 from forge.models.config import UserConfigurable
 
-from .openai import format_function_def_for_openai
-from .utils import validate_tool_calls
-
-if TYPE_CHECKING:
-    from groq.types.chat import ChatCompletion, CompletionCreateParams
-    from groq.types.chat.chat_completion_message import ChatCompletionMessage
-    from groq.types.chat.chat_completion_message_param import ChatCompletionMessageParam
-
-_T = TypeVar("_T")
-_P = ParamSpec("_P")
+from ._openai_base import BaseOpenAIChatProvider
 
 
 class GroqModelName(str, enum.Enum):
@@ -87,10 +67,6 @@ class GroqModelName(str, enum.Enum):
 }
 
 
-class GroqConfiguration(ModelProviderConfiguration):
-    fix_failed_parse_tries: int = UserConfigurable(3)
-
-
 class GroqCredentials(ModelProviderCredentials):
     """Credentials for Groq."""
 
@@ -111,24 +87,24 @@ def get_api_access_kwargs(self) -> dict[str, str]:
 
 
 class GroqSettings(ModelProviderSettings):
-    configuration: GroqConfiguration  # type: ignore
     credentials: Optional[GroqCredentials]  # type: ignore
     budget: ModelProviderBudget  # type: ignore
 
 
-class GroqProvider(BaseChatModelProvider[GroqModelName, GroqSettings]):
+class GroqProvider(BaseOpenAIChatProvider[GroqModelName, GroqSettings]):
+    CHAT_MODELS = GROQ_CHAT_MODELS
+    MODELS = CHAT_MODELS
+
     default_settings = GroqSettings(
         name="groq_provider",
         description="Provides access to Groq's API.",
-        configuration=GroqConfiguration(
-            retries_per_request=7,
-        ),
+        configuration=ModelProviderConfiguration(),
         credentials=None,
         budget=ModelProviderBudget(),
     )
 
     _settings: GroqSettings
-    _configuration: GroqConfiguration
+    _configuration: ModelProviderConfiguration
     _credentials: GroqCredentials
     _budget: ModelProviderBudget
 
@@ -137,11 +113,6 @@ def __init__(
         settings: Optional[GroqSettings] = None,
         logger: Optional[logging.Logger] = None,
     ):
-        if not settings:
-            settings = self.default_settings.copy(deep=True)
-        if not settings.credentials:
-            settings.credentials = GroqCredentials.from_env()
-
         super(GroqProvider, self).__init__(settings=settings, logger=logger)
 
         from groq import AsyncGroq
@@ -150,284 +121,6 @@ def __init__(
             **self._credentials.get_api_access_kwargs()  # type: ignore
         )
 
-    async def get_available_models(self) -> list[ChatModelInfo[GroqModelName]]:
-        _models = (await self._client.models.list()).data
-        return [GROQ_CHAT_MODELS[m.id] for m in _models if m.id in GROQ_CHAT_MODELS]
-
-    def get_token_limit(self, model_name: GroqModelName) -> int:
-        """Get the token limit for a given model."""
-        return GROQ_CHAT_MODELS[model_name].max_tokens
-
     def get_tokenizer(self, model_name: GroqModelName) -> ModelTokenizer[Any]:
         # HACK: No official tokenizer is available for Groq
         return tiktoken.encoding_for_model("gpt-3.5-turbo")
-
-    def count_tokens(self, text: str, model_name: GroqModelName) -> int:
-        return len(self.get_tokenizer(model_name).encode(text))
-
-    def count_message_tokens(
-        self,
-        messages: ChatMessage | list[ChatMessage],
-        model_name: GroqModelName,
-    ) -> int:
-        if isinstance(messages, ChatMessage):
-            messages = [messages]
-        # HACK: No official tokenizer (for text or messages) is available for Groq.
-        # Token overhead of messages is unknown and may be inaccurate.
-        return self.count_tokens(
-            "\n\n".join(f"{m.role.upper()}: {m.content}" for m in messages), model_name
-        )
-
-    async def create_chat_completion(
-        self,
-        model_prompt: list[ChatMessage],
-        model_name: GroqModelName,
-        completion_parser: Callable[[AssistantChatMessage], _T] = lambda _: None,
-        functions: Optional[list[CompletionModelFunction]] = None,
-        max_output_tokens: Optional[int] = None,
-        prefill_response: str = "",
-        **kwargs,
-    ) -> ChatModelResponse[_T]:
-        """Create a completion using the Groq API."""
-        groq_messages, completion_kwargs = self._get_chat_completion_args(
-            prompt_messages=model_prompt,
-            functions=functions,
-            max_output_tokens=max_output_tokens,
-            **kwargs,
-        )
-
-        total_cost = 0.0
-        attempts = 0
-        while True:
-            completion_kwargs["messages"] = groq_messages.copy()
-            _response, _cost, t_input, t_output = await self._create_chat_completion(
-                model=model_name,
-                completion_kwargs=completion_kwargs,
-            )
-            total_cost += _cost
-
-            # If parsing the response fails, append the error to the prompt, and let the
-            # LLM fix its mistake(s).
-            attempts += 1
-            parse_errors: list[Exception] = []
-
-            _assistant_msg = _response.choices[0].message
-
-            tool_calls, _errors = self._parse_assistant_tool_calls(_assistant_msg)
-            parse_errors += _errors
-
-            # Validate tool calls
-            if not parse_errors and tool_calls and functions:
-                parse_errors += validate_tool_calls(tool_calls, functions)
-
-            assistant_msg = AssistantChatMessage(
-                content=_assistant_msg.content or "",
-                tool_calls=tool_calls or None,
-            )
-
-            parsed_result: _T = None  # type: ignore
-            if not parse_errors:
-                try:
-                    parsed_result = completion_parser(assistant_msg)
-                except Exception as e:
-                    parse_errors.append(e)
-
-            if not parse_errors:
-                if attempts > 1:
-                    self._logger.debug(
-                        f"Total cost for {attempts} attempts: ${round(total_cost, 5)}"
-                    )
-
-                return ChatModelResponse(
-                    response=AssistantChatMessage(
-                        content=_assistant_msg.content or "",
-                        tool_calls=tool_calls or None,
-                    ),
-                    parsed_result=parsed_result,
-                    model_info=GROQ_CHAT_MODELS[model_name],
-                    prompt_tokens_used=t_input,
-                    completion_tokens_used=t_output,
-                )
-
-            else:
-                self._logger.debug(
-                    f"Parsing failed on response: '''{_assistant_msg}'''"
-                )
-                parse_errors_fmt = "\n\n".join(
-                    f"{e.__class__.__name__}: {e}" for e in parse_errors
-                )
-                self._logger.warning(
-                    f"Parsing attempt #{attempts} failed: {parse_errors_fmt}"
-                )
-                for e in parse_errors:
-                    sentry_sdk.capture_exception(
-                        error=e,
-                        extras={"assistant_msg": _assistant_msg, "i_attempt": attempts},
-                    )
-
-                if attempts < self._configuration.fix_failed_parse_tries:
-                    groq_messages.append(
-                        _assistant_msg.dict(exclude_none=True)  # type: ignore
-                    )
-                    groq_messages.append(
-                        {
-                            "role": "system",
-                            "content": (
-                                f"ERROR PARSING YOUR RESPONSE:\n\n{parse_errors_fmt}"
-                            ),
-                        }
-                    )
-                    continue
-                else:
-                    raise parse_errors[0]
-
-    def _get_chat_completion_args(
-        self,
-        prompt_messages: list[ChatMessage],
-        functions: Optional[list[CompletionModelFunction]] = None,
-        max_output_tokens: Optional[int] = None,
-        **kwargs,  # type: ignore
-    ) -> tuple[list[ChatCompletionMessageParam], CompletionCreateParams]:
-        """Prepare chat completion arguments and keyword arguments for API call.
-
-        Args:
-            model_prompt: List of ChatMessages.
-            functions: Optional list of functions available to the LLM.
-            kwargs: Additional keyword arguments.
-
-        Returns:
-            list[ChatCompletionMessageParam]: Prompt messages for the OpenAI call
-            dict[str, Any]: Any other kwargs for the OpenAI call
-        """
-        kwargs: CompletionCreateParams = kwargs  # type: ignore
-        if max_output_tokens:
-            kwargs["max_tokens"] = max_output_tokens
-
-        if functions:
-            kwargs["tools"] = [
-                {"type": "function", "function": format_function_def_for_openai(f)}
-                for f in functions
-            ]
-            if len(functions) == 1:
-                # force the model to call the only specified function
-                kwargs["tool_choice"] = {
-                    "type": "function",
-                    "function": {"name": functions[0].name},
-                }
-
-        if extra_headers := self._configuration.extra_request_headers:
-            # 'extra_headers' is not on CompletionCreateParams, but is on chat.create()
-            kwargs["extra_headers"] = kwargs.get("extra_headers", {})  # type: ignore
-            kwargs["extra_headers"].update(extra_headers.copy())  # type: ignore
-
-        groq_messages: list[ChatCompletionMessageParam] = [
-            message.dict(  # type: ignore
-                include={"role", "content", "tool_calls", "tool_call_id", "name"},
-                exclude_none=True,
-            )
-            for message in prompt_messages
-        ]
-
-        if "messages" in kwargs:
-            groq_messages += kwargs["messages"]
-            del kwargs["messages"]  # type: ignore - messages are added back later
-
-        return groq_messages, kwargs
-
-    async def _create_chat_completion(
-        self, model: GroqModelName, completion_kwargs: CompletionCreateParams
-    ) -> tuple[ChatCompletion, float, int, int]:
-        """
-        Create a chat completion using the Groq API with retry handling.
-
-        Params:
-            completion_kwargs: Keyword arguments for an Groq Messages API call
-
-        Returns:
-            Message: The message completion object
-            float: The cost ($) of this completion
-            int: Number of input tokens used
-            int: Number of output tokens used
-        """
-
-        @self._retry_api_request
-        async def _create_chat_completion_with_retry() -> ChatCompletion:
-            return await self._client.chat.completions.create(
-                model=model, **completion_kwargs  # type: ignore
-            )
-
-        response = await _create_chat_completion_with_retry()
-
-        if not response.usage:
-            self._logger.warning(
-                "Groq chat completion response does not contain a usage field",
-                response,
-            )
-            return response, 0, 0, 0
-        else:
-            cost = self._budget.update_usage_and_cost(
-                model_info=GROQ_CHAT_MODELS[model],
-                input_tokens_used=response.usage.prompt_tokens,
-                output_tokens_used=response.usage.completion_tokens,
-            )
-            return (
-                response,
-                cost,
-                response.usage.prompt_tokens,
-                response.usage.completion_tokens,
-            )
-
-    def _parse_assistant_tool_calls(
-        self, assistant_message: ChatCompletionMessage, compat_mode: bool = False
-    ):
-        tool_calls: list[AssistantToolCall] = []
-        parse_errors: list[Exception] = []
-
-        if assistant_message.tool_calls:
-            for _tc in assistant_message.tool_calls:
-                try:
-                    parsed_arguments = json_loads(_tc.function.arguments)
-                except Exception as e:
-                    err_message = (
-                        f"Decoding arguments for {_tc.function.name} failed: "
-                        + str(e.args[0])
-                    )
-                    parse_errors.append(
-                        type(e)(err_message, *e.args[1:]).with_traceback(
-                            e.__traceback__
-                        )
-                    )
-                    continue
-
-                tool_calls.append(
-                    AssistantToolCall(
-                        id=_tc.id,
-                        type=_tc.type,
-                        function=AssistantFunctionCall(
-                            name=_tc.function.name,
-                            arguments=parsed_arguments,
-                        ),
-                    )
-                )
-
-            # If parsing of all tool calls succeeds in the end, we ignore any issues
-            if len(tool_calls) == len(assistant_message.tool_calls):
-                parse_errors = []
-
-        return tool_calls, parse_errors
-
-    def _retry_api_request(self, func: Callable[_P, _T]) -> Callable[_P, _T]:
-        return tenacity.retry(
-            retry=(
-                tenacity.retry_if_exception_type(APIConnectionError)
-                | tenacity.retry_if_exception(
-                    lambda e: isinstance(e, APIStatusError) and e.status_code >= 500
-                )
-            ),
-            wait=tenacity.wait_exponential(),
-            stop=tenacity.stop_after_attempt(self._configuration.retries_per_request),
-            after=tenacity.after_log(self._logger, logging.DEBUG),
-        )(func)
-
-    def __repr__(self):
-        return "GroqProvider()"
diff --git a/forge/forge/llm/providers/llamafile.py b/forge/forge/llm/providers/llamafile.py
index c1819a15a22..45c4e8c1dfd 100644
--- a/forge/forge/llm/providers/llamafile.py
+++ b/forge/forge/llm/providers/llamafile.py
@@ -1,30 +1,33 @@
 import enum
 import logging
 from pathlib import Path
-from typing import Any, Iterator, Optional, TypeVar
+from typing import Any, Iterator, Optional, Sequence
 
 import requests
 from openai.types.chat import (
-    ChatCompletion,
     ChatCompletionMessage,
     ChatCompletionMessageParam,
+    CompletionCreateParams,
 )
-from overrides import overrides
+from pydantic import SecretStr
 
 from forge.json.parsing import json_loads
+from forge.models.config import UserConfigurable
 
-from .openai import OpenAICredentials, OpenAIProvider
+from ._openai_base import BaseOpenAIChatProvider
 from .schema import (
     AssistantToolCall,
     AssistantToolCallDict,
     ChatMessage,
     ChatModelInfo,
+    CompletionModelFunction,
+    ModelProviderConfiguration,
+    ModelProviderCredentials,
     ModelProviderName,
+    ModelProviderSettings,
     ModelTokenizer,
 )
 
-_T = TypeVar("_T")
-
 
 class LlamafileModelName(str, enum.Enum):
     LLAMAFILE_MISTRAL_7B_INSTRUCT = "mistral-7b-instruct-v0"
@@ -44,10 +47,18 @@ class LlamafileModelName(str, enum.Enum):
     ]
 }
 
+LLAMAFILE_EMBEDDING_MODELS = {}
+
+
+class LlamafileCredentials(ModelProviderCredentials):
+    api_key = SecretStr("sk-no-key-required")
+    api_base: SecretStr = UserConfigurable(  # type: ignore
+        default=SecretStr("http://localhost:8080/v1"), from_env="LLAMAFILE_API_BASE"
+    )
 
-class LlamafileTokenizer(ModelTokenizer):
 
-    def __init__(self, credentials: OpenAICredentials):
+class LlamafileTokenizer(ModelTokenizer[int]):
+    def __init__(self, credentials: LlamafileCredentials):
         self._credentials = credentials
 
     @property
@@ -60,50 +71,64 @@ def _tokenizer_base_url(self):
 
     def encode(self, text: str) -> list[int]:
         response = requests.post(
-            url=f"{self._tokenizer_base_url}/tokenize",
-            json={"content": text}
+            url=f"{self._tokenizer_base_url}/tokenize", json={"content": text}
         )
         response.raise_for_status()
         return response.json()["tokens"]
 
     def decode(self, tokens: list[int]) -> str:
         response = requests.post(
-            url=f"{self._tokenizer_base_url}/detokenize",
-            json={"tokens": tokens}
+            url=f"{self._tokenizer_base_url}/detokenize", json={"tokens": tokens}
         )
         response.raise_for_status()
         return response.json()["content"]
 
 
-class LlamafileProvider(OpenAIProvider):
+class LlamafileSettings(ModelProviderSettings):
+    credentials: Optional[LlamafileCredentials]  # type: ignore
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
 
-    async def get_available_models(self) -> list[ChatModelInfo]:
+class LlamafileProvider(
+    BaseOpenAIChatProvider[LlamafileModelName, LlamafileSettings],
+    # TODO: add and test support for embedding models
+    # BaseOpenAIEmbeddingProvider[LlamafileModelName, LlamafileSettings],
+):
+    EMBEDDING_MODELS = LLAMAFILE_EMBEDDING_MODELS
+    CHAT_MODELS = LLAMAFILE_CHAT_MODELS
+    MODELS = {**CHAT_MODELS, **EMBEDDING_MODELS}
+
+    default_settings = LlamafileSettings(
+        name="llamafile_provider",
+        description=(
+            "Provides chat completion and embedding services "
+            "through a llamafile instance"
+        ),
+        configuration=ModelProviderConfiguration(),
+    )
+
+    _settings: LlamafileSettings  # type: ignore
+    _credentials: LlamafileCredentials  # type: ignore
+
+    async def get_available_models(self) -> Sequence[ChatModelInfo[LlamafileModelName]]:
         _models = (await self._client.models.list()).data
         # note: at the moment, llamafile only serves one model at a time (so this
         # list will only ever have one value). however, in the future, llamafile
         # may support multiple models, so leaving this method as-is for now.
 
         # clean up model names
-        # e.g. 'mistral-7b-instruct-v0.2.Q5_K_M.gguf' -> 'mistral-7b-instruct-v0.2'
-        # e.g. '/Users/kate/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf' -> 'mistral-7b-instruct-v0.2
+        # e.g. 'mistral-7b-instruct-v0.2.Q5_K_M.gguf'
+        #   -> 'mistral-7b-instruct-v0.2'
+        # e.g. '/Users/kate/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf'
+        #   ->                    'mistral-7b-instruct-v0.2'
         return [
             LLAMAFILE_CHAT_MODELS[_id]
             for m in _models
             if (_id := Path(m.id).name.split(".")[0]) in LLAMAFILE_CHAT_MODELS
         ]
 
-    @overrides
-    def get_tokenizer(self, model_name: LlamafileModelName) -> ModelTokenizer:
+    def get_tokenizer(self, model_name: LlamafileModelName) -> LlamafileTokenizer:
         return LlamafileTokenizer(self._credentials)
 
-    @overrides
-    def count_tokens(self, text: str, model_name: LlamafileModelName) -> int:
-        return len(self.get_tokenizer(model_name).encode(text))
-
-    @overrides
     def count_message_tokens(
         self,
         messages: ChatMessage | list[ChatMessage],
@@ -125,8 +150,10 @@ def count_message_tokens(
             ntokens = 0
             for message in messages:
                 if (
-                        message.role == ChatMessage.Role.USER or
-                        message.role == ChatMessage.Role.SYSTEM  # note that 'system' messages will get converted to 'user' messages before being sent to the model
+                    message.role == ChatMessage.Role.USER
+                    # note that 'system' messages will get converted
+                    # to 'user' messages before being sent to the model
+                    or message.role == ChatMessage.Role.SYSTEM
                 ):
                     # 5 tokens for [INST], [/INST], which actually get
                     # tokenized into "[, INST, ]" and "[, /, INST, ]"
@@ -135,7 +162,9 @@ def count_message_tokens(
                 elif message.role == ChatMessage.Role.ASSISTANT:
                     assistant_num_added += 1  # for </s>
                 else:
-                    raise ValueError(f"{model_name} does not support role: {message.role}")
+                    raise ValueError(
+                        f"{model_name} does not support role: {message.role}"
+                    )
 
                 ntokens += self.count_tokens(message.content, model_name)
 
@@ -143,11 +172,36 @@ def count_message_tokens(
             return total_token_count
 
         else:
-            raise NotImplementedError(f"count_message_tokens not implemented for model {model_name}")
+            raise NotImplementedError(
+                f"count_message_tokens not implemented for model {model_name}"
+            )
+
+    def _get_chat_completion_args(
+        self,
+        prompt_messages: list[ChatMessage],
+        model: LlamafileModelName,
+        functions: list[CompletionModelFunction] | None = None,
+        max_output_tokens: int | None = None,
+        **kwargs,
+    ) -> tuple[
+        list[ChatCompletionMessageParam], CompletionCreateParams, dict[str, Any]
+    ]:
+        messages, completion_kwargs, parse_kwargs = super()._get_chat_completion_args(
+            prompt_messages, model, functions, max_output_tokens, **kwargs
+        )
+
+        if model == LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+            messages = self._adapt_chat_messages_for_mistral_instruct(messages)
+
+        if "seed" not in kwargs:
+            # FIXME: temporarily hard-coded for reproducibility, instead the
+            #  seed should be set from config
+            completion_kwargs["seed"] = 0
+
+        return messages, completion_kwargs, parse_kwargs
 
     def _adapt_chat_messages_for_mistral_instruct(
-            self,
-            messages: list[ChatCompletionMessageParam]
+        self, messages: list[ChatCompletionMessageParam]
     ) -> list[ChatCompletionMessageParam]:
         """
         Munge the messages to be compatible with the mistral-7b-instruct chat
@@ -155,50 +209,53 @@ def _adapt_chat_messages_for_mistral_instruct(
         - only supports 'user' and 'assistant' roles.
         - expects messages to alternate between user/assistant roles.
 
-        See details here: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format
+        See details here:
+        https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format
         """
-        adapted_messages = []
+        adapted_messages: list[ChatCompletionMessageParam] = []
         for message in messages:
-
             # convert 'system' role to 'user' role as mistral-7b-instruct does
             # not support 'system'
             if message["role"] == ChatMessage.Role.SYSTEM:
                 message["role"] = ChatMessage.Role.USER
 
-            if len(adapted_messages) == 0:
+            if (
+                len(adapted_messages) == 0
+                or message["role"] != (last_message := adapted_messages[-1])["role"]
+            ):
                 adapted_messages.append(message)
-
             else:
-                if message["role"] == adapted_messages[-1]["role"]:
-                    # if the curr message has the same role as the previous one,
-                    # concat the current message content to the prev message
-                    adapted_messages[-1]["content"] += " " + message["content"]
-                else:
-                    adapted_messages.append(message)
+                if not message.get("content"):
+                    continue
+
+                # if the curr message has the same role as the previous one,
+                # concat the current message content to the prev message
+                if message["role"] == "user" and last_message["role"] == "user":
+                    # user messages can contain other types of content blocks
+                    if not isinstance(last_message["content"], list):
+                        last_message["content"] = [
+                            {"type": "text", "text": last_message["content"]}
+                        ]
+
+                    last_message["content"].extend(
+                        message["content"]
+                        if isinstance(message["content"], list)
+                        else [{"type": "text", "text": message["content"]}]
+                    )
+                elif message["role"] != "user" and last_message["role"] != "user":
+                    last_message["content"] = (
+                        (last_message.get("content") or "")
+                        + "\n\n"
+                        + (message.get("content") or "")
+                    ).strip()
 
         return adapted_messages
 
-    @overrides
-    async def _create_chat_completion(
+    def _parse_assistant_tool_calls(
         self,
-        messages: list[ChatCompletionMessageParam],
-        model: LlamafileModelName,
-        *_,
+        assistant_message: ChatCompletionMessage,
+        compat_mode: bool = False,
         **kwargs,
-    ) -> tuple[ChatCompletion, float, int, int]:
-        if model == LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
-            messages = self._adapt_chat_messages_for_mistral_instruct(messages)
-
-        if "seed" not in kwargs:
-            # TODO: temporarily hard-coded for reproducibility, instead the
-            #  seed should be set from config
-            kwargs["seed"] = 0
-
-        return await super()._create_chat_completion(messages, model, **kwargs)
-
-    @overrides
-    def _parse_assistant_tool_calls(
-        self, assistant_message: ChatCompletionMessage, compat_mode: bool = False
     ):
         tool_calls: list[AssistantToolCall] = []
         parse_errors: list[Exception] = []
diff --git a/forge/forge/llm/providers/multi.py b/forge/forge/llm/providers/multi.py
index 588d26ac174..ee42c2a8446 100644
--- a/forge/forge/llm/providers/multi.py
+++ b/forge/forge/llm/providers/multi.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Callable, Iterator, Optional, TypeVar
+from typing import Any, Callable, Iterator, Optional, Sequence, TypeVar
 
 from pydantic import ValidationError
 
@@ -25,7 +25,7 @@
 
 _T = TypeVar("_T")
 
-ModelName = AnthropicModelName | GroqModelName | OpenAIModelName
+ModelName = AnthropicModelName | GroqModelName | LlamafileModelName | OpenAIModelName
 EmbeddingModelProvider = OpenAIProvider
 
 CHAT_MODELS = {
@@ -62,10 +62,14 @@ def __init__(
 
         self._provider_instances = {}
 
-    async def get_available_models(self) -> list[ChatModelInfo[ModelName]]:
+    async def get_available_models(self) -> Sequence[ChatModelInfo[ModelName]]:
+        # TODO: support embeddings
+        return await self.get_available_chat_models()
+
+    async def get_available_chat_models(self) -> Sequence[ChatModelInfo[ModelName]]:
         models = []
         for provider in self.get_available_providers():
-            models.extend(await provider.get_available_models())
+            models.extend(await provider.get_available_chat_models())
         return models
 
     def get_token_limit(self, model_name: ModelName) -> int:
@@ -165,4 +169,10 @@ def __repr__(self):
         return f"{self.__class__.__name__}()"
 
 
-ChatModelProvider = AnthropicProvider | GroqProvider | OpenAIProvider | MultiProvider
+ChatModelProvider = (
+    AnthropicProvider
+    | GroqProvider
+    | LlamafileProvider
+    | OpenAIProvider
+    | MultiProvider
+)
diff --git a/forge/forge/llm/providers/openai.py b/forge/forge/llm/providers/openai.py
index a4dc2cacf32..985108a177a 100644
--- a/forge/forge/llm/providers/openai.py
+++ b/forge/forge/llm/providers/openai.py
@@ -2,47 +2,29 @@
 import logging
 import os
 from pathlib import Path
-from typing import (
-    Any,
-    Callable,
-    Coroutine,
-    Iterator,
-    Optional,
-    ParamSpec,
-    TypeVar,
-    cast,
-)
+from typing import Any, Callable, Iterator, Mapping, Optional, ParamSpec, TypeVar, cast
 
-import sentry_sdk
 import tenacity
 import tiktoken
 import yaml
 from openai._exceptions import APIStatusError, RateLimitError
-from openai.types import CreateEmbeddingResponse
+from openai.types import EmbeddingCreateParams
 from openai.types.chat import (
-    ChatCompletion,
-    ChatCompletionAssistantMessageParam,
     ChatCompletionMessage,
     ChatCompletionMessageParam,
+    CompletionCreateParams,
 )
-from openai.types.shared_params import FunctionDefinition
 from pydantic import SecretStr
 
 from forge.json.parsing import json_loads
 from forge.llm.providers.schema import (
-    AssistantChatMessage,
-    AssistantFunctionCall,
     AssistantToolCall,
     AssistantToolCallDict,
-    BaseChatModelProvider,
-    BaseEmbeddingModelProvider,
     ChatMessage,
     ChatModelInfo,
-    ChatModelResponse,
     CompletionModelFunction,
     Embedding,
     EmbeddingModelInfo,
-    EmbeddingModelResponse,
     ModelProviderBudget,
     ModelProviderConfiguration,
     ModelProviderCredentials,
@@ -53,7 +35,7 @@
 from forge.models.config import UserConfigurable
 from forge.models.json_schema import JSONSchema
 
-from .utils import validate_tool_calls
+from ._openai_base import BaseOpenAIChatProvider, BaseOpenAIEmbeddingProvider
 
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
@@ -221,16 +203,15 @@ class OpenAIModelName(str, enum.Enum):
             copy_info.has_function_call_api = False
 
 
-OPEN_AI_MODELS = {
+OPEN_AI_MODELS: Mapping[
+    OpenAIModelName,
+    ChatModelInfo[OpenAIModelName] | EmbeddingModelInfo[OpenAIModelName],
+] = {
     **OPEN_AI_CHAT_MODELS,
     **OPEN_AI_EMBEDDING_MODELS,
 }
 
 
-class OpenAIConfiguration(ModelProviderConfiguration):
-    fix_failed_parse_tries: int = UserConfigurable(3)
-
-
 class OpenAICredentials(ModelProviderCredentials):
     """Credentials for OpenAI."""
 
@@ -308,27 +289,28 @@ def _get_azure_access_kwargs(self, model: str) -> dict[str, str]:
 
 
 class OpenAISettings(ModelProviderSettings):
-    configuration: OpenAIConfiguration  # type: ignore
     credentials: Optional[OpenAICredentials]  # type: ignore
     budget: ModelProviderBudget  # type: ignore
 
 
 class OpenAIProvider(
-    BaseChatModelProvider[OpenAIModelName, OpenAISettings],
-    BaseEmbeddingModelProvider[OpenAIModelName, OpenAISettings],
+    BaseOpenAIChatProvider[OpenAIModelName, OpenAISettings],
+    BaseOpenAIEmbeddingProvider[OpenAIModelName, OpenAISettings],
 ):
+    MODELS = OPEN_AI_MODELS
+    CHAT_MODELS = OPEN_AI_CHAT_MODELS
+    EMBEDDING_MODELS = OPEN_AI_EMBEDDING_MODELS
+
     default_settings = OpenAISettings(
         name="openai_provider",
         description="Provides access to OpenAI's API.",
-        configuration=OpenAIConfiguration(
-            retries_per_request=7,
-        ),
+        configuration=ModelProviderConfiguration(),
         credentials=None,
         budget=ModelProviderBudget(),
     )
 
     _settings: OpenAISettings
-    _configuration: OpenAIConfiguration
+    _configuration: ModelProviderConfiguration
     _credentials: OpenAICredentials
     _budget: ModelProviderBudget
 
@@ -337,11 +319,6 @@ def __init__(
         settings: Optional[OpenAISettings] = None,
         logger: Optional[logging.Logger] = None,
     ):
-        if not settings:
-            settings = self.default_settings.copy(deep=True)
-        if not settings.credentials:
-            settings.credentials = OpenAICredentials.from_env()
-
         super(OpenAIProvider, self).__init__(settings=settings, logger=logger)
 
         if self._credentials.api_type == SecretStr("azure"):
@@ -359,21 +336,9 @@ def __init__(
                 **self._credentials.get_api_access_kwargs()  # type: ignore
             )
 
-    async def get_available_models(self) -> list[ChatModelInfo[OpenAIModelName]]:
-        _models = (await self._client.models.list()).data
-        return [OPEN_AI_MODELS[m.id] for m in _models if m.id in OPEN_AI_MODELS]
-
-    def get_token_limit(self, model_name: OpenAIModelName) -> int:
-        """Get the token limit for a given model."""
-        return OPEN_AI_MODELS[model_name].max_tokens
-
     def get_tokenizer(self, model_name: OpenAIModelName) -> ModelTokenizer[int]:
         return tiktoken.encoding_for_model(model_name)
 
-    def count_tokens(self, text: str, model_name: OpenAIModelName) -> int:
-        encoding = self.get_tokenizer(model_name)
-        return len(encoding.encode(text))
-
     def count_message_tokens(
         self,
         messages: ChatMessage | list[ChatMessage],
@@ -387,338 +352,87 @@ def count_message_tokens(
                 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
             )
             tokens_per_name = -1  # if there's a name, the role is omitted
-            encoding_model = "gpt-3.5-turbo"
+        # TODO: check if this is still valid for gpt-4o
         elif model_name.startswith("gpt-4"):
             tokens_per_message = 3
             tokens_per_name = 1
-            encoding_model = "gpt-4"
         else:
             raise NotImplementedError(
                 f"count_message_tokens() is not implemented for model {model_name}.\n"
-                " See https://github.com/openai/openai-python/blob/main/chatml.md for"
-                " information on how messages are converted to tokens."
+                "See https://github.com/openai/openai-python/blob/120d225b91a8453e15240a49fb1c6794d8119326/chatml.md "  # noqa
+                "for information on how messages are converted to tokens."
             )
-        try:
-            encoding = tiktoken.encoding_for_model(encoding_model)
-        except KeyError:
-            logging.getLogger(__class__.__name__).warning(
-                f"Model {model_name} not found. Defaulting to cl100k_base encoding."
-            )
-            encoding = tiktoken.get_encoding("cl100k_base")
+        tokenizer = self.get_tokenizer(model_name)
 
         num_tokens = 0
         for message in messages:
             num_tokens += tokens_per_message
             for key, value in message.dict().items():
-                num_tokens += len(encoding.encode(value))
+                num_tokens += len(tokenizer.encode(value))
                 if key == "name":
                     num_tokens += tokens_per_name
         num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
         return num_tokens
 
-    async def create_chat_completion(
-        self,
-        model_prompt: list[ChatMessage],
-        model_name: OpenAIModelName,
-        completion_parser: Callable[[AssistantChatMessage], _T] = lambda _: None,
-        functions: Optional[list[CompletionModelFunction]] = None,
-        max_output_tokens: Optional[int] = None,
-        prefill_response: str = "",  # not supported by OpenAI
-        **kwargs,
-    ) -> ChatModelResponse[_T]:
-        """Create a completion using the OpenAI API and parse it."""
-
-        openai_messages, completion_kwargs = self._get_chat_completion_args(
-            model_prompt=model_prompt,
-            model_name=model_name,
-            functions=functions,
-            max_tokens=max_output_tokens,
-            **kwargs,
-        )
-        tool_calls_compat_mode = bool(functions and "tools" not in completion_kwargs)
-
-        total_cost = 0.0
-        attempts = 0
-        while True:
-            _response, _cost, t_input, t_output = await self._create_chat_completion(
-                messages=openai_messages,
-                **completion_kwargs,
-            )
-            total_cost += _cost
-
-            # If parsing the response fails, append the error to the prompt, and let the
-            # LLM fix its mistake(s).
-            attempts += 1
-            parse_errors: list[Exception] = []
-
-            _assistant_msg = _response.choices[0].message
-
-            tool_calls, _errors = self._parse_assistant_tool_calls(
-                _assistant_msg, tool_calls_compat_mode
-            )
-            parse_errors += _errors
-
-            # Validate tool calls
-            if not parse_errors and tool_calls and functions:
-                parse_errors += validate_tool_calls(tool_calls, functions)
-
-            assistant_msg = AssistantChatMessage(
-                content=_assistant_msg.content or "",
-                tool_calls=tool_calls or None,
-            )
-
-            parsed_result: _T = None  # type: ignore
-            if not parse_errors:
-                try:
-                    parsed_result = completion_parser(assistant_msg)
-                except Exception as e:
-                    parse_errors.append(e)
-
-            if not parse_errors:
-                if attempts > 1:
-                    self._logger.debug(
-                        f"Total cost for {attempts} attempts: ${round(total_cost, 5)}"
-                    )
-
-                return ChatModelResponse(
-                    response=AssistantChatMessage(
-                        content=_assistant_msg.content or "",
-                        tool_calls=tool_calls or None,
-                    ),
-                    parsed_result=parsed_result,
-                    model_info=OPEN_AI_CHAT_MODELS[model_name],
-                    prompt_tokens_used=t_input,
-                    completion_tokens_used=t_output,
-                )
-
-            else:
-                self._logger.debug(
-                    f"Parsing failed on response: '''{_assistant_msg}'''"
-                )
-                parse_errors_fmt = "\n\n".join(
-                    f"{e.__class__.__name__}: {e}" for e in parse_errors
-                )
-                self._logger.warning(
-                    f"Parsing attempt #{attempts} failed: {parse_errors_fmt}"
-                )
-                for e in parse_errors:
-                    sentry_sdk.capture_exception(
-                        error=e,
-                        extras={"assistant_msg": _assistant_msg, "i_attempt": attempts},
-                    )
-
-                if attempts < self._configuration.fix_failed_parse_tries:
-                    openai_messages.append(
-                        cast(
-                            ChatCompletionAssistantMessageParam,
-                            _assistant_msg.dict(exclude_none=True),
-                        )
-                    )
-                    openai_messages.append(
-                        {
-                            "role": "system",
-                            "content": (
-                                f"ERROR PARSING YOUR RESPONSE:\n\n{parse_errors_fmt}"
-                            ),
-                        }
-                    )
-                    continue
-                else:
-                    raise parse_errors[0]
-
-    async def create_embedding(
-        self,
-        text: str,
-        model_name: OpenAIModelName,
-        embedding_parser: Callable[[Embedding], Embedding],
-        **kwargs,
-    ) -> EmbeddingModelResponse:
-        """Create an embedding using the OpenAI API."""
-        embedding_kwargs = self._get_embedding_kwargs(model_name, **kwargs)
-        response = await self._create_embedding(text=text, **embedding_kwargs)
-
-        response = EmbeddingModelResponse(
-            embedding=embedding_parser(response.data[0].embedding),
-            model_info=OPEN_AI_EMBEDDING_MODELS[model_name],
-            prompt_tokens_used=response.usage.prompt_tokens,
-            completion_tokens_used=0,
-        )
-        self._budget.update_usage_and_cost(
-            model_info=response.model_info,
-            input_tokens_used=response.prompt_tokens_used,
-        )
-        return response
-
     def _get_chat_completion_args(
         self,
-        model_prompt: list[ChatMessage],
-        model_name: OpenAIModelName,
+        prompt_messages: list[ChatMessage],
+        model: OpenAIModelName,
         functions: Optional[list[CompletionModelFunction]] = None,
+        max_output_tokens: Optional[int] = None,
         **kwargs,
-    ) -> tuple[list[ChatCompletionMessageParam], dict[str, Any]]:
-        """Prepare chat completion arguments and keyword arguments for API call.
+    ) -> tuple[
+        list[ChatCompletionMessageParam], CompletionCreateParams, dict[str, Any]
+    ]:
+        """Prepare keyword arguments for an OpenAI chat completion call
 
         Args:
-            model_prompt: List of ChatMessages.
-            model_name: The model to use.
-            functions: Optional list of functions available to the LLM.
-            kwargs: Additional keyword arguments.
+            prompt_messages: List of ChatMessages
+            model: The model to use
+            functions (optional): List of functions available to the LLM
+            max_output_tokens (optional): Maximum number of tokens to generate
 
         Returns:
             list[ChatCompletionMessageParam]: Prompt messages for the OpenAI call
-            dict[str, Any]: Any other kwargs for the OpenAI call
+            CompletionCreateParams: Mapping of other kwargs for the OpenAI call
+            Mapping[str, Any]: Any keyword arguments to pass on to the completion parser
         """
-        kwargs.update(self._credentials.get_model_access_kwargs(model_name))
-
+        tools_compat_mode = False
         if functions:
-            if OPEN_AI_CHAT_MODELS[model_name].has_function_call_api:
-                kwargs["tools"] = [
-                    {"type": "function", "function": format_function_def_for_openai(f)}
-                    for f in functions
-                ]
-                if len(functions) == 1:
-                    # force the model to call the only specified function
-                    kwargs["tool_choice"] = {
-                        "type": "function",
-                        "function": {"name": functions[0].name},
-                    }
-            else:
+            if not OPEN_AI_CHAT_MODELS[model].has_function_call_api:
                 # Provide compatibility with older models
-                _functions_compat_fix_kwargs(functions, kwargs)
-
-        if extra_headers := self._configuration.extra_request_headers:
-            kwargs["extra_headers"] = kwargs.get("extra_headers", {})
-            kwargs["extra_headers"].update(extra_headers.copy())
-
-        if "messages" in kwargs:
-            model_prompt += kwargs["messages"]
-            del kwargs["messages"]
-
-        openai_messages = [
-            cast(
-                ChatCompletionMessageParam,
-                message.dict(
-                    include={"role", "content", "tool_calls", "name"},
-                    exclude_none=True,
-                ),
-            )
-            for message in model_prompt
-        ]
+                _functions_compat_fix_kwargs(functions, prompt_messages)
+                tools_compat_mode = True
+                functions = None
 
-        return openai_messages, kwargs
-
-    def _get_embedding_kwargs(
-        self,
-        model_name: OpenAIModelName,
-        **kwargs,
-    ) -> dict:
-        """Get kwargs for embedding API call.
-
-        Args:
-            model: The model to use.
-            kwargs: Keyword arguments to override the default values.
-
-        Returns:
-            The kwargs for the embedding API call.
-
-        """
-        kwargs.update(self._credentials.get_model_access_kwargs(model_name))
-
-        if extra_headers := self._configuration.extra_request_headers:
-            kwargs["extra_headers"] = kwargs.get("extra_headers", {})
-            kwargs["extra_headers"].update(extra_headers.copy())
-
-        return kwargs
-
-    async def _create_chat_completion(
-        self,
-        messages: list[ChatCompletionMessageParam],
-        model: OpenAIModelName,
-        *_,
-        **kwargs,
-    ) -> tuple[ChatCompletion, float, int, int]:
-        """
-        Create a chat completion using the OpenAI API with retry handling.
-
-        Params:
-            openai_messages: List of OpenAI-consumable message dict objects
-            model: The model to use for the completion
-
-        Returns:
-            ChatCompletion: The chat completion response object
-            float: The cost ($) of this completion
-            int: Number of prompt tokens used
-            int: Number of completion tokens used
-        """
-
-        @self._retry_api_request
-        async def _create_chat_completion_with_retry(
-            messages: list[ChatCompletionMessageParam], **kwargs
-        ) -> ChatCompletion:
-            return await self._client.chat.completions.create(
-                messages=messages,  # type: ignore
-                **kwargs,
-            )
-
-        completion = await _create_chat_completion_with_retry(
-            messages, model=model, **kwargs
+        openai_messages, kwargs, parse_kwargs = super()._get_chat_completion_args(
+            prompt_messages=prompt_messages,
+            model=model,
+            functions=functions,
+            max_output_tokens=max_output_tokens,
+            **kwargs,
         )
+        kwargs.update(self._credentials.get_model_access_kwargs(model))  # type: ignore
 
-        if completion.usage:
-            prompt_tokens_used = completion.usage.prompt_tokens
-            completion_tokens_used = completion.usage.completion_tokens
-        else:
-            prompt_tokens_used = completion_tokens_used = 0
+        if tools_compat_mode:
+            parse_kwargs["compat_mode"] = True
 
-        cost = self._budget.update_usage_and_cost(
-            model_info=OPEN_AI_CHAT_MODELS[model],
-            input_tokens_used=prompt_tokens_used,
-            output_tokens_used=completion_tokens_used,
-        )
-        self._logger.debug(
-            f"Completion usage: {prompt_tokens_used} input, "
-            f"{completion_tokens_used} output - ${round(cost, 5)}"
-        )
-        return completion, cost, prompt_tokens_used, completion_tokens_used
+        return openai_messages, kwargs, parse_kwargs
 
     def _parse_assistant_tool_calls(
-        self, assistant_message: ChatCompletionMessage, compat_mode: bool = False
+        self,
+        assistant_message: ChatCompletionMessage,
+        compat_mode: bool = False,
+        **kwargs,
     ) -> tuple[list[AssistantToolCall], list[Exception]]:
         tool_calls: list[AssistantToolCall] = []
         parse_errors: list[Exception] = []
 
-        if assistant_message.tool_calls:
-            for _tc in assistant_message.tool_calls:
-                try:
-                    parsed_arguments = json_loads(_tc.function.arguments)
-                except Exception as e:
-                    err_message = (
-                        f"Decoding arguments for {_tc.function.name} failed: "
-                        + str(e.args[0])
-                    )
-                    parse_errors.append(
-                        type(e)(err_message, *e.args[1:]).with_traceback(
-                            e.__traceback__
-                        )
-                    )
-                    continue
-
-                tool_calls.append(
-                    AssistantToolCall(
-                        id=_tc.id,
-                        type=_tc.type,
-                        function=AssistantFunctionCall(
-                            name=_tc.function.name,
-                            arguments=parsed_arguments,
-                        ),
-                    )
-                )
-
-            # If parsing of all tool calls succeeds in the end, we ignore any issues
-            if len(tool_calls) == len(assistant_message.tool_calls):
-                parse_errors = []
-
-        elif compat_mode and assistant_message.content:
+        if not compat_mode:
+            return super()._parse_assistant_tool_calls(
+                assistant_message=assistant_message, compat_mode=compat_mode, **kwargs
+            )
+        elif assistant_message.content:
             try:
                 tool_calls = list(
                     _tool_calls_compat_extract_calls(assistant_message.content)
@@ -728,21 +442,16 @@ def _parse_assistant_tool_calls(
 
         return tool_calls, parse_errors
 
-    def _create_embedding(
-        self, text: str, *_, **kwargs
-    ) -> Coroutine[None, None, CreateEmbeddingResponse]:
-        """Create an embedding using the OpenAI API with retry handling."""
-
-        @self._retry_api_request
-        async def _create_embedding_with_retry(
-            text: str, *_, **kwargs
-        ) -> CreateEmbeddingResponse:
-            return await self._client.embeddings.create(
-                input=[text],
-                **kwargs,
-            )
+    def _get_embedding_kwargs(
+        self, input: str | list[str], model: OpenAIModelName, **kwargs
+    ) -> EmbeddingCreateParams:
+        kwargs = super()._get_embedding_kwargs(input=input, model=model, **kwargs)
+        kwargs.update(self._credentials.get_model_access_kwargs(model))  # type: ignore
+        return kwargs
 
-        return _create_embedding_with_retry(text, *_, **kwargs)
+    _get_embedding_kwargs.__doc__ = (
+        BaseOpenAIEmbeddingProvider._get_embedding_kwargs.__doc__
+    )
 
     def _retry_api_request(self, func: Callable[_P, _T]) -> Callable[_P, _T]:
         _log_retry_debug_message = tenacity.after_log(self._logger, logging.DEBUG)
@@ -777,24 +486,6 @@ def __repr__(self):
         return "OpenAIProvider()"
 
 
-def format_function_def_for_openai(self: CompletionModelFunction) -> FunctionDefinition:
-    """Returns an OpenAI-consumable function definition"""
-
-    return {
-        "name": self.name,
-        "description": self.description,
-        "parameters": {
-            "type": "object",
-            "properties": {
-                name: param.to_dict() for name, param in self.parameters.items()
-            },
-            "required": [
-                name for name, param in self.parameters.items() if param.required
-            ],
-        },
-    }
-
-
 def format_function_specs_as_typescript_ns(
     functions: list[CompletionModelFunction],
 ) -> str:
@@ -871,7 +562,7 @@ def count_openai_functions_tokens(
 
 def _functions_compat_fix_kwargs(
     functions: list[CompletionModelFunction],
-    completion_kwargs: dict,
+    prompt_messages: list[ChatMessage],
 ):
     function_definitions = format_function_specs_as_typescript_ns(functions)
     function_call_schema = JSONSchema(
@@ -902,7 +593,7 @@ def _functions_compat_fix_kwargs(
             },
         ),
     )
-    completion_kwargs["messages"] = [
+    prompt_messages.append(
         ChatMessage.system(
             "# tool usage instructions\n\n"
             "Specify a '```tool_calls' block in your response,"
@@ -915,7 +606,7 @@ def _functions_compat_fix_kwargs(
             "For the function call itself, use one of the following"
             f" functions:\n\n{function_definitions}"
         ),
-    ]
+    )
 
 
 def _tool_calls_compat_extract_calls(response: str) -> Iterator[AssistantToolCall]:
diff --git a/forge/forge/llm/providers/schema.py b/forge/forge/llm/providers/schema.py
index fd957db6f89..297db5a5ce9 100644
--- a/forge/forge/llm/providers/schema.py
+++ b/forge/forge/llm/providers/schema.py
@@ -12,11 +12,12 @@
     Literal,
     Optional,
     Protocol,
+    Sequence,
     TypedDict,
     TypeVar,
 )
 
-from pydantic import BaseModel, Field, SecretStr, validator
+from pydantic import BaseModel, Field, SecretStr
 
 from forge.logging.utils import fmt_kwargs
 from forge.models.config import (
@@ -190,7 +191,8 @@ class ModelResponse(BaseModel):
 
 
 class ModelProviderConfiguration(SystemConfiguration):
-    retries_per_request: int = UserConfigurable()
+    retries_per_request: int = UserConfigurable(7)
+    fix_failed_parse_tries: int = UserConfigurable(3)
     extra_request_headers: dict[str, str] = Field(default_factory=dict)
 
 
@@ -297,6 +299,12 @@ def __init__(
 
         self._logger = logger or logging.getLogger(self.__module__)
 
+    @abc.abstractmethod
+    async def get_available_models(
+        self,
+    ) -> Sequence["ChatModelInfo[_ModelName] | EmbeddingModelInfo[_ModelName]"]:
+        ...
+
     @abc.abstractmethod
     def count_tokens(self, text: str, model_name: _ModelName) -> int:
         ...
@@ -340,7 +348,7 @@ def decode(self, tokens: list[_T]) -> str:
 class EmbeddingModelInfo(ModelInfo[_ModelName]):
     """Struct for embedding model information."""
 
-    service = ModelProviderService.EMBEDDING
+    service: Literal[ModelProviderService.EMBEDDING] = ModelProviderService.EMBEDDING  # type: ignore # noqa
     max_tokens: int
     embedding_dimensions: int
 
@@ -349,15 +357,16 @@ class EmbeddingModelResponse(ModelResponse):
     """Standard response struct for a response from an embedding model."""
 
     embedding: Embedding = Field(default_factory=list)
-
-    @validator("completion_tokens_used")
-    def _verify_no_completion_tokens_used(cls, v: int):
-        if v > 0:
-            raise ValueError("Embeddings should not have completion tokens used.")
-        return v
+    completion_tokens_used: int = Field(default=0, const=True)
 
 
 class BaseEmbeddingModelProvider(BaseModelProvider[_ModelName, _ModelProviderSettings]):
+    @abc.abstractmethod
+    async def get_available_embedding_models(
+        self,
+    ) -> Sequence[EmbeddingModelInfo[_ModelName]]:
+        ...
+
     @abc.abstractmethod
     async def create_embedding(
         self,
@@ -377,7 +386,7 @@ async def create_embedding(
 class ChatModelInfo(ModelInfo[_ModelName]):
     """Struct for language model information."""
 
-    service = ModelProviderService.CHAT
+    service: Literal[ModelProviderService.CHAT] = ModelProviderService.CHAT  # type: ignore # noqa
     max_tokens: int
     has_function_call_api: bool = False
 
@@ -391,7 +400,7 @@ class ChatModelResponse(ModelResponse, Generic[_T]):
 
 class BaseChatModelProvider(BaseModelProvider[_ModelName, _ModelProviderSettings]):
     @abc.abstractmethod
-    async def get_available_models(self) -> list[ChatModelInfo[_ModelName]]:
+    async def get_available_chat_models(self) -> Sequence[ChatModelInfo[_ModelName]]:
         ...
 
     @abc.abstractmethod

From f53c2de41ec5e9f5be5cabf270d5fac9827a1995 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 3 Jun 2024 16:40:11 +0200
Subject: [PATCH 11/40] move llamafile stuff into folders

---
 .../env.llamafile.example                     | 247 ------------------
 autogpt/scripts/llamafile/.gitignore          |   1 +
 .../llamafile}/serve.sh                       |  12 +-
 .../llamafile/setup.sh}                       |  10 +-
 docs/content/AutoGPT/setup/index.md           |  19 ++
 forge/forge/config/config.py                  |  10 +-
 .../forge/llm/providers/llamafile}/README.md  |   0
 .../forge/llm/providers/llamafile/__init__.py |  17 ++
 .../providers/{ => llamafile}/llamafile.py    |   4 +-
 9 files changed, 55 insertions(+), 265 deletions(-)
 delete mode 100644 autogpt/llamafile-integration/env.llamafile.example
 create mode 100644 autogpt/scripts/llamafile/.gitignore
 rename autogpt/{llamafile-integration => scripts/llamafile}/serve.sh (68%)
 rename autogpt/{llamafile-integration/setup-llamafile.sh => scripts/llamafile/setup.sh} (54%)
 rename {autogpt/llamafile-integration => forge/forge/llm/providers/llamafile}/README.md (100%)
 create mode 100644 forge/forge/llm/providers/llamafile/__init__.py
 rename forge/forge/llm/providers/{ => llamafile}/llamafile.py (99%)

diff --git a/autogpt/llamafile-integration/env.llamafile.example b/autogpt/llamafile-integration/env.llamafile.example
deleted file mode 100644
index ed39b78942e..00000000000
--- a/autogpt/llamafile-integration/env.llamafile.example
+++ /dev/null
@@ -1,247 +0,0 @@
-################################################################################
-### AutoGPT - GENERAL SETTINGS
-################################################################################
-
-## OPENAI_API_KEY - OpenAI API Key (Example: my-openai-api-key)
-#OPENAI_API_KEY=your-openai-api-key
-OPENAI_API_KEY=sk-000000000000000000000000000000000000000000000000
-
-## TELEMETRY_OPT_IN - Share telemetry on errors and other issues with the AutoGPT team, e.g. through Sentry.
-##   This helps us to spot and solve problems earlier & faster. (Default: DISABLED)
-TELEMETRY_OPT_IN=false
-
-## EXECUTE_LOCAL_COMMANDS - Allow local command execution (Default: False)
-# EXECUTE_LOCAL_COMMANDS=False
-
-### Workspace ###
-
-## RESTRICT_TO_WORKSPACE - Restrict file operations to workspace ./data/agents/<agent_id>/workspace (Default: True)
-# RESTRICT_TO_WORKSPACE=True
-
-## DISABLED_COMMAND_CATEGORIES - The list of categories of commands that are disabled (Default: None)
-# DISABLED_COMMAND_CATEGORIES=
-
-## FILE_STORAGE_BACKEND - Choose a storage backend for contents
-## Options: local, gcs, s3
-# FILE_STORAGE_BACKEND=local
-
-## STORAGE_BUCKET - GCS/S3 Bucket to store contents in
-# STORAGE_BUCKET=autogpt
-
-## GCS Credentials
-# see https://cloud.google.com/storage/docs/authentication#libauth
-
-## AWS/S3 Credentials
-# see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
-
-## S3_ENDPOINT_URL - If you're using non-AWS S3, set your endpoint here.
-# S3_ENDPOINT_URL=
-
-### Miscellaneous ###
-
-## USER_AGENT - Define the user-agent used by the requests library to browse website (string)
-# USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
-
-## AI_SETTINGS_FILE - Specifies which AI Settings file to use, relative to the AutoGPT root directory. (defaults to ai_settings.yaml)
-# AI_SETTINGS_FILE=ai_settings.yaml
-
-## PLUGINS_CONFIG_FILE - The path to the plugins_config.yaml file, relative to the AutoGPT root directory. (Default plugins_config.yaml)
-# PLUGINS_CONFIG_FILE=plugins_config.yaml
-
-## PROMPT_SETTINGS_FILE - Specifies which Prompt Settings file to use, relative to the AutoGPT root directory. (defaults to prompt_settings.yaml)
-# PROMPT_SETTINGS_FILE=prompt_settings.yaml
-
-## AUTHORISE COMMAND KEY - Key to authorise commands
-# AUTHORISE_COMMAND_KEY=y
-
-## EXIT_KEY - Key to exit AutoGPT
-# EXIT_KEY=n
-
-################################################################################
-### LLM PROVIDER
-################################################################################
-
-LLM_PROVIDER=llamafile
-
-## TEMPERATURE - Sets temperature in OpenAI (Default: 0)
-# TEMPERATURE=0
-
-## OPENAI_API_BASE_URL - Custom url for the OpenAI API, useful for connecting to custom backends. No effect if USE_AZURE is true, leave blank to keep the default url
-# the following is an example:
-# OPENAI_API_BASE_URL=http://localhost:443/v1
-OPENAI_API_BASE_URL=http://localhost:8080/v1
-
-# OPENAI_API_TYPE=
-# OPENAI_API_VERSION=
-
-## OPENAI_FUNCTIONS - Enables OpenAI functions: https://platform.openai.com/docs/guides/gpt/function-calling
-## Note: this feature is only supported by OpenAI's newer models.
-# OPENAI_FUNCTIONS=False
-
-## OPENAI_ORGANIZATION - Your OpenAI Organization key (Default: None)
-# OPENAI_ORGANIZATION=
-
-## USE_AZURE - Use Azure OpenAI or not (Default: False)
-# USE_AZURE=False
-
-## AZURE_CONFIG_FILE - The path to the azure.yaml file, relative to the folder containing this file. (Default: azure.yaml)
-# AZURE_CONFIG_FILE=azure.yaml
-
-# AZURE_OPENAI_AD_TOKEN=
-# AZURE_OPENAI_ENDPOINT=
-
-################################################################################
-### LLM MODELS
-################################################################################
-
-## SMART_LLM - Smart language model (Default: gpt-4-turbo-preview)
-# SMART_LLM=gpt-4-turbo-preview
-SMART_LLM=mistral-7b-instruct-v0
-
-## FAST_LLM - Fast language model (Default: gpt-3.5-turbo-0125)
-# FAST_LLM=gpt-3.5-turbo-0125
-FAST_LLM=mistral-7b-instruct-v0
-
-## EMBEDDING_MODEL - Model to use for creating embeddings
-# EMBEDDING_MODEL=text-embedding-3-small
-
-################################################################################
-### SHELL EXECUTION
-################################################################################
-
-## SHELL_COMMAND_CONTROL - Whether to use "allowlist" or "denylist" to determine what shell commands can be executed (Default: denylist)
-# SHELL_COMMAND_CONTROL=denylist
-
-## ONLY if SHELL_COMMAND_CONTROL is set to denylist:
-## SHELL_DENYLIST - List of shell commands that ARE NOT allowed to be executed by AutoGPT (Default: sudo,su)
-# SHELL_DENYLIST=sudo,su
-
-## ONLY if SHELL_COMMAND_CONTROL is set to allowlist:
-## SHELL_ALLOWLIST - List of shell commands that ARE allowed to be executed by AutoGPT (Default: None)
-# SHELL_ALLOWLIST=
-
-################################################################################
-### IMAGE GENERATION PROVIDER
-################################################################################
-
-### Common
-
-## IMAGE_PROVIDER - Image provider (Default: dalle)
-# IMAGE_PROVIDER=dalle
-
-## IMAGE_SIZE - Image size (Default: 256)
-# IMAGE_SIZE=256
-
-### Huggingface (IMAGE_PROVIDER=huggingface)
-
-## HUGGINGFACE_IMAGE_MODEL - Text-to-image model from Huggingface (Default: CompVis/stable-diffusion-v1-4)
-# HUGGINGFACE_IMAGE_MODEL=CompVis/stable-diffusion-v1-4
-
-## HUGGINGFACE_API_TOKEN - HuggingFace API token (Default: None)
-# HUGGINGFACE_API_TOKEN=
-
-### Stable Diffusion (IMAGE_PROVIDER=sdwebui)
-
-## SD_WEBUI_AUTH - Stable Diffusion Web UI username:password pair (Default: None)
-# SD_WEBUI_AUTH=
-
-## SD_WEBUI_URL - Stable Diffusion Web UI API URL (Default: http://localhost:7860)
-# SD_WEBUI_URL=http://localhost:7860
-
-################################################################################
-### AUDIO TO TEXT PROVIDER
-################################################################################
-
-## AUDIO_TO_TEXT_PROVIDER - Audio-to-text provider (Default: huggingface)
-# AUDIO_TO_TEXT_PROVIDER=huggingface
-
-## HUGGINGFACE_AUDIO_TO_TEXT_MODEL - The model for HuggingFace to use (Default: CompVis/stable-diffusion-v1-4)
-# HUGGINGFACE_AUDIO_TO_TEXT_MODEL=CompVis/stable-diffusion-v1-4
-
-################################################################################
-### GITHUB
-################################################################################
-
-## GITHUB_API_KEY - Github API key / PAT (Default: None)
-# GITHUB_API_KEY=
-
-## GITHUB_USERNAME - Github username (Default: None)
-# GITHUB_USERNAME=
-
-################################################################################
-### WEB BROWSING
-################################################################################
-
-## HEADLESS_BROWSER - Whether to run the browser in headless mode (default: True)
-# HEADLESS_BROWSER=True
-
-## USE_WEB_BROWSER - Sets the web-browser driver to use with selenium (default: chrome)
-# USE_WEB_BROWSER=chrome
-USE_WEB_BROWSER=firefox
-
-## BROWSE_CHUNK_MAX_LENGTH - When browsing website, define the length of chunks to summarize (Default: 3000)
-# BROWSE_CHUNK_MAX_LENGTH=3000
-
-## BROWSE_SPACY_LANGUAGE_MODEL - spaCy language model](https://spacy.io/usage/models) to use when creating chunks. (Default: en_core_web_sm)
-# BROWSE_SPACY_LANGUAGE_MODEL=en_core_web_sm
-
-## GOOGLE_API_KEY - Google API key (Default: None)
-# GOOGLE_API_KEY=
-
-## GOOGLE_CUSTOM_SEARCH_ENGINE_ID - Google custom search engine ID (Default: None)
-# GOOGLE_CUSTOM_SEARCH_ENGINE_ID=
-
-################################################################################
-### TEXT TO SPEECH PROVIDER
-################################################################################
-
-## TEXT_TO_SPEECH_PROVIDER - Which Text to Speech provider to use (Default: gtts)
-## Options: gtts, streamelements, elevenlabs, macos
-# TEXT_TO_SPEECH_PROVIDER=gtts
-
-## STREAMELEMENTS_VOICE - Voice to use for StreamElements (Default: Brian)
-# STREAMELEMENTS_VOICE=Brian
-
-## ELEVENLABS_API_KEY - Eleven Labs API key (Default: None)
-# ELEVENLABS_API_KEY=
-
-## ELEVENLABS_VOICE_ID - Eleven Labs voice ID (Example: None)
-# ELEVENLABS_VOICE_ID=
-
-################################################################################
-### CHAT MESSAGES
-################################################################################
-
-## CHAT_MESSAGES_ENABLED - Enable chat messages (Default: False)
-# CHAT_MESSAGES_ENABLED=False
-
-################################################################################
-### LOGGING
-################################################################################
-
-## LOG_LEVEL - Set the minimum level to filter log output by. Setting this to DEBUG implies LOG_FORMAT=debug, unless LOG_FORMAT is set explicitly.
-## Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
-# LOG_LEVEL=INFO
-LOG_LEVEL=DEBUG
-
-## LOG_FORMAT - The format in which to log messages to the console (and log files).
-## Options: simple, debug, structured_google_cloud
-# LOG_FORMAT=simple
-
-## LOG_FILE_FORMAT - Normally follows the LOG_FORMAT setting, but can be set separately.
-## Note: Log file output is disabled if LOG_FORMAT=structured_google_cloud.
-# LOG_FILE_FORMAT=simple
-
-## PLAIN_OUTPUT - Disables animated typing and the spinner in the console output. (Default: False)
-# PLAIN_OUTPUT=False
-
-
-################################################################################
-### Agent Protocol Server Settings
-################################################################################
-## AP_SERVER_PORT - Specifies what port the agent protocol server will listen on. (Default: 8000)
-## AP_SERVER_DB_URL - Specifies what connection url the agent protocol database will connect to (Default: Internal SQLite)
-## AP_SERVER_CORS_ALLOWED_ORIGINS - Comma separated list of allowed origins for CORS. (Default: http://localhost:{AP_SERVER_PORT})
-# AP_SERVER_PORT=8000
-# AP_SERVER_DB_URL=sqlite:///data/ap_server.db
-# AP_SERVER_CORS_ALLOWED_ORIGINS=
diff --git a/autogpt/scripts/llamafile/.gitignore b/autogpt/scripts/llamafile/.gitignore
new file mode 100644
index 00000000000..36431613d38
--- /dev/null
+++ b/autogpt/scripts/llamafile/.gitignore
@@ -0,0 +1 @@
+*.llamafile
diff --git a/autogpt/llamafile-integration/serve.sh b/autogpt/scripts/llamafile/serve.sh
similarity index 68%
rename from autogpt/llamafile-integration/serve.sh
rename to autogpt/scripts/llamafile/serve.sh
index 1f7c44ffa25..a7a4fa4895d 100755
--- a/autogpt/llamafile-integration/serve.sh
+++ b/autogpt/scripts/llamafile/serve.sh
@@ -1,13 +1,15 @@
-#!/bin/bash
-#
+#!/usr/bin/env bash
+
 # Use llamafile to server a (quantized) mistral-7b-instruct-v0.2 model
 #
 # Usage:
 #   cd <repo-root>/autogpt
-#   ./llamafile-integration/serve.sh
-#
+#   ./scripts/llamafile/serve.sh
+
+# Go to autogpt/scripts/llamafile/
+cd "$(dirname "$0")"
 
-LLAMAFILE="./llamafile-integration/mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
+LLAMAFILE="./mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
 
 "${LLAMAFILE}" \
 --server \
diff --git a/autogpt/llamafile-integration/setup-llamafile.sh b/autogpt/scripts/llamafile/setup.sh
similarity index 54%
rename from autogpt/llamafile-integration/setup-llamafile.sh
rename to autogpt/scripts/llamafile/setup.sh
index ad2426a2324..f0368226446 100755
--- a/autogpt/llamafile-integration/setup-llamafile.sh
+++ b/autogpt/scripts/llamafile/setup.sh
@@ -1,8 +1,14 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
-cd llamafile-integration
+# Go to autogpt/scripts/llamafile/
+cd "$(dirname "$0")"
 
 # Download the mistral-7b-instruct llamafile from HuggingFace
+echo "Downloading mistral-7b-instruct-v0.2..."
 wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
 chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
 ./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version
+
+echo
+echo "NOTE: To use other models besides mistral-7b-instruct-v0.2," \
+     "download them into autogpt/scripts/llamafile/"
diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index 43aba647477..1bbc3294d4d 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -190,3 +190,22 @@ If you don't know which to choose, you can safely go with OpenAI*.
 
 [groq/api-keys]: https://console.groq.com/keys
 [groq/models]: https://console.groq.com/docs/models
+
+
+### Llamafile
+
+With llamafile you can run models locally, which means no need to set up billing,
+and guaranteed data privacy.
+
+1. Run the llamafile setup script:
+   ```shell
+   ./scripts/llamafile/setup.sh
+   ```
+
+2. Start the llamafile server:
+   ```shell
+   ./scripts/llamafile/serve.sh
+   ```
+
+3. If the server is not running on `http://localhost:8080/v1`, adjust `LLAMAFILE_API_BASE`
+   in `.env` with the right base URL
diff --git a/forge/forge/config/config.py b/forge/forge/config/config.py
index 5e157a339c4..27b57cb5a2a 100644
--- a/forge/forge/config/config.py
+++ b/forge/forge/config/config.py
@@ -13,7 +13,7 @@
 
 import forge
 from forge.file_storage import FileStorageBackendName
-from forge.llm.providers import CHAT_MODELS, ModelName, ModelProviderName
+from forge.llm.providers import CHAT_MODELS, ModelName
 from forge.llm.providers.openai import OpenAICredentials, OpenAIModelName
 from forge.logging.config import LoggingConfig
 from forge.models.config import Configurable, SystemSettings, UserConfigurable
@@ -56,14 +56,6 @@ class Config(SystemSettings, arbitrary_types_allowed=True):
     # Agent Control Settings #
     ##########################
     # Model configuration
-    # llm_provider: str = UserConfigurable(
-    #     default="openai",
-    #     from_env=lambda: os.getenv("LLM_PROVIDER")
-    # )
-    llm_provider: ModelProviderName = UserConfigurable(
-        default=ModelProviderName.OPENAI,
-        from_env=lambda: ModelProviderName(os.getenv("LLM_PROVIDER")),
-    )
     fast_llm: ModelName = UserConfigurable(
         default=OpenAIModelName.GPT3,
         from_env="FAST_LLM",
diff --git a/autogpt/llamafile-integration/README.md b/forge/forge/llm/providers/llamafile/README.md
similarity index 100%
rename from autogpt/llamafile-integration/README.md
rename to forge/forge/llm/providers/llamafile/README.md
diff --git a/forge/forge/llm/providers/llamafile/__init__.py b/forge/forge/llm/providers/llamafile/__init__.py
new file mode 100644
index 00000000000..23706b102be
--- /dev/null
+++ b/forge/forge/llm/providers/llamafile/__init__.py
@@ -0,0 +1,17 @@
+from .llamafile import (
+    LLAMAFILE_CHAT_MODELS,
+    LLAMAFILE_EMBEDDING_MODELS,
+    LlamafileCredentials,
+    LlamafileModelName,
+    LlamafileProvider,
+    LlamafileSettings,
+)
+
+__all__ = [
+    "LLAMAFILE_CHAT_MODELS",
+    "LLAMAFILE_EMBEDDING_MODELS",
+    "LlamafileCredentials",
+    "LlamafileModelName",
+    "LlamafileProvider",
+    "LlamafileSettings",
+]
diff --git a/forge/forge/llm/providers/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
similarity index 99%
rename from forge/forge/llm/providers/llamafile.py
rename to forge/forge/llm/providers/llamafile/llamafile.py
index 45c4e8c1dfd..12476fd1534 100644
--- a/forge/forge/llm/providers/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -14,8 +14,8 @@
 from forge.json.parsing import json_loads
 from forge.models.config import UserConfigurable
 
-from ._openai_base import BaseOpenAIChatProvider
-from .schema import (
+from .._openai_base import BaseOpenAIChatProvider
+from ..schema import (
     AssistantToolCall,
     AssistantToolCallDict,
     ChatMessage,

From f78ad94bdf0ee9e38e16daf641ca0e16b5666f95 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 3 Jun 2024 16:40:45 +0200
Subject: [PATCH 12/40] clean up llamafile readme

---
 forge/forge/llm/providers/llamafile/README.md | 212 ++----------------
 1 file changed, 18 insertions(+), 194 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/README.md b/forge/forge/llm/providers/llamafile/README.md
index 77483805d5d..e4276ec1a94 100644
--- a/forge/forge/llm/providers/llamafile/README.md
+++ b/forge/forge/llm/providers/llamafile/README.md
@@ -1,212 +1,36 @@
-# Llamafile/AutoGPT Integration Notes
+# Llamafile Integration Notes
 
 Tested with:
 * Python 3.11
 * Apple M2 Pro (32 GB), macOS 14.2.1
 * quantized mistral-7b-instruct-v0.2
 
-I tested everything using the task: "Tell me about Roman dodecahedrons."
-
 ## Setup
 
-### AutoGPT setup
-
-```bash
-git clone git@github.com:Mozilla-Ocho/AutoGPT.git
-cd AutoGPT/autogpt
-git checkout draft-llamafile-support
-pyenv local 3.11
-./setup
-cp llamafile-integration/env.llamafile.example .env
-```
-
-
-### llamafile setup
-
-Run the llamafile setup script:
-
-```bash
-./llamafile-integration/setup-llamafile.sh
-```
-
-### Run AutoGPT + llamafile
-
-First, start the llamafile server:
-
-```bash
-./llamafile-integration/serve.sh
-```
-
-Then, in a separate terminal, run AutoGPT:
-
-```bash
-./autogpt.sh run
-```
-
-Sample interaction:
-
-```bash
-2024-04-18 23:55:26,895 WARNING  You are running on `draft-llamafile-support` branch - this is not a supported branch.
-2024-04-18 23:55:26,896 INFO  Smart LLM: mistral-7b-instruct-v0
-2024-04-18 23:55:26,896 INFO  Fast LLM: mistral-7b-instruct-v0
-2024-04-18 23:55:26,896 INFO  Browser: firefox
-2024-04-18 23:55:26,898 INFO  Code Execution: DISABLED (Docker unavailable)
-Enter the task that you want AutoGPT to execute, with as much detail as possible: Tell me about Roman dodecahedrons.
-2024-04-18 23:55:59,738 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
-2024-04-18 23:55:59,741 INFO  Current AI Settings:
-2024-04-18 23:55:59,741 INFO  -------------------:
-2024-04-18 23:55:59,741 INFO  Name : HistorianDodecahedronGPT
-2024-04-18 23:55:59,741 INFO  Role : An autonomous agent specialized in providing in-depth knowledge and analysis about Roman dodecahedrons.
-2024-04-18 23:55:59,741 INFO  Constraints:
-2024-04-18 23:55:59,741 INFO  - Exclusively use the commands listed below.
-2024-04-18 23:55:59,741 INFO  - You can only act proactively, and are unable to start background jobs or set up webhooks for yourself. Take this into account when planning your actions.
-2024-04-18 23:55:59,741 INFO  - You are unable to interact with physical objects. If this is absolutely necessary to fulfill a task or objective or to complete a step, you must ask the user to do it for you. If the user refuses this, and there is no other way to achieve your goals, you must terminate to avoid wasting time and energy.
-2024-04-18 23:55:59,741 INFO  - Limit responses to facts and historical information.
-2024-04-18 23:55:59,741 INFO  - Provide sources and citations for all information provided.
-2024-04-18 23:55:59,741 INFO  Resources:
-2024-04-18 23:55:59,742 INFO  - Internet access for searches and information gathering.
-2024-04-18 23:55:59,742 INFO  - The ability to read and write files.
-2024-04-18 23:55:59,742 INFO  - You are a Large Language Model, trained on millions of pages of text, including a lot of factual knowledge. Make use of this factual knowledge to avoid unnecessary gathering of information.
-2024-04-18 23:55:59,742 INFO  Best practices:
-2024-04-18 23:55:59,742 INFO  - Continuously review and analyze your actions to ensure you are performing to the best of your abilities.
-2024-04-18 23:55:59,742 INFO  - Constructively self-criticize your big-picture behavior constantly.
-2024-04-18 23:55:59,742 INFO  - Reflect on past decisions and strategies to refine your approach.
-2024-04-18 23:55:59,742 INFO  - Every command has a cost, so be smart and efficient. Aim to complete tasks in the least number of steps.
-2024-04-18 23:55:59,742 INFO  - Only make use of your information gathering abilities to find information that you don't yet have knowledge of.
-2024-04-18 23:55:59,742 INFO  - Provide accurate and detailed historical context about the origin, usage, and cultural significance of Roman dodecahedrons.
-2024-04-18 23:55:59,742 INFO  - Analyze and interpret various historical artifacts and texts to gain a comprehensive understanding of the subject.
-2024-04-18 23:55:59,742 INFO  - Offer visualizations and diagrams to help illustrate complex concepts related to Roman dodecahedrons.
-2024-04-18 23:55:59,742 INFO  - Provide recommendations for further reading and resources for those interested in learning more about the topic.
-Continue with these settings? [Y/n] Y
-2024-04-18 23:56:41,707 INFO  NOTE: All files/directories created by this agent can be found inside its workspace at: /Users/ksilverstein/dev/autogpt/v4-autogpt-llamafile-support/autogpts/autogpt/data/agents/HistorianDodecahedronGPT-d4df1da9/workspace
-/ Thinking...
-2024-04-18 23:57:08,180 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
-2024-04-18 23:57:08,188 INFO  HISTORIANDODECAHEDRONGPT THOUGHTS: Roman dodecahedrons are polyhedra with twelve faces, each of which is a regular pentagon. They have been found in various archaeological sites across the Roman Empire, dating back to the 1st century BC. The exact purpose and significance of these objects are still a subject of debate among historians and archaeologists. Some theories suggest they were used as gaming pieces, while others propose they had religious or symbolic meanings.
-2024-04-18 23:57:08,188 INFO  REASONING: Based on the user's request, I will provide historical information about Roman dodecahedrons.
-2024-04-18 23:57:08,188 INFO  PLAN:
-2024-04-18 23:57:08,188 INFO  -  Research the historical context and significance of Roman dodecahedrons.
-2024-04-18 23:57:08,188 INFO  -  Identify theories regarding their usage and meaning.
-2024-04-18 23:57:08,188 INFO  -  Provide visualizations and diagrams to help illustrate the concepts.
-2024-04-18 23:57:08,188 INFO  CRITICISM:
-2024-04-18 23:57:08,188 INFO  SPEAK: Roman dodecahedrons are intriguing objects with a rich history. They were used by the ancient Romans and have twelve faces, each one a regular pentagon. While their exact purpose remains a topic of debate, some theories suggest they were used as gaming pieces, while others propose religious or symbolic meanings. Let me delve deeper into the historical context and significance of these fascinating objects.
-
-2024-04-18 23:57:08,188 INFO  NEXT ACTION: COMMAND = web_search  ARGUMENTS = {'query': 'Roman dodecahedron history significance'}
-2024-04-18 23:57:08,188 INFO  Enter 'y' to authorise command, 'y -N' to run N continuous commands, 'n' to exit program, or enter feedback for HistorianDodecahedronGPT...
-Input: y
-2024-04-18 23:57:36,589 INFO  -=-=-=-=-=-=-= COMMAND AUTHORISED BY USER -=-=-=-=-=-=-=
-2024-04-18 23:57:48,021 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
-2024-04-18 23:57:48,022 INFO  SYSTEM:
-## Search results
-### "Roman Dodecahedron - the Mystery of an Ancient Artifact"
-**URL:** https://roman-empire.net/discoveries/roman-dodecahedron/
-**Excerpt:** "Scholars have long debated the purpose and significance of the Roman dodecahedron. Some experts argue that it was used as a measuring instrument for astronomical calculations, while others believe it was used for religious purposes or as a gaming piece. ... The rich history and culture of the Roman Empire has lasting impacts in modern society."
-
-### "Roman dodecahedron - Wikipedia"
-**URL:** https://en.wikipedia.org/wiki/Roman_dodecahedron
-**Excerpt:** "Roman bronze dodecahedron found in Tongeren, Gallo-Roman Museum, Tongeren A Roman dodecahedron or Gallo-Roman dodecahedron is a small hollow object made of copper alloy which has been cast into a regular dodecahedral shape: twelve flat pentagonal faces. Each face has a circular hole of varying diameter in the middle, the holes connecting to the hollow center, and each corner has a protruding knob."
-
-### "The Mysterious Dodecahedrons of the Roman Empire - Atlas Obscura"
-**URL:** https://www.atlasobscura.com/articles/dodecahedrons-roman-empire
-**Excerpt:** "This ancient dodecahedron found in Avenches, Switzerland, once the Roman city of Aventicum. Woudloper/Wikimedia/CC BY-SA 3.0. In the first episode of Buck Rogers, the 1980s television series about ..."
-
-### "What Was the Purpose of a Roman Dodecahedron? - History Defined"
-**URL:** https://www.historydefined.net/what-was-the-purpose-of-a-roman-dodecahedron/
-**Excerpt:** "One of the significant advantages any historian of ancient Rome has is a wealth of written material that has survived from 2,000 years ago to help explain to us what the remains of the Roman Empire mean. For instance, we know how the towns of Pompeii and Herculaneum ended up buried under volcanic ash because"
-
-### "The Enigma of the Roman Dodecahedra | Ancient Origins"
-**URL:** https://www.ancient-origins.net/artifacts-other-artifacts-news-unexplained-phenomena/enigma-roman-dodecahedra-002371
-**Excerpt:** "The Roman dodecahedron is a small, hollow object made of bronze or (more rarely) stone, with a geometrical shape that has 12 flat faces. Each face is a pentagon, a five-sided shape. The Roman dodecahedra are also embellished with a series of knobs on each corner point of the pentagons, and the pentagon faces in most cases contain circular holes ..."
-
-### "The mysterious dodecahedrons of the Roman Empire | English Heritage"
-**URL:** https://www.english-heritage.org.uk/visit/places/corbridge-roman-town-hadrians-wall/dodecahedron-exhibition/
-**Excerpt:** "The dodecahedron (12 sided object) has been puzzling archaeologists since the 18 th century when it was first discovered. Only 120 have been found across what was the Roman Empire, however, none have been found in Italy, Spain, Africa or the Mediterranean. These striking objects are some of the rarest and least understood to survive from the ..."
-
-### "What is a Roman Dodecahedron? The Mystery is Still Unsolved"
-**URL:** https://www.historicmysteries.com/archaeology/roman-dodecahedron/14742/
-**Excerpt:** "This Roman dodecahedron derives from the 2nd or 3rd century in Stuttgart, Germany. CC3.0 Anagoria. The enigmatic geometric forms consist of copper alloy. They range in size from 4 cm to 11 cm, and each of the 12 pentagonal faces contains a circular hole. Oddly, the diameters of the holes are not identical in a single dodecahedron."
-
-### "The Mysterious Bronze Objects That Have Baffled Archeologists for ..."
-**URL:** https://www.mentalfloss.com/article/514246/are-roman-dodecahedrons-worlds-most-mysterious-artifact
-**Excerpt:** "The Roman dodecahedron Brian Campbell found in East London / Brian Campbell One August day in 1987, Brian Campbell was refilling the hole left by a tree stump in his yard in Romford, East London ..."
-2024-04-18 23:58:16,114 INFO  HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
-2024-04-18 23:58:16,180 INFO  HISTORIANDODECAHEDRONGPT THOUGHTS: Roman dodecahedrons are small, hollow objects made of bronze or copper alloy with a dodecahedral shape, consisting of twelve flat pentagonal faces. Each face has a circular hole in the middle, and the corners have protruding knobs. The exact purpose and significance of these objects remain a mystery, with theories suggesting they were used for astronomical calculations, religious purposes, or as gaming pieces.
-2024-04-18 23:58:16,180 INFO  REASONING: Based on the user's request, I provided historical information about Roman dodecahedrons.
-2024-04-18 23:58:16,180 INFO  PLAN:
-2024-04-18 23:58:16,180 INFO  -  Read the webpage contents in more detail to extract specific information about the origins and cultural significance of Roman dodecahedrons.
-2024-04-18 23:58:16,180 INFO  CRITICISM: I could have provided more specific information about the origins and cultural significance of Roman dodecahedrons from the existing search results.
-2024-04-18 23:58:16,180 INFO  SPEAK: Roman dodecahedrons are intriguing objects with a rich history. They were made of bronze or copper alloy and had a dodecahedral shape, consisting of twelve flat pentagonal faces. Each face had a circular hole in the middle, and the corners had protruding knobs. The exact purpose and significance of these objects remain a mystery, with theories suggesting they were used for astronomical calculations, religious purposes, or as gaming pieces.
-
-2024-04-18 23:58:16,180 INFO  NEXT ACTION: COMMAND = read_webpage  ARGUMENTS = {'url': 'https://en.wikipedia.org/wiki/Roman_dodecahedron', 'topics_of_interest': ['origins', 'cultural_significance']}
-2024-04-18 23:58:16,180 INFO  Enter 'y' to authorise command, 'y -N' to run N continuous commands, 'n' to exit program, or enter feedback for HistorianDodecahedronGPT...
-...
-```
-
-## Implementation Notes
-
-Here's a brief summary of the issues I encountered & fixed while I was trying to get this integration to work.
-
-### Initial Setup
-
-AutoGPT setup steps:
-
-starting commit: `7082e63b115d72440ee2dfe3f545fa3dcba490d5`
-
-```bash
-git clone git@github.com:Mozilla-Ocho/AutoGPT.git
-cd AutoGPT/autogpt
-pyenv local 3.11
-./setup
-cp .env.template .env
+Download a `mistral-7b-instruct-v0.2` llamafile:
+```shell
+wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version
 ```
 
-then I edited `.env` to set:
-
-```dotenv
-OPENAI_API_KEY=sk-noop
-OPENAI_API_BASE_URL=http://localhost:8080/v1
-```
+Run the llamafile server:
+```shell
+LLAMAFILE="./mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
 
-In a separate terminal window, I started the llamafile server:
+"${LLAMAFILE}" \
+--server \
+--nobrowser \
+--ctx-size 0 \
+--n-predict 1024
 
-```bash
-./llamafile-integration/setup.sh
-./llamafile-integration/serve.sh
+# note: ctx-size=0 means the prompt context size will be set directly from the
+# underlying model configuration. This may cause slow response times or consume
+# a lot of memory.
 ```
 
-### Issue 1: Fix 'Error: Invalid OpenAI API key'
-
-Culprit: API key validation is baked in regardless of whether we actually need an API key or what format the API key is supposed to take. See:
-- https://github.com/Mozilla-Ocho/AutoGPT/blob/262771a69c787814222e23d856f4438333256245/autogpts/autogpt/autogpt/app/main.py#L104
-- https://github.com/Mozilla-Ocho/AutoGPT/blob/028d2c319f3dcca6aa57fc4fdcd2e78a01926e3f/autogpts/autogpt/autogpt/config/config.py#L306
-
-Temporary fix: In `.env`, changed `OPENAI_API_KEY` to something that passes the regex validator:
-
-```bash
-## OPENAI_API_KEY - OpenAI API Key (Example: my-openai-api-key)
-#OPENAI_API_KEY=sk-noop
-OPENAI_API_KEY="sk-000000000000000000000000000000000000000000000000"
-```
-
-### Issue 2: Fix 'ValueError: LLM did not call `create_agent` function; agent profile creation failed'
-
-* Added new entry to `OPEN_AI_CHAT_MODELS` with `has_function_call_api=False` so that `tool_calls_compat_mode` will be triggered in the `create_chat_completion` (changes in `autogpt/core/resource/model_providers/openai.py`)
-* Modified `_tool_calls_compat_extract_calls` to strip off whitespace and markdown syntax at the beginning/end of model responses (changes in `autogpt/core/resource/model_providers/llamafile.py`)
-* Modified `_get_chat_completion_args` to adapt model prompt message roles to be compatible with the [Mistral-7b-Instruct chat template](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format), which supports the 'user' & 'assistant' roles but does not support the 'system' role (changes in `autogpt/core/resource/model_providers/llamafile.py`).
-
-### Issue 3: Fix: 'NotImplementedError: `count_message_tokens()` is not implemented for model'
-
-* In `OpenAIProvider`, change methods `count_message_tokens`, `count_tokens`, and `get_tokenizer` from classmethods to regular methods so a) I can override them in subclass `LlamafileProvider`, b) these methods can access instance attributes (this is required in my implementation of these methods in `LlamafileProvider`). 
-* Implement class `LlamafileTokenizer` that calls the llamafile server's `/tokenize` API endpoint. Implement methods `count_message_tokens`, `count_tokens`, and `get_tokenizer` in `LlamafileProvider` (changes in `autogpt/core/resource/model_providers/llamafile.py`).
-
-### Issue 4: Fix `Command web_search returned an error: DuckDuckGoSearchException: Ratelimit`
-
-* Ran: `poetry update duckduckgo-search` - this got rid of the rate limit error
-* Why is the `send_token_limit` divided by 3 [here](https://github.com/Mozilla-Ocho/AutoGPT/blob/37904a0f80f3499ea43e7846f78d5274b32cad03/autogpts/autogpt/autogpt/agents/agent.py#L274)? 
-
-## Other TODOs
+## TODOs
 
-* Test with other tasks
 * `SMART_LLM`/`FAST_LLM` configuration: Currently, the llamafile server only serves one model at a time. However, there's no reason you can't start multiple llamafile servers on different ports. To support using different models for `smart_llm` and `fast_llm`, you could implement config vars like `LLAMAFILE_SMART_LLM_URL` and `LLAMAFILE_FAST_LLM_URL` that point to different llamafile servers (one serving a 'big model' and one serving a 'fast model'). 
 * Authorization: the `serve.sh` script does not set up any authorization for the llamafile server; this can be turned on by adding arg `--api-key <some-key>` to the server startup command. However I haven't attempted to test whether the integration with autogpt works when this feature is turned on.
-* Added a few TODOs inline in the code
 * Test with other models

From 1a00ecf3a8376bcbcbadf7e404f501e984a919d9 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 3 Jun 2024 19:54:30 +0200
Subject: [PATCH 13/40] Improve llamafile model name cleaning logic

---
 .../llm/providers/llamafile/llamafile.py      | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index 12476fd1534..e5753bc45c8 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -1,5 +1,6 @@
 import enum
 import logging
+import re
 from pathlib import Path
 from typing import Any, Iterator, Optional, Sequence
 
@@ -30,14 +31,14 @@
 
 
 class LlamafileModelName(str, enum.Enum):
-    LLAMAFILE_MISTRAL_7B_INSTRUCT = "mistral-7b-instruct-v0"
+    MISTRAL_7B_INSTRUCT = "mistral-7b-instruct-v0.2"
 
 
 LLAMAFILE_CHAT_MODELS = {
     info.name: info
     for info in [
         ChatModelInfo(
-            name=LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT,
+            name=LlamafileModelName.MISTRAL_7B_INSTRUCT,
             provider_name=ModelProviderName.LLAMAFILE,
             prompt_token_cost=0.0,
             completion_token_cost=0.0,
@@ -114,16 +115,32 @@ async def get_available_models(self) -> Sequence[ChatModelInfo[LlamafileModelNam
         # note: at the moment, llamafile only serves one model at a time (so this
         # list will only ever have one value). however, in the future, llamafile
         # may support multiple models, so leaving this method as-is for now.
+        self._logger.debug(f"Retrieved models: {_models}")
 
-        # clean up model names
+        # Clean up model names:
+        # 1. Remove file extension
+        # 2. Remove quantization info
         # e.g. 'mistral-7b-instruct-v0.2.Q5_K_M.gguf'
         #   -> 'mistral-7b-instruct-v0.2'
         # e.g. '/Users/kate/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf'
         #   ->                    'mistral-7b-instruct-v0.2'
+        # e.g. 'llava-v1.5-7b-q4.gguf'
+        #   -> 'llava-v1.5-7b'
+        def clean_model_name(model_file: str) -> str:
+            name_without_ext = Path(model_file).name.rsplit(".", 1)[0]
+            name_without_Q = re.match(
+                r"^[a-zA-Z0-9]+([.\-](?!([qQ]|B?F)\d{1,2})[a-zA-Z0-9]+)*",
+                name_without_ext,
+            )
+            return name_without_Q.group() if name_without_Q else name_without_ext
+
+        clean_model_ids = [clean_model_name(m.id) for m in _models]
+        self._logger.debug(f"Cleaned model IDs: {clean_model_ids}")
+
         return [
-            LLAMAFILE_CHAT_MODELS[_id]
-            for m in _models
-            if (_id := Path(m.id).name.split(".")[0]) in LLAMAFILE_CHAT_MODELS
+            LLAMAFILE_CHAT_MODELS[id]
+            for id in clean_model_ids
+            if id in LLAMAFILE_CHAT_MODELS
         ]
 
     def get_tokenizer(self, model_name: LlamafileModelName) -> LlamafileTokenizer:
@@ -137,7 +154,7 @@ def count_message_tokens(
         if isinstance(messages, ChatMessage):
             messages = [messages]
 
-        if model_name == LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+        if model_name == LlamafileModelName.MISTRAL_7B_INSTRUCT:
             # For mistral-instruct, num added tokens depends on if the message
             # is a prompt/instruction or an assistant-generated message.
             # - prompt gets [INST], [/INST] added and the first instruction
@@ -190,7 +207,7 @@ def _get_chat_completion_args(
             prompt_messages, model, functions, max_output_tokens, **kwargs
         )
 
-        if model == LlamafileModelName.LLAMAFILE_MISTRAL_7B_INSTRUCT:
+        if model == LlamafileModelName.MISTRAL_7B_INSTRUCT:
             messages = self._adapt_chat_messages_for_mistral_instruct(messages)
 
         if "seed" not in kwargs:

From 3c8bf3c5f280fd69e28c66de3d450d6486f46181 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 3 Jun 2024 19:59:32 +0200
Subject: [PATCH 14/40] expand setup instructions and info for llamafile

---
 docs/content/AutoGPT/setup/index.md | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index 1bbc3294d4d..ec749055c78 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -197,6 +197,18 @@ If you don't know which to choose, you can safely go with OpenAI*.
 With llamafile you can run models locally, which means no need to set up billing,
 and guaranteed data privacy.
 
+!!! warning
+    At the moment, llamafile only serves one model at a time. This means you can not
+    set `SMART_LLM` and `FAST_LLM` to two different llamafile models.
+
+!!! note
+    These instructions will download and use `mistral-7b-instruct-v0.2.Q5_K_M.llamafile`.
+    `mistral-7b-instruct-v0.2` is currently the only tested and supported model.
+    If you want to try other models, you'll have to add them to `LlamafileModelName` in
+    [`llamafile.py`][forge/llamafile.py].
+    For optimal results, you may also have to add some logic to adapt the message format,
+    like `LlamafileProvider._adapt_chat_messages_for_mistral_instruct(..)` does.
+
 1. Run the llamafile setup script:
    ```shell
    ./scripts/llamafile/setup.sh
@@ -207,5 +219,9 @@ and guaranteed data privacy.
    ./scripts/llamafile/serve.sh
    ```
 
-3. If the server is not running on `http://localhost:8080/v1`, adjust `LLAMAFILE_API_BASE`
-   in `.env` with the right base URL
+3. In `.env`, set `SMART_LLM`/`FAST_LLM` or both to `mistral-7b-instruct-v0.2`
+
+4. If the server is running on different address than `http://localhost:8080/v1`,
+   set `LLAMAFILE_API_BASE` in `.env` to the right base URL
+
+[forge/llamafile.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/forge/forge/llm/providers/llamafile/llamafile.py

From 65433ba0989d1f596eff8641834ec99a2c1c8181 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 3 Jun 2024 20:37:50 +0200
Subject: [PATCH 15/40] combine llamafile setup.sh and serve.sh into single
 cross-platform serve.py

---
 autogpt/scripts/llamafile/.gitignore |  1 +
 autogpt/scripts/llamafile/serve.py   | 65 ++++++++++++++++++++++++++++
 autogpt/scripts/llamafile/serve.sh   | 22 ----------
 autogpt/scripts/llamafile/setup.sh   | 14 ------
 docs/content/AutoGPT/setup/index.md  | 11 ++---
 5 files changed, 70 insertions(+), 43 deletions(-)
 create mode 100755 autogpt/scripts/llamafile/serve.py
 delete mode 100755 autogpt/scripts/llamafile/serve.sh
 delete mode 100755 autogpt/scripts/llamafile/setup.sh

diff --git a/autogpt/scripts/llamafile/.gitignore b/autogpt/scripts/llamafile/.gitignore
index 36431613d38..55e65ce9706 100644
--- a/autogpt/scripts/llamafile/.gitignore
+++ b/autogpt/scripts/llamafile/.gitignore
@@ -1 +1,2 @@
 *.llamafile
+*.llamafile.exe
diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
new file mode 100755
index 00000000000..59867dea885
--- /dev/null
+++ b/autogpt/scripts/llamafile/serve.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model
+
+Usage:
+  cd <repo-root>/autogpt
+  ./scripts/llamafile/serve.py
+"""
+
+import os
+import platform
+import subprocess
+from pathlib import Path
+
+LLAMAFILE = Path(
+    "mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
+    + (".exe" if platform.system() == "Windows" else "")
+)
+
+
+def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
+    if total_size != -1:
+        downloaded_size = chunk_number * chunk_size
+        percent = min(1, downloaded_size / total_size)
+        bar = "#" * int(40 * percent)
+        print(
+            f"\rDownloading: [{bar:<40}] {percent:.0%}"
+            f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
+            end="",
+        )
+
+
+def download_llamafile():
+    print(f"Downloading {LLAMAFILE.name}...")
+    import urllib.request
+
+    url = "https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile"  # noqa
+
+    urllib.request.urlretrieve(url, LLAMAFILE.name, reporthook=report_download_progress)
+    print()
+
+    LLAMAFILE.chmod(0o755)
+    subprocess.run([LLAMAFILE, "--version"], check=True)
+
+    print(
+        "\n"
+        "NOTE: To use other models besides mistral-7b-instruct-v0.2, "
+        "download them into autogpt/scripts/llamafile/"
+    )
+
+
+# Go to autogpt/scripts/llamafile/
+os.chdir(Path(__file__).resolve().parent)
+
+if not LLAMAFILE.is_file():
+    download_llamafile()
+
+subprocess.run(
+    [LLAMAFILE, "--server", "--nobrowser", "--ctx-size", "0", "--n-predict", "1024"],
+    check=True,
+)
+
+# note: --ctx-size 0 means the prompt context size will be set directly from the
+# underlying model configuration. This may cause slow response times or consume
+# a lot of memory.
diff --git a/autogpt/scripts/llamafile/serve.sh b/autogpt/scripts/llamafile/serve.sh
deleted file mode 100755
index a7a4fa4895d..00000000000
--- a/autogpt/scripts/llamafile/serve.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Use llamafile to server a (quantized) mistral-7b-instruct-v0.2 model
-#
-# Usage:
-#   cd <repo-root>/autogpt
-#   ./scripts/llamafile/serve.sh
-
-# Go to autogpt/scripts/llamafile/
-cd "$(dirname "$0")"
-
-LLAMAFILE="./mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
-
-"${LLAMAFILE}" \
---server \
---nobrowser \
---ctx-size 0 \
---n-predict 1024
-
-# note: ctx-size=0 means the prompt context size will be set directly from the
-# underlying model configuration. This may cause slow response times or consume
-# a lot of memory.
diff --git a/autogpt/scripts/llamafile/setup.sh b/autogpt/scripts/llamafile/setup.sh
deleted file mode 100755
index f0368226446..00000000000
--- a/autogpt/scripts/llamafile/setup.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-# Go to autogpt/scripts/llamafile/
-cd "$(dirname "$0")"
-
-# Download the mistral-7b-instruct llamafile from HuggingFace
-echo "Downloading mistral-7b-instruct-v0.2..."
-wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
-chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
-./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version
-
-echo
-echo "NOTE: To use other models besides mistral-7b-instruct-v0.2," \
-     "download them into autogpt/scripts/llamafile/"
diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index ec749055c78..becc808ed2e 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -209,15 +209,12 @@ and guaranteed data privacy.
     For optimal results, you may also have to add some logic to adapt the message format,
     like `LlamafileProvider._adapt_chat_messages_for_mistral_instruct(..)` does.
 
-1. Run the llamafile setup script:
+1. Run the llamafile serve script:
    ```shell
-   ./scripts/llamafile/setup.sh
-   ```
-
-2. Start the llamafile server:
-   ```shell
-   ./scripts/llamafile/serve.sh
+   python3 ./scripts/llamafile/serve.py
    ```
+   The first time this is run, it will download a file containing the model + runtime,
+   which may take a while and a few gigabytes of disk space.
 
 3. In `.env`, set `SMART_LLM`/`FAST_LLM` or both to `mistral-7b-instruct-v0.2`
 

From e1bcb036f92ec6626604a52eecfb99909090606a Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 14 Jun 2024 12:33:24 -0700
Subject: [PATCH 16/40] fix llamafile/serve.py for Windows

---
 autogpt/scripts/llamafile/.gitignore |  1 +
 autogpt/scripts/llamafile/serve.py   | 98 +++++++++++++++++++---------
 2 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/autogpt/scripts/llamafile/.gitignore b/autogpt/scripts/llamafile/.gitignore
index 55e65ce9706..3aa496945a5 100644
--- a/autogpt/scripts/llamafile/.gitignore
+++ b/autogpt/scripts/llamafile/.gitignore
@@ -1,2 +1,3 @@
 *.llamafile
 *.llamafile.exe
+llamafile.exe
diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 59867dea885..9a146389342 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -12,10 +12,18 @@
 import subprocess
 from pathlib import Path
 
-LLAMAFILE = Path(
-    "mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
-    + (".exe" if platform.system() == "Windows" else "")
-)
+LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile")
+LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}"  # noqa
+LLAMAFILE_EXE = Path("llamafile.exe")
+LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6"  # noqa
+
+
+def download_file(url: str, to_file: Path) -> None:
+    print(f"Downloading {to_file.name}...")
+    import urllib.request
+
+    urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
+    print()
 
 
 def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
@@ -30,36 +38,64 @@ def report_download_progress(chunk_number: int, chunk_size: int, total_size: int
         )
 
 
-def download_llamafile():
-    print(f"Downloading {LLAMAFILE.name}...")
-    import urllib.request
+if __name__ == "__main__":
+    # Go to autogpt/scripts/llamafile/
+    os.chdir(Path(__file__).resolve().parent)
 
-    url = "https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile"  # noqa
+    if not LLAMAFILE.is_file():
+        download_file(LLAMAFILE_URL, LLAMAFILE)
 
-    urllib.request.urlretrieve(url, LLAMAFILE.name, reporthook=report_download_progress)
-    print()
+        if platform.system() != "Windows":
+            LLAMAFILE.chmod(0o755)
+            subprocess.run([LLAMAFILE, "--version"], check=True)
 
-    LLAMAFILE.chmod(0o755)
-    subprocess.run([LLAMAFILE, "--version"], check=True)
+        print(
+            "\n"
+            "NOTE: To use other models besides mistral-7b-instruct-v0.2, "
+            "download them into autogpt/scripts/llamafile/"
+        )
 
-    print(
-        "\n"
-        "NOTE: To use other models besides mistral-7b-instruct-v0.2, "
-        "download them into autogpt/scripts/llamafile/"
+    if platform.system() != "Windows":
+        base_command = [LLAMAFILE]
+    else:
+        # Windows does not allow executables over 4GB, so we have to download a
+        # model-less llamafile.exe and extract the model weights (.gguf file)
+        # from the downloaded .llamafile.
+        if not LLAMAFILE_EXE.is_file():
+            download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
+            LLAMAFILE_EXE.chmod(0o755)
+            subprocess.run([LLAMAFILE_EXE, "--version"], check=True)
+
+        model_file = LLAMAFILE.with_suffix(".gguf")
+        if not model_file.is_file():
+            import zipfile
+
+            with zipfile.ZipFile(LLAMAFILE, "r") as zip_ref:
+                gguf_file = next(
+                    (file for file in zip_ref.namelist() if file.endswith(".gguf")),
+                    None,
+                )
+                if not gguf_file:
+                    raise Exception("No .gguf file found in the zip file.")
+
+                zip_ref.extract(gguf_file)
+                Path(gguf_file).rename(model_file)
+
+        base_command = [LLAMAFILE_EXE, "-m", model_file]
+
+    subprocess.run(
+        [
+            *base_command,
+            "--server",
+            "--nobrowser",
+            "--ctx-size",
+            "0",
+            "--n-predict",
+            "1024",
+        ],
+        check=True,
     )
 
-
-# Go to autogpt/scripts/llamafile/
-os.chdir(Path(__file__).resolve().parent)
-
-if not LLAMAFILE.is_file():
-    download_llamafile()
-
-subprocess.run(
-    [LLAMAFILE, "--server", "--nobrowser", "--ctx-size", "0", "--n-predict", "1024"],
-    check=True,
-)
-
-# note: --ctx-size 0 means the prompt context size will be set directly from the
-# underlying model configuration. This may cause slow response times or consume
-# a lot of memory.
+    # note: --ctx-size 0 means the prompt context size will be set directly from the
+    # underlying model configuration. This may cause slow response times or consume
+    # a lot of memory.

From df3278ff1eda3d199bbd8ff2fd6b559bde2e5e25 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 14 Jun 2024 12:42:19 -0700
Subject: [PATCH 17/40] address review comment on clean_model_name in
 llamafile.py

---
 .../llm/providers/llamafile/llamafile.py      | 43 +++++++++++--------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index e5753bc45c8..d1a9643586c 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -117,23 +117,6 @@ async def get_available_models(self) -> Sequence[ChatModelInfo[LlamafileModelNam
         # may support multiple models, so leaving this method as-is for now.
         self._logger.debug(f"Retrieved models: {_models}")
 
-        # Clean up model names:
-        # 1. Remove file extension
-        # 2. Remove quantization info
-        # e.g. 'mistral-7b-instruct-v0.2.Q5_K_M.gguf'
-        #   -> 'mistral-7b-instruct-v0.2'
-        # e.g. '/Users/kate/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf'
-        #   ->                    'mistral-7b-instruct-v0.2'
-        # e.g. 'llava-v1.5-7b-q4.gguf'
-        #   -> 'llava-v1.5-7b'
-        def clean_model_name(model_file: str) -> str:
-            name_without_ext = Path(model_file).name.rsplit(".", 1)[0]
-            name_without_Q = re.match(
-                r"^[a-zA-Z0-9]+([.\-](?!([qQ]|B?F)\d{1,2})[a-zA-Z0-9]+)*",
-                name_without_ext,
-            )
-            return name_without_Q.group() if name_without_Q else name_without_ext
-
         clean_model_ids = [clean_model_name(m.id) for m in _models]
         self._logger.debug(f"Cleaned model IDs: {clean_model_ids}")
 
@@ -288,6 +271,32 @@ def _parse_assistant_tool_calls(
         return tool_calls, parse_errors
 
 
+def clean_model_name(model_file: str) -> str:
+    """
+    Clean up model names:
+    1. Remove file extension
+    2. Remove quantization info
+
+    Examples:
+    ```
+    raw:   'mistral-7b-instruct-v0.2.Q5_K_M.gguf'
+    clean: 'mistral-7b-instruct-v0.2'
+
+    raw: '/Users/kate/models/mistral-7b-instruct-v0.2.Q5_K_M.gguf'
+    clean:                  'mistral-7b-instruct-v0.2'
+
+    raw:   'llava-v1.5-7b-q4.gguf'
+    clean: 'llava-v1.5-7b'
+    ```
+    """
+    name_without_ext = Path(model_file).name.rsplit(".", 1)[0]
+    name_without_Q = re.match(
+        r"^[a-zA-Z0-9]+([.\-](?!([qQ]|B?F)\d{1,2})[a-zA-Z0-9]+)*",
+        name_without_ext,
+    )
+    return name_without_Q.group() if name_without_Q else name_without_ext
+
+
 def _tool_calls_compat_extract_calls(response: str) -> Iterator[AssistantToolCall]:
     import re
     import uuid

From 6858b22ec8f56b298afe3553a5ca4015081813bb Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 14 Jun 2024 13:23:31 -0700
Subject: [PATCH 18/40] add --llamafile and --llamafile_url options to
 llamafile/serve.py

---
 autogpt/scripts/llamafile/serve.py | 107 +++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 27 deletions(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 9a146389342..7fb2b9c9aa7 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -11,6 +11,9 @@
 import platform
 import subprocess
 from pathlib import Path
+from typing import Optional
+
+import click
 
 LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile")
 LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}"  # noqa
@@ -18,36 +21,62 @@
 LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6"  # noqa
 
 
-def download_file(url: str, to_file: Path) -> None:
-    print(f"Downloading {to_file.name}...")
-    import urllib.request
-
-    urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
-    print()
-
-
-def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
-    if total_size != -1:
-        downloaded_size = chunk_number * chunk_size
-        percent = min(1, downloaded_size / total_size)
-        bar = "#" * int(40 * percent)
-        print(
-            f"\rDownloading: [{bar:<40}] {percent:.0%}"
-            f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
-            end="",
-        )
-
+@click.command()
+@click.option(
+    "--llamafile",
+    type=click.Path(dir_okay=False),
+    help="Name of the llamafile to serve",
+)
+@click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
+def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
+    if not llamafile:
+        if not llamafile_url:
+            llamafile = LLAMAFILE
+        else:
+            llamafile = Path(llamafile_url.rsplit("/", 1)[1])
+            if llamafile.suffix != ".llamafile":
+                if not click.prompt(
+                    "The given URL does not end with '.llamafile'. "
+                    "Are you 100%% sure this URL will download a llamafile?",
+                    type=bool,
+                ):
+                    return
+
+    if llamafile == LLAMAFILE and not llamafile_url:
+        llamafile_url = LLAMAFILE_URL
+    elif llamafile_url != LLAMAFILE_URL:
+        if not click.prompt(
+            click.style(
+                "You seem to have specified a different URL for the default model "
+                f"({llamafile.name}). Are you sure this is correct? "
+                "If you want to use a different model, also specify --llamafile.",
+                fg="yellow",
+            ),
+            type=bool,
+        ):
+            return
 
-if __name__ == "__main__":
     # Go to autogpt/scripts/llamafile/
     os.chdir(Path(__file__).resolve().parent)
 
-    if not LLAMAFILE.is_file():
-        download_file(LLAMAFILE_URL, LLAMAFILE)
+    if not llamafile.is_file():
+        if not llamafile_url:
+            click.echo(
+                click.style(
+                    "Please use --lamafile_url to specify a download URL for "
+                    f"'{llamafile.name}'. "
+                    "This will only be necessary once, so we can download the model.",
+                    fg="red",
+                ),
+                err=True,
+            )
+            return
+
+        download_file(llamafile_url, llamafile)
 
         if platform.system() != "Windows":
-            LLAMAFILE.chmod(0o755)
-            subprocess.run([LLAMAFILE, "--version"], check=True)
+            llamafile.chmod(0o755)
+            subprocess.run([llamafile, "--version"], check=True)
 
         print(
             "\n"
@@ -56,7 +85,7 @@ def report_download_progress(chunk_number: int, chunk_size: int, total_size: int
         )
 
     if platform.system() != "Windows":
-        base_command = [LLAMAFILE]
+        base_command = [llamafile]
     else:
         # Windows does not allow executables over 4GB, so we have to download a
         # model-less llamafile.exe and extract the model weights (.gguf file)
@@ -66,11 +95,11 @@ def report_download_progress(chunk_number: int, chunk_size: int, total_size: int
             LLAMAFILE_EXE.chmod(0o755)
             subprocess.run([LLAMAFILE_EXE, "--version"], check=True)
 
-        model_file = LLAMAFILE.with_suffix(".gguf")
+        model_file = llamafile.with_suffix(".gguf")
         if not model_file.is_file():
             import zipfile
 
-            with zipfile.ZipFile(LLAMAFILE, "r") as zip_ref:
+            with zipfile.ZipFile(llamafile, "r") as zip_ref:
                 gguf_file = next(
                     (file for file in zip_ref.namelist() if file.endswith(".gguf")),
                     None,
@@ -99,3 +128,27 @@ def report_download_progress(chunk_number: int, chunk_size: int, total_size: int
     # note: --ctx-size 0 means the prompt context size will be set directly from the
     # underlying model configuration. This may cause slow response times or consume
     # a lot of memory.
+
+
+def download_file(url: str, to_file: Path) -> None:
+    print(f"Downloading {to_file.name}...")
+    import urllib.request
+
+    urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
+    print()
+
+
+def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
+    if total_size != -1:
+        downloaded_size = chunk_number * chunk_size
+        percent = min(1, downloaded_size / total_size)
+        bar = "#" * int(40 * percent)
+        print(
+            f"\rDownloading: [{bar:<40}] {percent:.0%}"
+            f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
+            end="",
+        )
+
+
+if __name__ == "__main__":
+    main()

From d73a98c015777f341f9fa7aecad1b9659fb30055 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 14 Jun 2024 13:30:40 -0700
Subject: [PATCH 19/40] tweaks to llamafile/serve.py

---
 autogpt/scripts/llamafile/serve.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 7fb2b9c9aa7..0faa77d8cf4 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -25,7 +25,7 @@
 @click.option(
     "--llamafile",
     type=click.Path(dir_okay=False),
-    help="Name of the llamafile to serve",
+    help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
 )
 @click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
 def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
@@ -35,12 +35,16 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
         else:
             llamafile = Path(llamafile_url.rsplit("/", 1)[1])
             if llamafile.suffix != ".llamafile":
-                if not click.prompt(
-                    "The given URL does not end with '.llamafile'. "
-                    "Are you 100%% sure this URL will download a llamafile?",
-                    type=bool,
-                ):
-                    return
+                click.echo(
+                    click.style(
+                        "The given URL does not end with '.llamafile' -> "
+                        "can't get filename from URL. "
+                        "Specify the filename using --llamafile.",
+                        fg="red",
+                    ),
+                    err=True,
+                )
+                return
 
     if llamafile == LLAMAFILE and not llamafile_url:
         llamafile_url = LLAMAFILE_URL
@@ -78,12 +82,6 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
             llamafile.chmod(0o755)
             subprocess.run([llamafile, "--version"], check=True)
 
-        print(
-            "\n"
-            "NOTE: To use other models besides mistral-7b-instruct-v0.2, "
-            "download them into autogpt/scripts/llamafile/"
-        )
-
     if platform.system() != "Windows":
         base_command = [llamafile]
     else:

From 4d64b45881e6e7dc808aa6deb7a66b41c05d0517 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 14 Jun 2024 13:32:47 -0700
Subject: [PATCH 20/40] address comment by Nick

---
 autogpt/scripts/llamafile/serve.py | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 0faa77d8cf4..c24aae87b4e 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -86,29 +86,13 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
         base_command = [llamafile]
     else:
         # Windows does not allow executables over 4GB, so we have to download a
-        # model-less llamafile.exe and extract the model weights (.gguf file)
-        # from the downloaded .llamafile.
+        # model-less llamafile.exe and run that instead.
         if not LLAMAFILE_EXE.is_file():
             download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
             LLAMAFILE_EXE.chmod(0o755)
             subprocess.run([LLAMAFILE_EXE, "--version"], check=True)
 
-        model_file = llamafile.with_suffix(".gguf")
-        if not model_file.is_file():
-            import zipfile
-
-            with zipfile.ZipFile(llamafile, "r") as zip_ref:
-                gguf_file = next(
-                    (file for file in zip_ref.namelist() if file.endswith(".gguf")),
-                    None,
-                )
-                if not gguf_file:
-                    raise Exception("No .gguf file found in the zip file.")
-
-                zip_ref.extract(gguf_file)
-                Path(gguf_file).rename(model_file)
-
-        base_command = [LLAMAFILE_EXE, "-m", model_file]
+        base_command = [LLAMAFILE_EXE, "-m", llamafile]
 
     subprocess.run(
         [

From e5c5163bb67d743be6c7bc13f40dea84e80773b6 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 14 Jun 2024 14:03:42 -0700
Subject: [PATCH 21/40] fix llamafile/serve.py execution path error

---
 autogpt/scripts/llamafile/serve.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index c24aae87b4e..96b8c2b772e 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -83,16 +83,16 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
             subprocess.run([llamafile, "--version"], check=True)
 
     if platform.system() != "Windows":
-        base_command = [llamafile]
+        base_command = [f"./{llamafile}" if len(llamafile.parts) == 1 else llamafile]
     else:
         # Windows does not allow executables over 4GB, so we have to download a
         # model-less llamafile.exe and run that instead.
         if not LLAMAFILE_EXE.is_file():
             download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
             LLAMAFILE_EXE.chmod(0o755)
-            subprocess.run([LLAMAFILE_EXE, "--version"], check=True)
+            subprocess.run([f".\\{LLAMAFILE_EXE}", "--version"], check=True)
 
-        base_command = [LLAMAFILE_EXE, "-m", llamafile]
+        base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]
 
     subprocess.run(
         [

From 0e081f4fb71497bf4388d6f06fb45307c8ceaa2d Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 14:22:38 -0700
Subject: [PATCH 22/40] improve debug logging messages in
 `LlamafileProvider.get_available_models`

---
 forge/forge/llm/providers/llamafile/llamafile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index d1a9643586c..012c0eea796 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -115,10 +115,10 @@ async def get_available_models(self) -> Sequence[ChatModelInfo[LlamafileModelNam
         # note: at the moment, llamafile only serves one model at a time (so this
         # list will only ever have one value). however, in the future, llamafile
         # may support multiple models, so leaving this method as-is for now.
-        self._logger.debug(f"Retrieved models: {_models}")
+        self._logger.debug(f"Retrieved llamafile models: {_models}")
 
         clean_model_ids = [clean_model_name(m.id) for m in _models]
-        self._logger.debug(f"Cleaned model IDs: {clean_model_ids}")
+        self._logger.debug(f"Cleaned llamafile model IDs: {clean_model_ids}")
 
         return [
             LLAMAFILE_CHAT_MODELS[id]

From 529314e42f70fa4b86511c86370c60988b40c306 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 17:24:57 -0700
Subject: [PATCH 23/40] small refactor for readability/simplicity in
 llamafile/serve.py

---
 autogpt/scripts/llamafile/serve.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 96b8c2b772e..d19a4528a12 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -63,6 +63,8 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
     # Go to autogpt/scripts/llamafile/
     os.chdir(Path(__file__).resolve().parent)
 
+    on_windows = platform.system() == "Windows"
+
     if not llamafile.is_file():
         if not llamafile_url:
             click.echo(
@@ -78,12 +80,12 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
 
         download_file(llamafile_url, llamafile)
 
-        if platform.system() != "Windows":
+        if not on_windows:
             llamafile.chmod(0o755)
             subprocess.run([llamafile, "--version"], check=True)
 
-    if platform.system() != "Windows":
-        base_command = [f"./{llamafile}" if len(llamafile.parts) == 1 else llamafile]
+    if not on_windows:
+        base_command = [f"./{llamafile}"]
     else:
         # Windows does not allow executables over 4GB, so we have to download a
         # model-less llamafile.exe and run that instead.

From 072e674dfdf0dbdbfdcd397fe1d3cd28f69ddd2a Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 17:42:35 -0700
Subject: [PATCH 24/40] amend docs regarding WSL

---
 docs/content/AutoGPT/setup/index.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index becc808ed2e..4ed63a9bd52 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -201,6 +201,14 @@ and guaranteed data privacy.
     At the moment, llamafile only serves one model at a time. This means you can not
     set `SMART_LLM` and `FAST_LLM` to two different llamafile models.
 
+!!! warning
+    Due to the issues linked below, llamafiles don't work on WSL. To use a llamafile
+    with AutoGPT in WSL, you will have to run the llamafile in Windows (outside WSL).
+    The llamafile server should then also be available in WSL.
+
+    * [Mozilla-Ocho/llamafile#356](https://github.com/Mozilla-Ocho/llamafile/issues/356)
+    * [Mozilla-Ocho/llamafile#100](https://github.com/Mozilla-Ocho/llamafile/issues/100)
+
 !!! note
     These instructions will download and use `mistral-7b-instruct-v0.2.Q5_K_M.llamafile`.
     `mistral-7b-instruct-v0.2` is currently the only tested and supported model.

From 01372d18892e70bf1cbfcc5275e09f6ef77d245f Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 19:41:28 -0700
Subject: [PATCH 25/40] add --use-gpu option to llamafile/serve.py

---
 autogpt/scripts/llamafile/serve.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index d19a4528a12..d17ed68a41a 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -28,7 +28,14 @@
     help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
 )
 @click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
-def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
+@click.option(
+    "--use-gpu", is_flag=True, help="Use an AMD or Nvidia GPU to speed up inference"
+)
+def main(
+    llamafile: Optional[Path] = None,
+    llamafile_url: Optional[str] = None,
+    use_gpu: bool = False,
+):
     if not llamafile:
         if not llamafile_url:
             llamafile = LLAMAFILE
@@ -96,6 +103,9 @@ def main(llamafile: Optional[Path] = None, llamafile_url: Optional[str] = None):
 
         base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]
 
+    if use_gpu:
+        base_command.extend(["-ngl", "9999"])
+
     subprocess.run(
         [
             *base_command,

From 63fe5b59a6865accae82532c11fa3e6748c772c6 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 20:39:45 -0700
Subject: [PATCH 26/40] set llamafile host to 0.0.0.0

---
 autogpt/scripts/llamafile/serve.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index d17ed68a41a..7869e90ce49 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -110,6 +110,8 @@ def main(
         [
             *base_command,
             "--server",
+            "--host",
+            "0.0.0.0",
             "--nobrowser",
             "--ctx-size",
             "0",

From 271e59b0111820793dd57ef26d69a99efb34e259 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 20:40:37 -0700
Subject: [PATCH 27/40] debug llamafile init

---
 .../llm/providers/llamafile/llamafile.py      | 10 +++++++
 forge/forge/llm/providers/multi.py            | 27 +++++++++++++++----
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index 012c0eea796..3c2eb15791e 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -57,6 +57,16 @@ class LlamafileCredentials(ModelProviderCredentials):
         default=SecretStr("http://localhost:8080/v1"), from_env="LLAMAFILE_API_BASE"
     )
 
+    def get_api_access_kwargs(self) -> dict[str, str]:
+        return {
+            k: v.get_secret_value()
+            for k, v in {
+                "api_key": self.api_key,
+                "base_url": self.api_base,
+            }.items()
+            if v is not None
+        }
+
 
 class LlamafileTokenizer(ModelTokenizer[int]):
     def __init__(self, credentials: LlamafileCredentials):
diff --git a/forge/forge/llm/providers/multi.py b/forge/forge/llm/providers/multi.py
index ee42c2a8446..088c7afa9af 100644
--- a/forge/forge/llm/providers/multi.py
+++ b/forge/forge/llm/providers/multi.py
@@ -122,33 +122,50 @@ def get_model_provider(self, model: ModelName) -> ChatModelProvider:
 
     def get_available_providers(self) -> Iterator[ChatModelProvider]:
         for provider_name in ModelProviderName:
+            self._logger.debug(f"Checking if {provider_name} is available...")
             try:
                 yield self._get_provider(provider_name)
-            except Exception:
+                self._logger.debug(f"{provider_name} is available!")
+            except ValueError:
                 pass
 
     def _get_provider(self, provider_name: ModelProviderName) -> ChatModelProvider:
         _provider = self._provider_instances.get(provider_name)
         if not _provider:
             Provider = self._get_provider_class(provider_name)
+            self._logger.debug(
+                f"{Provider.__name__} not yet in cache, trying to init..."
+            )
+
             settings = Provider.default_settings.copy(deep=True)
             settings.budget = self._budget
             settings.configuration.extra_request_headers.update(
                 self._settings.configuration.extra_request_headers
             )
             if settings.credentials is None:
+                credentials_field = settings.__fields__["credentials"]
+                Credentials = credentials_field.type_
+                self._logger.debug(f"Loading {Credentials.__name__}...")
                 try:
-                    Credentials = settings.__fields__["credentials"].type_
                     settings.credentials = Credentials.from_env()
                 except ValidationError as e:
-                    raise ValueError(
-                        f"{provider_name} is unavailable: can't load credentials"
-                    ) from e
+                    if credentials_field.required:
+                        self._logger.debug(
+                            f"Could not load (required) {Credentials.__name__}"
+                        )
+                        raise ValueError(
+                            f"{Provider.__name__} is unavailable: "
+                            "can't load credentials"
+                        ) from e
+                    self._logger.debug(
+                        f"Could not load {Credentials.__name__}, continuing without..."
+                    )
 
             self._provider_instances[provider_name] = _provider = Provider(
                 settings=settings, logger=self._logger  # type: ignore
             )
             _provider._budget = self._budget  # Object binding not preserved by Pydantic
+            self._logger.debug(f"Initialized {Provider.__name__}!")
         return _provider
 
     @classmethod

From aecc3633e29a0480f4e5bb29406eb281e79a28aa Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 21:00:40 -0700
Subject: [PATCH 28/40] add --host and --port options to llamafile/serve.py

---
 autogpt/scripts/llamafile/serve.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 7869e90ce49..8539fde1e13 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -28,12 +28,20 @@
     help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
 )
 @click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
+@click.option(
+    "--host", help="Specify the address for the llamafile server to listen on"
+)
+@click.option(
+    "--port", type=int, help="Specify the port for the llamafile server to listen on"
+)
 @click.option(
     "--use-gpu", is_flag=True, help="Use an AMD or Nvidia GPU to speed up inference"
 )
 def main(
     llamafile: Optional[Path] = None,
     llamafile_url: Optional[str] = None,
+    host: Optional[str] = None,
+    port: Optional[int] = None,
     use_gpu: bool = False,
 ):
     if not llamafile:
@@ -103,6 +111,10 @@ def main(
 
         base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]
 
+    if host:
+        base_command.extend(["--host", host])
+    if port:
+        base_command.extend(["--port", str(port)])
     if use_gpu:
         base_command.extend(["-ngl", "9999"])
 
@@ -110,8 +122,6 @@ def main(
         [
             *base_command,
             "--server",
-            "--host",
-            "0.0.0.0",
             "--nobrowser",
             "--ctx-size",
             "0",

From 9ee1e8f6e50749a7b7328ee2526f772e36a7f6a4 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 21:25:44 -0700
Subject: [PATCH 29/40] add instructions to run llamafiles with WSL

---
 docs/content/AutoGPT/setup/index.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index 4ed63a9bd52..3c7a17a6939 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -204,7 +204,25 @@ and guaranteed data privacy.
 !!! warning
     Due to the issues linked below, llamafiles don't work on WSL. To use a llamafile
     with AutoGPT in WSL, you will have to run the llamafile in Windows (outside WSL).
-    The llamafile server should then also be available in WSL.
+
+    <details>
+    <summary>Instructions</summary>
+
+    1. Get the `llamafile/serve.py` script through one of these two ways:
+        1. Clone the AutoGPT repo somewhere in your Windows environment,
+           with the script located at `autogpt/scripts/llamafile/serve.py`
+        2. Download just the [serve.py] script somewhere in your Windows environment
+    2. Make sure you have `click` installed: `pip install click`
+    3. Run `ip route | grep default | awk '{print $3}'` *inside WSL* to get the address
+       of the WSL host machine
+    4. Run `python3 serve.py --host {WSL_HOST_ADDR}`, where `{WSL_HOST_ADDR}`
+       is the address you found at step 3.
+       If port 8080 is taken, also specify a different port using `--port {PORT}`.
+    5. In WSL, set `LLAMAFILE_API_BASE=http://{WSL_HOST_ADDR}:8080/v1` in your `.env`.
+    6. Follow the rest of the regular instructions below.
+
+    [serve.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/autogpt/scripts/llamafile/serve.py
+    </details>
 
     * [Mozilla-Ocho/llamafile#356](https://github.com/Mozilla-Ocho/llamafile/issues/356)
     * [Mozilla-Ocho/llamafile#100](https://github.com/Mozilla-Ocho/llamafile/issues/100)

From 242753ed7bc4cb4a2abb887ae42d01774d0f5ea0 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Thu, 20 Jun 2024 21:29:23 -0700
Subject: [PATCH 30/40] add note about `--use-gpu` to the docs

---
 docs/content/AutoGPT/setup/index.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index 3c7a17a6939..1745156a1a3 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -242,6 +242,8 @@ and guaranteed data privacy.
    The first time this is run, it will download a file containing the model + runtime,
    which may take a while and a few gigabytes of disk space.
 
+   To force GPU acceleration, add `--use-gpu` to the command.
+
 3. In `.env`, set `SMART_LLM`/`FAST_LLM` or both to `mistral-7b-instruct-v0.2`
 
 4. If the server is running on different address than `http://localhost:8080/v1`,

From e8905d105f65ab73331f91c03d297ac1c64c3552 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 24 Jun 2024 11:45:57 -0700
Subject: [PATCH 31/40] Convert messages with content blocks to plain text
 messages

Fixes #7242
---
 forge/forge/llm/providers/llamafile/llamafile.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index 3c2eb15791e..61300f5d972 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -208,6 +208,16 @@ def _get_chat_completion_args(
             #  seed should be set from config
             completion_kwargs["seed"] = 0
 
+        # Convert all messages with content blocks to simple text messages
+        for message in messages:
+            if isinstance(content := message.get("content"), list):
+                message["content"] = "\n\n".join(
+                    b["text"]
+                    for b in content
+                    if b["type"] == "text"
+                    # FIXME: add support for images through image_data completion kwarg
+                )
+
         return messages, completion_kwargs, parse_kwargs
 
     def _adapt_chat_messages_for_mistral_instruct(

From 3621be0ff1f8c98dcff8bfa0d04c389d64015538 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 24 Jun 2024 11:54:34 -0700
Subject: [PATCH 32/40] add LLAMAFILE_API_BASE to .env.template

---
 autogpt/.env.template | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/autogpt/.env.template b/autogpt/.env.template
index 9d458b90a55..8d4894988c5 100644
--- a/autogpt/.env.template
+++ b/autogpt/.env.template
@@ -11,6 +11,9 @@
 ## GROQ_API_KEY - Groq API Key (Example: gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx)
 # GROQ_API_KEY=
 
+## LLAMAFILE_API_BASE - Llamafile API base URL
+# LLAMAFILE_API_BASE=http://localhost:8080/v1
+
 ## TELEMETRY_OPT_IN - Share telemetry on errors and other issues with the AutoGPT team, e.g. through Sentry.
 ##   This helps us to spot and solve problems earlier & faster. (Default: DISABLED)
 # TELEMETRY_OPT_IN=true

From f33c2d21a9eec904f60731e8c13d44365814c191 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 24 Jun 2024 12:26:26 -0700
Subject: [PATCH 33/40] resolve TODO regarding `seed` parameter

---
 .../forge/llm/providers/llamafile/llamafile.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index 61300f5d972..c6d291a4cba 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -51,6 +51,11 @@ class LlamafileModelName(str, enum.Enum):
 LLAMAFILE_EMBEDDING_MODELS = {}
 
 
+class LlamafileConfiguration(ModelProviderConfiguration):
+    # TODO: implement 'seed' across forge.llm.providers
+    seed: Optional[int] = None
+
+
 class LlamafileCredentials(ModelProviderCredentials):
     api_key = SecretStr("sk-no-key-required")
     api_base: SecretStr = UserConfigurable(  # type: ignore
@@ -114,11 +119,12 @@ class LlamafileProvider(
             "Provides chat completion and embedding services "
             "through a llamafile instance"
         ),
-        configuration=ModelProviderConfiguration(),
+        configuration=LlamafileConfiguration(),
     )
 
-    _settings: LlamafileSettings  # type: ignore
-    _credentials: LlamafileCredentials  # type: ignore
+    _settings: LlamafileSettings
+    _credentials: LlamafileCredentials
+    _configuration: LlamafileConfiguration
 
     async def get_available_models(self) -> Sequence[ChatModelInfo[LlamafileModelName]]:
         _models = (await self._client.models.list()).data
@@ -203,10 +209,8 @@ def _get_chat_completion_args(
         if model == LlamafileModelName.MISTRAL_7B_INSTRUCT:
             messages = self._adapt_chat_messages_for_mistral_instruct(messages)
 
-        if "seed" not in kwargs:
-            # FIXME: temporarily hard-coded for reproducibility, instead the
-            #  seed should be set from config
-            completion_kwargs["seed"] = 0
+        if "seed" not in kwargs and self._configuration.seed is not None:
+            completion_kwargs["seed"] = self._configuration.seed
 
         # Convert all messages with content blocks to simple text messages
         for message in messages:

From 75e0301524acbe1fbfa85fd69633b8d73b9424a1 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 24 Jun 2024 12:26:56 -0700
Subject: [PATCH 34/40] minor refactor

---
 forge/forge/llm/providers/llamafile/llamafile.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index c6d291a4cba..4afc58657c0 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -73,6 +73,11 @@ def get_api_access_kwargs(self) -> dict[str, str]:
         }
 
 
+class LlamafileSettings(ModelProviderSettings):
+    configuration: LlamafileConfiguration
+    credentials: Optional[LlamafileCredentials]
+
+
 class LlamafileTokenizer(ModelTokenizer[int]):
     def __init__(self, credentials: LlamafileCredentials):
         self._credentials = credentials
@@ -100,10 +105,6 @@ def decode(self, tokens: list[int]) -> str:
         return response.json()["content"]
 
 
-class LlamafileSettings(ModelProviderSettings):
-    credentials: Optional[LlamafileCredentials]  # type: ignore
-
-
 class LlamafileProvider(
     BaseOpenAIChatProvider[LlamafileModelName, LlamafileSettings],
     # TODO: add and test support for embedding models

From deb7d1119d0f023b546114a578242a85d9dc13dd Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 24 Jun 2024 15:31:40 -0700
Subject: [PATCH 35/40] add reference to llamafile documentation

---
 docs/content/AutoGPT/setup/index.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
index 1745156a1a3..880bd77f6bc 100644
--- a/docs/content/AutoGPT/setup/index.md
+++ b/docs/content/AutoGPT/setup/index.md
@@ -197,6 +197,8 @@ If you don't know which to choose, you can safely go with OpenAI*.
 With llamafile you can run models locally, which means no need to set up billing,
 and guaranteed data privacy.
 
+For more information and in-depth documentation, check out the [llamafile documentation].
+
 !!! warning
     At the moment, llamafile only serves one model at a time. This means you can not
     set `SMART_LLM` and `FAST_LLM` to two different llamafile models.
@@ -249,4 +251,5 @@ and guaranteed data privacy.
 4. If the server is running on different address than `http://localhost:8080/v1`,
    set `LLAMAFILE_API_BASE` in `.env` to the right base URL
 
+[llamafile documentation]: https://github.com/Mozilla-Ocho/llamafile#readme
 [forge/llamafile.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/forge/forge/llm/providers/llamafile/llamafile.py

From 74923f108d629d1fc28a378cd2450ad0b1e9cda9 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Mon, 24 Jun 2024 15:41:38 -0700
Subject: [PATCH 36/40] fix type errors

---
 forge/forge/llm/providers/llamafile/llamafile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index 4afc58657c0..a3b775c65ba 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -74,8 +74,8 @@ def get_api_access_kwargs(self) -> dict[str, str]:
 
 
 class LlamafileSettings(ModelProviderSettings):
-    configuration: LlamafileConfiguration
-    credentials: Optional[LlamafileCredentials]
+    configuration: LlamafileConfiguration  # type: ignore
+    credentials: Optional[LlamafileCredentials] = None  # type: ignore
 
 
 class LlamafileTokenizer(ModelTokenizer[int]):

From a303f9d3bae6c720f152028a62c735978e7b492b Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Wed, 3 Jul 2024 15:12:06 -0600
Subject: [PATCH 37/40] fix path issue in scripts/llamafile/serve.py

---
 autogpt/scripts/llamafile/serve.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autogpt/scripts/llamafile/serve.py b/autogpt/scripts/llamafile/serve.py
index 8539fde1e13..9e2c4de6711 100755
--- a/autogpt/scripts/llamafile/serve.py
+++ b/autogpt/scripts/llamafile/serve.py
@@ -24,7 +24,7 @@
 @click.command()
 @click.option(
     "--llamafile",
-    type=click.Path(dir_okay=False),
+    type=click.Path(dir_okay=False, path_type=Path),
     help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
 )
 @click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
@@ -44,6 +44,7 @@ def main(
     port: Optional[int] = None,
     use_gpu: bool = False,
 ):
+    print(f"type(llamafile) = {type(llamafile)}")
     if not llamafile:
         if not llamafile_url:
             llamafile = LLAMAFILE

From 30445be1188d5fc14966ced413788ee506640aa2 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Wed, 3 Jul 2024 15:12:25 -0600
Subject: [PATCH 38/40] docs: add `LLAMAFILE_API_BASE` to
 AutoGPT/configuration/options.md

---
 docs/content/AutoGPT/configuration/options.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/content/AutoGPT/configuration/options.md b/docs/content/AutoGPT/configuration/options.md
index 49e111330f0..ed0f70885d7 100644
--- a/docs/content/AutoGPT/configuration/options.md
+++ b/docs/content/AutoGPT/configuration/options.md
@@ -22,6 +22,7 @@ You can set configuration variables via the `.env` file. If you don't have a `.e
 - `GROQ_API_KEY`: Set this if you want to use Groq models with AutoGPT
 - `HUGGINGFACE_API_TOKEN`: HuggingFace API, to be used for both image generation and audio to text. Optional.
 - `HUGGINGFACE_IMAGE_MODEL`: HuggingFace model to use for image generation. Default: CompVis/stable-diffusion-v1-4
+- `LLAMAFILE_API_BASE`: Llamafile API base URL. Default: `http://localhost:8080/v1`
 - `OPENAI_API_KEY`: Set this if you want to use OpenAI models; [OpenAI API Key](https://platform.openai.com/account/api-keys).
 - `OPENAI_ORGANIZATION`: Organization ID in OpenAI. Optional.
 - `PLAIN_OUTPUT`: Plain output, which disables the spinner. Default: False

From c9501bddff0938b3ec99f1efabf979b8c33e0bc5 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Wed, 3 Jul 2024 15:29:21 -0600
Subject: [PATCH 39/40] fix Pydantic attribute override error

---
 forge/forge/llm/providers/llamafile/llamafile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index a3b775c65ba..864ed40dc08 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -57,7 +57,7 @@ class LlamafileConfiguration(ModelProviderConfiguration):
 
 
 class LlamafileCredentials(ModelProviderCredentials):
-    api_key = SecretStr("sk-no-key-required")
+    api_key: SecretStr = SecretStr("sk-no-key-required")
     api_base: SecretStr = UserConfigurable(  # type: ignore
         default=SecretStr("http://localhost:8080/v1"), from_env="LLAMAFILE_API_BASE"
     )

From b7da031a141d3d5702e3f6c56b5903829273d721 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Wed, 3 Jul 2024 22:15:52 -0600
Subject: [PATCH 40/40] fix Pydantic v2 errors

---
 forge/forge/llm/providers/llamafile/llamafile.py | 2 +-
 forge/forge/llm/providers/multi.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/forge/forge/llm/providers/llamafile/llamafile.py b/forge/forge/llm/providers/llamafile/llamafile.py
index 864ed40dc08..7eb7afaafb1 100644
--- a/forge/forge/llm/providers/llamafile/llamafile.py
+++ b/forge/forge/llm/providers/llamafile/llamafile.py
@@ -57,7 +57,7 @@ class LlamafileConfiguration(ModelProviderConfiguration):
 
 
 class LlamafileCredentials(ModelProviderCredentials):
-    api_key: SecretStr = SecretStr("sk-no-key-required")
+    api_key: Optional[SecretStr] = SecretStr("sk-no-key-required")
     api_base: SecretStr = UserConfigurable(  # type: ignore
         default=SecretStr("http://localhost:8080/v1"), from_env="LLAMAFILE_API_BASE"
     )
diff --git a/forge/forge/llm/providers/multi.py b/forge/forge/llm/providers/multi.py
index 488c8c62279..e0b08352299 100644
--- a/forge/forge/llm/providers/multi.py
+++ b/forge/forge/llm/providers/multi.py
@@ -151,7 +151,7 @@ def _get_provider(self, provider_name: ModelProviderName) -> ChatModelProvider:
                 try:
                     settings.credentials = Credentials.from_env()
                 except ValidationError as e:
-                    if credentials_field.required:
+                    if credentials_field.is_required():
                         self._logger.debug(
                             f"Could not load (required) {Credentials.__name__}"
                         )