genai-impact · samuelrince · Apr 12, 2024 · Mar 21, 2024 · Mar 26, 2024 · Apr 3, 2024
diff --git a/ecologits/data/models.csv b/ecologits/data/models.csv
@@ -32,4 +32,5 @@ anthropic,claude-3-sonnet-20240229,800,100;400,model_architecture_not_released,
 anthropic,claude-3-haiku-20240307,300,75;150,model_architecture_not_released,
 anthropic,claude-2.1,130,130,model_architecture_not_released,https://docs.google.com/spreadsheets/d/1O5KVQW1Hx5ZAkcg8AIRjbQLQzx2wVaLl0SqUu-ir9Fs/edit?usp=sharing
 anthropic,claude-2.0,130,130,model_architecture_not_released,https://docs.google.com/spreadsheets/d/1O5KVQW1Hx5ZAkcg8AIRjbQLQzx2wVaLl0SqUu-ir9Fs/edit?usp=sharing
-anthropic,claude-instant-1.2,20;70,20;70,model_architecture_not_released,
+anthropic,claude-instant-1.2,20;70,20;70,model_architecture_not_released,
+huggingface_hub,HuggingFaceH4/zephyr-7b-beta,7.24,7.24,model_architecture_not_released,
diff --git a/ecologits/ecologits.py b/ecologits/ecologits.py
@@ -18,6 +18,7 @@ def init_instruments() -> None:
     init_openai_instrumentor()
     init_anthropic_instrumentor()
     init_mistralai_instrumentor()
+    init_huggingface_instrumentor()
 
 
 def init_openai_instrumentor() -> None:
@@ -42,3 +43,10 @@ def init_mistralai_instrumentor() -> None:
 
         instrumentor = MistralAIInstrumentor()
         instrumentor.instrument()
+
+def init_huggingface_instrumentor() -> None:
+    if importlib.util.find_spec("huggingface_hub") is not None:
+        from ecologits.tracers.huggingface_tracer import HuggingfaceInstrumentor
+
+        instrumentor = HuggingfaceInstrumentor()
+        instrumentor.instrument()
diff --git a/ecologits/model_repository.py b/ecologits/model_repository.py
@@ -9,6 +9,7 @@ class Providers(Enum):
     anthropic = "anthropic"
     mistralai = "mistralai"
     openai = "openai"
+    huggingface_hub = "huggingface_hub"
 
 
 class Warnings(Enum):

diff --git a/ecologits/tracers/huggingface_tracer.py b/ecologits/tracers/huggingface_tracer.py
@@ -0,0 +1,172 @@
+import time
+from dataclasses import asdict, dataclass
+from typing import Any, AsyncIterable, Callable, Iterable, Union
+
+from wrapt import wrap_function_wrapper
+
+from ecologits.impacts.models import Impacts
+from ecologits.tracers.utils import compute_llm_impacts
+
+try:
+    import tiktoken
+    from huggingface_hub import AsyncInferenceClient, InferenceClient
+    from huggingface_hub import ChatCompletionOutput as _ChatCompletionOutput
+    from huggingface_hub import ChatCompletionStreamOutput as _ChatCompletionStreamOutput
+except ImportError:
+    InferenceClient = object()
+    AsyncInferenceClient = object()
+    _ChatCompletionOutput = object()
+    _ChatCompletionStreamOutput = object()
+
+
+PROVIDER = "huggingface_hub"
+
+
+@dataclass
+class ChatCompletionOutput(_ChatCompletionOutput):
+    impacts: Impacts
+
+
+@dataclass
+class ChatCompletionStreamOutput(_ChatCompletionStreamOutput):
+    impacts: Impacts
+
+
+def huggingface_chat_wrapper(
+    wrapped: Callable,
+    instance: InferenceClient,
+    args: Any,
+    kwargs: Any
+) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
+    if kwargs.get("stream", False):
+        return huggingface_chat_wrapper_stream(wrapped, instance, args, kwargs)
+    else:
+        return huggingface_chat_wrapper_non_stream(wrapped, instance, args, kwargs)
+
+
+def huggingface_chat_wrapper_non_stream(
+    wrapped: Callable,
+    instance: InferenceClient,
+    args: Any,
+    kwargs: Any
+) -> ChatCompletionOutput:
+    timer_start = time.perf_counter()
+    response = wrapped(*args, **kwargs)
+    request_latency = time.perf_counter() - timer_start
+    encoder = tiktoken.get_encoding("cl100k_base")
+    output_tokens = len(encoder.encode(response.choices[0].message.content))
+    impacts = compute_llm_impacts(
+        provider=PROVIDER,
+        model_name=instance.model,
+        output_token_count=output_tokens,
+        request_latency=request_latency
+    )
+    if impacts is not None:
+        return ChatCompletionOutput(**asdict(response), impacts=impacts)
+    else:
+        return response
+
+
+def huggingface_chat_wrapper_stream(
+    wrapped: Callable,
+    instance: InferenceClient,
+    args: Any,
+    kwargs: Any
+) -> Iterable[ChatCompletionStreamOutput]:
+    timer_start = time.perf_counter()
+    stream = wrapped(*args, **kwargs)
+    token_count = 0
+    for chunk in stream:
+        token_count += 1
+        request_latency = time.perf_counter() - timer_start
+        impacts = compute_llm_impacts(
+            provider=PROVIDER,
+            model_name=instance.model,
+            output_token_count=token_count,
+            request_latency=request_latency
+        )
+        if impacts is not None:
+            yield ChatCompletionStreamOutput(**asdict(chunk), impacts=impacts)
+        else:
+            yield chunk
+
+
+async def huggingface_async_chat_wrapper(
+    wrapped: Callable,
+    instance: AsyncInferenceClient,
+    args: Any,
+    kwargs: Any
+) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
+    if kwargs.get("stream", False):
+        return huggingface_async_chat_wrapper_stream(wrapped, instance, args, kwargs)
+    else:
+        return await huggingface_async_chat_wrapper_non_stream(wrapped, instance, args, kwargs)
+
+
+async def huggingface_async_chat_wrapper_non_stream(
+    wrapped: Callable,
+    instance: AsyncInferenceClient,
+    args: Any,
+    kwargs: Any
+) -> ChatCompletionOutput:
+    timer_start = time.perf_counter()
+    response = await wrapped(*args, **kwargs)
+    request_latency = time.perf_counter() - timer_start
+    encoder = tiktoken.get_encoding("cl100k_base")
+    output_tokens = len(encoder.encode(response.choices[0].message.content))
+    impacts = compute_llm_impacts(
+        provider=PROVIDER,
+        model_name=instance.model,
+        output_token_count=output_tokens,
+        request_latency=request_latency
+    )
+    if impacts is not None:
+        return ChatCompletionOutput(**asdict(response), impacts=impacts)
+    else:
+        return response
+
+
+async def huggingface_async_chat_wrapper_stream(
+    wrapped: Callable,
+    instance: AsyncInferenceClient,
+    args: Any,
+    kwargs: Any
+) -> AsyncIterable[ChatCompletionStreamOutput]:
+    timer_start = time.perf_counter()
+    stream = await wrapped(*args, **kwargs)
+    token_count = 0
+    async for chunk in stream:
+        token_count += 1
+        request_latency = time.perf_counter() - timer_start
+        impacts = compute_llm_impacts(
+            provider=PROVIDER,
+            model_name=instance.model,
+            output_token_count=token_count,
+            request_latency=request_latency
+        )
+        if impacts is not None:
+            yield ChatCompletionStreamOutput(**asdict(chunk), impacts=impacts)
+        else:
+            yield chunk
+
+
+class HuggingfaceInstrumentor:
+    def __init__(self) -> None:
+        self.wrapped_methods = [
+            {
+                "module": "huggingface_hub.inference._client",
+                "name": "InferenceClient.chat_completion",
+                "wrapper": huggingface_chat_wrapper,
+            },
+            {
+                "module": "huggingface_hub.inference._generated._async_client",
+                "name": "AsyncInferenceClient.chat_completion",
+                "wrapper": huggingface_async_chat_wrapper,
+            },
+        ]
+
+    def instrument(self) -> None:
+        for wrapper in self.wrapped_methods:
+            wrap_function_wrapper(
+                wrapper["module"], wrapper["name"], wrapper["wrapper"]
+            )