Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Hugging Face Inference Endpoints #33

Merged
merged 15 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ecologits/data/models.csv
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ anthropic,claude-3-sonnet-20240229,800,100;400,model_architecture_not_released,
anthropic,claude-3-haiku-20240307,300,75;150,model_architecture_not_released,
anthropic,claude-2.1,130,130,model_architecture_not_released,https://docs.google.com/spreadsheets/d/1O5KVQW1Hx5ZAkcg8AIRjbQLQzx2wVaLl0SqUu-ir9Fs/edit?usp=sharing
anthropic,claude-2.0,130,130,model_architecture_not_released,https://docs.google.com/spreadsheets/d/1O5KVQW1Hx5ZAkcg8AIRjbQLQzx2wVaLl0SqUu-ir9Fs/edit?usp=sharing
anthropic,claude-instant-1.2,20;70,20;70,model_architecture_not_released,
anthropic,claude-instant-1.2,20;70,20;70,model_architecture_not_released,
huggingface_hub,HuggingFaceH4/zephyr-7b-beta,7.24,7.24,model_architecture_not_released,
8 changes: 8 additions & 0 deletions ecologits/ecologits.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def init_instruments() -> None:
init_openai_instrumentor()
init_anthropic_instrumentor()
init_mistralai_instrumentor()
init_huggingface_instrumentor()


def init_openai_instrumentor() -> None:
Expand All @@ -42,3 +43,10 @@ def init_mistralai_instrumentor() -> None:

instrumentor = MistralAIInstrumentor()
instrumentor.instrument()

def init_huggingface_instrumentor() -> None:
if importlib.util.find_spec("huggingface_hub") is not None:
from ecologits.tracers.huggingface_tracer import HuggingfaceInstrumentor

instrumentor = HuggingfaceInstrumentor()
instrumentor.instrument()
1 change: 1 addition & 0 deletions ecologits/model_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class Providers(Enum):
anthropic = "anthropic"
mistralai = "mistralai"
openai = "openai"
huggingface_hub = "huggingface_hub"


class Warnings(Enum):
Expand Down
172 changes: 172 additions & 0 deletions ecologits/tracers/huggingface_tracer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import time
from dataclasses import asdict, dataclass
from typing import Any, AsyncIterable, Callable, Iterable, Union

from wrapt import wrap_function_wrapper

from ecologits.impacts.models import Impacts
from ecologits.tracers.utils import compute_llm_impacts

try:
import tiktoken
from huggingface_hub import AsyncInferenceClient, InferenceClient
from huggingface_hub import ChatCompletionOutput as _ChatCompletionOutput
from huggingface_hub import ChatCompletionStreamOutput as _ChatCompletionStreamOutput
except ImportError:
InferenceClient = object()
AsyncInferenceClient = object()
_ChatCompletionOutput = object()
_ChatCompletionStreamOutput = object()


PROVIDER = "huggingface_hub"


@dataclass
class ChatCompletionOutput(_ChatCompletionOutput):
impacts: Impacts


@dataclass
class ChatCompletionStreamOutput(_ChatCompletionStreamOutput):
impacts: Impacts


def huggingface_chat_wrapper(
wrapped: Callable,
instance: InferenceClient,
args: Any,
kwargs: Any
) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
if kwargs.get("stream", False):
return huggingface_chat_wrapper_stream(wrapped, instance, args, kwargs)
else:
return huggingface_chat_wrapper_non_stream(wrapped, instance, args, kwargs)


def huggingface_chat_wrapper_non_stream(
wrapped: Callable,
instance: InferenceClient,
args: Any,
kwargs: Any
) -> ChatCompletionOutput:
timer_start = time.perf_counter()
response = wrapped(*args, **kwargs)
request_latency = time.perf_counter() - timer_start
encoder = tiktoken.get_encoding("cl100k_base")
output_tokens = len(encoder.encode(response.choices[0].message.content))
impacts = compute_llm_impacts(
provider=PROVIDER,
model_name=instance.model,
output_token_count=output_tokens,
request_latency=request_latency
)
if impacts is not None:
return ChatCompletionOutput(**asdict(response), impacts=impacts)
else:
return response


def huggingface_chat_wrapper_stream(
wrapped: Callable,
instance: InferenceClient,
args: Any,
kwargs: Any
) -> Iterable[ChatCompletionStreamOutput]:
timer_start = time.perf_counter()
stream = wrapped(*args, **kwargs)
token_count = 0
for chunk in stream:
token_count += 1
request_latency = time.perf_counter() - timer_start
impacts = compute_llm_impacts(
provider=PROVIDER,
model_name=instance.model,
output_token_count=token_count,
request_latency=request_latency
)
if impacts is not None:
yield ChatCompletionStreamOutput(**asdict(chunk), impacts=impacts)
else:
yield chunk


async def huggingface_async_chat_wrapper(
wrapped: Callable,
instance: AsyncInferenceClient,
args: Any,
kwargs: Any
) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
if kwargs.get("stream", False):
return huggingface_async_chat_wrapper_stream(wrapped, instance, args, kwargs)
else:
return await huggingface_async_chat_wrapper_non_stream(wrapped, instance, args, kwargs)


async def huggingface_async_chat_wrapper_non_stream(
wrapped: Callable,
instance: AsyncInferenceClient,
args: Any,
kwargs: Any
) -> ChatCompletionOutput:
timer_start = time.perf_counter()
response = await wrapped(*args, **kwargs)
request_latency = time.perf_counter() - timer_start
encoder = tiktoken.get_encoding("cl100k_base")
output_tokens = len(encoder.encode(response.choices[0].message.content))
impacts = compute_llm_impacts(
provider=PROVIDER,
model_name=instance.model,
output_token_count=output_tokens,
request_latency=request_latency
)
if impacts is not None:
return ChatCompletionOutput(**asdict(response), impacts=impacts)
else:
return response


async def huggingface_async_chat_wrapper_stream(
wrapped: Callable,
instance: AsyncInferenceClient,
args: Any,
kwargs: Any
) -> AsyncIterable[ChatCompletionStreamOutput]:
timer_start = time.perf_counter()
stream = await wrapped(*args, **kwargs)
token_count = 0
async for chunk in stream:
token_count += 1
request_latency = time.perf_counter() - timer_start
impacts = compute_llm_impacts(
provider=PROVIDER,
model_name=instance.model,
output_token_count=token_count,
request_latency=request_latency
)
if impacts is not None:
yield ChatCompletionStreamOutput(**asdict(chunk), impacts=impacts)
else:
yield chunk


class HuggingfaceInstrumentor:
def __init__(self) -> None:
self.wrapped_methods = [
{
"module": "huggingface_hub.inference._client",
"name": "InferenceClient.chat_completion",
"wrapper": huggingface_chat_wrapper,
},
{
"module": "huggingface_hub.inference._generated._async_client",
"name": "AsyncInferenceClient.chat_completion",
"wrapper": huggingface_async_chat_wrapper,
},
]

def instrument(self) -> None:
for wrapper in self.wrapped_methods:
wrap_function_wrapper(
wrapper["module"], wrapper["name"], wrapper["wrapper"]
)
Loading
Loading