SeldonIO · luohua13 · Dec 24, 2024
diff --git a/runtimes/huggingface/mlserver_huggingface/runtime.py b/runtimes/huggingface/mlserver_huggingface/runtime.py
@@ -14,6 +14,10 @@
 from .codecs import HuggingfaceRequestCodec
 from .metadata import METADATA
 
+from prometheus_client import (
+    Counter,
+)
+
 
 class HuggingFaceRuntime(MLModel):
     """Runtime class for specific Huggingface models"""
@@ -22,6 +26,18 @@ def __init__(self, settings: ModelSettings):
         self.hf_settings = get_huggingface_settings(settings)
         super().__init__(settings)
 
+        self._ModelInputTokens = Counter(
+            "model_input_tokens",
+            "Model input tokens count",
+            ["model", "version"],
+        )
+
+        self._ModelOutputTokens = Counter(
+            "model_output_tokens",
+            "Model output tokens count",
+            ["model", "version"],
+        )
+
     async def load(self) -> bool:
         # Loading & caching pipeline in asyncio loop to avoid blocking
         logger.info(f"Loading model for task '{self.hf_settings.task_name}'...")
@@ -45,8 +61,34 @@ async def predict(self, payload: InferenceRequest) -> InferenceResponse:
         array_inputs = kwargs.pop("array_inputs", [])
         if array_inputs:
             args = [list(array_inputs)] + args
+
+        #  calculate input_tokens
+        if hasattr(self._model, "tokenizer") and args:
+            input_texts = args[0] if isinstance(args[0], list) else [args[0]]
+            input_tokens_count = sum(
+                len(self._model.tokenizer(text, return_tensors="pt")["input_ids"][0]) for text in input_texts
+            )
+        else:
+            input_tokens_count = 0
+
         prediction = self._model(*args, **kwargs)
 
+        try:
+            # calculate output_tokens
+            if hasattr(self._model, "tokenizer") and prediction:
+                output_tokens_count = sum(
+                    len(self._model.tokenizer(text["generated_text"], return_tensors="pt")["input_ids"][0]) for text in prediction
+                )
+            else:
+                output_tokens_count = 0
+
+            # store metrics
+            labels = dict(model=self.name, version=self.version)
+            self._ModelInputTokens.labels(**labels).inc(input_tokens_count)
+            self._ModelOutputTokens.labels(**labels).inc(output_tokens_count)
+        except Exception as e:
+            logger.error(f"got error: '{e}'")
+
         return self.encode_response(
             payload=prediction, default_codec=HuggingfaceRequestCodec
         )