Change validator to respect local/remote inference. Also only push to Pypi on release tag.

JosephCatrambone · JosephCatrambone · commit d0f0f52481b7 · 2024-11-27T13:44:36.000-08:00
diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml
@@ -3,8 +3,9 @@ name: Publish to Guardrails Hub
 on:
   workflow_dispatch:
   push:
-    branches:
-      - main
+    # Publish when new releases are tagged.
+    tags:
+      - '*'
 
 jobs:
   setup:
diff --git a/app_inference_spec.py b/app_inference_spec.py
@@ -2,10 +2,8 @@
 # Forked from spec:
 # github.com/guardrails-ai/models-host/tree/main/ray#adding-new-inference-endpoints
 import os
-from typing import Optional
 from logging import getLogger
 
-from fastapi import HTTPException
 from pydantic import BaseModel
 from models_host.base_inference_spec import BaseInferenceSpec
 
@@ -23,13 +21,10 @@
 
 class InputRequest(BaseModel):
     message: str
-    threshold: Optional[float] = None
 
 
 class OutputResponse(BaseModel):
-    classification: str
     score: float
-    is_jailbreak: bool
 
 
 # Using same nomenclature as in Sagemaker classes
@@ -69,30 +64,9 @@ def process_request(self, input_request: InputRequest):
         # raise HTTPException(status_code=400, detail="Invalid input format")
         args = (message,)
         kwargs = {}
-        if input_request.threshold is not None:
-            kwargs["threshold"] = input_request.threshold
-            if not 0.0 <= input_request.threshold <= 1.0:
-                raise HTTPException(
-                    status_code=400,
-                    detail=f"Threshold must be between 0.0 and 1.0. "
-                           f"Got {input_request.threshold}"
-                )
         return args, kwargs
 
-    def infer(self, message: str, threshold: Optional[float] = None) -> OutputResponse:
-        if threshold is None:
-            threshold = 0.81
-
-        score = self.model.predict_jailbreak([message,])[0]
-        if score > threshold:
-            classification = "jailbreak"
-            is_jailbreak = True
-        else:
-            classification = "safe"
-            is_jailbreak = False
-
+    def infer(self, message: str) -> OutputResponse:
         return OutputResponse(
-            classification=classification,
-            score=score,
-            is_jailbreak=is_jailbreak,
+            score=self.model.predict_jailbreak([message,])[0],
         )
diff --git a/validator/main.py b/validator/main.py
@@ -1,5 +1,6 @@
+import json
 import math
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Optional, Union, Any
 
 import torch
 from torch.nn import functional as F
@@ -65,8 +66,9 @@ def __init__(
             device: str = "cpu",
             on_fail: Optional[Callable] = None,
             model_path_override: str = "",
+            **kwargs,
     ):
-        super().__init__(on_fail=on_fail)
+        super().__init__(on_fail=on_fail, **kwargs)
         self.device = device
         self.threshold = threshold
 
@@ -271,7 +273,9 @@ def validate(
         if isinstance(value, str):
             value = [value, ]
 
-        scores = self.predict_jailbreak(value)
+        # _inference is to support local/remote. It is equivalent to this:
+        # scores = self.predict_jailbreak(value)
+        scores = self._inference(value)
 
         failed_prompts = list()
         failed_scores = list()  # To help people calibrate their thresholds.
@@ -289,3 +293,31 @@ def validate(
                 error_message=failure_message
             )
         return PassResult()
+
+    # The rest of these methods are made for validator compatibility and may have some
+    # strange properties,
+
+    def _inference_local(self, model_input: List[str]) -> Any:
+        return self.predict_jailbreak(model_input)
+
+    def _inference_remote(self, model_input: List[str]) -> Any:
+        # This needs to be kept in-sync with app_inference_spec.
+        request_body = {
+            "inputs": [
+                {
+                    "name": "message",
+                    "shape": [len(model_input)],
+                    "data": model_input,
+                    "datatype": "BYTES"
+                }
+            ]
+        }
+        response = self._hub_inference_request(
+            json.dumps(request_body),
+            self.validation_endpoint
+        )
+        if not response or "outputs" not in response:
+            raise ValueError("Invalid response from remote inference", response)
+
+        data = [output["score"] for output in response["outputs"]]
+        return data