From 096089d9161b148da54c8c7324ceddbfd700f696 Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Tue, 15 Jul 2025 14:04:54 +0200
Subject: [PATCH 1/9] open-api-evals-script

---
 scripts/generate_openapi.py | 621 ++++++++++++++++++++++++++++++++++++
 1 file changed, 621 insertions(+)
 create mode 100644 scripts/generate_openapi.py

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
new file mode 100644
index 0000000..0949ee1
--- /dev/null
+++ b/scripts/generate_openapi.py
@@ -0,0 +1,621 @@
+#!/usr/bin/env python3
+"""
+Python script to generate OpenAPI JSON from evaluators.generated.ts
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+
+
+def parse_typescript_evaluators(file_path: str) -> Dict[str, Any]:
+    """
+    Parse the TypeScript evaluators file to extract evaluator definitions.
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    evaluators = {}
+
+    # Find all evaluator definitions using regex
+    # Look for patterns like "evaluator/id": { ... }
+    evaluator_pattern = r'"([^"]+)":\s*{'
+
+    # Find all matches
+    matches = list(re.finditer(evaluator_pattern, content))
+    print(f"Found {len(matches)} evaluator matches")
+
+    for i, match in enumerate(matches):
+        evaluator_id = match.group(1)
+        start_pos = match.end() - 1  # Position of the opening brace
+
+        # Find the matching closing brace
+        brace_count = 0
+        end_pos = start_pos
+
+        for j, char in enumerate(content[start_pos:], start_pos):
+            if char == "{":
+                brace_count += 1
+            elif char == "}":
+                brace_count -= 1
+                if brace_count == 0:
+                    end_pos = j + 1
+                    break
+
+        evaluator_content = content[start_pos:end_pos]
+
+        # Parse the evaluator content
+        evaluator = parse_evaluator_content(evaluator_id, evaluator_content)
+        evaluators[evaluator_id] = evaluator
+
+        print(f"Processing evaluator: {evaluator_id}")
+
+    return evaluators
+
+
+def parse_evaluator_content(evaluator_id: str, content: str) -> Dict[str, Any]:
+    """Parse individual evaluator content."""
+
+    evaluator = {
+        "name": evaluator_id,
+        "description": "",
+        "category": "other",
+        "isGuardrail": False,
+        "requiredFields": [],
+        "optionalFields": [],
+        "settings": {},
+    }
+
+    # Extract name
+    name_match = re.search(r"name:\s*`([^`]+)`", content, re.DOTALL)
+    if name_match:
+        evaluator["name"] = name_match.group(1).strip()
+
+    # Extract description
+    desc_match = re.search(r"description:\s*`([^`]+)`", content, re.DOTALL)
+    if desc_match:
+        evaluator["description"] = desc_match.group(1).strip()
+
+    # Extract category
+    cat_match = re.search(r'category:\s*"([^"]+)"', content)
+    if cat_match:
+        evaluator["category"] = cat_match.group(1)
+
+    # Extract isGuardrail
+    guard_match = re.search(r"isGuardrail:\s*(true|false)", content)
+    if guard_match:
+        evaluator["isGuardrail"] = guard_match.group(1) == "true"
+
+    # Extract required fields
+    req_match = re.search(r"requiredFields:\s*\[([^\]]*)\]", content)
+    if req_match:
+        req_content = req_match.group(1)
+        evaluator["requiredFields"] = [
+            field.strip().strip('"') for field in re.findall(r'"([^"]+)"', req_content)
+        ]
+
+    # Extract optional fields
+    opt_match = re.search(r"optionalFields:\s*\[([^\]]*)\]", content)
+    if opt_match:
+        opt_content = opt_match.group(1)
+        evaluator["optionalFields"] = [
+            field.strip().strip('"') for field in re.findall(r'"([^"]+)"', opt_content)
+        ]
+
+    # Extract settings
+    settings_match = re.search(
+        r"settings:\s*{([^}]+(?:{[^}]*}[^}]*)*)}", content, re.DOTALL
+    )
+    if settings_match:
+        settings_content = settings_match.group(1)
+        evaluator["settings"] = parse_settings(settings_content)
+
+    return evaluator
+
+
+def parse_settings(settings_content: str) -> Dict[str, Any]:
+    """Parse settings object."""
+    settings = {}
+
+    # Find all setting definitions
+    setting_pattern = r"(\w+):\s*{([^}]+(?:{[^}]*}[^}]*)*)}"
+
+    for setting_match in re.finditer(setting_pattern, settings_content, re.DOTALL):
+        setting_name = setting_match.group(1)
+        setting_content = setting_match.group(2)
+
+        # Extract description
+        desc_match = re.search(r'description:\s*"([^"]+)"', setting_content)
+        description = desc_match.group(1) if desc_match else f"{setting_name} setting"
+
+        # Extract default value
+        default_value = extract_default_value(setting_content)
+
+        settings[setting_name] = {"description": description, "default": default_value}
+
+    return settings
+
+
+def extract_default_value(content: str) -> Any:
+    """Extract default value from setting content."""
+
+    # Look for default: followed by various value types
+    default_match = re.search(r"default:\s*(.+?)(?:,|$)", content, re.DOTALL)
+    if not default_match:
+        return None
+
+    value_str = default_match.group(1).strip()
+
+    # Handle different value types
+    if value_str.startswith('"') and value_str.endswith('"'):
+        # String value
+        return value_str[1:-1]
+    elif value_str.startswith("`") and value_str.endswith("`"):
+        # Backtick string
+        return value_str[1:-1]
+    elif value_str == "true":
+        return True
+    elif value_str == "false":
+        return False
+    elif value_str.isdigit():
+        return int(value_str)
+    elif value_str.replace(".", "").isdigit():
+        return float(value_str)
+    elif value_str.startswith("[") and value_str.endswith("]"):
+        # Array value - simplified parsing
+        return []
+    elif value_str.startswith("{") and value_str.endswith("}"):
+        # Object value - simplified parsing
+        return {}
+    else:
+        return value_str
+
+
+def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
+    """Generate OpenAPI 3.1.0 schema from evaluators."""
+
+    schema = {
+        "openapi": "3.1.0",
+        "info": {
+            "title": "LangEvals API",
+            "version": "1.0.0",
+            "description": "API for LangEvals evaluators",
+        },
+        "servers": [
+            {
+                "url": "https://app.langwatch.ai/api/evaluations",
+                "description": "Production server",
+            }
+        ],
+        "security": [{"api_key": []}],
+        "paths": {},
+        "components": {
+            "schemas": {
+                "EvaluationRequest": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "object",
+                            "properties": {
+                                "input": {
+                                    "type": "string",
+                                    "description": "The input text to evaluate",
+                                },
+                                "output": {
+                                    "type": "string",
+                                    "description": "The output text to evaluate",
+                                },
+                                "contexts": {
+                                    "type": "array",
+                                    "items": {"type": "string"},
+                                    "description": "Context information for evaluation",
+                                },
+                                "expected_output": {
+                                    "type": "string",
+                                    "description": "Expected output for comparison",
+                                },
+                                "expected_contexts": {
+                                    "type": "array",
+                                    "items": {"type": "string"},
+                                    "description": "Expected context information",
+                                },
+                                "conversation": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "object",
+                                        "properties": {
+                                            "role": {
+                                                "type": "string",
+                                                "enum": ["user", "assistant", "system"],
+                                            },
+                                            "content": {"type": "string"},
+                                        },
+                                        "required": ["role", "content"],
+                                    },
+                                    "description": "Conversation history",
+                                },
+                            },
+                        },
+                        "settings": {
+                            "type": "object",
+                            "description": "Evaluator settings",
+                        },
+                    },
+                    "required": ["data"],
+                },
+                "EvaluationEntry": {
+                    "type": "object",
+                    "properties": {
+                        "input": {
+                            "type": "string",
+                            "description": "The input text to evaluate",
+                        },
+                        "output": {
+                            "type": "string",
+                            "description": "The output text to evaluate",
+                        },
+                        "contexts": {
+                            "type": "array",
+                            "items": {"type": "string"},
+                            "description": "Context information for evaluation",
+                        },
+                        "expected_output": {
+                            "type": "string",
+                            "description": "Expected output for comparison",
+                        },
+                        "expected_contexts": {
+                            "type": "array",
+                            "items": {"type": "string"},
+                            "description": "Expected context information",
+                        },
+                        "conversation": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "role": {
+                                        "type": "string",
+                                        "enum": ["user", "assistant", "system"],
+                                    },
+                                    "content": {"type": "string"},
+                                },
+                                "required": ["role", "content"],
+                            },
+                            "description": "Conversation history",
+                        },
+                    },
+                },
+                "EvaluationResult": {
+                    "type": "object",
+                    "properties": {
+                        "status": {
+                            "type": "string",
+                            "enum": ["processed", "skipped", "error"],
+                        },
+                        "score": {"type": "number", "description": "Evaluation score"},
+                        "passed": {
+                            "type": "boolean",
+                            "description": "Whether the evaluation passed",
+                        },
+                        "label": {"type": "string", "description": "Evaluation label"},
+                        "details": {
+                            "type": "string",
+                            "description": "Additional details about the evaluation",
+                        },
+                        "cost": {"$ref": "#/components/schemas/Money"},
+                        "raw_response": {
+                            "type": "object",
+                            "description": "Raw response from the evaluator",
+                        },
+                        "error_type": {
+                            "type": "string",
+                            "description": "Type of error if status is 'error'",
+                        },
+                        "traceback": {
+                            "type": "array",
+                            "items": {"type": "string"},
+                            "description": "Error traceback if status is 'error'",
+                        },
+                    },
+                    "required": ["status"],
+                },
+                "Money": {
+                    "type": "object",
+                    "properties": {
+                        "currency": {"type": "string"},
+                        "amount": {"type": "number"},
+                    },
+                    "required": ["currency", "amount"],
+                },
+            },
+            "securitySchemes": {
+                "api_key": {"type": "apiKey", "in": "header", "name": "X-Auth-Token"}
+            },
+        },
+    }
+
+    # Generate paths for each evaluator
+    for evaluator_id, evaluator in evaluators.items():
+        path_key = f"/{evaluator_id.replace('.', '/')}/evaluate"
+
+        # Create evaluator-specific data schema
+        data_schema = {"type": "object", "properties": {}}
+
+        # Add all possible fields to properties
+        all_fields = [
+            "input",
+            "output",
+            "contexts",
+            "expected_output",
+            "expected_contexts",
+            "conversation",
+        ]
+        required_fields = []
+
+        for field in all_fields:
+            if field in evaluator.get("requiredFields", []):
+                required_fields.append(field)
+
+            if field in evaluator.get("requiredFields", []) or field in evaluator.get(
+                "optionalFields", []
+            ):
+                if field == "input":
+                    data_schema["properties"]["input"] = {
+                        "type": "string",
+                        "description": "The input text to evaluate",
+                    }
+                elif field == "output":
+                    data_schema["properties"]["output"] = {
+                        "type": "string",
+                        "description": "The output text to evaluate",
+                    }
+                elif field == "contexts":
+                    data_schema["properties"]["contexts"] = {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "Context information for evaluation",
+                    }
+                elif field == "expected_output":
+                    data_schema["properties"]["expected_output"] = {
+                        "type": "string",
+                        "description": "Expected output for comparison",
+                    }
+                elif field == "expected_contexts":
+                    data_schema["properties"]["expected_contexts"] = {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "Expected context information",
+                    }
+                elif field == "conversation":
+                    data_schema["properties"]["conversation"] = {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "role": {
+                                    "type": "string",
+                                    "enum": ["user", "assistant", "system"],
+                                },
+                                "content": {"type": "string"},
+                            },
+                            "required": ["role", "content"],
+                        },
+                        "description": "Conversation history",
+                    }
+
+        # Add required fields if any
+        if required_fields:
+            data_schema["required"] = required_fields
+
+        # Create evaluator-specific request schema
+        request_schema_name = (
+            f"{evaluator_id.replace('.', '_').replace('/', '_')}Request"
+        )
+        schema["components"]["schemas"][request_schema_name] = {
+            "type": "object",
+            "properties": {"data": data_schema},
+            "required": ["data"],
+        }
+
+        # Create settings schema for this evaluator
+        settings_schema = {"type": "object", "properties": {}}
+
+        # Only process settings if the evaluator has settings
+        if evaluator.get("settings"):
+            for setting_key, setting in evaluator["settings"].items():
+                property_def = {
+                    "description": setting.get("description", f"{setting_key} setting")
+                }
+
+                default_value = setting.get("default")
+                if isinstance(default_value, str):
+                    property_def["type"] = "string"
+                    property_def["default"] = default_value
+                elif isinstance(default_value, (int, float)):
+                    property_def["type"] = "number"
+                    property_def["default"] = default_value
+                elif isinstance(default_value, bool):
+                    property_def["type"] = "boolean"
+                    property_def["default"] = default_value
+                elif isinstance(default_value, list):
+                    property_def["type"] = "array"
+                    if default_value and isinstance(default_value[0], str):
+                        property_def["items"] = {"type": "string"}
+                    else:
+                        property_def["items"] = {"type": "object"}
+                    property_def["default"] = default_value
+                elif isinstance(default_value, dict):
+                    property_def["type"] = "object"
+                    property_def["default"] = default_value
+
+                settings_schema["properties"][setting_key] = property_def
+
+        # Add settings schema to components only if there are settings
+        if settings_schema["properties"]:
+            settings_schema_name = (
+                f"{evaluator_id.replace('.', '_').replace('/', '_')}Settings"
+            )
+            schema["components"]["schemas"][settings_schema_name] = settings_schema
+
+            # Create the path with settings
+            schema["paths"][path_key] = {
+                "post": {
+                    "summary": evaluator["name"],
+                    "description": evaluator["description"],
+                    "operationId": f"{evaluator_id.replace('.', '_').replace('/', '_')}_evaluate",
+                    "requestBody": {
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "allOf": [
+                                        {
+                                            "$ref": f"#/components/schemas/{request_schema_name}"
+                                        },
+                                        {
+                                            "type": "object",
+                                            "properties": {
+                                                "settings": {
+                                                    "$ref": f"#/components/schemas/{settings_schema_name}"
+                                                }
+                                            },
+                                        },
+                                    ]
+                                }
+                            }
+                        },
+                        "required": True,
+                    },
+                    "responses": {
+                        "200": {
+                            "description": "Successful evaluation",
+                            "content": {
+                                "application/json": {
+                                    "schema": {
+                                        "type": "array",
+                                        "items": {
+                                            "$ref": "#/components/schemas/EvaluationResult"
+                                        },
+                                    }
+                                }
+                            },
+                        },
+                        "400": {
+                            "description": "Bad request",
+                            "content": {
+                                "application/json": {
+                                    "schema": {
+                                        "type": "object",
+                                        "properties": {"detail": {"type": "string"}},
+                                    }
+                                }
+                            },
+                        },
+                        "500": {
+                            "description": "Internal server error",
+                            "content": {
+                                "application/json": {
+                                    "schema": {
+                                        "type": "object",
+                                        "properties": {"detail": {"type": "string"}},
+                                    }
+                                }
+                            },
+                        },
+                    },
+                }
+            }
+        else:
+            # Create the path without settings for evaluators with no settings
+            schema["paths"][path_key] = {
+                "post": {
+                    "summary": evaluator["name"],
+                    "description": evaluator["description"],
+                    "operationId": f"{evaluator_id.replace('.', '_').replace('/', '_')}_evaluate",
+                    "requestBody": {
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": f"#/components/schemas/{request_schema_name}"
+                                }
+                            }
+                        },
+                        "required": True,
+                    },
+                    "responses": {
+                        "200": {
+                            "description": "Successful evaluation",
+                            "content": {
+                                "application/json": {
+                                    "schema": {
+                                        "type": "array",
+                                        "items": {
+                                            "$ref": "#/components/schemas/EvaluationResult"
+                                        },
+                                    }
+                                }
+                            },
+                        },
+                        "400": {
+                            "description": "Bad request",
+                            "content": {
+                                "application/json": {
+                                    "schema": {
+                                        "type": "object",
+                                        "properties": {"detail": {"type": "string"}},
+                                    }
+                                }
+                            },
+                        },
+                        "500": {
+                            "description": "Internal server error",
+                            "content": {
+                                "application/json": {
+                                    "schema": {
+                                        "type": "object",
+                                        "properties": {"detail": {"type": "string"}},
+                                    }
+                                }
+                            },
+                        },
+                    },
+                }
+            }
+
+    return schema
+
+
+def main():
+    """Main function to generate OpenAPI schema."""
+    try:
+        # Find the evaluators.generated.ts file
+        script_dir = Path(__file__).parent
+        project_root = script_dir.parent
+        evaluators_file = project_root / "ts-integration" / "evaluators.generated.ts"
+
+        if not evaluators_file.exists():
+            print(f"Error: Could not find evaluators file at {evaluators_file}")
+            sys.exit(1)
+
+        print(f"Reading evaluators from: {evaluators_file}")
+        evaluators = parse_typescript_evaluators(str(evaluators_file))
+
+        print(f"Found {len(evaluators)} evaluators")
+        schema = generate_openapi_schema(evaluators)
+
+        # Write the schema to a file
+        output_path = script_dir / "openapi.json"
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(schema, f, indent=2, ensure_ascii=False)
+
+        print(f"OpenAPI schema generated successfully at: {output_path}")
+        print(f"Generated {len(evaluators)} evaluator endpoints")
+
+    except Exception as error:
+        print(f"Error generating OpenAPI schema: {error}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From 5fb265c7fbc41cf20ebbfa775a819be1951e419b Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Wed, 16 Jul 2025 09:01:08 +0200
Subject: [PATCH 2/9] Update script

---
 scripts/generate_openapi.py | 189 ++++++++++++++++++++++++++++++++----
 1 file changed, 168 insertions(+), 21 deletions(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index 0949ee1..f79176a 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -19,12 +19,38 @@ def parse_typescript_evaluators(file_path: str) -> Dict[str, Any]:
 
     evaluators = {}
 
+    # Find the AVAILABLE_EVALUATORS object using a more robust approach
+    # First find the start of the object
+    start_pattern = r"export const AVAILABLE_EVALUATORS:\s*{[^}]*}\s*=\s*{"
+    start_match = re.search(start_pattern, content)
+
+    if not start_match:
+        print("Could not find AVAILABLE_EVALUATORS object")
+        return evaluators
+
+    start_pos = start_match.end() - 1  # Position of the opening brace
+
+    # Find the matching closing brace
+    brace_count = 0
+    end_pos = start_pos
+
+    for j, char in enumerate(content[start_pos:], start_pos):
+        if char == "{":
+            brace_count += 1
+        elif char == "}":
+            brace_count -= 1
+            if brace_count == 0:
+                end_pos = j + 1
+                break
+
+    evaluators_content = content[start_pos:end_pos]
+
     # Find all evaluator definitions using regex
     # Look for patterns like "evaluator/id": { ... }
     evaluator_pattern = r'"([^"]+)":\s*{'
 
     # Find all matches
-    matches = list(re.finditer(evaluator_pattern, content))
+    matches = list(re.finditer(evaluator_pattern, evaluators_content))
     print(f"Found {len(matches)} evaluator matches")
 
     for i, match in enumerate(matches):
@@ -35,7 +61,7 @@ def parse_typescript_evaluators(file_path: str) -> Dict[str, Any]:
         brace_count = 0
         end_pos = start_pos
 
-        for j, char in enumerate(content[start_pos:], start_pos):
+        for j, char in enumerate(evaluators_content[start_pos:], start_pos):
             if char == "{":
                 brace_count += 1
             elif char == "}":
@@ -44,7 +70,7 @@ def parse_typescript_evaluators(file_path: str) -> Dict[str, Any]:
                     end_pos = j + 1
                     break
 
-        evaluator_content = content[start_pos:end_pos]
+        evaluator_content = evaluators_content[start_pos:end_pos]
 
         # Parse the evaluator content
         evaluator = parse_evaluator_content(evaluator_id, evaluator_content)
@@ -52,6 +78,10 @@ def parse_typescript_evaluators(file_path: str) -> Dict[str, Any]:
 
         print(f"Processing evaluator: {evaluator_id}")
 
+        # Debug: Print content for competitor_blocklist
+        if evaluator_id == "langevals/competitor_blocklist":
+            print(f"DEBUG: Content for {evaluator_id}: {evaluator_content[:500]}")
+
     return evaluators
 
 
@@ -104,13 +134,31 @@ def parse_evaluator_content(evaluator_id: str, content: str) -> Dict[str, Any]:
             field.strip().strip('"') for field in re.findall(r'"([^"]+)"', opt_content)
         ]
 
-    # Extract settings
-    settings_match = re.search(
-        r"settings:\s*{([^}]+(?:{[^}]*}[^}]*)*)}", content, re.DOTALL
-    )
-    if settings_match:
-        settings_content = settings_match.group(1)
-        evaluator["settings"] = parse_settings(settings_content)
+    # Extract settings using a more robust approach
+    settings_start = content.find("settings:")
+    if settings_start != -1:
+        # Find the opening brace after settings:
+        brace_start = content.find("{", settings_start)
+        if brace_start != -1:
+            # Find the matching closing brace
+            brace_count = 0
+            settings_end = brace_start
+
+            for i, char in enumerate(content[brace_start:], brace_start):
+                if char == "{":
+                    brace_count += 1
+                elif char == "}":
+                    brace_count -= 1
+                    if brace_count == 0:
+                        settings_end = i + 1
+                        break
+
+            settings_content = content[brace_start + 1 : settings_end - 1]
+            evaluator["settings"] = parse_settings(settings_content)
+        else:
+            print(f"DEBUG: No opening brace found for settings in {evaluator_id}")
+    else:
+        print(f"DEBUG: No settings found for {evaluator_id}")
 
     return evaluator
 
@@ -119,12 +167,28 @@ def parse_settings(settings_content: str) -> Dict[str, Any]:
     """Parse settings object."""
     settings = {}
 
-    # Find all setting definitions
-    setting_pattern = r"(\w+):\s*{([^}]+(?:{[^}]*}[^}]*)*)}"
+    # Find all setting definitions using a more robust approach
+    # Look for patterns like "setting_name: { ... }" with proper brace matching
+    setting_pattern = r"(\w+):\s*{"
 
     for setting_match in re.finditer(setting_pattern, settings_content, re.DOTALL):
         setting_name = setting_match.group(1)
-        setting_content = setting_match.group(2)
+        start_pos = setting_match.end() - 1  # Position of the opening brace
+
+        # Find the matching closing brace
+        brace_count = 0
+        end_pos = start_pos
+
+        for i, char in enumerate(settings_content[start_pos:], start_pos):
+            if char == "{":
+                brace_count += 1
+            elif char == "}":
+                brace_count -= 1
+                if brace_count == 0:
+                    end_pos = i + 1
+                    break
+
+        setting_content = settings_content[start_pos + 1 : end_pos - 1]
 
         # Extract description
         desc_match = re.search(r'description:\s*"([^"]+)"', setting_content)
@@ -133,6 +197,15 @@ def parse_settings(settings_content: str) -> Dict[str, Any]:
         # Extract default value
         default_value = extract_default_value(setting_content)
 
+        # Debug output for competitor_blocklist
+        if setting_name == "competitors":
+            print(f"DEBUG: Setting content for competitors: {setting_content}")
+            print(f"DEBUG: Extracted default value: {default_value}")
+            print(f"DEBUG: Type of default value: {type(default_value)}")
+            print(
+                f"DEBUG: Content contains 'competitors': {'competitors' in setting_content}"
+            )
+
         settings[setting_name] = {"description": description, "default": default_value}
 
     return settings
@@ -141,13 +214,58 @@ def parse_settings(settings_content: str) -> Dict[str, Any]:
 def extract_default_value(content: str) -> Any:
     """Extract default value from setting content."""
 
+    # Debug output (only for competitors field)
+    if "competitors" in content:
+        print(f"DEBUG: extract_default_value called with content: {content}")
+
     # Look for default: followed by various value types
+    # Use a more robust pattern that handles arrays and objects
     default_match = re.search(r"default:\s*(.+?)(?:,|$)", content, re.DOTALL)
     if not default_match:
         return None
 
     value_str = default_match.group(1).strip()
 
+    # Debug output (only for competitors field)
+    if "competitors" in content:
+        print(f"DEBUG: Raw regex match: {default_match.group(0)}")
+        print(f"DEBUG: Captured value: {value_str}")
+
+        # If it's an array or object, try to find the complete value
+    if value_str.startswith("[") or value_str.startswith("{"):
+        # Find the complete array/object by looking in the original content
+        # Find the position of the opening bracket in the original content
+        default_start = content.find("default:")
+        if default_start != -1:
+            # Find the opening bracket after "default:"
+            bracket_start = content.find("[", default_start)
+            if bracket_start != -1:
+                # Find the matching closing bracket
+                bracket_count = 0
+                complete_value = ""
+
+                for i, char in enumerate(content[bracket_start:], bracket_start):
+                    complete_value += char
+                    if char == "[":
+                        bracket_count += 1
+                    elif char == "]":
+                        bracket_count -= 1
+                        if bracket_count == 0:
+                            break
+
+                value_str = complete_value
+
+        # Debug output (only for competitors field)
+        if "competitors" in content:
+            print(f"DEBUG: Original value_str: {default_match.group(1).strip()}")
+            print(f"DEBUG: Complete value_str: {value_str}")
+    else:
+        # Debug output (only for competitors field)
+        if "competitors" in content:
+            print(f"DEBUG: Not an array/object, value_str: {value_str}")
+            print(f"DEBUG: value_str starts with '[': {value_str.startswith('[')}")
+            print(f"DEBUG: value_str starts with '{{': {value_str.startswith('{')}")
+
     # Handle different value types
     if value_str.startswith('"') and value_str.endswith('"'):
         # String value
@@ -164,11 +282,17 @@ def extract_default_value(content: str) -> Any:
     elif value_str.replace(".", "").isdigit():
         return float(value_str)
     elif value_str.startswith("[") and value_str.endswith("]"):
-        # Array value - simplified parsing
-        return []
+        # Array value - try to parse as JSON
+        try:
+            return json.loads(value_str)
+        except:
+            return []
     elif value_str.startswith("{") and value_str.endswith("}"):
-        # Object value - simplified parsing
-        return {}
+        # Object value - try to parse as JSON
+        try:
+            return json.loads(value_str)
+        except:
+            return {}
     else:
         return value_str
 
@@ -413,12 +537,24 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
         request_schema_name = (
             f"{evaluator_id.replace('.', '_').replace('/', '_')}Request"
         )
-        schema["components"]["schemas"][request_schema_name] = {
+        request_schema = {
             "type": "object",
             "properties": {"data": data_schema},
             "required": ["data"],
         }
 
+        # Add settings property to request schema if evaluator has settings
+        if evaluator.get("settings"):
+            print(f"Adding settings to {evaluator_id}")
+            request_schema["properties"]["settings"] = {
+                "type": "object",
+                "description": "Evaluator settings",
+            }
+        else:
+            print(f"No settings found for {evaluator_id}")
+
+        schema["components"]["schemas"][request_schema_name] = request_schema
+
         # Create settings schema for this evaluator
         settings_schema = {"type": "object", "properties": {}}
 
@@ -441,10 +577,21 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                     property_def["default"] = default_value
                 elif isinstance(default_value, list):
                     property_def["type"] = "array"
-                    if default_value and isinstance(default_value[0], str):
-                        property_def["items"] = {"type": "string"}
+                    # Infer item type for arrays
+                    if default_value:
+                        first_item = default_value[0]
+                        if isinstance(first_item, str):
+                            property_def["items"] = {"type": "string"}
+                        elif isinstance(first_item, bool):
+                            property_def["items"] = {"type": "boolean"}
+                        elif isinstance(first_item, (int, float)):
+                            property_def["items"] = {"type": "number"}
+                        elif isinstance(first_item, dict):
+                            property_def["items"] = {"type": "object"}
+                        else:
+                            property_def["items"] = {"type": "string"}
                     else:
-                        property_def["items"] = {"type": "object"}
+                        property_def["items"] = {"type": "string"}
                     property_def["default"] = default_value
                 elif isinstance(default_value, dict):
                     property_def["type"] = "object"

From 015607e1f849261e9fe53e22b42c6c623cf7e8ae Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Wed, 16 Jul 2025 09:46:13 +0200
Subject: [PATCH 3/9] Update generate_openapi.py

---
 scripts/generate_openapi.py | 70 +++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 10 deletions(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index f79176a..0c3e045 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -211,6 +211,21 @@ def parse_settings(settings_content: str) -> Dict[str, Any]:
     return settings
 
 
+def typescript_to_json(ts_str: str) -> str:
+    """Convert a TypeScript-style object/array string to valid JSON."""
+    # Remove newlines and excessive spaces for easier regex
+    s = ts_str.strip()
+    # Remove trailing commas before closing brackets/braces
+    s = re.sub(r",\s*([}\]])", r"\1", s)
+    # Quote property names (unquoted keys)
+    s = re.sub(r"([,{]\s*)([A-Za-z0-9_]+)\s*:", r'\1"\2":', s)
+    # Quote string values (only those not already quoted)
+    s = re.sub(r":\s*([A-Za-z0-9_\-/\.]+)", r': "\1"', s)
+    # Replace single quotes with double quotes
+    s = s.replace("'", '"')
+    return s
+
+
 def extract_default_value(content: str) -> Any:
     """Extract default value from setting content."""
 
@@ -218,6 +233,12 @@ def extract_default_value(content: str) -> Any:
     if "competitors" in content:
         print(f"DEBUG: extract_default_value called with content: {content}")
 
+    # Debug output for categories field
+    if "categories" in content:
+        print(
+            f"DEBUG: extract_default_value called with content for categories: {content}"
+        )
+
     # Look for default: followed by various value types
     # Use a more robust pattern that handles arrays and objects
     default_match = re.search(r"default:\s*(.+?)(?:,|$)", content, re.DOTALL)
@@ -237,18 +258,24 @@ def extract_default_value(content: str) -> Any:
         # Find the position of the opening bracket in the original content
         default_start = content.find("default:")
         if default_start != -1:
-            # Find the opening bracket after "default:"
-            bracket_start = content.find("[", default_start)
+            bracket_start = -1
+            if value_str.startswith("["):
+                bracket_start = content.find("[", default_start)
+            elif value_str.startswith("{"):
+                bracket_start = content.find("{", default_start)
+
             if bracket_start != -1:
                 # Find the matching closing bracket
                 bracket_count = 0
                 complete_value = ""
+                open_char = content[bracket_start]
+                close_char = "]" if open_char == "[" else "}"
 
                 for i, char in enumerate(content[bracket_start:], bracket_start):
                     complete_value += char
-                    if char == "[":
+                    if char == open_char:
                         bracket_count += 1
-                    elif char == "]":
+                    elif char == close_char:
                         bracket_count -= 1
                         if bracket_count == 0:
                             break
@@ -259,6 +286,13 @@ def extract_default_value(content: str) -> Any:
         if "competitors" in content:
             print(f"DEBUG: Original value_str: {default_match.group(1).strip()}")
             print(f"DEBUG: Complete value_str: {value_str}")
+
+        # Debug output for categories field
+        if "categories" in content:
+            print(
+                f"DEBUG: Original value_str for categories: {default_match.group(1).strip()}"
+            )
+            print(f"DEBUG: Complete value_str for categories: {value_str}")
     else:
         # Debug output (only for competitors field)
         if "competitors" in content:
@@ -266,6 +300,12 @@ def extract_default_value(content: str) -> Any:
             print(f"DEBUG: value_str starts with '[': {value_str.startswith('[')}")
             print(f"DEBUG: value_str starts with '{{': {value_str.startswith('{')}")
 
+        # Debug output for categories field
+        if "categories" in content:
+            print(f"DEBUG: Not an array/object, value_str for categories: {value_str}")
+            print(f"DEBUG: value_str starts with '[': {value_str.startswith('[')}")
+            print(f"DEBUG: value_str starts with '{{': {value_str.startswith('{')}")
+
     # Handle different value types
     if value_str.startswith('"') and value_str.endswith('"'):
         # String value
@@ -286,13 +326,23 @@ def extract_default_value(content: str) -> Any:
         try:
             return json.loads(value_str)
         except:
-            return []
+            # Try to convert TypeScript array to JSON
+            try:
+                json_str = typescript_to_json(value_str)
+                return json.loads(json_str)
+            except:
+                return []
     elif value_str.startswith("{") and value_str.endswith("}"):
         # Object value - try to parse as JSON
         try:
             return json.loads(value_str)
         except:
-            return {}
+            # Try to convert TypeScript object to JSON
+            try:
+                json_str = typescript_to_json(value_str)
+                return json.loads(json_str)
+            except:
+                return {}
     else:
         return value_str
 
@@ -566,15 +616,15 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                 }
 
                 default_value = setting.get("default")
-                if isinstance(default_value, str):
+                if isinstance(default_value, bool):
+                    property_def["type"] = "boolean"
+                    property_def["default"] = default_value
+                elif isinstance(default_value, str):
                     property_def["type"] = "string"
                     property_def["default"] = default_value
                 elif isinstance(default_value, (int, float)):
                     property_def["type"] = "number"
                     property_def["default"] = default_value
-                elif isinstance(default_value, bool):
-                    property_def["type"] = "boolean"
-                    property_def["default"] = default_value
                 elif isinstance(default_value, list):
                     property_def["type"] = "array"
                     # Infer item type for arrays

From fe305149b9e966017451e145a31f71771e41071f Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Wed, 16 Jul 2025 10:13:44 +0200
Subject: [PATCH 4/9] remove debugs

---
 scripts/generate_openapi.py | 54 +------------------------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index 0c3e045..3f9a944 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -78,10 +78,6 @@ def parse_typescript_evaluators(file_path: str) -> Dict[str, Any]:
 
         print(f"Processing evaluator: {evaluator_id}")
 
-        # Debug: Print content for competitor_blocklist
-        if evaluator_id == "langevals/competitor_blocklist":
-            print(f"DEBUG: Content for {evaluator_id}: {evaluator_content[:500]}")
-
     return evaluators
 
 
@@ -197,15 +193,6 @@ def parse_settings(settings_content: str) -> Dict[str, Any]:
         # Extract default value
         default_value = extract_default_value(setting_content)
 
-        # Debug output for competitor_blocklist
-        if setting_name == "competitors":
-            print(f"DEBUG: Setting content for competitors: {setting_content}")
-            print(f"DEBUG: Extracted default value: {default_value}")
-            print(f"DEBUG: Type of default value: {type(default_value)}")
-            print(
-                f"DEBUG: Content contains 'competitors': {'competitors' in setting_content}"
-            )
-
         settings[setting_name] = {"description": description, "default": default_value}
 
     return settings
@@ -229,16 +216,6 @@ def typescript_to_json(ts_str: str) -> str:
 def extract_default_value(content: str) -> Any:
     """Extract default value from setting content."""
 
-    # Debug output (only for competitors field)
-    if "competitors" in content:
-        print(f"DEBUG: extract_default_value called with content: {content}")
-
-    # Debug output for categories field
-    if "categories" in content:
-        print(
-            f"DEBUG: extract_default_value called with content for categories: {content}"
-        )
-
     # Look for default: followed by various value types
     # Use a more robust pattern that handles arrays and objects
     default_match = re.search(r"default:\s*(.+?)(?:,|$)", content, re.DOTALL)
@@ -247,12 +224,7 @@ def extract_default_value(content: str) -> Any:
 
     value_str = default_match.group(1).strip()
 
-    # Debug output (only for competitors field)
-    if "competitors" in content:
-        print(f"DEBUG: Raw regex match: {default_match.group(0)}")
-        print(f"DEBUG: Captured value: {value_str}")
-
-        # If it's an array or object, try to find the complete value
+    # If it's an array or object, try to find the complete value
     if value_str.startswith("[") or value_str.startswith("{"):
         # Find the complete array/object by looking in the original content
         # Find the position of the opening bracket in the original content
@@ -282,30 +254,6 @@ def extract_default_value(content: str) -> Any:
 
                 value_str = complete_value
 
-        # Debug output (only for competitors field)
-        if "competitors" in content:
-            print(f"DEBUG: Original value_str: {default_match.group(1).strip()}")
-            print(f"DEBUG: Complete value_str: {value_str}")
-
-        # Debug output for categories field
-        if "categories" in content:
-            print(
-                f"DEBUG: Original value_str for categories: {default_match.group(1).strip()}"
-            )
-            print(f"DEBUG: Complete value_str for categories: {value_str}")
-    else:
-        # Debug output (only for competitors field)
-        if "competitors" in content:
-            print(f"DEBUG: Not an array/object, value_str: {value_str}")
-            print(f"DEBUG: value_str starts with '[': {value_str.startswith('[')}")
-            print(f"DEBUG: value_str starts with '{{': {value_str.startswith('{')}")
-
-        # Debug output for categories field
-        if "categories" in content:
-            print(f"DEBUG: Not an array/object, value_str for categories: {value_str}")
-            print(f"DEBUG: value_str starts with '[': {value_str.startswith('[')}")
-            print(f"DEBUG: value_str starts with '{{': {value_str.startswith('{')}")
-
     # Handle different value types
     if value_str.startswith('"') and value_str.endswith('"'):
         # String value

From 45ad0637a03dcc1758f167007cfa1afce19e2a48 Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Thu, 17 Jul 2025 10:02:46 +0200
Subject: [PATCH 5/9] add code samples

---
 scripts/generate_openapi.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index 3f9a944..6b41978 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -226,7 +226,6 @@ def extract_default_value(content: str) -> Any:
 
     # If it's an array or object, try to find the complete value
     if value_str.startswith("[") or value_str.startswith("{"):
-        # Find the complete array/object by looking in the original content
         # Find the position of the opening bracket in the original content
         default_start = content.find("default:")
         if default_start != -1:
@@ -669,6 +668,18 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                             },
                         },
                     },
+                    "x-codeSamples": [
+                        {
+                            "lang": "python",
+                            "label": "Python",
+                            "source": f'import langwatch\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n{chr(10).join(f"        {field}=\"\", # required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}=\"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings={{}},\n    )\n    print(result)',
+                        },
+                        {
+                            "lang": "typescript",
+                            "label": "TypeScript",
+                            "source": f'import {{ type LangWatchTrace }} from "langwatch";\n\nasync function llmStep({{ message, trace }}: {{ message: string, trace: LangWatchTrace }}): Promise<string> {{\n    const span = trace.startLLMSpan({{ name: "llmStep" }});\n    \n    // ... your existing code\n\n    // call the evaluator either on a span or on a trace\n    const result = await span.evaluate({{\n        evaluator: "{evaluator_id}",\n        name: "",\n{chr(10).join(f"        {field}: \"\", // required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}: \"\", # optional" for field in evaluator.get("optionalFields", []))}        settings: {{}},\n    }})\n\n    console.log(result);',
+                        },
+                    ],
                 }
             }
         else:
@@ -725,6 +736,18 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                             },
                         },
                     },
+                    "x-codeSamples": [
+                        {
+                            "lang": "python",
+                            "label": "Python",
+                            "source": f'import langwatch\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n{chr(10).join(f"        {field}=\"\", # required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}=\"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings={{}},\n    )\n    print(result)',
+                        },
+                        {
+                            "lang": "typescript",
+                            "label": "TypeScript",
+                            "source": f'import {{ type LangWatchTrace }} from "langwatch";\n\nasync function llmStep({{ message, trace }}: {{ message: string, trace: LangWatchTrace }}): Promise<string> {{\n    const span = trace.startLLMSpan({{ name: "llmStep" }});\n    \n    // ... your existing code\n\n    // call the evaluator either on a span or on a trace\n    const result = await span.evaluate({{\n        evaluator: "{evaluator_id}",\n        name: "",\n{chr(10).join(f"        {field}: \"\", // required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}: \"\", # optional" for field in evaluator.get("optionalFields", []))}        settings: {{}},\n    }})\n\n    console.log(result);',
+                        },
+                    ],
                 }
             }
 

From 394552dcae78efe9157ceecdee85a84da0503251 Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Thu, 17 Jul 2025 11:21:45 +0200
Subject: [PATCH 6/9] add offline

---
 scripts/generate_openapi.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index 6b41978..969528d 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -671,13 +671,18 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                     "x-codeSamples": [
                         {
                             "lang": "python",
-                            "label": "Python",
+                            "label": "Python (offline)",
+                            "source": f'import langwatch\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n    def evaluate(index, row):\n        # Your evaluation logic here\n        \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{\n                \n            }}\n        )\n\n    evaluation.submit(evaluate, index, row)',
+                        },
+                        {
+                            "lang": "python",
+                            "label": "Python (online)",
                             "source": f'import langwatch\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n{chr(10).join(f"        {field}=\"\", # required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}=\"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings={{}},\n    )\n    print(result)',
                         },
                         {
                             "lang": "typescript",
                             "label": "TypeScript",
-                            "source": f'import {{ type LangWatchTrace }} from "langwatch";\n\nasync function llmStep({{ message, trace }}: {{ message: string, trace: LangWatchTrace }}): Promise<string> {{\n    const span = trace.startLLMSpan({{ name: "llmStep" }});\n    \n    // ... your existing code\n\n    // call the evaluator either on a span or on a trace\n    const result = await span.evaluate({{\n        evaluator: "{evaluator_id}",\n        name: "",\n{chr(10).join(f"        {field}: \"\", // required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}: \"\", # optional" for field in evaluator.get("optionalFields", []))}        settings: {{}},\n    }})\n\n    console.log(result);',
+                            "source": f'import {{ type LangWatchTrace }} from "langwatch";\n\nasync function llmStep({{ message, trace }}: {{ message: string, trace: LangWatchTrace }}): Promise<string> {{\n    const span = trace.startLLMSpan({{ name: "llmStep" }});\n    \n    // ... your existing code\n\n    // call the evaluator either on a span or on a trace\n    const result = await span.evaluate({{\n        evaluator: "{evaluator_id}",\n        name: "",\n{chr(10).join(f"        {field}: \"\", // required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}: \"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings: {{}},\n    }})\n\n    console.log(result);',
                         },
                     ],
                 }
@@ -739,13 +744,18 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                     "x-codeSamples": [
                         {
                             "lang": "python",
-                            "label": "Python",
+                            "label": "Python (offline)",
+                            "source": f'import langwatch\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n    def evaluate(index, row):\n        # Your evaluation logic here\n        \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{\n                \n            }}\n        )\n\n    evaluation.submit(evaluate, index, row)',
+                        },
+                        {
+                            "lang": "python",
+                            "label": "Python (online)",
                             "source": f'import langwatch\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n{chr(10).join(f"        {field}=\"\", # required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}=\"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings={{}},\n    )\n    print(result)',
                         },
                         {
                             "lang": "typescript",
                             "label": "TypeScript",
-                            "source": f'import {{ type LangWatchTrace }} from "langwatch";\n\nasync function llmStep({{ message, trace }}: {{ message: string, trace: LangWatchTrace }}): Promise<string> {{\n    const span = trace.startLLMSpan({{ name: "llmStep" }});\n    \n    // ... your existing code\n\n    // call the evaluator either on a span or on a trace\n    const result = await span.evaluate({{\n        evaluator: "{evaluator_id}",\n        name: "",\n{chr(10).join(f"        {field}: \"\", // required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}: \"\", # optional" for field in evaluator.get("optionalFields", []))}        settings: {{}},\n    }})\n\n    console.log(result);',
+                            "source": f'import {{ type LangWatchTrace }} from "langwatch";\n\nasync function llmStep({{ message, trace }}: {{ message: string, trace: LangWatchTrace }}): Promise<string> {{\n    const span = trace.startLLMSpan({{ name: "llmStep" }});\n    \n    // ... your existing code\n\n    // call the evaluator either on a span or on a trace\n    const result = await span.evaluate({{\n        evaluator: "{evaluator_id}",\n        name: "",\n{chr(10).join(f"        {field}: \"\", // required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}: \"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings: {{}},\n    }})\n\n    console.log(result);',
                         },
                     ],
                 }

From 9bc8502d9a6e1b77c37f1bd9a0e841200bee70ed Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Thu, 17 Jul 2025 13:18:44 +0200
Subject: [PATCH 7/9] Update generate_openapi.py

---
 scripts/generate_openapi.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index 969528d..a21d2a2 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -671,13 +671,13 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                     "x-codeSamples": [
                         {
                             "lang": "python",
-                            "label": "Python (offline)",
-                            "source": f'import langwatch\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n    def evaluate(index, row):\n        # Your evaluation logic here\n        \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{\n                \n            }}\n        )\n\n    evaluation.submit(evaluate, index, row)',
+                            "label": "Offline Evaluation",
+                            "source": f'import langwatch\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n       \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{}}\n        )\n',
                         },
                         {
                             "lang": "python",
-                            "label": "Python (online)",
-                            "source": f'import langwatch\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n{chr(10).join(f"        {field}=\"\", # required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}=\"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings={{}},\n    )\n    print(result)',
+                            "label": "Realtime Evaluation",
+                            "source": f'import langwatch\n\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n        data={{\n{chr(10).join([f"            \"{field}\": \"\"," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", [])])}\n        }},\n        settings={{}},\n    )\n    print(result)',
                         },
                         {
                             "lang": "typescript",
@@ -744,13 +744,13 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                     "x-codeSamples": [
                         {
                             "lang": "python",
-                            "label": "Python (offline)",
-                            "source": f'import langwatch\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n    def evaluate(index, row):\n        # Your evaluation logic here\n        \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{\n                \n            }}\n        )\n\n    evaluation.submit(evaluate, index, row)',
+                            "label": "Offline Evaluation",
+                            "source": f'import langwatch\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n         \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{}}\n        )\n',
                         },
                         {
                             "lang": "python",
-                            "label": "Python (online)",
-                            "source": f'import langwatch\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n{chr(10).join(f"        {field}=\"\", # required" for field in evaluator.get("requiredFields", []))}\n{chr(10).join(f"        {field}=\"\", # optional" for field in evaluator.get("optionalFields", []))}\n        settings={{}},\n    )\n    print(result)',
+                            "label": "Realtime Evaluation",
+                            "source": f'import langwatch\n\n@langwatch.span()\ndef llm_step():\n    ... # your existing code\n    result = langwatch.get_current_span().evaluate(\n        "{evaluator_id}",\n        data={{\n{chr(10).join([f"            \"{field}\": \"\"," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", [])])}\n        }},\n        settings={{}},\n    )\n    print(result)',
                         },
                         {
                             "lang": "typescript",

From 94605f8bc62a01361d0f41dac638d69b1efe2f82 Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Fri, 18 Jul 2025 11:01:57 +0200
Subject: [PATCH 8/9] update spacing

---
 scripts/generate_openapi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index a21d2a2..a3ef917 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -672,7 +672,7 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                         {
                             "lang": "python",
                             "label": "Offline Evaluation",
-                            "source": f'import langwatch\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n       \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{}}\n        )\n',
+                            "source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n\n    # your execution code here\n       \n    evaluation.run(\n        "{evaluator_id}",\n        index=index,\n        data={{\n{chr(10).join(f"            \"{field}\": {field}," if field != "expected_output" else f"            \"{field}\": row[\"{field}\"]," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n        }},\n        settings={{}}\n    )\n',
                         },
                         {
                             "lang": "python",
@@ -745,7 +745,7 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                         {
                             "lang": "python",
                             "label": "Offline Evaluation",
-                            "source": f'import langwatch\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n         \n        evaluation.run(\n            "{evaluator_id}",\n            index=index,\n            data={{\n{chr(10).join(f"                \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n            }},\n            settings={{}}\n        )\n',
+                            "source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n\n    # your execution code here\n       \n    evaluation.run(\n        "{evaluator_id}",\n        index=index,\n        data={{\n{chr(10).join(f"            \"{field}\": {field}," if field != "expected_output" else f"            \"{field}\": row[\"{field}\"]," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n        }},\n        settings={{}}\n    )\n',
                         },
                         {
                             "lang": "python",

From 45da26854b954007ad225c182ee3ffd3b6519135 Mon Sep 17 00:00:00 2001
From: Richard Huth <richhuth@gmail.com>
Date: Mon, 21 Jul 2025 11:18:53 +0200
Subject: [PATCH 9/9] generate mdx list and open api

---
 scripts/generate_mdx_list.py | 202 +++++++++++++++++++++++++++++++++++
 scripts/generate_openapi.py  |   4 +-
 2 files changed, 204 insertions(+), 2 deletions(-)
 create mode 100644 scripts/generate_mdx_list.py

diff --git a/scripts/generate_mdx_list.py b/scripts/generate_mdx_list.py
new file mode 100644
index 0000000..8b9867f
--- /dev/null
+++ b/scripts/generate_mdx_list.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+"""
+Python script to generate MDX list from OpenAPI JSON
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+
+
+def categorize_evaluators(
+    evaluators: Dict[str, Any],
+) -> Dict[str, Dict[str, Any]]:
+    """Categorize evaluators based on their paths and descriptions."""
+
+    categories = {
+        "Expected Answer Evaluation": {
+            "description": "For when you have the golden answer and want to measure how correct the LLM gets it",
+            "evaluators": [],
+        },
+        "LLM-as-Judge": {
+            "description": "For when you don't have a golden answer, but have a set of rules for another LLM to evaluate quality",
+            "evaluators": [],
+        },
+        "RAG Quality": {
+            "description": "For measuring the quality of your RAG, check for hallucinations with faithfulness and precision/recall",
+            "evaluators": [],
+        },
+        "Quality Aspects Evaluation": {
+            "description": "For when you want to check the language, structure, style and other general quality metrics",
+            "evaluators": [],
+        },
+        "Safety": {
+            "description": "Check for PII, prompt injection attempts and toxic content",
+            "evaluators": [],
+        },
+        "Other": {"description": "Miscellaneous evaluators", "evaluators": []},
+    }
+
+    for path, path_info in evaluators.items():
+        if not path.endswith("/evaluate"):
+            continue
+
+        evaluator_id = path.replace("/evaluate", "")
+        post_info = path_info.get("post", {})
+        summary = post_info.get("summary", evaluator_id)
+        description = post_info.get("description", "")
+
+        # Convert evaluator name to proper endpoint format
+        # Use the evaluator name and convert to kebab-case
+        endpoint_id = summary.lower()
+        # Replace spaces and special characters with hyphens
+        endpoint_id = endpoint_id.replace(" ", "-")
+        endpoint_id = endpoint_id.replace("_", "-")
+        endpoint_id = endpoint_id.replace("/", "-")
+        # Remove any non-alphanumeric characters except hyphens
+        import re
+
+        endpoint_id = re.sub(r"[^a-z0-9\-]", "", endpoint_id)
+        # Remove multiple consecutive hyphens
+        endpoint_id = re.sub(r"-+", "-", endpoint_id)
+        # Remove leading/trailing hyphens
+        endpoint_id = endpoint_id.strip("-")
+
+        evaluator_info = {
+            "id": evaluator_id,
+            "name": summary,
+            "description": description,
+            "endpoint": f"/api-reference/{endpoint_id}",
+        }
+
+        # Categorize based on path and description
+        if any(
+            keyword in evaluator_id.lower()
+            for keyword in [
+                "exact_match",
+                "llm_answer_match",
+                "factual",
+                "sql_query",
+                "rouge",
+                "bleu",
+            ]
+        ):
+            categories["Expected Answer Evaluation"]["evaluators"].append(
+                evaluator_info
+            )
+        elif any(
+            keyword in evaluator_id.lower()
+            for keyword in ["llm_boolean", "llm_score", "llm_category", "rubrics"]
+        ):
+            categories["LLM-as-Judge"]["evaluators"].append(evaluator_info)
+        elif any(
+            keyword in evaluator_id.lower()
+            for keyword in [
+                "faithfulness",
+                "context_precision",
+                "context_recall",
+                "context_f1",
+                "response_relevancy",
+                "response_context",
+            ]
+        ):
+            categories["RAG Quality"]["evaluators"].append(evaluator_info)
+        elif any(
+            keyword in evaluator_id.lower()
+            for keyword in ["language_detection", "valid_format", "summarization"]
+        ):
+            categories["Quality Aspects Evaluation"]["evaluators"].append(
+                evaluator_info
+            )
+        elif any(
+            keyword in evaluator_id.lower()
+            for keyword in [
+                "pii",
+                "jailbreak",
+                "prompt_injection",
+                "content_safety",
+                "moderation",
+                "llama_guard",
+            ]
+        ):
+            categories["Safety"]["evaluators"].append(evaluator_info)
+        else:
+            categories["Other"]["evaluators"].append(evaluator_info)
+
+    return categories
+
+
+def generate_mdx(categories: Dict[str, Any]) -> str:
+    """Generate MDX content."""
+
+    mdx_content = []
+
+    for category_name, category_info in categories.items():
+        if not category_info["evaluators"]:
+            continue
+
+        mdx_content.append(f"## {category_name}")
+        mdx_content.append(f"{category_info['description']}")
+        mdx_content.append("")
+        mdx_content.append("| Evaluator | Description |")
+        mdx_content.append("| --------- | ----------- |")
+
+        for evaluator in category_info["evaluators"]:
+            # Clean description to remove newlines but keep full text
+            desc = evaluator["description"]
+            # Remove newlines and normalize whitespace
+            desc = " ".join(desc.split())
+
+            mdx_content.append(
+                f"| [{evaluator['name']}]({evaluator['endpoint']}) | {desc} |"
+            )
+
+        mdx_content.append("")
+
+    return "\n".join(mdx_content)
+
+
+def main():
+    """Main function to generate MDX list."""
+    try:
+        # Find the openapi.json file
+        script_dir = Path(__file__).parent
+        openapi_file = script_dir / "openapi.json"
+
+        if not openapi_file.exists():
+            print(f"Error: Could not find OpenAPI file at {openapi_file}")
+            sys.exit(1)
+
+        print(f"Reading OpenAPI from: {openapi_file}")
+
+        with open(openapi_file, "r", encoding="utf-8") as f:
+            openapi_data = json.load(f)
+
+        paths = openapi_data.get("paths", {})
+        categories = categorize_evaluators(paths)
+        mdx_content = generate_mdx(categories)
+
+        # Write the MDX content to a file
+        output_path = script_dir / "evaluators-list.mdx"
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(mdx_content)
+
+        print(f"MDX list generated successfully at: {output_path}")
+
+        # Print summary
+        total_evaluators = sum(len(cat["evaluators"]) for cat in categories.values())
+        active_categories = len(
+            [cat for cat in categories.values() if cat["evaluators"]]
+        )
+        print(
+            f"Generated {total_evaluators} evaluators across {active_categories} categories"
+        )
+
+    except Exception as error:
+        print(f"Error generating MDX list: {error}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/generate_openapi.py b/scripts/generate_openapi.py
index a3ef917..e134d83 100644
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@@ -672,7 +672,7 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                         {
                             "lang": "python",
                             "label": "Offline Evaluation",
-                            "source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n\n    # your execution code here\n       \n    evaluation.run(\n        "{evaluator_id}",\n        index=index,\n        data={{\n{chr(10).join(f"            \"{field}\": {field}," if field != "expected_output" else f"            \"{field}\": row[\"{field}\"]," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n        }},\n        settings={{}}\n    )\n',
+                            "source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n    # your execution code here       \n    evaluation.run(\n        "{evaluator_id}",\n        index=index,\n        data={{\n{chr(10).join(f"            \"{field}\": row[\"{field}\"]," if field in ["input", "contexts", "expected_output"] else f"            \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n        }},\n        settings={{}}\n    )\n',
                         },
                         {
                             "lang": "python",
@@ -745,7 +745,7 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
                         {
                             "lang": "python",
                             "label": "Offline Evaluation",
-                            "source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n\n    # your execution code here\n       \n    evaluation.run(\n        "{evaluator_id}",\n        index=index,\n        data={{\n{chr(10).join(f"            \"{field}\": {field}," if field != "expected_output" else f"            \"{field}\": row[\"{field}\"]," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n        }},\n        settings={{}}\n    )\n',
+                            "source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n    # your execution code here       \n    evaluation.run(\n        "{evaluator_id}",\n        index=index,\n        data={{\n{chr(10).join(f"            \"{field}\": row[\"{field}\"]," if field in ["input", "contexts", "expected_output"] else f"            \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n        }},\n        settings={{}}\n    )\n',
                         },
                         {
                             "lang": "python",