diff --git a/Makefile b/Makefile index d6a20fa6d..1209323c6 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ CONFIG := Release PROJECT := App/osaurus.xcodeproj DERIVED := build/DerivedData -.PHONY: help cli app install-cli serve status test ci-test clean bench-setup bench-ingest bench-ingest-chunks bench-run bench evals evals-verbose evals-report evals-all evals-all-verbose evals-all-report +.PHONY: help cli app install-cli serve status test ci-test compat-openai clean bench-setup bench-ingest bench-ingest-chunks bench-run bench evals evals-verbose evals-report evals-all evals-all-verbose evals-all-report help: @echo "Targets:" @@ -30,6 +30,7 @@ help: @echo " evals-all-report Same as 'evals-all' but writes per-suite JSON to EVALS_OUT_DIR (build/evals/)" @echo " test Run OsaurusCore package tests via 'swift test'" @echo " ci-test Reproduce the CI test-core job locally (xcodebuild + xcbeautify)" + @echo " compat-openai Run OpenAI Chat Completions compatibility checks against HOST/MODEL" @echo " clean Remove DerivedData build output" cli: @@ -94,6 +95,10 @@ ci-test: @echo "" @echo "Done. Inspect failures with: open build/Tests.xcresult" +compat-openai: + @echo "Running OpenAI compatibility checks (HOST=$(HOST), MODEL=$(MODEL))..." + ./scripts/openai_compat_report.sh + ## ── LOCOMO Benchmark ────────────────────────────────────────────── BENCH_MODEL ?= openrouter/google/gemini-2.5-flash diff --git a/docs/DEVELOPER_TOOLS.md b/docs/DEVELOPER_TOOLS.md index 471855869..dc5a17d82 100644 --- a/docs/DEVELOPER_TOOLS.md +++ b/docs/DEVELOPER_TOOLS.md @@ -267,6 +267,16 @@ open build/Tests.xcresult # full Xcode Test Navigator UI If a test fails on CI but you can't reproduce it on your machine, download the `test-core-xcresult-*` artifact attached to the failed CI run and open it the same way. +### API compatibility reports + +Use the OpenAI compatibility report when changing request encoding, response writers, streaming, tool calls, or request validation: + +```bash +make compat-openai HOST=http://localhost:1337 MODEL=foundation +``` + +The script writes a Markdown summary to `results/openai_compat_report.md` and detailed request/response artifacts under `results/openai_compat/`. Set `OUT_DIR=build/compat/openai` if you want scratch artifacts outside tracked result paths. + ### Long-running and integration tests Tests that require external infrastructure (Apple Containerization, real GPU, network, etc.) must: diff --git a/scripts/openai_compat_report.sh b/scripts/openai_compat_report.sh old mode 100644 new mode 100755 index c3f0c2eba..d64d95c54 --- a/scripts/openai_compat_report.sh +++ b/scripts/openai_compat_report.sh @@ -2,24 +2,34 @@ set -euo pipefail -# OpenAI Chat Completions streaming compatibility checks using curl + jq +# OpenAI Chat Completions compatibility checks using curl + jq. +# +# This is a manual/local guardrail, not a CI job: it needs a running Osaurus +# server and a usable model. It records request/response artifacts so response +# writer and request validation changes can be compared across runs. HOST=${HOST:-"http://localhost:1337"} +MODEL=${MODEL:-""} +REQUIRE_TOOL_CALLS=${REQUIRE_TOOL_CALLS:-0} -# Resolve repo root based on the location of this script; default results dir under repo SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-${0}}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -OUT_DIR=${OUT_DIR:-"$REPO_ROOT/results"} -mkdir -p "$OUT_DIR" +OUT_DIR=${OUT_DIR:-"$REPO_ROOT/results/openai_compat"} +REPORT=${REPORT:-"$REPO_ROOT/results/openai_compat_report.md"} + +mkdir -p "$OUT_DIR" "$(dirname "$REPORT")" TMP_DIR=$(mktemp -d) cleanup() { rm -rf "$TMP_DIR" >/dev/null 2>&1 || true; } trap cleanup EXIT +RESULTS_TSV="$TMP_DIR/results.tsv" +: > "$RESULTS_TSV" + need() { if ! command -v "$1" >/dev/null 2>&1; then echo "Missing dependency: $1" >&2 - echo "Please install it (e.g., brew install $1) and re-run." >&2 + echo "Please install it and re-run." >&2 exit 1 fi } @@ -27,254 +37,320 @@ need() { need curl need jq -emoji_ok='✅' -emoji_warn='⚠️' -emoji_bad='❌' - -log() { printf "%s %s\n" "$1" "$2"; } - -get_first_model() { - # Query models and pick either user-provided MODEL, or 'foundation', else the first one - local models_json - models_json=$(curl -sS "$HOST/v1/models" || curl -sS "$HOST/models" || true) - if [[ -z "$models_json" ]]; then - echo ""; return - fi - local ids - ids=$(jq -r '.data[]?.id // empty' <<<"$models_json") - if [[ -n "${MODEL:-}" ]]; then - echo "$MODEL"; return - fi - if grep -qx "foundation" <<<"$ids"; then - echo "foundation"; return - fi - if [[ -n "$ids" ]]; then - head -n1 <<<"$ids" - return - fi - echo "" +log() { + printf "%-7s %s\n" "$1" "$2" } -filter_sse_to_jsonl() { - # Args: - # Extract `data: ` lines, drop [DONE], keep only valid JSON chunks as jsonl - local sse_body="$1" - local jsonl_out="$2" - awk '/^data: /{print substr($0,7)}' "$sse_body" \ - | jq -Rrc 'select(. != "[DONE]") | try fromjson catch empty' > "$jsonl_out" || true +record() { + local key="$1" + local status="$2" + local area="$3" + local notes="$4" + printf '%s\t%s\t%s\t%s\n' "$key" "$status" "$area" "$notes" >> "$RESULTS_TSV" + log "$status" "$area" } -has_any_content_delta() { - # Args: - jq -e 'map(.choices[0].delta.content // empty) | join("") | length > 0' \ - <<<"$(jq -s '.' "$1")" >/dev/null 2>&1 +artifact_path() { + local name="$1" + printf '%s/%s' "$OUT_DIR" "$name" } -last_finish_reason() { - # Args: - jq -r '.[-1].choices[0].finish_reason // empty' <<<"$(jq -s '.' "$1")" +header_status() { + awk '/^HTTP/{status=$2} END{print status}' "$1" } -first_role() { - # Args: - jq -r '.[0].choices[0].delta.role // empty' <<<"$(jq -s '.' "$1")" +header_content_type_includes() { + local headers="$1" + local needle="$2" + grep -i '^content-type:' "$headers" | grep -qi "$needle" } -has_tool_calls_delta() { - # Args: - jq -e 'map( .choices[0].delta.tool_calls[0] // empty ) | any' \ - <<<"$(jq -s '.' "$1")" >/dev/null 2>&1 +json_array_from_jsonl() { + local jsonl="$1" + if [[ -s "$jsonl" ]]; then + jq -s '.' "$jsonl" + else + printf '[]\n' + fi } -has_tool_call_id_and_name() { - # Args: - jq -e 'map( .choices[0].delta.tool_calls[0] // empty ) - | map( ( .id != null ) and ( .function.name != null ) ) - | any' <<<"$(jq -s '.' "$1")" >/dev/null 2>&1 +post_json() { + local name="$1" + local path="$2" + local payload="$3" + local accept="${4:-application/json}" + local headers body status + + headers="$(artifact_path "${name}_headers.txt")" + body="$(artifact_path "${name}_body.txt")" + printf '%s\n' "$payload" > "$(artifact_path "${name}_request.json")" + + status=$( + curl -sS -N \ + -D "$headers" \ + -H "Accept: $accept" \ + -H 'Content-Type: application/json' \ + -X POST "$HOST$path" \ + -d "$payload" \ + -o "$body" \ + -w '%{http_code}' + ) + + printf '%s' "$status" > "$(artifact_path "${name}_status.txt")" } -has_tool_call_arguments_any() { - # Args: - jq -e 'map( .choices[0].delta.tool_calls[0].function.arguments // empty ) - | map( (.|type) == "string" and (.|length) > 0 ) - | any' <<<"$(jq -s '.' "$1")" >/dev/null 2>&1 +filter_sse_to_jsonl() { + local sse_body="$1" + local jsonl_out="$2" + awk '/^data: /{print substr($0,7)}' "$sse_body" \ + | jq -Rrc 'select(. != "[DONE]") | try fromjson catch empty' > "$jsonl_out" || true } -http_content_type_includes() { - # Args: - grep -i '^content-type:' "$1" | grep -qi "$2" +get_model() { + if [[ -n "$MODEL" ]]; then + printf '%s\n' "$MODEL" + return + fi + + local models_json ids + models_json=$(curl -sS "$HOST/v1/models" || curl -sS "$HOST/models" || true) + if [[ -z "$models_json" ]]; then + printf '\n' + return + fi + + ids=$(jq -r '.data[]?.id // empty' <<<"$models_json") + if grep -qx 'foundation' <<<"$ids"; then + printf 'foundation\n' + elif [[ -n "$ids" ]]; then + head -n1 <<<"$ids" + else + printf '\n' + fi } -http_status_ok() { - # Args: - head -n1 "$1" | grep -qE 'HTTP/1\.[01] 200|HTTP/2 200' +expect_status() { + local headers="$1" + local expected="$2" + [[ "$(header_status "$headers")" == "$expected" ]] } -text_streaming_test() { +nonstreaming_chat_test() { local model="$1" - local hdrs="$TMP_DIR/text_headers.txt" - local body="$TMP_DIR/text_body.txt" - local jsonl="$TMP_DIR/text_chunks.jsonl" - - local payload - payload=$(jq -nc --arg m "$model" '{model:$m, messages:[{role:"user", content:"Say hello in one short sentence."}], stream:true, temperature:0.2, max_tokens:64}') - printf '%s\n' "$payload" > "$OUT_DIR/text_stream_request.json" - - curl -sS -N \ - -D "$hdrs" \ - -H 'Accept: text/event-stream' \ - -H 'Content-Type: application/json' \ - -X POST "$HOST/v1/chat/completions" \ - -d "$payload" \ - > "$body" - - # Basic checks - local ok=true - http_status_ok "$hdrs" || ok=false - http_content_type_includes "$hdrs" 'text/event-stream' || ok=false - grep -q '^data: \[DONE\]$' "$body" || ok=false + local name="chat_nonstream" + local payload headers body ok finish + + payload=$(jq -nc --arg m "$model" '{ + model: $m, + messages: [{role:"user", content:"Say hello in one short sentence."}], + stream: false, + temperature: 0.2, + max_tokens: 64 + }') + + post_json "$name" "/v1/chat/completions" "$payload" + headers="$(artifact_path "${name}_headers.txt")" + body="$(artifact_path "${name}_body.txt")" + + ok=true + expect_status "$headers" 200 || ok=false + jq -e '.object == "chat.completion"' "$body" >/dev/null 2>&1 || ok=false + jq -e '.choices[0].message.role == "assistant"' "$body" >/dev/null 2>&1 || ok=false + jq -e '(.choices[0].message.content // "") | length > 0' "$body" >/dev/null 2>&1 || ok=false + finish=$(jq -r '.choices[0].finish_reason // empty' "$body" 2>/dev/null || true) + [[ "$finish" == "stop" ]] || ok=false + + if $ok; then + record "chat_nonstream" "PASS" "Chat completions non-streaming" "object/message/finish_reason=stop" + else + record "chat_nonstream" "FAIL" "Chat completions non-streaming" "see ${name}_*.txt/json artifacts" + fi +} - # Chunk checks +streaming_chat_test() { + local model="$1" + local name="chat_stream" + local payload headers body jsonl chunks ok finish role + + payload=$(jq -nc --arg m "$model" '{ + model: $m, + messages: [{role:"user", content:"Say hello in one short sentence."}], + stream: true, + temperature: 0.2, + max_tokens: 64 + }') + + post_json "$name" "/v1/chat/completions" "$payload" "text/event-stream" + headers="$(artifact_path "${name}_headers.txt")" + body="$(artifact_path "${name}_body.txt")" + jsonl="$(artifact_path "${name}_chunks.jsonl")" filter_sse_to_jsonl "$body" "$jsonl" - [[ -s "$jsonl" ]] || ok=false - # object type for all chunks - jq -e 'all( .object == "chat.completion.chunk" )' <<<"$(jq -s '.' "$jsonl")" >/dev/null 2>&1 || ok=false - # first delta has role: assistant - [[ "$(first_role "$jsonl")" == "assistant" ]] || ok=false - # content present somewhere - has_any_content_delta "$jsonl" || ok=false - # final finish_reason == stop - [[ "$(last_finish_reason "$jsonl")" == "stop" ]] || ok=false + chunks=$(json_array_from_jsonl "$jsonl") + + ok=true + expect_status "$headers" 200 || ok=false + header_content_type_includes "$headers" 'text/event-stream' || ok=false + grep -q '^data: \[DONE\]$' "$body" || ok=false + jq -e 'length > 0 and all(.object == "chat.completion.chunk")' <<<"$chunks" >/dev/null 2>&1 || ok=false + role=$(jq -r '.[0].choices[0].delta.role // empty' <<<"$chunks" 2>/dev/null || true) + [[ "$role" == "assistant" ]] || ok=false + jq -e 'map(.choices[0].delta.content // empty) | join("") | length > 0' <<<"$chunks" >/dev/null 2>&1 || ok=false + finish=$(jq -r '.[-1].choices[0].finish_reason // empty' <<<"$chunks" 2>/dev/null || true) + [[ "$finish" == "stop" ]] || ok=false if $ok; then - log "$emoji_ok" "Text streaming conformance" - echo "text_streaming=pass" > "$TMP_DIR/text_result.ini" + record "chat_stream" "PASS" "Chat completions streaming" "SSE framing, assistant role, content deltas, finish_reason=stop" else - log "$emoji_bad" "Text streaming conformance" - echo "text_streaming=fail" > "$TMP_DIR/text_result.ini" + record "chat_stream" "FAIL" "Chat completions streaming" "see ${name}_*.txt/jsonl artifacts" fi - - # Save artifacts - cp "$hdrs" "$OUT_DIR/text_stream_headers.txt" || true - cp "$body" "$OUT_DIR/text_stream_raw.txt" || true - cp "$jsonl" "$OUT_DIR/text_stream_chunks.jsonl" || true } tool_call_streaming_test() { local model="$1" - local hdrs="$TMP_DIR/tool_headers.txt" - local body="$TMP_DIR/tool_body.txt" - local jsonl="$TMP_DIR/tool_chunks.jsonl" - - # Define a simple function tool - local tools - tools='[{"type":"function","function":{"name":"get_weather","description":"Get weather by city","parameters":{"type":"object","properties":{"city":{"type":"string"}},"required":["city"]}}}]' - - local payload - payload=$(jq -nc --arg m "$model" --argjson t "$tools" '{model:$m, messages:[{role:"user", content:"Using tools if available, get weather for city=San Francisco."}], tools:$t, tool_choice:"auto", stream:true, temperature:0.0, max_tokens:64}') - printf '%s\n' "$payload" > "$OUT_DIR/tool_stream_request.json" - - curl -sS -N \ - -D "$hdrs" \ - -H 'Accept: text/event-stream' \ - -H 'Content-Type: application/json' \ - -X POST "$HOST/v1/chat/completions" \ - -d "$payload" \ - > "$body" - - # Basic checks - local supported=false - local compliant=false - - http_status_ok "$hdrs" || true - http_content_type_includes "$hdrs" 'text/event-stream' || true - grep -q '^data: \[DONE\]$' "$body" || true - + local name="tool_stream" + local payload headers body jsonl chunks supported compliant + + payload=$(jq -nc --arg m "$model" '{ + model: $m, + messages: [{role:"user", content:"Using tools if available, get weather for city=San Francisco."}], + tools: [{ + type: "function", + function: { + name: "get_weather", + description: "Get weather by city", + parameters: { + type: "object", + properties: { city: { type: "string" } }, + required: ["city"] + } + } + }], + tool_choice: "auto", + stream: true, + temperature: 0.0, + max_tokens: 64 + }') + + post_json "$name" "/v1/chat/completions" "$payload" "text/event-stream" + headers="$(artifact_path "${name}_headers.txt")" + body="$(artifact_path "${name}_body.txt")" + jsonl="$(artifact_path "${name}_chunks.jsonl")" filter_sse_to_jsonl "$body" "$jsonl" - if [[ -s "$jsonl" ]] && has_tool_calls_delta "$jsonl"; then + chunks=$(json_array_from_jsonl "$jsonl") + + supported=false + compliant=false + + if expect_status "$headers" 200 \ + && header_content_type_includes "$headers" 'text/event-stream' \ + && grep -q '^data: \[DONE\]$' "$body" \ + && jq -e 'map(.choices[0].delta.tool_calls[0] // empty) | any' <<<"$chunks" >/dev/null 2>&1 + then supported=true - # minimal conformance checks for OpenAI-style tool call streaming - if has_tool_call_id_and_name "$jsonl" \ - && has_tool_call_arguments_any "$jsonl" \ - && [[ "$(last_finish_reason "$jsonl")" == "tool_calls" ]]; then + if jq -e 'map(.choices[0].delta.tool_calls[0] // empty) + | any((.id != null) and (.function.name != null))' <<<"$chunks" >/dev/null 2>&1 \ + && jq -e 'map(.choices[0].delta.tool_calls[0].function.arguments // empty) + | any((type == "string") and (length > 0))' <<<"$chunks" >/dev/null 2>&1 \ + && [[ "$(jq -r '.[-1].choices[0].finish_reason // empty' <<<"$chunks" 2>/dev/null || true)" == "tool_calls" ]] + then compliant=true fi fi if $supported && $compliant; then - log "$emoji_ok" "Tool calling streaming (OpenAI-style deltas)" - echo "tool_streaming=pass" > "$TMP_DIR/tool_result.ini" - elif $supported && ! $compliant; then - log "$emoji_warn" "Tool calling streaming present but not fully compliant" - echo "tool_streaming=partial" > "$TMP_DIR/tool_result.ini" + record "tool_stream" "PASS" "Tool calling streaming" "OpenAI-style id/name/arguments deltas with finish_reason=tool_calls" + elif $supported; then + record "tool_stream" "FAIL" "Tool calling streaming" "tool deltas observed but schema is incomplete" + elif [[ "$REQUIRE_TOOL_CALLS" == "1" ]]; then + record "tool_stream" "FAIL" "Tool calling streaming" "not observed and REQUIRE_TOOL_CALLS=1" else - log "$emoji_warn" "Tool calling streaming not observed (likely unsupported by current model)" - echo "tool_streaming=unsupported" > "$TMP_DIR/tool_result.ini" + record "tool_stream" "WARN" "Tool calling streaming" "not observed; current model may not support tool calls" fi +} + +request_validation_error_test() { + local model="$1" + local name="request_validation_error" + local payload headers body ok status + + payload=$(jq -nc --arg m "$model" '{ + model: $m, + messages: [{role:"user", content:"hello"}], + stream: false, + n: 2 + }') + + post_json "$name" "/v1/chat/completions" "$payload" + headers="$(artifact_path "${name}_headers.txt")" + body="$(artifact_path "${name}_body.txt")" + status=$(header_status "$headers") - # Save artifacts - cp "$hdrs" "$OUT_DIR/tool_stream_headers.txt" || true - cp "$body" "$OUT_DIR/tool_stream_raw.txt" || true - cp "$jsonl" "$OUT_DIR/tool_stream_chunks.jsonl" || true + ok=true + [[ "$status" == "400" || "$status" == "422" ]] || ok=false + jq -e '.error.message | type == "string" and length > 0' "$body" >/dev/null 2>&1 || ok=false + + if $ok; then + record "request_validation_error" "PASS" "Unsupported request validation" "n>1 rejected with JSON error message" + else + record "request_validation_error" "FAIL" "Unsupported request validation" "expected HTTP 400/422 JSON error for n>1" + fi } write_report() { local model="$1" - local text_status tool_status - text_status=$(cut -d= -f2 < "$TMP_DIR/text_result.ini") - tool_status=$(cut -d= -f2 < "$TMP_DIR/tool_result.ini" 2>/dev/null || echo "unsupported") - - local report="$OUT_DIR/openai_compat_report.md" { - echo "## OpenAI Chat Completions Streaming Compatibility" + echo "## OpenAI Chat Completions Compatibility" echo "- **Server**: $HOST" - echo "- **Model**: \ -$(printf " -\`%s\` -" "$model")" + echo "- **Model**: \`$model\`" + echo "- **Artifacts**: \`$OUT_DIR\`" echo "" echo "### Results" echo "| Area | Status | Notes |" - echo "|---|---:|---|" - case "$text_status" in - pass) echo "| Text generation (SSE) | ✅ | Headers, SSE framing, role, finish_reason=stop |";; - *) echo "| Text generation (SSE) | ❌ | See artifacts in results/ |";; - esac - case "$tool_status" in - pass) echo "| Tool calling (SSE deltas) | ✅ | id/name/arguments; finish_reason=tool_calls |";; - partial) echo "| Tool calling (SSE deltas) | ⚠️ | Present but schema incomplete |";; - unsupported) echo "| Tool calling (SSE deltas) | ⚠️ | Not observed; model likely lacks tool support |";; - *) echo "| Tool calling (SSE deltas) | ❌ | Error; see artifacts |";; - esac + echo "| --- | ---: | --- |" + awk -F '\t' '{ printf "| %s | %s | %s |\n", $3, $2, $4 }' "$RESULTS_TSV" echo "" - echo "### Artifacts" - echo "- \ -\`results/text_stream_headers.txt\`, \ -\`results/text_stream_raw.txt\`, \ -\`results/text_stream_chunks.jsonl\`" - echo "- \ -\`results/tool_stream_headers.txt\`, \ -\`results/tool_stream_raw.txt\`, \ -\`results/tool_stream_chunks.jsonl\`" - } > "$report" - - echo ""; echo "Report written to: $report" + echo "### Artifact Naming" + echo "" + echo "Each check writes:" + echo "" + echo "- \`*_request.json\`" + echo "- \`*_headers.txt\`" + echo "- \`*_status.txt\`" + echo "- \`*_body.txt\`" + echo "- \`*_chunks.jsonl\` for streaming checks" + echo "" + echo "Set \`OUT_DIR=build/compat/openai\` to keep generated artifacts out of tracked \`results/\` files." + } > "$REPORT" + + echo "" + echo "Report written to: $REPORT" } main() { - local model - model=$(get_first_model) + local model failures + + model=$(get_model) if [[ -z "$model" ]]; then echo "Could not determine a model from $HOST/v1/models. Set MODEL=... and retry." >&2 exit 2 fi - echo "Using model: $model" - text_streaming_test "$model" - tool_call_streaming_test "$model" || true + echo "Using server: $HOST" + echo "Using model: $model" + echo "Artifacts: $OUT_DIR" + echo "" + + nonstreaming_chat_test "$model" + streaming_chat_test "$model" + tool_call_streaming_test "$model" + request_validation_error_test "$model" write_report "$model" + + failures=$(awk -F '\t' '$2 == "FAIL" { count++ } END { print count + 0 }' "$RESULTS_TSV") + if [[ "$failures" != "0" ]]; then + echo "$failures compatibility check(s) failed." >&2 + exit 1 + fi } main "$@" - -