Skip to content
122 changes: 80 additions & 42 deletions adapter/vercelaisdk/uimessagestream/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
// packages/ai/src/generate-text/stream-text.ts and the SSE framing from
// packages/ai/src/ui-message-stream/json-to-sse-transform-stream.ts.
//
// Known limitations:
// - Inbound tool call history from multi-turn conversations is not yet
// reconstructed; only text parts are forwarded to the model.
// - The handler calls model.GenerateEvents directly; interceptor plugins
// (retry, OTel) must be wired at the model level.
//
// Reference: https://github.com/vercel/ai
package uimessagestream

Expand All @@ -36,7 +42,11 @@ import (
// protocol. It accepts POST requests with a JSON body containing messages
// and streams back SSE events compatible with useChat.
func Handler(model llm.Model, opts ...Option) http.Handler {
cfg := &config{logger: slog.Default()}
cfg := &config{
logger: slog.Default(),
maxBodyBytes: 1 << 20, // 1MB
maxTurns: 10,
}
for _, o := range opts {
o(cfg)
}
Expand All @@ -52,10 +62,12 @@ type Option func(*config)
type ToolExecutor func(ctx context.Context, name string, args json.RawMessage) (json.RawMessage, error)

type config struct {
system string
logger *slog.Logger
tools []llm.ToolDefinition
executor ToolExecutor
system string
logger *slog.Logger
tools []llm.ToolDefinition
executor ToolExecutor
maxBodyBytes int64
maxTurns int
}

// WithSystem sets the system prompt prepended to every request.
Expand All @@ -79,6 +91,16 @@ func WithTools(tools []llm.ToolDefinition, executor ToolExecutor) Option {
}
}

// WithMaxBodyBytes sets the maximum request body size in bytes. Default is 1MB.
func WithMaxBodyBytes(n int64) Option {
return func(c *config) { c.maxBodyBytes = n }
}

// WithMaxTurns sets the maximum number of agentic tool-calling turns. Default is 10.
func WithMaxTurns(n int) Option {
return func(c *config) { c.maxTurns = n }
}

type handler struct {
model llm.Model
cfg *config
Expand All @@ -88,7 +110,6 @@ type handler struct {
type chatRequest struct {
ID string `json:"id"`
Messages []chatMessage `json:"messages"`
Trigger string `json:"trigger"`
}

type chatMessage struct {
Expand Down Expand Up @@ -127,8 +148,8 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}

// Limit request body to 1MB to prevent abuse.
r.Body = http.MaxBytesReader(w, r.Body, 1<<20)
// Limit request body size to prevent abuse.
r.Body = http.MaxBytesReader(w, r.Body, h.cfg.maxBodyBytes)

var body chatRequest
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
Expand Down Expand Up @@ -158,14 +179,14 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
setSSEHeaders(w)

ew := &EventWriter{w: w, f: flusher}
StreamModelWithTools(r.Context(), h.model, req, ew, h.cfg.logger, h.cfg.executor)
StreamModelWithTools(r.Context(), h.model, req, ew, h.cfg.logger, h.cfg.executor, h.cfg.maxTurns)
}

// generateMessageID creates a random 16-character hex ID for use as a messageId.
func generateMessageID() string {
b := make([]byte, 8)
if _, err := rand.Read(b); err != nil {
return "msg-0000000000000000"
return "0000000000000000"
}

return hex.EncodeToString(b)
Expand All @@ -182,16 +203,19 @@ const (
// complexity of StreamModel and StreamModelWithTools.
type streamWriter struct {
ew *EventWriter
logger *slog.Logger
textID string
reasoningID string
textStarted bool
reasoningStarted bool
textCounter int
reasoningCounter int
}

func newStreamWriter(ew *EventWriter) *streamWriter {
func newStreamWriter(ew *EventWriter, logger *slog.Logger) *streamWriter {
return &streamWriter{
ew: ew,
logger: logger,
textID: "text-0",
reasoningID: "reasoning-0",
}
Expand All @@ -207,20 +231,8 @@ func (sw *streamWriter) endReasoning() error {
}

sw.reasoningStarted = false

return nil
}

func (sw *streamWriter) endText() error {
if !sw.textStarted {
return nil
}

if err := sw.ew.WriteChunk(Chunk{"type": "text-end", "id": sw.textID}); err != nil {
return err
}

sw.textStarted = false
sw.reasoningCounter++
sw.reasoningID = fmt.Sprintf("reasoning-%d", sw.reasoningCounter)

return nil
}
Expand Down Expand Up @@ -258,6 +270,10 @@ func (sw *streamWriter) writeTextDelta(text string) error {
}

func (sw *streamWriter) writeReasoningDelta(trace *llm.ReasoningTrace) error {
if err := sw.endTextAndAdvance(); err != nil {
return err
}

if !sw.reasoningStarted {
if err := sw.ew.WriteChunk(Chunk{"type": "reasoning-start", "id": sw.reasoningID}); err != nil {
return err
Expand Down Expand Up @@ -286,7 +302,9 @@ func (sw *streamWriter) writeToolRequest(tr *llm.ToolRequest) error {

var input any
if len(tr.Arguments) > 0 {
_ = json.Unmarshal(tr.Arguments, &input)
if err := json.Unmarshal(tr.Arguments, &input); err != nil {
sw.logger.Warn("failed to unmarshal tool input", "toolCallId", tr.ID, "error", err)
}
}

return sw.ew.WriteChunk(Chunk{
Expand All @@ -307,7 +325,9 @@ func (sw *streamWriter) writeToolResponse(tr *llm.ToolResponse) error {

var output any
if len(tr.Result) > 0 {
_ = json.Unmarshal(tr.Result, &output)
if err := json.Unmarshal(tr.Result, &output); err != nil {
sw.logger.Warn("failed to unmarshal tool output", "toolCallId", tr.ID, "error", err)
}
}

return sw.ew.WriteChunk(Chunk{
Expand All @@ -321,7 +341,11 @@ func (sw *streamWriter) handleContentPart(part *llm.Part) error {
case llm.PartText:
return sw.writeTextDelta(part.Text)
case llm.PartToolRequest:
if err := sw.endText(); err != nil {
if err := sw.endReasoning(); err != nil {
return err
}

if err := sw.endTextAndAdvance(); err != nil {
Comment thread
claude[bot] marked this conversation as resolved.
return err
}

Comment thread
claude[bot] marked this conversation as resolved.
Expand All @@ -340,12 +364,15 @@ func (sw *streamWriter) closeSpans() {
if sw.reasoningStarted {
_ = sw.ew.WriteChunk(Chunk{"type": "reasoning-end", "id": sw.reasoningID})
sw.reasoningStarted = false
sw.reasoningCounter++
sw.reasoningID = fmt.Sprintf("reasoning-%d", sw.reasoningCounter)
}

if sw.textStarted {
_ = sw.ew.WriteChunk(Chunk{"type": "text-end", "id": sw.textID})
sw.textStarted = false
sw.textCounter++
sw.textID = fmt.Sprintf("text-%d", sw.textCounter)
}
}

Expand Down Expand Up @@ -388,7 +415,7 @@ func StreamModel(ctx context.Context, model llm.Model, req *llm.Request, ew *Eve
return
}

sw := newStreamWriter(ew)
sw := newStreamWriter(ew, logger)

for event, err := range model.GenerateEvents(ctx, req) {
if err != nil {
Expand All @@ -398,6 +425,8 @@ func StreamModel(ctx context.Context, model llm.Model, req *llm.Request, ew *Eve

logger.Error("stream error", "error", err)

sw.closeSpans()

_ = ew.WriteChunk(Chunk{"type": "error", "errorText": "An error occurred"})
_ = ew.WriteChunk(Chunk{"type": "finish-step"})
_ = ew.WriteChunk(Chunk{"type": "finish", "finishReason": finishReasonError})
Expand All @@ -419,7 +448,7 @@ func StreamModel(ctx context.Context, model llm.Model, req *llm.Request, ew *Eve
}

case llm.StreamResetEvent:
if err := sw.endText(); err != nil {
if err := sw.endTextAndAdvance(); err != nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟣 🟣 Pre-existing (from #131): StreamModel (the non-executor path) emits tool-input-start/tool-input-available via handleContentPart's PartToolRequest case (handler.go:343-352) but tracks no pending tool requests — so on iterator-error (handler.go:429-442), StreamResetEvent (handler.go:458-465), or StreamEndEvent.Error via writeStreamEnd (handler.go:388-407), any previously-emitted tool-input-available is left unpaired on the wire, violating the protocol invariant just enforced in streamToolTurn. Reachable through the public API as Handler(model, WithTools(tools, nil)) (WithTools accepts nil executor; ServeHTTP sets req.Tools regardless; StreamModelWithTools delegates to StreamModel when executor==nil at handler.go:481-484) or via direct calls to the exported StreamModel with req.Tools populated. Fix template is identical to what this PR applied in streamToolTurn: track pending tool-call IDs and emit tool-output-error for each before the terminal chunks; alternatively reject WithTools(tools, nil) at config time.

Extended reasoning...

What the bug is

StreamModel (the non-executor path) emits tool-input-start and tool-input-available chunks for any PartToolRequest content event via handleContentPart (handler.go:343-352) → writeToolRequest (handler.go:292-313). But unlike streamToolTurn — which maintains a local toolRequests []*llm.ToolRequest slice — StreamModel does not track these pending tool calls anywhere.

When any of three terminal events fires after such chunks have been written:

  1. Iterator-error at handler.go:429-442: sw.closeSpans() only closes text/reasoning spans; writes error/finish-step/finish. No tool-output-error for orphaned tool calls.
  2. StreamResetEvent at handler.go:458-465: endTextAndAdvance + endReasoning only. No tool-output-error.
  3. StreamEndEvent.Error via writeStreamEnd at handler.go:388-407: error + closeSpans + finish-step + finish. No tool-output-error.

All three leave the previously-emitted tool-input-available unpaired on the wire — the exact pairing invariant the PR just enforced in streamToolTurn at handler.go:555-568 (iterator-error) and handler.go:588-597 (StreamResetEvent).

Why this PR is the natural moment

This PR's diff explicitly:

  • Added sw.closeSpans() to StreamModel's iterator-error branch at handler.go:436.
  • Changed endText() to endTextAndAdvance() in StreamModel's StreamResetEvent case at handler.go:459.
  • Applied the symmetric tool-output-error pairing fix in streamToolTurn for both iter-error and StreamResetEvent.

So the same hunks were touched, the same template applied to the executor sibling — but StreamModel's lack of a pending-IDs slice meant the symmetric fix needs a small additional structural change (track IDs) rather than just looping over an existing slice.

Step-by-step proof (StreamResetEvent path)

Provider yields [ContentPartEvent{ToolRequest{ID: "call-1"}}, StreamResetEvent, ContentPartEvent{Text("hi")}, StreamEndEvent{Stop}] to StreamModel:

  1. start, start-step written.
  2. PartToolRequesthandleContentPartendReasoning (no-op), endTextAndAdvance (no-op), writeToolRequest → emits tool-input-start, tool-input-available for call-1.
  3. StreamResetEvent → handler.go:458-465 → endTextAndAdvance (no-op), endReasoning (no-op). No tool-output-error for call-1.
  4. PartText("hi")text-start, text-delta.
  5. StreamEndEvent{Stop}writeStreamEndtext-end, finish-step, finish.
  6. [DONE].

Wire output:

start, start-step,
tool-input-start, tool-input-available,   <-- call-1 (orphaned)
text-start, text-delta, text-end,
finish-step, finish, [DONE]

call-1 enters pending on tool-input-available and never receives a paired tool-output-*. Same orphan pattern for iterator-error and StreamEndEvent.Error.

Reachability

Two routes through the public API:

  1. Handler(model, WithTools(tools, nil)): WithTools (handler.go:87-92) accepts a ToolExecutor function with no nil check. ServeHTTP always sets req.Tools = h.cfg.tools (handler.go:170). StreamModelWithTools sees executor == nil and delegates to StreamModel at handler.go:481-484, passing req with Tools populated. The model receives tools and may emit PartToolRequest. This configuration corresponds to client-side tool execution.
  2. Direct calls to the exported StreamModel with req.Tools set by external callers (StreamModel is exported at handler.go:411).

Addressing the refutation

The refutation argues the no-executor path operates under a different contract: tool-input-available is the terminal tool chunk and the client handles resolution in a subsequent request. Two points in response:

  • The package doc explicitly states "Inbound tool call history from multi-turn conversations is not yet reconstructed; only text parts are forwarded to the model" (handler.go:18). So full client-side execution isn't really supported in either direction. The tool-input-available chunks are server-side state the client must somehow drop on its own when the stream errors.
  • Even granting the client-side interpretation, on the error terminal paths (iterator error, StreamEndEvent.Error, StreamResetEvent for a discarded attempt) the protocol invariant remains — those tool calls did not complete and won't complete, so tool-output-error is the correct signal. The PR's own streamToolTurn fix codifies this for the same three error paths.

Reachability via WithTools(tools, nil) is genuinely undocumented, granting that — but StreamModel is exported and the nil-executor delegation is intentional code (not just a defensive check), and the symmetric fix template is right there in the PR's diff.

Severity rationale

pre_existing. The orphan was there before this PR, the PR didn't introduce the bug, and the typical Handler(model, WithTools(tools, executor)) configuration routes through streamToolTurn (now fixed). But worth flagging here because the symmetric fix pattern is what the PR just applied next door.

How to fix

Option 1 (mirror streamToolTurn): track pending tool IDs in StreamModel and emit tool-output-error for each on the three terminal paths before the terminal chunks. Requires propagating a small slice down through handleContentPart or a thin wrapper.

Option 2 (smaller diff): reject WithTools(tools, nil) at config time, or document that tools-without-executor is unsupported. This closes route #1 but not direct StreamModel calls.

return
}

Expand All @@ -438,8 +467,9 @@ func StreamModel(ctx context.Context, model llm.Model, req *llm.Request, ew *Eve
// StreamModelWithTools is like StreamModel but supports agentic tool calling.
// When the model returns tool calls, the executor is invoked for each, results
// are streamed to the client, and the model is called again with the results
// appended to the conversation. This loops until the model stops calling tools.
func StreamModelWithTools(ctx context.Context, model llm.Model, req *llm.Request, ew *EventWriter, logger *slog.Logger, executor ToolExecutor) {
// appended to the conversation. This loops until the model stops calling tools
// or maxTurns is reached. If maxTurns is 0, it defaults to 10.
func StreamModelWithTools(ctx context.Context, model llm.Model, req *llm.Request, ew *EventWriter, logger *slog.Logger, executor ToolExecutor, maxTurns int) {
if executor == nil {
StreamModel(ctx, model, req, ew, logger)
return
Expand All @@ -449,19 +479,27 @@ func StreamModelWithTools(ctx context.Context, model llm.Model, req *llm.Request
logger = slog.Default()
}

if maxTurns <= 0 {
maxTurns = 10
}

messageID := generateMessageID()
if err := ew.WriteChunk(Chunk{"type": "start", "messageId": messageID}); err != nil {
return
}

messages := slices.Clone(req.Messages)
sw := newStreamWriter(ew)

const maxTurns = 10
sw := newStreamWriter(ew, logger)

for range maxTurns {
finishReason, toolRequests := streamToolTurn(ctx, model, req, messages, sw, ew, logger)

// finishReasonError: terminal chunks already written by streamToolTurn.
// "": stream aborted (ctx cancel or write failure), nothing to write.
if finishReason == finishReasonError || finishReason == "" {
return
}

if len(toolRequests) == 0 || finishReason != "tool-calls" {
_ = ew.WriteChunk(Chunk{"type": "finish", "finishReason": finishReason})
_ = ew.WriteDone()
Comment on lines 510 to 512
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Tool-input chunks emitted before a StreamEndEvent with FinishReason length/content-filter/other are orphaned: streamToolTurn returns ("length", [pending]) and StreamModelWithTools at handler.go:510-515 writes finish+[DONE] without iterating toolRequests, leaving every Vercel useChat tool-call card with no matching tool-output-* chunk. The PR added this cleanup for the iterator-error (lines 564-569) and StreamResetEvent (lines 599-604) paths but missed the third symmetric path. Fix: emit tool-output-error for each pending tool request before the finish chunk, mirroring the two existing loops.

Extended reasoning...

What the bug is

The early-return at handler.go:510-515 fires whenever streamToolTurn returns with finishReason != "tool-calls", but does not iterate the accumulated toolRequests slice to pair each previously-emitted tool-input-available with a tool-output-* chunk. Two of the three symmetric paths in streamToolTurn were just fixed for exactly this orphan-pairing problem (lines 564-569 for iterator-error and lines 599-604 for StreamResetEvent), but the StreamEndEvent path was missed for the e.Error == nil case.

Specific code path that triggers it

Inside streamToolTurn, StreamEndEvent delegates to writeToolTurnEnd (handler.go:653-672). When e.Error == nil and e.Response.FinishReason is anything other than FinishReasonToolCalls, mapFinishReason translates it to "stop", "length", "content-filter", or "other". None of those equal "tool-calls", so the caller's early-return triggers with toolRequests still populated.

Why existing code doesn't prevent it

The iterator-error and StreamResetEvent branches both have explicit cleanup loops added in this PR. The StreamEndEvent path returns the mapped reason and the caller fall-through assumed toolRequests is empty when finishReason isn't "tool-calls" — which is true when the model finishes cleanly without tool calls, but not when length/content-filter cuts off mid-tool-emission.

Reachability is well-documented across every in-repo provider

This is the documented, intentional mapping for the in-repo providers:

  • providers/anthropic/response_mapper_test.go:92 — "max_tokens with tool calls stays Length"
  • providers/anthropic/response_mapper_test.go:104 — "refusal with tool calls stays ContentFilter"
  • providers/anthropic/stream_partial_test.go:139 — "max_tokens must propagate as FinishReasonLength even when tool calls are present"
  • providers/bedrock/finish_reason_test.go:65,71 — Length/ContentFilter preserved with tool_use
  • providers/openai/response_mapper_test.go:100,107 — Length/ContentFilter with tool calls
  • providers/google/finish_reason_test.go:42-45 — same
  • providers/openaicompat/finish_reason_test.go:43-45 — same

So the moment any model truncates mid-tool-emission (long arguments, large schemas, content filter cutoff), this fires.

Step-by-step proof

Provider yields [ContentPartEvent{PartToolRequest{ID:"call-1"}}, StreamEndEvent{Response: {FinishReason: FinishReasonLength}}] to StreamModelWithTools:

  1. StreamModelWithTools writes start, enters loop, calls streamToolTurn.
  2. streamToolTurn writes start-step.
  3. ContentPartEvent{PartToolRequest{call-1}}handleToolTurnPart PartToolRequest case → appends call-1 to toolRequests, writeToolRequest emits tool-input-start + tool-input-available. State: toolRequests=[call-1].
  4. StreamEndEvent{Response:{FinishReason:Length}} (e.Error nil) → writeToolTurnEnd emits finish-step, returns "length" via mapFinishReason. No tool-output- for call-1.*
  5. streamToolTurn returns ("length", [call-1]).
  6. Caller: len(toolRequests)>0 is false branch, but finishReason != "tool-calls" is true → enters if block at handler.go:510. Writes finish with reason="length", [DONE], returns.

Wire output:

start, start-step,
tool-input-start, tool-input-available,   ← call-1 orphaned
finish-step,
finish (length), [DONE]

The tool-input-available for call-1 has no matching tool-output-available or tool-output-error. Vercel useChat tracks tool-call state by toolCallId — call-1 enters pending and never leaves.

Impact

UI clients render the tool-call card as perpetually pending: permanent loading spinner, unresolved error overlay, or a stale entry in messages[] that contradicts the assistant's truncated response. The same pairing invariant that the PR just enforced for the other two error paths.

Fix

Mirror the cleanup loops already present at lines 564-569 and 599-604. In StreamModelWithTools at handler.go:510, before writing the finish chunk:

if len(toolRequests) == 0 || finishReason != "tool-calls" {
    for _, tr := range toolRequests {
        _ = ew.WriteChunk(Chunk{
            "type": "tool-output-error", "toolCallId": tr.ID,
            "errorText": "stream ended without tool resolution; tool call discarded",
        })
    }
    _ = ew.WriteChunk(Chunk{"type": "finish", "finishReason": finishReason})
    _ = ew.WriteDone()
    return
}

A companion test TestStreamModelWithTools_FinishLengthPairsToolChunks that feeds PartToolRequest → StreamEndEvent{FinishReason: Length} and asserts a tool-output-error precedes the finish would lock this in.

Expand All @@ -476,7 +514,7 @@ func StreamModelWithTools(ctx context.Context, model llm.Model, req *llm.Request

messages = append(messages, llm.Message{Role: llm.RoleAssistant, Content: assistantParts})

if err := executeTools(ctx, toolRequests, &messages, ew, executor); err != nil {
if err := executeTools(ctx, toolRequests, &messages, ew, logger, executor); err != nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟣 🟡 Pre-existing bug (from #131, not introduced by this PR): In StreamModelWithTools, text and reasoning emitted alongside a tool call in the same model turn are streamed to the client but not preserved in the conversation history passed to the next model turn. The assistant message at handler.go:509-514 is built solely from toolRequests — any preamble like "Let me check the weather..." or reasoning the model just generated is invisible to it on the next call. Fix: accumulate text/reasoning parts in handleToolTurnPart and include them in assistantParts alongside the tool requests. Since this PR adds a Known limitations doc block covering inbound-history reconstruction, it's a natural moment to either fix this symmetric outbound gap or document it.

Extended reasoning...

What the bug is

In handleToolTurnPart (handler.go:580-617), the three content-part cases behave asymmetrically with respect to next-turn history:

case llm.PartText:
    if err := sw.writeTextDelta(e.Part.Text); err != nil { ... }   // streamed only
case llm.PartReasoning:
    if err := sw.writeReasoningDelta(e.Part.ReasoningTrace); err != nil { ... }  // streamed only
case llm.PartToolRequest:
    ...
    *toolRequests = append(*toolRequests, tr)   // ← accumulated
    if err := sw.writeToolRequest(tr); err != nil { ... }

Only PartToolRequest accumulates into the slice. Then in StreamModelWithTools (handler.go:509-514), the assistant message is constructed exclusively from tool requests:

assistantParts := make([]*llm.Part, 0, len(toolRequests))
for _, tr := range toolRequests {
    assistantParts = append(assistantParts, llm.NewToolRequestPart(tr))
}
messages = append(messages, llm.Message{Role: llm.RoleAssistant, Content: assistantParts})

So the assistant turn the model sees on iteration N+1 contains only its tool_use blocks — its own preceding text and reasoning are gone.

Specific code path that triggers it

A real model commonly emits something like [Text("Let me look that up..."), ToolRequest(getWeather, {...})] in a single turn. After streamToolTurn returns, executeTools appends a RoleUser message containing only tool responses. The next call to model.GenerateEvents sees [user("weather?"), assistant(tool_use only), user(tool_responses)] — no preamble text, no reasoning trace.

Why existing code doesn't prevent it

The new TestStreamModelWithTools_HappyPath added in this PR uses a model that emits Text("calling tool") before the tool call, but only inspects the SSE chunk sequence — it never asserts on the inter-turn messages slice that gets passed back to GenerateEvents. So the regression silently passes. The conformance suite likewise focuses on wire chunks, not on history accumulation across turns.

Step-by-step proof

Consider a 2-turn agentic interaction with a model that emits text alongside its tool call. Initial req.Messages = [user("weather?")].

  1. Turn 1 begins. streamToolTurn called with messages = [user("weather?")].
  2. Model yields ContentPartEvent{Text("Let me check.")}writeTextDelta emits text-start, text-delta, text-end to the client. toolRequests remains empty.
  3. Model yields ContentPartEvent{ToolRequest{ID:"call-1", Name:"getWeather", Args:{...}}} → tool span chunks emitted, toolRequests = [tr1].
  4. Model yields StreamEndEvent{FinishReason: tool-calls}writeToolTurnEnd emits finish-step.
  5. Control returns to StreamModelWithTools. assistantParts = [ToolRequestPart(tr1)]text is dropped.
  6. messages = [user("weather?"), assistant(tool_use:tr1)].
  7. executeTools appends user(tool_response:call-1="72F sunny").
  8. Turn 2 begins. Model is called with messages = [user("weather?"), assistant(tool_use:tr1), user(tool_response)]. The assistant's prior "Let me check." preamble is invisible on this call. The model may now repeat its preamble ("Let me check the weather..."), lose continuity with the reasoning it just produced, or behave inconsistently with its prior statements.

Impact

  • Anthropic prompt-cache continuity breaks. Anthropic retains assistant text alongside tool_use in their content array; reconstructing the assistant turn without the preceding text invalidates cache prefix hits and increases token cost on subsequent turns of long agentic chains.
  • Strict content validators can reject. Some provider SDKs reject assistant messages that contain tool_use blocks with no associated text content when text was originally produced, depending on validation strictness.
  • Multi-turn agentic accuracy degrades. The new WithMaxTurns option (added in this PR, default 10) increases the blast radius — longer agentic chains amplify the divergence between what the model said and what it sees it said.
  • Reasoning trace continuity is also lost. For reasoning-capable models (e.g. Claude with thinking, OpenAI o-series), the inter-tool reasoning the model emits is silently dropped before the next turn, breaking the model's ability to build on its own chain of thought.

How to fix

Accumulate text and reasoning parts in handleToolTurnPart alongside the tool requests, and include them in the assistant Content. Sketch:

// In streamToolTurn:
var assistantContent []*llm.Part
// pass &assistantContent into handleToolTurnPart

// In handleToolTurnPart:
case llm.PartText:
    if err := sw.writeTextDelta(e.Part.Text); err != nil { return true }
    *assistantContent = append(*assistantContent, e.Part)
case llm.PartReasoning:
    if err := sw.writeReasoningDelta(e.Part.ReasoningTrace); err != nil { return true }
    *assistantContent = append(*assistantContent, e.Part)
case llm.PartToolRequest:
    ...
    *toolRequests = append(*toolRequests, tr)
    *assistantContent = append(*assistantContent, e.Part)
    ...

// In StreamModelWithTools, replace the assistantParts loop with:
messages = append(messages, llm.Message{Role: llm.RoleAssistant, Content: assistantContent})

Note: text deltas from the same span should ideally be concatenated into a single PartText per logical span (or the llm.Message format should permit multiple text parts, which it does).

Why this PR is the natural moment

This PR substantially modifies StreamModelWithTools (new maxTurns parameter, signature change, early-return guards on error, post-loop finish-step drop) and adds a package-level Known limitations block that explicitly enumerates the inbound tool-history gap ("only text parts are forwarded to the model"). The symmetric outbound gap — what we send back to the model after a tool turn — is not mentioned. Either fix it here or extend the doc block to cover it.

Comment thread
claude[bot] marked this conversation as resolved.
return
}
}
Expand All @@ -499,10 +537,6 @@ func streamToolTurn(
return "", nil
}

sw.textID = fmt.Sprintf("text-%d", sw.textCounter)
sw.textStarted = false
sw.reasoningStarted = false

var toolRequests []*llm.ToolRequest

iterReq := &llm.Request{
Expand All @@ -521,6 +555,8 @@ func streamToolTurn(

logger.Error("stream error", "error", err)

sw.closeSpans()
Comment thread
claude[bot] marked this conversation as resolved.

_ = ew.WriteChunk(Chunk{"type": "error", "errorText": "An error occurred"})
_ = ew.WriteChunk(Chunk{"type": "finish-step"})
_ = ew.WriteChunk(Chunk{"type": "finish", "finishReason": finishReasonError})
Expand Down Expand Up @@ -601,7 +637,7 @@ func writeToolTurnEnd(e llm.StreamEndEvent, sw *streamWriter, ew *EventWriter, l
return reason
}

func executeTools(ctx context.Context, toolRequests []*llm.ToolRequest, messages *[]llm.Message, ew *EventWriter, executor ToolExecutor) error {
func executeTools(ctx context.Context, toolRequests []*llm.ToolRequest, messages *[]llm.Message, ew *EventWriter, logger *slog.Logger, executor ToolExecutor) error {
toolResponseParts := make([]*llm.Part, 0, len(toolRequests))

for _, tr := range toolRequests {
Expand All @@ -620,7 +656,9 @@ func executeTools(ctx context.Context, toolRequests []*llm.ToolRequest, messages

var output any
if len(result) > 0 {
_ = json.Unmarshal(result, &output)
if err := json.Unmarshal(result, &output); err != nil {
logger.Warn("failed to unmarshal tool result", "toolCallId", tr.ID, "error", err)
}
}

if err := ew.WriteChunk(Chunk{
Expand Down
Loading
Loading