Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions agent/llmagent/empty_response_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2026 Redpanda Data, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package llmagent_test

import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/redpanda-data/ai-sdk-go/agent"
"github.com/redpanda-data/ai-sdk-go/agent/llmagent"
"github.com/redpanda-data/ai-sdk-go/llm"
"github.com/redpanda-data/ai-sdk-go/llm/fakellm"
"github.com/redpanda-data/ai-sdk-go/store/session"
)

// TestRun_EmptyContentNotPersisted locks in the agent-loop guard against
// session poisoning by empty assistant messages.
//
// Sibling case to PR #116. Where #116 surfaced *truncated* tool args
// instead of corrupting state, this guards against *empty* content.
// When a provider returns a Response whose Message has zero content
// blocks (max_tokens before any block was emitted, the only block was
// a partial tool_use that stream finalisation correctly dropped, or
// refusal with no text emitted), Anthropic and most other providers
// reject any subsequent replay with `messages.N.content: Field
// required` — once the empty turn is in session state, every
// following call 400s and the conversation is wedged forever.
//
// The guard sits at the same place adk-go's AppendEvent honours its
// `Event.Partial` flag: the session-store boundary, the single
// chokepoint for "what gets persisted." The MessageEvent still fires
// (observers see what happened) and the FinishReason still propagates
// (terminal-reason handling fires below); only the persistence step
// is skipped.
func TestRun_EmptyContentNotPersisted(t *testing.T) {
t.Parallel()

model := fakellm.NewFakeModel()
model.When(fakellm.Any()).
ThenRespondWith(func(_ *llm.Request, _ *fakellm.CallContext) (*llm.Response, error) {
// max_tokens hit before any content block emitted: provider
// returns a Response with empty content + FinishReasonLength.
return &llm.Response{
Message: llm.Message{
Role: llm.RoleAssistant,
Content: nil,
},
FinishReason: llm.FinishReasonLength,
}, nil
})

ag, err := llmagent.New("test-agent", "You are a helpful assistant", model)
require.NoError(t, err)

sess := &session.State{
ID: "test-session",
Messages: []llm.Message{llm.NewMessage(llm.RoleUser, llm.NewTextPart("hello"))},
}
inv := agent.NewInvocationMetadata(sess, agent.Info{})

events := collectEvents(t, ag.Run(t.Context(), inv))

// FinishReason still propagates so callers can react.
endEvent := findInvocationEndEvent(events)
require.NotNil(t, endEvent)
assert.Equal(t, agent.FinishReasonLength, endEvent.FinishReason,
"finish reason must propagate even when content is empty")

// MessageEvent still fires so observers can see what happened.
messageEvents := filterEvents[agent.MessageEvent](events)
assert.NotEmpty(t, messageEvents, "MessageEvent must still fire so observers see the empty response")

// Session must NOT contain the empty assistant message — that's the
// whole point. Only the original user message stays.
require.Len(t, sess.Messages, 1, "empty assistant message must not be persisted to session")
assert.Equal(t, llm.RoleUser, sess.Messages[0].Role)

// Sanity: every persisted message has non-empty content. This is
// the invariant Anthropic et al. require — every message in a
// replayed array must have at least one content block.
for i, m := range sess.Messages {
assert.NotEmpty(t, m.Content, "session.Messages[%d] must have non-empty content", i)
}
}

// TestRun_NonEmptyContentStillPersisted is the negative control:
// the guard must not over-fire and skip messages that DO have content.
// If this regresses, normal turns stop being persisted and the agent
// loses all state.
func TestRun_NonEmptyContentStillPersisted(t *testing.T) {
t.Parallel()

model := fakellm.NewFakeModel()
model.When(fakellm.Any()).
ThenRespondWith(func(_ *llm.Request, _ *fakellm.CallContext) (*llm.Response, error) {
return &llm.Response{
Message: llm.Message{
Role: llm.RoleAssistant,
Content: []*llm.Part{llm.NewTextPart("hi back")},
},
FinishReason: llm.FinishReasonStop,
}, nil
})

ag, err := llmagent.New("test-agent", "You are a helpful assistant", model)
require.NoError(t, err)

sess := &session.State{
ID: "test-session",
Messages: []llm.Message{llm.NewMessage(llm.RoleUser, llm.NewTextPart("hello"))},
}
inv := agent.NewInvocationMetadata(sess, agent.Info{})

collectEvents(t, ag.Run(t.Context(), inv))

require.Len(t, sess.Messages, 2, "non-empty assistant message must be persisted")
assert.Equal(t, llm.RoleAssistant, sess.Messages[1].Role)
assert.NotEmpty(t, sess.Messages[1].Content)
}
24 changes: 22 additions & 2 deletions agent/llmagent/llmagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,28 @@ func (a *LLMAgent) executeSingleTurn(
// Update usage tracking
agent.AddUsage(inv, resp.Usage)

// Add assistant message to session (single source of truth)
sess.Messages = append(sess.Messages, resp.Message)
// Add assistant message to session (single source of truth).
//
// Skip persistence when Content is empty. Sources we have observed:
// - max_tokens hit before any content block was emitted
// - the only content block was a partial tool_use that stream
// finalisation correctly dropped (PR #116)
// - refusal / safety filter with no text emitted
//
// In all of these the FinishReason carries the truth (Length,
// ContentFilter, etc.) and the terminal-reason handling below
// surfaces it. The MessageEvent below still fires so observers
// see what happened. Persisting the empty Message would poison
// every subsequent replay — Anthropic and most other providers
// reject `messages.N.content` arrays with `Field required` and
// the conversation is permanently wedged.
//
// Same shape as adk-go's `AppendEvent` check on `Event.Partial`:
// the session-store boundary is the right place to drop responses
// that carry no usable content.
if len(resp.Message.Content) > 0 {
sess.Messages = append(sess.Messages, resp.Message)
}

// Emit message event
if !yield(agent.MessageEvent{
Expand Down
139 changes: 139 additions & 0 deletions docs/empty-content-persistence.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Empty-content responses: open design question

Status: parked. PR #123 ships the minimal fix — agent-loop guard at the
session-store boundary. The deeper question of where the invariant
should live is not resolved.

## The bug

After PR #116 landed (drop partial `tool_use` blocks at stream
finalisation), an unrelated wedge surfaced in production: agent
sessions started failing every call with

```
messages.15.content: Field required
```

from Anthropic. The cause: a previous turn finalised with `Content =
[]` (max_tokens hit before any non-partial block was emitted, or the
only block was a partial tool_use that #116 correctly dropped, or a
refusal with no text). The agent loop's `sess.Messages = append(...)`
ran *before* the FinishReason terminal-handling check, so the empty
`Message` landed in session state. Every subsequent replay then sent
that empty array to Anthropic, which 400'd. Pod restarts didn't help —
session was the persistent thing.

## What we shipped (PR #123)

One-line guard at `agent/llmagent/llmagent.go:273`:

```go
if len(resp.Message.Content) > 0 {
sess.Messages = append(sess.Messages, resp.Message)
}
```

The MessageEvent still fires; the FinishReason still propagates; the
terminal-reason handling still terminates the loop cleanly. Only
persistence is skipped.

## What we considered and rejected

### A. Provider returns error on empty content

Move the check into each `response_mapper`: if `len(content) == 0`,
return `ErrResponseMapping`. The agent loop's existing terminal-error
path then fires (`generate()` returns err, loop exits cleanly).

Rejected because it swallows the FinishReason signal. `Length + empty`
is a real, valid state (the model ran out of budget before producing
anything) — surfacing it as a generic mapping error loses the
information the caller needs to decide whether to retry with higher
`max_tokens`. Same for `ContentFilter + empty` (refusal). The agent
loop's existing handling for these reasons is correct and should keep
firing.

### B. Provider error only on `Stop + empty`

Conditional version of (A): only error when the stop reason is `Stop`
(end_turn / clean stop) with empty content, since *that* is the
genuinely anomalous case. Length and ContentFilter with empty content
flow through normally.

Rejected as speculative paranoia: `Stop + empty` is theoretical and
hasn't been observed in the wild. Adding code for a phantom case adds
complexity without value. If it ever happens we can revisit.

### C. Add a `Partial bool` field to `llm.Response`

Mirror adk-go's `Event.Partial` flag. Providers set it when the
response is a non-result; the agent loop respects the flag at the
persistence boundary.

Rejected as redundant. adk-go needs Partial because they expose
streaming chunks at the API surface — they have to distinguish
"chunk-with-content" from "complete." This SDK's agent loop sees only
the final aggregated `Response`, so `Partial == (len(Content) == 0)`
in our architecture. The flag adds ceremony without distinguishing
power.

### D. Key persistence on FinishReason instead of content length

"Skip persistence when FinishReason == Length."

Rejected because Length with non-empty content is valid and worth
persisting — e.g., the model completed 3 of 4 parallel tool calls
before hitting `max_tokens` (PR #116's central scenario). The caller
wants those 3 completed tool_use blocks to inspect. Length alone
isn't synonymous with "unusable result"; `len(Content) == 0` is.

## The open question

The session-store boundary is the *right place* to honour "non-result
responses don't get persisted." That part is settled. What's not
settled:

1. **Should providers also be involved?** Right now, the invariant is
enforced only in `llmagent.go`. A non-llmagent caller that drives
`model.Generate()` directly and persists the result has to know to
apply the same guard. That's not great encapsulation. Options:

- Document the invariant: "callers must skip persistence when
`len(resp.Message.Content) == 0`." Cheapest. Relies on docs.
- Add a helper: `resp.IsPersistable() bool` returning
`len(Content) > 0`. Self-documenting at call sites.
- Move the guard into the session store itself (any
`sess.Append(msg)` helper rejects empty Content). Requires
introducing a session abstraction the SDK doesn't currently
own at this layer.

2. **Should we have telemetry?** Right now empty-content responses
silently disappear into the void. A counter/log would tell us how
often this fires and whether some provider/scenario is leaking
them at an unexpected rate. Cheap to add, useful for spotting
regressions.

3. **Is there a class of "incomplete" responses that don't have
`len(Content) == 0` but still shouldn't be persisted?** PR #116's
reproducer (truncated tool args yielding `unexpected end of JSON
input`) was such a case before the fix. Are there other shapes we
haven't seen yet? Hard to know without more production exposure.

## Move on

For now: PR #123 fixes the production bug. The architectural
improvements above are not blocking. Revisit when:

- Telemetry shows empty-content responses are happening at a
non-trivial rate, or
- A non-llmagent caller hits the same wedge, or
- A new "incomplete" response shape surfaces that the simple
`len(Content) == 0` check doesn't catch.

## References

- PR #116 — surface truncation instead of corrupting state when
tool_use is cut off (the sibling fix)
- PR #123 — the agent-loop guard shipped here
- adk-go `session/inmemory.go::AppendEvent` — prior art for
guarding the persistence boundary on a "not-final" signal
23 changes: 23 additions & 0 deletions llm/part.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,29 @@ type ToolRequest struct {
Arguments json.RawMessage `json:"arguments"`
}

// ArgumentsAsObject decodes Arguments into a JSON object for provider APIs
// that require structured input (Anthropic tool_use.input, Bedrock tool_use
// input, Gemini function_call.args). Empty bytes decode to an empty object,
// which matches what providers actually send for no-arg tools. Invalid JSON
// also decodes to an empty object rather than erroring: corrupt arguments
// can end up in session state when a streaming turn is cut short
// mid-accumulation (e.g. stop_reason=max_tokens during input_json_delta),
// and without this fallback every replay of that session fails message
// mapping with "unexpected end of JSON input". The paired tool_result
// already carries the original parse error so the model has context.
func (t *ToolRequest) ArgumentsAsObject() map[string]any {
if len(t.Arguments) == 0 {
return map[string]any{}
}

var input map[string]any
if err := json.Unmarshal(t.Arguments, &input); err != nil {
return map[string]any{}
}

return input
}

// ToolResponse represents the result of executing a tool.
// This is sent back to the model to continue the conversation.
type ToolResponse struct {
Expand Down
50 changes: 50 additions & 0 deletions llm/part_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright 2026 Redpanda Data, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package llm_test

import (
"encoding/json"
"testing"

"github.com/stretchr/testify/assert"

"github.com/redpanda-data/ai-sdk-go/llm"
)

func TestToolRequest_ArgumentsAsObject(t *testing.T) {
t.Parallel()

cases := []struct {
name string
in json.RawMessage
want map[string]any
}{
{"nil", nil, map[string]any{}},
{"empty", json.RawMessage(""), map[string]any{}},
{"truncated", json.RawMessage(`{"query":`), map[string]any{}},
{"not an object", json.RawMessage(`"oops"`), map[string]any{}},
{"valid empty", json.RawMessage(`{}`), map[string]any{}},
{"valid", json.RawMessage(`{"query":"SELECT 1"}`), map[string]any{"query": "SELECT 1"}},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

tr := &llm.ToolRequest{Arguments: tc.in}
assert.Equal(t, tc.want, tr.ArgumentsAsObject())
})
}
}
Loading
Loading