Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 65 additions & 16 deletions pkg/agent/loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ func registerSharedTools(
agent.Tools.Register(sendFileTool)
}

// Read image tool (converts local images to base64 for LLM recognition)
if cfg.Tools.IsToolEnabled("read_image") {
readImageTool := tools.NewReadImageTool(int64(cfg.Agents.Defaults.GetMaxMediaSize()))
agent.Tools.Register(readImageTool)
}

// Skill discovery and installation tools
skills_enabled := cfg.Tools.IsToolEnabled("skills")
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
Expand Down Expand Up @@ -1357,25 +1363,55 @@ func (al *AgentLoop) runLLMIteration(
})
}

// If tool returned media refs, publish them as outbound media
// Handle media data based on MediaDispatch type
if len(r.result.Media) > 0 {
parts := make([]bus.MediaPart, 0, len(r.result.Media))
for _, ref := range r.result.Media {
part := bus.MediaPart{Ref: ref}
if al.mediaStore != nil {
if _, meta, err := al.mediaStore.ResolveWithMeta(ref); err == nil {
part.Filename = meta.Filename
part.ContentType = meta.ContentType
part.Type = inferMediaType(meta.Filename, meta.ContentType)
switch r.result.MediaDispatch {
case tools.MediaDispatchToLLM:
// Media is base64-encoded data for LLM analysis
// The data will be included in the tool result message for LLM context
logger.DebugCF("agent", "Tool returned media for LLM analysis",
map[string]any{
"tool": r.tc.Name,
"media_count": len(r.result.Media),
})

case tools.MediaDispatchOutbound, "":
// Default: media refs for external channels
parts := make([]bus.MediaPart, 0, len(r.result.Media))
for _, ref := range r.result.Media {
part := bus.MediaPart{Ref: ref}
if al.mediaStore != nil {
if _, meta, err := al.mediaStore.ResolveWithMeta(ref); err == nil {
part.Filename = meta.Filename
part.ContentType = meta.ContentType
part.Type = inferMediaType(meta.Filename, meta.ContentType)
}
}
parts = append(parts, part)
}
al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
Parts: parts,
})

default:
// Unknown dispatch type, log warning and default to outbound
logger.WarnCF("agent", "Unknown media dispatch type, defaulting to outbound",
map[string]any{
"tool": r.tc.Name,
"dispatch_type": r.result.MediaDispatch,
})
parts := make([]bus.MediaPart, 0, len(r.result.Media))
for _, ref := range r.result.Media {
parts = append(parts, bus.MediaPart{Ref: ref})
}
parts = append(parts, part)
al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
Parts: parts,
})
}
al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
Parts: parts,
})
}

// Determine content for LLM based on tool result
Expand All @@ -1384,11 +1420,24 @@ func (al *AgentLoop) runLLMIteration(
contentForLLM = r.result.Err.Error()
}

// Build tool result message based on MediaDispatch
toolResultMsg := providers.Message{
Role: "tool",
Content: contentForLLM,
ToolCallID: r.tc.ID,
}

// Set Content and Media based on dispatch type
if r.result.MediaDispatch == tools.MediaDispatchToLLM && len(r.result.Media) > 0 {
// For LLM-bound media, mark content as [image] and include base64 data
toolResultMsg.Content = "[image]"
toolResultMsg.Media = r.result.Media
} else {
toolResultMsg.Content = contentForLLM
if len(r.result.Media) > 0 && r.result.MediaDispatch != tools.MediaDispatchToLLM {
toolResultMsg.Media = r.result.Media
}
}

messages = append(messages, toolResultMsg)

// Save tool result message to session
Expand Down
154 changes: 154 additions & 0 deletions pkg/tools/read_image.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
package tools

import (
"bytes"
"context"
"encoding/base64"
"fmt"
"io"
"os"
"strings"

"github.com/h2non/filetype"
)

// ReadImageTool reads local image files and converts them to base64-encoded data URLs.
// This enables local images to be recognized by LLMs for image analysis tasks.
type ReadImageTool struct {
maxSize int64
}

// NewReadImageTool creates a new ReadImageTool instance with the specified max file size.
// If maxSize is 0 or negative, defaults to 10MB.
func NewReadImageTool(maxSize int64) *ReadImageTool {
if maxSize <= 0 {
maxSize = 10 * 1024 * 1024 // Default 10MB
}
return &ReadImageTool{
maxSize: maxSize,
}
}

// Name returns the tool name.
func (t *ReadImageTool) Name() string {
return "read_image"
}

// Description returns the tool description for LLM function calling.
func (t *ReadImageTool) Description() string {
return "Read a local image file and convert it to base64-encoded format for LLM image recognition and analysis. " +
"Supports common formats: jpg, jpeg, png, gif, webp, bmp. " +
"The image will be sent to the LLM for content analysis."
}

// Parameters returns the JSON Schema parameter definition for the tool.
func (t *ReadImageTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"path": map[string]any{
"type": "string",
"description": "Full path to the local image file. Supports jpg, jpeg, png, gif, webp, bmp formats.",
},
},
"required": []string{"path"},
}
}

// Execute reads the image file, validates it, and converts to base64 data URL.
// Returns a ToolResult with Media containing the base64 data and MediaDispatch set to MediaDispatchToLLM.
func (t *ReadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
// Parse path parameter
path, ok := args["path"].(string)
if !ok || path == "" {
return ErrorResult("path parameter is required and must be a string")
}

// Check file existence
info, err := os.Stat(path)
if err != nil {
if os.IsNotExist(err) {
return ErrorResult(fmt.Sprintf("file not found: %s", path))
}
return ErrorResult(fmt.Sprintf("failed to access file: %v", err))
}

// Check file size
if info.Size() > t.maxSize {
return ErrorResult(fmt.Sprintf(
"file too large: %d bytes (max: %d bytes, ~%d MB)",
info.Size(), t.maxSize, t.maxSize/(1024*1024),
))
}

// Detect MIME type
mime, err := detectImageMIME(path)
if err != nil {
return ErrorResult(fmt.Sprintf("failed to detect file type: %v", err))
}

// Validate it's an image
if !strings.HasPrefix(mime, "image/") {
return ErrorResult(fmt.Sprintf("not an image file: %s (detected: %s)", path, mime))
}

// Encode to base64 data URL
dataURL, err := encodeImageToDataURL(path, mime, info, int(t.maxSize))
if err != nil {
return ErrorResult(fmt.Sprintf("failed to encode image: %v", err))
}

if dataURL == "" {
return ErrorResult("failed to encode image: empty result")
}

// Build result with MediaDispatch set to send to LLM
return &ToolResult{
ForLLM: fmt.Sprintf("Image loaded successfully: %s (%s, %d bytes)", path, mime, info.Size()),
ForUser: fmt.Sprintf("Image loaded: %s", path),
Media: []string{dataURL},
MediaDispatch: MediaDispatchToLLM,
Silent: false,
IsError: false,
}
}

// detectImageMIME detects the MIME type of an image file using magic bytes.
func detectImageMIME(path string) (string, error) {
kind, err := filetype.MatchFile(path)
if err != nil {
return "", err
}
if kind == filetype.Unknown {
return "", fmt.Errorf("unknown file type")
}
return kind.MIME.Value, nil
}

// encodeImageToDataURL encodes an image file to a base64 data URL.
// Uses streaming encoding for memory efficiency with large files.
func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) (string, error) {
if info.Size() > int64(maxSize) {
return "", fmt.Errorf("file too large: %d bytes", info.Size())
}

f, err := os.Open(localPath)
if err != nil {
return "", err
}
defer f.Close()

prefix := "data:" + mime + ";base64,"
encodedLen := base64.StdEncoding.EncodedLen(int(info.Size()))
var buf bytes.Buffer
buf.Grow(len(prefix) + encodedLen)
buf.WriteString(prefix)

encoder := base64.NewEncoder(base64.StdEncoding, &buf)
if _, err := io.Copy(encoder, f); err != nil {
return "", err
}
encoder.Close()

return buf.String(), nil
}
25 changes: 23 additions & 2 deletions pkg/tools/result.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@ package tools

import "encoding/json"

// MediaDispatchType defines how media data should be dispatched.
// It determines whether media is sent to external channels or to the LLM for analysis.
type MediaDispatchType string

const (
// MediaDispatchOutbound sends media to external channels (e.g., Feishu, Discord).
// This is the default behavior for media store refs.
MediaDispatchOutbound MediaDispatchType = "OutboundMediaMessage"

// MediaDispatchToLLM sends media to the LLM for recognition and analysis.
// Used when media contains base64-encoded data for multimodal LLMs.
MediaDispatchToLLM MediaDispatchType = "SendToLLM"
)

// ToolResult represents the structured return value from tool execution.
// It provides clear semantics for different types of results and supports
// async operations, user-facing messages, and error handling.
Expand Down Expand Up @@ -31,9 +45,16 @@ type ToolResult struct {
// Used for internal error handling and logging.
Err error `json:"-"`

// Media contains media store refs produced by this tool.
// When non-empty, the agent will publish these as OutboundMediaMessage.
// Media contains media data produced by this tool.
// The content type depends on MediaDispatch:
// - MediaDispatchOutbound: media store refs (original behavior)
// - MediaDispatchToLLM: base64-encoded media data (e.g., "data:image/png;base64,xxx")
Media []string `json:"media,omitempty"`

// MediaDispatch specifies how media should be dispatched.
// - "OutboundMediaMessage": send to external channels (default)
// - "SendToLLM": inject into LLM context for analysis
MediaDispatch MediaDispatchType `json:"media_dispatch,omitempty"`
}

// NewToolResult creates a basic ToolResult with content for the LLM.
Expand Down