diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 8328c691e0..49a7678eed 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -202,6 +202,12 @@ func registerSharedTools( agent.Tools.Register(sendFileTool) } + // Read image tool (converts local images to base64 for LLM recognition) + if cfg.Tools.IsToolEnabled("read_image") { + readImageTool := tools.NewReadImageTool(int64(cfg.Agents.Defaults.GetMaxMediaSize())) + agent.Tools.Register(readImageTool) + } + // Skill discovery and installation tools skills_enabled := cfg.Tools.IsToolEnabled("skills") find_skills_enable := cfg.Tools.IsToolEnabled("find_skills") @@ -1357,25 +1363,55 @@ func (al *AgentLoop) runLLMIteration( }) } - // If tool returned media refs, publish them as outbound media + // Handle media data based on MediaDispatch type if len(r.result.Media) > 0 { - parts := make([]bus.MediaPart, 0, len(r.result.Media)) - for _, ref := range r.result.Media { - part := bus.MediaPart{Ref: ref} - if al.mediaStore != nil { - if _, meta, err := al.mediaStore.ResolveWithMeta(ref); err == nil { - part.Filename = meta.Filename - part.ContentType = meta.ContentType - part.Type = inferMediaType(meta.Filename, meta.ContentType) + switch r.result.MediaDispatch { + case tools.MediaDispatchToLLM: + // Media is base64-encoded data for LLM analysis + // The data will be included in the tool result message for LLM context + logger.DebugCF("agent", "Tool returned media for LLM analysis", + map[string]any{ + "tool": r.tc.Name, + "media_count": len(r.result.Media), + }) + + case tools.MediaDispatchOutbound, "": + // Default: media refs for external channels + parts := make([]bus.MediaPart, 0, len(r.result.Media)) + for _, ref := range r.result.Media { + part := bus.MediaPart{Ref: ref} + if al.mediaStore != nil { + if _, meta, err := al.mediaStore.ResolveWithMeta(ref); err == nil { + part.Filename = meta.Filename + part.ContentType = meta.ContentType + part.Type = inferMediaType(meta.Filename, meta.ContentType) + } } + parts = append(parts, part) + } + al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{ + Channel: opts.Channel, + ChatID: opts.ChatID, + Parts: parts, + }) + + default: + // Unknown dispatch type, log warning and default to outbound + logger.WarnCF("agent", "Unknown media dispatch type, defaulting to outbound", + map[string]any{ + "tool": r.tc.Name, + "dispatch_type": r.result.MediaDispatch, + }) + parts := make([]bus.MediaPart, 0, len(r.result.Media)) + for _, ref := range r.result.Media { + parts = append(parts, bus.MediaPart{Ref: ref}) } - parts = append(parts, part) + al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{ + Channel: opts.Channel, + ChatID: opts.ChatID, + Parts: parts, + }) } - al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{ - Channel: opts.Channel, - ChatID: opts.ChatID, - Parts: parts, - }) } // Determine content for LLM based on tool result @@ -1384,11 +1420,24 @@ func (al *AgentLoop) runLLMIteration( contentForLLM = r.result.Err.Error() } + // Build tool result message based on MediaDispatch toolResultMsg := providers.Message{ Role: "tool", - Content: contentForLLM, ToolCallID: r.tc.ID, } + + // Set Content and Media based on dispatch type + if r.result.MediaDispatch == tools.MediaDispatchToLLM && len(r.result.Media) > 0 { + // For LLM-bound media, mark content as [image] and include base64 data + toolResultMsg.Content = "[image]" + toolResultMsg.Media = r.result.Media + } else { + toolResultMsg.Content = contentForLLM + if len(r.result.Media) > 0 && r.result.MediaDispatch != tools.MediaDispatchToLLM { + toolResultMsg.Media = r.result.Media + } + } + messages = append(messages, toolResultMsg) // Save tool result message to session diff --git a/pkg/tools/read_image.go b/pkg/tools/read_image.go new file mode 100644 index 0000000000..875400b124 --- /dev/null +++ b/pkg/tools/read_image.go @@ -0,0 +1,154 @@ +package tools + +import ( + "bytes" + "context" + "encoding/base64" + "fmt" + "io" + "os" + "strings" + + "github.com/h2non/filetype" +) + +// ReadImageTool reads local image files and converts them to base64-encoded data URLs. +// This enables local images to be recognized by LLMs for image analysis tasks. +type ReadImageTool struct { + maxSize int64 +} + +// NewReadImageTool creates a new ReadImageTool instance with the specified max file size. +// If maxSize is 0 or negative, defaults to 10MB. +func NewReadImageTool(maxSize int64) *ReadImageTool { + if maxSize <= 0 { + maxSize = 10 * 1024 * 1024 // Default 10MB + } + return &ReadImageTool{ + maxSize: maxSize, + } +} + +// Name returns the tool name. +func (t *ReadImageTool) Name() string { + return "read_image" +} + +// Description returns the tool description for LLM function calling. +func (t *ReadImageTool) Description() string { + return "Read a local image file and convert it to base64-encoded format for LLM image recognition and analysis. " + + "Supports common formats: jpg, jpeg, png, gif, webp, bmp. " + + "The image will be sent to the LLM for content analysis." +} + +// Parameters returns the JSON Schema parameter definition for the tool. +func (t *ReadImageTool) Parameters() map[string]any { + return map[string]any{ + "type": "object", + "properties": map[string]any{ + "path": map[string]any{ + "type": "string", + "description": "Full path to the local image file. Supports jpg, jpeg, png, gif, webp, bmp formats.", + }, + }, + "required": []string{"path"}, + } +} + +// Execute reads the image file, validates it, and converts to base64 data URL. +// Returns a ToolResult with Media containing the base64 data and MediaDispatch set to MediaDispatchToLLM. +func (t *ReadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult { + // Parse path parameter + path, ok := args["path"].(string) + if !ok || path == "" { + return ErrorResult("path parameter is required and must be a string") + } + + // Check file existence + info, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return ErrorResult(fmt.Sprintf("file not found: %s", path)) + } + return ErrorResult(fmt.Sprintf("failed to access file: %v", err)) + } + + // Check file size + if info.Size() > t.maxSize { + return ErrorResult(fmt.Sprintf( + "file too large: %d bytes (max: %d bytes, ~%d MB)", + info.Size(), t.maxSize, t.maxSize/(1024*1024), + )) + } + + // Detect MIME type + mime, err := detectImageMIME(path) + if err != nil { + return ErrorResult(fmt.Sprintf("failed to detect file type: %v", err)) + } + + // Validate it's an image + if !strings.HasPrefix(mime, "image/") { + return ErrorResult(fmt.Sprintf("not an image file: %s (detected: %s)", path, mime)) + } + + // Encode to base64 data URL + dataURL, err := encodeImageToDataURL(path, mime, info, int(t.maxSize)) + if err != nil { + return ErrorResult(fmt.Sprintf("failed to encode image: %v", err)) + } + + if dataURL == "" { + return ErrorResult("failed to encode image: empty result") + } + + // Build result with MediaDispatch set to send to LLM + return &ToolResult{ + ForLLM: fmt.Sprintf("Image loaded successfully: %s (%s, %d bytes)", path, mime, info.Size()), + ForUser: fmt.Sprintf("Image loaded: %s", path), + Media: []string{dataURL}, + MediaDispatch: MediaDispatchToLLM, + Silent: false, + IsError: false, + } +} + +// detectImageMIME detects the MIME type of an image file using magic bytes. +func detectImageMIME(path string) (string, error) { + kind, err := filetype.MatchFile(path) + if err != nil { + return "", err + } + if kind == filetype.Unknown { + return "", fmt.Errorf("unknown file type") + } + return kind.MIME.Value, nil +} + +// encodeImageToDataURL encodes an image file to a base64 data URL. +// Uses streaming encoding for memory efficiency with large files. +func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) (string, error) { + if info.Size() > int64(maxSize) { + return "", fmt.Errorf("file too large: %d bytes", info.Size()) + } + + f, err := os.Open(localPath) + if err != nil { + return "", err + } + defer f.Close() + + prefix := "data:" + mime + ";base64," + encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) + var buf bytes.Buffer + buf.Grow(len(prefix) + encodedLen) + buf.WriteString(prefix) + + encoder := base64.NewEncoder(base64.StdEncoding, &buf) + if _, err := io.Copy(encoder, f); err != nil { + return "", err + } + encoder.Close() + + return buf.String(), nil +} diff --git a/pkg/tools/result.go b/pkg/tools/result.go index cab8332846..4b58067a2b 100644 --- a/pkg/tools/result.go +++ b/pkg/tools/result.go @@ -2,6 +2,20 @@ package tools import "encoding/json" +// MediaDispatchType defines how media data should be dispatched. +// It determines whether media is sent to external channels or to the LLM for analysis. +type MediaDispatchType string + +const ( + // MediaDispatchOutbound sends media to external channels (e.g., Feishu, Discord). + // This is the default behavior for media store refs. + MediaDispatchOutbound MediaDispatchType = "OutboundMediaMessage" + + // MediaDispatchToLLM sends media to the LLM for recognition and analysis. + // Used when media contains base64-encoded data for multimodal LLMs. + MediaDispatchToLLM MediaDispatchType = "SendToLLM" +) + // ToolResult represents the structured return value from tool execution. // It provides clear semantics for different types of results and supports // async operations, user-facing messages, and error handling. @@ -31,9 +45,16 @@ type ToolResult struct { // Used for internal error handling and logging. Err error `json:"-"` - // Media contains media store refs produced by this tool. - // When non-empty, the agent will publish these as OutboundMediaMessage. + // Media contains media data produced by this tool. + // The content type depends on MediaDispatch: + // - MediaDispatchOutbound: media store refs (original behavior) + // - MediaDispatchToLLM: base64-encoded media data (e.g., "data:image/png;base64,xxx") Media []string `json:"media,omitempty"` + + // MediaDispatch specifies how media should be dispatched. + // - "OutboundMediaMessage": send to external channels (default) + // - "SendToLLM": inject into LLM context for analysis + MediaDispatch MediaDispatchType `json:"media_dispatch,omitempty"` } // NewToolResult creates a basic ToolResult with content for the LLM.