diff --git a/src/bot/features/image_handler.py b/src/bot/features/image_handler.py index f9cf3798..5b6397b9 100644 --- a/src/bot/features/image_handler.py +++ b/src/bot/features/image_handler.py @@ -9,13 +9,17 @@ """ import base64 -from dataclasses import dataclass +import uuid +from dataclasses import dataclass, field +from pathlib import Path from typing import Dict, Optional from telegram import PhotoSize from src.config import Settings +_TEMP_DIR = Path("/tmp/claude_bot_files") + @dataclass class ProcessedImage: @@ -23,9 +27,9 @@ class ProcessedImage: prompt: str image_type: str - base64_data: str size: int - metadata: Dict[str, any] = None + metadata: Dict[str, object] = field(default_factory=dict) + base64_data: str = "" class ImageHandler: @@ -38,37 +42,51 @@ def __init__(self, config: Settings): async def process_image( self, photo: PhotoSize, caption: Optional[str] = None ) -> ProcessedImage: - """Process uploaded image""" - - # Download image + """Process uploaded image — save to temp file and build a path-based prompt.""" + # Download image bytes file = await photo.get_file() - image_bytes = await file.download_as_bytearray() - - # Detect image type + image_bytes = bytes(await file.download_as_bytearray()) + + # Detect format and save to temp file so Claude CLI can read it + fmt = self._detect_format(image_bytes) + if fmt == "unknown": + raise ValueError( + "Unsupported image format. Please upload a PNG, JPEG, GIF, or WebP image." + ) + _TEMP_DIR.mkdir(mode=0o700, exist_ok=True) + image_path = _TEMP_DIR / f"image_{uuid.uuid4()}.{fmt}" + try: + image_path.write_bytes(image_bytes) + image_path.chmod(0o600) + except Exception: + image_path.unlink(missing_ok=True) + raise + + # Detect image type for prompt tailoring image_type = self._detect_image_type(image_bytes) - # Create appropriate prompt + # Build prompt with actual file path so Claude CLI can see the image if image_type == "screenshot": - prompt = self._create_screenshot_prompt(caption) + prompt = self._create_screenshot_prompt(caption, image_path) elif image_type == "diagram": - prompt = self._create_diagram_prompt(caption) + prompt = self._create_diagram_prompt(caption, image_path) elif image_type == "ui_mockup": - prompt = self._create_ui_prompt(caption) + prompt = self._create_ui_prompt(caption, image_path) else: - prompt = self._create_generic_prompt(caption) + prompt = self._create_generic_prompt(caption, image_path) - # Convert to base64 for Claude (if supported in future) + # Retained for future multimodal SDK support — not currently used base64_image = base64.b64encode(image_bytes).decode("utf-8") - return ProcessedImage( prompt=prompt, image_type=image_type, - base64_data=base64_image, size=len(image_bytes), metadata={ - "format": self._detect_format(image_bytes), + "format": fmt, "has_caption": caption is not None, + "temp_path": str(image_path), }, + base64_data=base64_image, ) def _detect_image_type(self, image_bytes: bytes) -> str: @@ -93,61 +111,35 @@ def _detect_format(self, image_bytes: bytes) -> str: else: return "unknown" - def _create_screenshot_prompt(self, caption: Optional[str]) -> str: + def _create_screenshot_prompt( + self, caption: Optional[str], image_path: Path + ) -> str: """Create prompt for screenshot analysis""" - base_prompt = """I'm sharing a screenshot with you. Please analyze it and help me with: - -1. Identifying what application or website this is from -2. Understanding the UI elements and their purpose -3. Any issues or improvements you notice -4. Answering any specific questions I have - -""" + base = f"I'm sharing a screenshot with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease analyze it and help me with:\n1. Identifying what application or website this is from\n2. Understanding the UI elements and their purpose\n3. Any issues or improvements you notice\n4. Answering any specific questions I have\n" if caption: - base_prompt += f"Specific request: {caption}" + base += f"\nSpecific request: {caption}" + return base - return base_prompt - - def _create_diagram_prompt(self, caption: Optional[str]) -> str: + def _create_diagram_prompt(self, caption: Optional[str], image_path: Path) -> str: """Create prompt for diagram analysis""" - base_prompt = """I'm sharing a diagram with you. Please help me: - -1. Understand the components and their relationships -2. Identify the type of diagram (flowchart, architecture, etc.) -3. Explain any technical concepts shown -4. Suggest improvements or clarifications - -""" + base = f"I'm sharing a diagram with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease help me:\n1. Understand the components and their relationships\n2. Identify the type of diagram\n3. Explain any technical concepts shown\n4. Suggest improvements or clarifications\n" if caption: - base_prompt += f"Specific request: {caption}" - - return base_prompt + base += f"\nSpecific request: {caption}" + return base - def _create_ui_prompt(self, caption: Optional[str]) -> str: + def _create_ui_prompt(self, caption: Optional[str], image_path: Path) -> str: """Create prompt for UI mockup analysis""" - base_prompt = """I'm sharing a UI mockup with you. Please analyze: - -1. The layout and visual hierarchy -2. User experience considerations -3. Accessibility aspects -4. Implementation suggestions -5. Any potential improvements - -""" + base = f"I'm sharing a UI mockup with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease analyze:\n1. The layout and visual hierarchy\n2. UX improvements\n3. Accessibility concerns\n" if caption: - base_prompt += f"Specific request: {caption}" - - return base_prompt + base += f"\nSpecific request: {caption}" + return base - def _create_generic_prompt(self, caption: Optional[str]) -> str: + def _create_generic_prompt(self, caption: Optional[str], image_path: Path) -> str: """Create generic image analysis prompt""" - base_prompt = """I'm sharing an image with you. Please analyze it and provide relevant insights. - -""" + base = f"I'm sharing an image with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease analyze and describe what you see.\n" if caption: - base_prompt += f"Context: {caption}" - - return base_prompt + base += f"\nSpecific request: {caption}" + return base def supports_format(self, filename: str) -> bool: """Check if image format is supported""" diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py index ac1d5304..9ff0b81c 100644 --- a/src/bot/orchestrator.py +++ b/src/bot/orchestrator.py @@ -1276,6 +1276,7 @@ async def agentic_photo( await chat.send_action("typing") progress_msg = await update.message.reply_text("Working...") + processed_image = None try: photo = update.message.photo[-1] processed_image = await image_handler.process_image( @@ -1297,6 +1298,11 @@ async def agentic_photo( logger.error( "Claude photo processing failed", error=str(e), user_id=user_id ) + finally: + if processed_image is not None: + temp_path = processed_image.metadata.get("temp_path") + if temp_path: + Path(temp_path).unlink(missing_ok=True) async def agentic_voice( self, update: Update, context: ContextTypes.DEFAULT_TYPE