Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 54 additions & 62 deletions src/bot/features/image_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,27 @@
"""

import base64
from dataclasses import dataclass
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Optional

from telegram import PhotoSize

from src.config import Settings

_TEMP_DIR = Path("/tmp/claude_bot_files")


@dataclass
class ProcessedImage:
"""Processed image result"""

prompt: str
image_type: str
base64_data: str
size: int
metadata: Dict[str, any] = None
metadata: Dict[str, object] = field(default_factory=dict)
base64_data: str = ""


class ImageHandler:
Expand All @@ -38,37 +42,51 @@ def __init__(self, config: Settings):
async def process_image(
self, photo: PhotoSize, caption: Optional[str] = None
) -> ProcessedImage:
"""Process uploaded image"""

# Download image
"""Process uploaded image — save to temp file and build a path-based prompt."""
# Download image bytes
file = await photo.get_file()
image_bytes = await file.download_as_bytearray()

# Detect image type
image_bytes = bytes(await file.download_as_bytearray())

# Detect format and save to temp file so Claude CLI can read it
fmt = self._detect_format(image_bytes)
if fmt == "unknown":
raise ValueError(
"Unsupported image format. Please upload a PNG, JPEG, GIF, or WebP image."
)
_TEMP_DIR.mkdir(mode=0o700, exist_ok=True)
image_path = _TEMP_DIR / f"image_{uuid.uuid4()}.{fmt}"
try:
image_path.write_bytes(image_bytes)
image_path.chmod(0o600)
except Exception:
image_path.unlink(missing_ok=True)
raise

# Detect image type for prompt tailoring
image_type = self._detect_image_type(image_bytes)

# Create appropriate prompt
# Build prompt with actual file path so Claude CLI can see the image
if image_type == "screenshot":
prompt = self._create_screenshot_prompt(caption)
prompt = self._create_screenshot_prompt(caption, image_path)
elif image_type == "diagram":
prompt = self._create_diagram_prompt(caption)
prompt = self._create_diagram_prompt(caption, image_path)
elif image_type == "ui_mockup":
prompt = self._create_ui_prompt(caption)
prompt = self._create_ui_prompt(caption, image_path)
else:
prompt = self._create_generic_prompt(caption)
prompt = self._create_generic_prompt(caption, image_path)

# Convert to base64 for Claude (if supported in future)
# Retained for future multimodal SDK support — not currently used
base64_image = base64.b64encode(image_bytes).decode("utf-8")

return ProcessedImage(
prompt=prompt,
image_type=image_type,
base64_data=base64_image,
size=len(image_bytes),
metadata={
"format": self._detect_format(image_bytes),
"format": fmt,
"has_caption": caption is not None,
"temp_path": str(image_path),
},
base64_data=base64_image,
)

def _detect_image_type(self, image_bytes: bytes) -> str:
Expand All @@ -93,61 +111,35 @@ def _detect_format(self, image_bytes: bytes) -> str:
else:
return "unknown"

def _create_screenshot_prompt(self, caption: Optional[str]) -> str:
def _create_screenshot_prompt(
self, caption: Optional[str], image_path: Path
) -> str:
"""Create prompt for screenshot analysis"""
base_prompt = """I'm sharing a screenshot with you. Please analyze it and help me with:

1. Identifying what application or website this is from
2. Understanding the UI elements and their purpose
3. Any issues or improvements you notice
4. Answering any specific questions I have

"""
base = f"I'm sharing a screenshot with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease analyze it and help me with:\n1. Identifying what application or website this is from\n2. Understanding the UI elements and their purpose\n3. Any issues or improvements you notice\n4. Answering any specific questions I have\n"
if caption:
base_prompt += f"Specific request: {caption}"
base += f"\nSpecific request: {caption}"
return base

return base_prompt

def _create_diagram_prompt(self, caption: Optional[str]) -> str:
def _create_diagram_prompt(self, caption: Optional[str], image_path: Path) -> str:
"""Create prompt for diagram analysis"""
base_prompt = """I'm sharing a diagram with you. Please help me:

1. Understand the components and their relationships
2. Identify the type of diagram (flowchart, architecture, etc.)
3. Explain any technical concepts shown
4. Suggest improvements or clarifications

"""
base = f"I'm sharing a diagram with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease help me:\n1. Understand the components and their relationships\n2. Identify the type of diagram\n3. Explain any technical concepts shown\n4. Suggest improvements or clarifications\n"
if caption:
base_prompt += f"Specific request: {caption}"

return base_prompt
base += f"\nSpecific request: {caption}"
return base

def _create_ui_prompt(self, caption: Optional[str]) -> str:
def _create_ui_prompt(self, caption: Optional[str], image_path: Path) -> str:
"""Create prompt for UI mockup analysis"""
base_prompt = """I'm sharing a UI mockup with you. Please analyze:

1. The layout and visual hierarchy
2. User experience considerations
3. Accessibility aspects
4. Implementation suggestions
5. Any potential improvements

"""
base = f"I'm sharing a UI mockup with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease analyze:\n1. The layout and visual hierarchy\n2. UX improvements\n3. Accessibility concerns\n"
if caption:
base_prompt += f"Specific request: {caption}"

return base_prompt
base += f"\nSpecific request: {caption}"
return base

def _create_generic_prompt(self, caption: Optional[str]) -> str:
def _create_generic_prompt(self, caption: Optional[str], image_path: Path) -> str:
"""Create generic image analysis prompt"""
base_prompt = """I'm sharing an image with you. Please analyze it and provide relevant insights.

"""
base = f"I'm sharing an image with you. Please read the image file at this path using your Read tool: {image_path}\n\nPlease analyze and describe what you see.\n"
if caption:
base_prompt += f"Context: {caption}"

return base_prompt
base += f"\nSpecific request: {caption}"
return base

def supports_format(self, filename: str) -> bool:
"""Check if image format is supported"""
Expand Down
6 changes: 6 additions & 0 deletions src/bot/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1276,6 +1276,7 @@ async def agentic_photo(
await chat.send_action("typing")
progress_msg = await update.message.reply_text("Working...")

processed_image = None
try:
photo = update.message.photo[-1]
processed_image = await image_handler.process_image(
Expand All @@ -1297,6 +1298,11 @@ async def agentic_photo(
logger.error(
"Claude photo processing failed", error=str(e), user_id=user_id
)
finally:
if processed_image is not None:
temp_path = processed_image.metadata.get("temp_path")
if temp_path:
Path(temp_path).unlink(missing_ok=True)

async def agentic_voice(
self, update: Update, context: ContextTypes.DEFAULT_TYPE
Expand Down