small-thinking · yxjiang · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -170,4 +170,7 @@ scripts
 knowledge/facts/**
 knowledge/tools/**
 use_cases
-polymind/example_tools/**
+polymind/example_tools/**
+
+# Large files are tracked with Git LFS (see .gitattributes)
+# PNG files are automatically tracked by LFS
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,38 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.0.61] - 2024-08-06
+
+### Added
+- **Image Processing Utilities**: Added new utility functions to `polymind.core.utils`:
+  - `encode_image_to_base64()`: Converts local image files to base64 strings for API calls
+  - `is_valid_image_url()`: Validates image URLs for common image hosting services
+- **Image Understanding Tool**: Added `ImageUnderstandingTool` to the media-gen example module:
+  - Uses OpenAI's GPT-4o-mini API for image analysis
+  - Supports both local image files and image URLs
+  - Configurable prompts with default values
+  - Optional JSON response format for structured output
+  - Comprehensive error handling and metadata tracking
+
+### Changed
+- Updated version to 0.0.61
+- Enhanced media-gen example with integration tests and documentation
+
+### Technical Details
+- Added type annotations and comprehensive error handling for image utilities
+- Implemented automatic base64 encoding for local images
+- Added support for multiple image formats and hosting services
+- Created separate integration test structure for real API testing
+
+## [0.0.60] - Previous version
+
+### Added
+- Initial release with core Polymind framework
+- Multi-agent collaboration capabilities
+- Task management system
+- Tool integration framework
+- Knowledge retrieval and indexing tools 
diff --git a/examples/media-gen/README.md b/examples/media-gen/README.md
@@ -49,9 +49,14 @@ media-gen/
 │   ├── __init__.py                # Package exports
 │   ├── media_gen_tool_base.py     # Abstract base classes
 │   ├── dummy_image_gen.py         # Dummy image generation tool
-│   └── dummy_video_gen.py         # Dummy video generation tool
+│   ├── dummy_video_gen.py         # Dummy video generation tool
+│   └── image_understanding_tool.py # Image understanding tool
 ├── tests/                         # Test suite
-│   └── test_dummy_media_gen.py    # Comprehensive tests
+│   ├── test_dummy_media_gen.py    # Comprehensive tests
+│   └── test_image_understanding.py # Image understanding tests
+├── integration_tests/             # Integration tests (manual)
+│   ├── test_image_understanding.py # Real API integration test
+│   └── README.md                  # Integration test documentation
 
 ├── env.example                    # Environment variables template
 ├── setup.py                       # Unified setup script (all platforms)
@@ -115,6 +120,77 @@ class MyVideoGen(VideoGenerationTool):
 - `resolution` (str, optional, default: "480p"): Video resolution
 - `image` (str, optional): URI of starting image
 
+### Image Understanding Tool
+
+```python
+from tools import ImageUnderstandingTool
+
+# Initialize the tool
+image_tool = ImageUnderstandingTool()
+
+# Analyze an image from URL
+result = image_tool.run({
+    "prompt": "What objects do you see in this image?",
+    "images": ["https://example.com/image.jpg"],
+    "return_json": False
+})
+
+# Analyze with JSON response
+result = image_tool.run({
+    "prompt": "Analyze this image and return JSON with 'objects' and 'mood' fields",
+    "images": ["path/to/local/image.jpg"],
+    "return_json": True,
+    "max_tokens": 500
+})
+
+# Generate image generation prompt
+result = image_tool.run({
+    "prompt": "Analyze this image and create a detailed image generation prompt that could be used to recreate this image. Include specific details about objects, characters, setting, lighting, mood, style, composition, colors, and textures.",
+    "images": ["path/to/local/image.jpg"],
+    "max_tokens": 600
+})
+```
+
+**Parameters:**
+- `prompt` (str, optional, default: "What's in this image?"): Analysis prompt
+- `images` (List[str], required): List of image paths or URLs
+- `return_json` (bool, optional, default: False): Return JSON response
+- `max_tokens` (int, optional, default: 1000): Maximum response tokens
+
+
+**Features:**
+- Supports both local image files and image URLs
+- Automatic base64 encoding for local images
+- Optional JSON response format for structured output
+- Configurable token limits
+- Comprehensive error handling
+
+
+## Testing
+
+### Unit Tests
+Run the standard unit tests:
+```bash
+cd tests && python test_dummy_media_gen.py
+python test_image_understanding.py
+```
+
+### Integration Tests
+For real API testing with actual images:
+```bash
+python integration_tests/test_image_understanding.py
+```
+
+**Features:**
+- Generates image generation prompt for test image
+- Uses local test image (`test_image.png`)
+- Comprehensive error handling
+
+**Note:** Integration tests require:
+- Valid OpenAI API key in `.env` file
+- Internet connection
+- Test image file in `integration_tests/` folder
+
 ## Usage
 
 ```python
@@ -132,10 +208,17 @@ print(f"Replicate API Token: {'✓ Available' if os.getenv('REPLICATE_API_TOKEN'
 # Initialize tools
 image_gen = DummyImageGen()
 video_gen = DummyVideoGen()
+image_understanding = ImageUnderstandingTool()
 
 # Generate media
 image_result = image_gen.run({"prompt": "A beautiful sunset"})
 video_result = video_gen.run({"prompt": "A butterfly emerging"})
+
+# Analyze images
+analysis_result = image_understanding.run({
+    "prompt": "What's in this image?",
+    "images": ["https://example.com/image.jpg"]
+})
 ```
 
 ## Running Examples
@@ -144,9 +227,13 @@ video_result = video_gen.run({"prompt": "A butterfly emerging"})
 # Activate virtual environment
 source venv/bin/activate  # On Windows: venv\Scripts\activate.bat
 
-# Run example
+# Run examples
 python example_usage.py
 
 # Run tests
 cd tests && python test_dummy_media_gen.py
+python test_image_understanding.py
+
+# Run integration tests (requires API key)
+python integration_tests/test_image_understanding.py
 ``` 
diff --git a/examples/media-gen/integration_tests/README.md b/examples/media-gen/integration_tests/README.md
@@ -0,0 +1,36 @@
+# Integration Tests
+
+This folder contains integration tests that require real API calls and external resources.
+
+## Image Understanding Integration Test
+
+### Prerequisites
+- OpenAI API key in `.env` file
+- Internet connection
+- Test image file (`test_image.png`)
+
+### Running the Test
+
+```bash
+# From the media-gen directory
+python integration_tests/test_image_understanding.py
+```
+
+### What it does
+- Loads the test image (`test_image.png`)
+- Generates an image generation prompt that could be used to recreate the image
+- Calls OpenAI's GPT-4o-mini API
+- Displays the generated prompt and metadata
+
+### Expected Output
+The test will show:
+- ✅ Confirmation that test image and API key are found
+- 📝 The prompt being used
+- 📋 The generated image generation prompt
+- 📊 Token usage metadata
+
+### Notes
+- This test is not run automatically with unit tests
+- It requires a valid OpenAI API key
+- It makes real API calls and may incur costs
+- The test image should be placed in this folder 
diff --git a/examples/media-gen/integration_tests/test_image.png b/examples/media-gen/integration_tests/test_image.png
diff --git a/examples/media-gen/integration_tests/test_image_understanding.py b/examples/media-gen/integration_tests/test_image_understanding.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Simple integration test for ImageUnderstandingTool.
+
+This test uses the test_image.png file to generate an image generation prompt.
+It requires OPENAI_API_KEY in the .env file.
+
+Run with: python integration_tests/test_image_understanding.py
+"""
+
+import os
+import sys
+
+from dotenv import load_dotenv
+from pathlib import Path
+
+# Add the parent directory to the path to import the tool
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from tools.image_understanding_tool import ImageUnderstandingTool
+
+
+def main():
+    """Run the integration test."""
+    print("=== Image Understanding Integration Test ===")
+    print("This test will analyze test_image.png and generate an image generation prompt.\n")
+
+    # Load environment variables
+    load_dotenv()
+
+    # Check for OpenAI API key
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("❌ Error: OPENAI_API_KEY not found in environment.")
+        print("Please set it in your .env file.")
+        return
+
+    # Get test image path
+    test_image_path = Path(__file__).parent / "test_image.png"
+
+    if not test_image_path.exists():
+        print(f"❌ Error: Test image not found at {test_image_path}")
+        return
+
+    print(f"✅ Found test image: {test_image_path}")
+    print(f"✅ API key available: {api_key[:8]}...")
+    print()
+
+    # Initialize tool
+    tool = ImageUnderstandingTool()
+
+    # Define the prompt for image generation
+    prompt = (
+        "Analyze this image and create an image generation prompt that could be used "
+        "to recreate this image as similar as possible. "
+        "Include specific details about objects, characters, setting, "
+        "lighting, mood, image style, composition, colors, and textures. The prompt should be "
+        "less than 100 words."
+    )
+
+    print("📝 Prompt: Generate image generation prompt")
+    print("🔄 Calling OpenAI API...")
+
+    try:
+        result = tool.run({
+            "prompt": prompt,
+            "images": [str(test_image_path)],
+            "return_json": False,
+            "max_tokens": 600
+        })
+
+        print("✅ Analysis completed successfully!")
+        print("\n📋 Generated Image Generation Prompt:")
+        print("-" * 50)
+        print(result["analysis"])
+        print("-" * 50)
+
+        print("\n📊 Metadata:")
+        print(f"  Model: {result['metadata']['model']}")
+        print(f"  Total tokens: {result['metadata']['tokens_used']}")
+        print(f"  Prompt tokens: {result['metadata']['prompt_tokens']}")
+        print(f"  Completion tokens: {result['metadata']['completion_tokens']}")
+
+    except Exception as e:
+        print(f"❌ Error during analysis: {e}")
+        return
+
+    print("\n✅ Integration test completed successfully!")
+
+
+if __name__ == "__main__":
+    main() 
diff --git a/examples/media-gen/tools/__init__.py b/examples/media-gen/tools/__init__.py
@@ -9,10 +9,12 @@
 from .media_gen_tool_base import ImageGenerationTool, VideoGenerationTool
 from .dummy_image_gen import DummyImageGen
 from .dummy_video_gen import DummyVideoGen
+from .image_understanding_tool import ImageUnderstandingTool
 
 __all__ = [
     "ImageGenerationTool",
     "VideoGenerationTool", 
     "DummyImageGen",
-    "DummyVideoGen"
+    "DummyVideoGen",
+    "ImageUnderstandingTool"
 ]