diff --git a/examples/media-gen/README.md b/examples/media-gen/README.md index fdf549c..f2cde22 100644 --- a/examples/media-gen/README.md +++ b/examples/media-gen/README.md @@ -1,339 +1,205 @@ -# Media Generation Tools +# Media Generation Pipeline -A collection of abstract base classes and concrete implementations for media generation tools that can be integrated into Polymind agents. This example demonstrates how to create image and video generation tools with a consistent API. +A command-line tool for regenerating images using AI-powered image understanding and generation. -## Summary +## Overview -This package provides: -- Abstract base classes for image and video generation tools -- Dummy implementations for testing and development -- Consistent parameter specifications across all tools -- Easy integration with Polymind agents +The media regeneration pipeline analyzes an original image and generates a new image based on user interests and preferences. It uses a two-step process: -## Setup +1. **Image Understanding**: Analyzes the original image using AI vision capabilities +2. **Image Generation**: Creates a new image based on the analysis and user preferences + +## Quick Start ### Prerequisites -- Python 3.10+ -- Polymind framework (installed as a third-party library) -### Quick Setup +1. **Environment Setup**: Create a `.env` file with your API keys: + ```bash + # Copy the example file + cp env.example .env + + # Edit .env with your actual API keys +OPENAI_API_KEY=your_openai_api_key_here +REPLICATE_API_TOKEN=your_replicate_api_token_here + +# Note: You only need the API key for the generator you plan to use + ``` + +2. **Dependencies**: Install required packages: + ```bash + pip install -r requirements.txt + ``` + +### Basic Usage -**Linux (recommended):** ```bash -python setup.py +python media_gen_pipeline.py --image-path --user-interests ``` -**Manual Setup:** +**Example:** ```bash -python3 -m venv venv -source venv/bin/activate -pip install -r requirements.txt -cp env.example .env +python media_gen_pipeline.py --image-path my_image.jpg --user-interests "make it more vibrant and modern" ``` -### Environment Configuration +### Command Line Options -The setup script automatically creates a `.env` file from the template. You'll need to edit it with your actual API keys: +- `--image-path`: Path to the original image file (supports ~ for home directory) (required) +- `--user-interests`: User preferences for regeneration (required) +- `--output-folder`: Output folder for generated image (default: `~/Downloads`) +- `--aspect-ratio`: Aspect ratio for generated image (default: `1:1`) +- `--output-format`: Output format for generated image (default: `png`) +- `--generator`: Image generation service to use (choices: `openai`, `replicate`, default: `openai`) +- `--debug`: Enable debug output to see prompts used in each step -**Required API keys:** -- `OPENAI_API_KEY`: For DALL-E image generation -- `REPLICATE_API_TOKEN`: For various AI models +**Full Example:** +```bash +python media_gen_pipeline.py \ + --image-path ~/Pictures/my_image.jpg \ + --user-interests "make it more artistic with vibrant colors" \ + --output-folder ~/Desktop/generated \ + --aspect-ratio 16:9 \ + --output-format jpg \ + --generator replicate \ + --debug +``` -**Note:** The `.env` file is automatically ignored by git to keep your keys secure. +## Examples -## File Structure +### Basic Regeneration +```bash +# Regenerate with simple preferences (saves to ~/Downloads by default) +python media_gen_pipeline.py --image-path photo.jpg --user-interests "make it more vibrant" +``` +### Using Replicate Generator +```bash +# Use Replicate for image generation +python media_gen_pipeline.py --image-path landscape.jpg --user-interests "convert to watercolor style" --generator replicate ``` -media-gen/ -├── tools/ # All tool implementations -│ ├── __init__.py # Package exports -│ ├── media_gen_tool_base.py # Abstract base classes -│ ├── dummy_image_gen.py # Dummy image generation tool -│ ├── openai_image_gen.py # OpenAI image generation tool -│ ├── replicate_image_gen.py # Replicate image generation tool -│ ├── dummy_video_gen.py # Dummy video generation tool -│ └── image_understanding_tool.py # Image understanding tool -├── tests/ # Test suite -│ ├── test_dummy_media_gen.py # Comprehensive tests -│ ├── test_openai_image_gen.py # OpenAI image generation tests -│ ├── test_replicate_image_gen.py # Replicate image generation tests -│ └── test_image_understanding.py # Image understanding tests -├── integration_tests/ # Integration tests (manual) -│ ├── test_image_understanding.py # Real API integration test -│ └── README.md # Integration test documentation - -├── env.example # Environment variables template -├── setup.py # Unified setup script (all platforms) -├── example_usage.py # Usage examples -├── requirements.txt # Dependencies -├── __init__.py # Main package exports -└── README.md # This file + +### Using Home Directory Paths +```bash +# Use ~ for home directory paths +python media_gen_pipeline.py --image-path ~/Pictures/landscape.jpg --user-interests "convert to watercolor style" ``` -## Creating New Tools - -### Image Generation Tool - -```python -from tools import ImageGenerationTool - -class MyImageGen(ImageGenerationTool): - def run(self, input: dict) -> dict: - prompt = input.get("prompt", "") - aspect_ratio = input.get("aspect_ratio", "4:3") - output_format = input.get("output_format", "jpg") - output_folder = input.get("output_folder", "/tmp") - - # Your image generation logic here - # ... - - return { - "image_path": "/path/to/generated/image.jpg", - "generation_info": {"model": "my-model"} - } +### Custom Output Location +```bash +# Save to custom location +python media_gen_pipeline.py --image-path portrait.jpg --user-interests "make it modern" --output-folder ~/Desktop ``` -**Parameters:** -- `prompt` (str, required): Text description -- `aspect_ratio` (str, optional, default: "4:3"): Image aspect ratio -- `output_format` (str, optional, default: "jpg"): Output format -- `output_folder` (str, optional, default: "~/Downloads"): Folder path where to save the generated image - -### OpenAI Image Generation Tool - -```python -from tools import OpenAIImageGen - -# Initialize the tool -image_gen = OpenAIImageGen() - -# Basic usage -result = image_gen.run({ - "prompt": "A gray tabby cat hugging an otter with an orange scarf", - "output_folder": "./generated_images" -}) - -# Advanced usage with custom parameters -result = image_gen.run({ - "prompt": "A futuristic cityscape at sunset with flying cars", - "size": "1024x1536", - "quality": "high", - "output_format": "png", - "compression": 90, - "background": "opaque", - "output_folder": "./generated_images" -}) +### Artistic Transformation +```bash +# Transform to artistic style +python media_gen_pipeline.py --image-path landscape.jpg --user-interests "convert to watercolor painting style" ``` -**Parameters:** -- `prompt` (str, required): Text description of the desired image -- `output_folder` (str, optional, default: "~/Downloads"): Folder path where to save the generated image -- `size` (str, optional, default: "1024x1024"): Image dimensions -- `quality` (str, optional, default: "low"): Rendering quality (low, medium, high) -- `output_format` (str, optional, default: "png"): Output format -- `compression` (int, optional, default: 80): Compression level 0-100% -- `background` (str, optional, default: "opaque"): Transparent or opaque - -**Features:** -- Uses OpenAI's gpt-4o-mini model with image generation capabilities -- Supports various image parameters (size, quality, format, compression, background) -- Automatic directory creation for output paths -- Comprehensive error handling -- Integrates seamlessly with Polymind framework - -### Replicate Image Generation Tool - -```python -from tools import ReplicateImageGen - -# Initialize the tool with default model (WAN 2.2) -image_gen = ReplicateImageGen() - -# Basic usage -result = image_gen.run({ - "prompt": "A cinematic cat portrait with golden hour lighting", - "output_folder": "./generated_images" -}) - -# Advanced usage with custom parameters -result = image_gen.run({ - "prompt": "A cinematic, photorealistic medium shot of a cat", - "seed": 246764, - "aspect_ratio": "4:3", - "model": "stability-ai/sdxl" -}) +### Style Enhancement +```bash +# Enhance with specific style +python media_gen_pipeline.py --image-path portrait.jpg --user-interests "make it more modern and professional" ``` -**Parameters:** -- `prompt` (str, required): Text description of the desired image -- `output_folder` (str, optional, default: "~/Downloads"): Folder path where to save the generated image -- `seed` (int, optional): Random seed for reproducible results -- `aspect_ratio` (str, optional, default: "4:3"): Image aspect ratio -- `output_format` (str, optional, default: "jpeg"): Output format -- `model` (str, optional): Replicate model to use (overrides default) - -**Features:** -- Uses Replicate's API with various image generation models -- Supports models like WAN 2.2, Stable Diffusion XL, and others -- Reproducible results with seed parameter -- Automatic directory creation for output paths -- Comprehensive error handling -- Integrates seamlessly with Polymind framework - -### Video Generation Tool - -```python -from tools import VideoGenerationTool - -class MyVideoGen(VideoGenerationTool): - def run(self, input: dict) -> dict: - prompt = input.get("prompt", "") - num_frames = input.get("num_frames", 81) - resolution = input.get("resolution", "480p") - image = input.get("image", None) - - # Your video generation logic here - # ... - - return { - "video_path": "/path/to/generated/video.mp4", - "generation_info": {"model": "my-model"} - } +## Output + +The tool outputs comprehensive information about the regeneration process with clear formatting: + ``` +============================================================ +🎨 MEDIA REGENERATION COMPLETE +============================================================ +📁 Image stored at: /Users/username/Downloads/openai_generated_image_20241201_143022.png + 📂 Relative to Downloads: openai_generated_image_20241201_143022.png + +🔍 Image Analysis: + This image shows a beautiful landscape with mountains in the background, featuring vibrant colors and dramatic lighting. The scene includes rolling hills, a clear blue sky, and natural elements that create a serene atmosphere. + +🎯 Final Generation Prompt: + This image shows a beautiful landscape with mountains in the background, featuring vibrant colors and dramatic lighting. The scene includes rolling hills, a clear blue sky, and natural elements that create a serene atmosphere. -**Parameters:** -- `prompt` (str, required): Text description -- `num_frames` (int, optional, default: 81): Number of frames -- `resolution` (str, optional, default: "480p"): Video resolution -- `image` (str, optional): URI of starting image - -### Image Understanding Tool - -```python -from tools import ImageUnderstandingTool - -# Initialize the tool -image_tool = ImageUnderstandingTool() - -# Analyze an image from URL -result = image_tool.run({ - "prompt": "What objects do you see in this image?", - "images": ["https://example.com/image.jpg"], - "return_json": False -}) - -# Analyze with JSON response -result = image_tool.run({ - "prompt": "Analyze this image and return JSON with 'objects' and 'mood' fields", - "images": ["path/to/local/image.jpg"], - "return_json": True, - "max_tokens": 500 -}) - -# Generate image generation prompt -result = image_tool.run({ - "prompt": "Analyze this image and create a detailed image generation prompt that could be used to recreate this image. Include specific details about objects, characters, setting, lighting, mood, style, composition, colors, and textures.", - "images": ["path/to/local/image.jpg"], - "max_tokens": 600 -}) + User preferences: make it more vibrant and modern + +📊 Generation Info: + model: gpt-4o-mini + tokens_used: 150 + generation_time: 2.3s +============================================================ ``` -**Parameters:** -- `prompt` (str, optional, default: "What's in this image?"): Analysis prompt -- `images` (List[str], required): List of image paths or URLs -- `return_json` (bool, optional, default: False): Return JSON response -- `max_tokens` (int, optional, default: 1000): Maximum response tokens +**Output includes:** +- **📁 Image location**: Full path where the image is stored +- **📂 Relative path**: Simplified path relative to Downloads folder (if applicable) +- **🔍 Image analysis**: The analysis of the original image +- **🎯 Final generation prompt**: Combined analysis and user interests used for generation (from metadata) +- **📊 Generation metadata**: Additional information about the generation process (model, tokens, timing, etc.) +## Path Support -**Features:** -- Supports both local image files and image URLs -- Automatic base64 encoding for local images -- Optional JSON response format for structured output -- Configurable token limits -- Comprehensive error handling +The tool supports path expansion for convenience: +- **Home directory**: Use `~` to reference your home directory +- **Examples**: + - `~/Pictures/photo.jpg` → `/Users/username/Pictures/photo.jpg` + - `~/Downloads` → `/Users/username/Downloads` + - `~/.config` → `/Users/username/.config` +## Error Handling -## Testing +The tool includes comprehensive error handling: +- Validates that the input image exists (with path expansion) +- Provides clear error messages for missing files or API issues +- Creates output directories automatically +- Supports path expansion for both input and output paths -### Unit Tests -Run the standard unit tests: -```bash -cd tests && python test_dummy_media_gen.py -python test_image_understanding.py -``` +## Architecture -### Integration Tests -For real API testing with actual images: -```bash -python integration_tests/test_image_understanding.py -``` +The pipeline uses a modular design with two main components: -**Features:** -- Generates image generation prompt for test image -- Uses local test image (`test_image.png`) -- Comprehensive error handling - -**Note:** Integration tests require: -- Valid OpenAI API key in `.env` file -- Internet connection -- Test image file in `integration_tests/` folder - -## Usage - -```python -from tools import DummyImageGen, OpenAIImageGen, ReplicateImageGen, DummyVideoGen -from dotenv import load_dotenv -import os - -# Load environment variables -load_dotenv() - -# Check configuration status -print(f"OpenAI API Key: {'✓ Available' if os.getenv('OPENAI_API_KEY') else '✗ Missing'}") -print(f"Replicate API Token: {'✓ Available' if os.getenv('REPLICATE_API_TOKEN') else '✗ Missing'}") - -# Initialize tools -image_gen = DummyImageGen() -openai_image_gen = OpenAIImageGen() -replicate_image_gen = ReplicateImageGen() -video_gen = DummyVideoGen() -image_understanding = ImageUnderstandingTool() - -# Generate media -image_result = image_gen.run({"prompt": "A beautiful sunset"}) -openai_result = openai_image_gen.run({ - "prompt": "A beautiful sunset over mountains", - "output_folder": "./generated_images" -}) -replicate_result = replicate_image_gen.run({ - "prompt": "A cinematic cat portrait", - "seed": 12345, - "aspect_ratio": "4:3" -}) -video_result = video_gen.run({"prompt": "A butterfly emerging"}) - -# Analyze images -analysis_result = image_understanding.run({ - "prompt": "What's in this image?", - "images": ["https://example.com/image.jpg"] -}) -``` +### Core Pipeline (`pipeline.py`) +- Generic pipeline infrastructure +- Configurable input/output mappings +- Extensible for future media types -## Running Examples +### Media Regeneration (`media_gen_pipeline.py`) +- Specialized for image regeneration +- Command-line interface +- Simple two-parameter API +- Path expansion support -```bash -# Activate virtual environment -source venv/bin/activate # On Windows: venv\Scripts\activate.bat +## Future Extensions + +The modular design allows easy extension to other media types: +- **Image to Video**: Add video generation step +- **Video Understanding**: Add video analysis capabilities +- **Multi-modal**: Support for text, audio, and other media + +## File Structure + +``` +media-gen/ +├── media_gen_pipeline.py # Main command-line tool +├── pipeline.py # Core pipeline infrastructure +├── .env # API keys (create from env.example) +├── env.example # Environment variables template +├── tools/ # Media generation tools +│ ├── image_understanding_tool.py +│ ├── openai_image_gen.py +│ ├── dummy_image_gen.py +│ └── media_gen_tool_base.py +├── integration_tests/ # Test files and examples +└── ~/Downloads/ # Default output location +``` -# Run examples -python example_usage.py +## Testing -# Run tests -cd tests && python test_dummy_media_gen.py -python test_openai_image_gen.py -python test_replicate_image_gen.py -python test_image_understanding.py +Test with the provided test image: -# Run integration tests (requires API key) -python integration_tests/test_image_understanding.py -``` \ No newline at end of file +**Using OpenAI (default):** +```bash +python media_gen_pipeline.py --image-path integration_tests/test_image.png --user-interests "enhance the visual appeal" +``` + +**Using Replicate:** +```bash +python media_gen_pipeline.py --image-path integration_tests/test_image.png --user-interests "enhance the visual appeal" --generator replicate +``` \ No newline at end of file diff --git a/examples/media-gen/example_usage.py b/examples/media-gen/example_usage.py deleted file mode 100644 index c5a0e14..0000000 --- a/examples/media-gen/example_usage.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -Example usage of the media generation tools. - -This script demonstrates how to use the DummyImageGen and DummyVideoGen tools -with the new parameter specifications and environment variable configuration. -""" - -import os - -from dotenv import load_dotenv -from tools import DummyImageGen, DummyVideoGen - - -def check_config_status(): - """Check and display configuration status.""" - print("Configuration Status:") - print("=" * 40) - - # Load environment variables - load_dotenv() - - # Check API keys - openai_key = os.getenv("OPENAI_API_KEY") - replicate_token = os.getenv("REPLICATE_API_TOKEN") - - print(f"OpenAI API Key: {'✓ Available' if openai_key else '✗ Missing'}") - print(f"Replicate API Token: " - f"{'✓ Available' if replicate_token else '✗ Missing'}") - - # Show configuration - default_image = os.getenv('DEFAULT_IMAGE_MODEL', 'dall-e-3') - default_video = os.getenv('DEFAULT_VIDEO_MODEL', 'stable-video-diffusion') - log_level = os.getenv('LOG_LEVEL', 'INFO') - - print(f"\nDefault Image Model: {default_image}") - print(f"Default Video Model: {default_video}") - print(f"Log Level: {log_level}") - - if not openai_key or not replicate_token: - print("\nMissing API keys. Please check your .env file.") - - -def main(): - """Demonstrate the media generation tools.""" - print("Media Generation Tools Example") - print("=" * 40) - - # Check configuration status - print("\nConfiguration Status:") - check_config_status() - - # Initialize tools - image_gen = DummyImageGen() - video_gen = DummyVideoGen() - - # Example 1: Generate an image with default parameters - print("\n1. Generating image with defaults...") - image_result = image_gen.run({ - "prompt": "A serene mountain landscape at dawn" - }) - print(f" Image path: {image_result['image_path']}") - print(f" Info: {image_result['generation_info']}") - - # Example 2: Generate an image with custom parameters - print("\n2. Generating image with custom parameters...") - image_result = image_gen.run({ - "prompt": "A futuristic city skyline", - "aspect_ratio": "16:9", - "output_format": "png" - }) - print(f" Image path: {image_result['image_path']}") - print(f" Info: {image_result['generation_info']}") - - # Example 3: Generate a video with default parameters - print("\n3. Generating video with defaults...") - video_result = video_gen.run({ - "prompt": "A butterfly emerging from a cocoon" - }) - print(f" Video path: {video_result['video_path']}") - print(f" Info: {video_result['generation_info']}") - - # Example 4: Generate a video with custom parameters - print("\n4. Generating video with custom parameters...") - video_result = video_gen.run({ - "prompt": "A cinematic flythrough of a space station", - "num_frames": 120, - "resolution": "720p", - "image": "https://example.com/space_station.jpg" - }) - print(f" Video path: {video_result['video_path']}") - print(f" Info: {video_result['generation_info']}") - - print("\n" + "=" * 40) - print("Example completed successfully!") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/examples/media-gen/media_gen_pipeline.py b/examples/media-gen/media_gen_pipeline.py new file mode 100644 index 0000000..24a0c18 --- /dev/null +++ b/examples/media-gen/media_gen_pipeline.py @@ -0,0 +1,496 @@ +""" +Media regeneration pipeline. + +Command-line tool for regenerating images: +1. Analyze original image using image understanding +2. Generate new image based on analysis and user preferences + +Usage: + python media_gen_pipeline.py --image-path + --user-interests + +Example: + python media_gen_pipeline.py --image-path + ./examples/media-gen/integration_tests/test_image.png + --user-interests "Users like cute cats and capybara" +""" + +import argparse +import json +import logging +import os +import sys +from typing import Any, Dict + +from pathlib import Path + +# Load environment variables from .env file if it exists +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + # dotenv not installed, continue without it + pass + +from pipeline import MediaGenerationPipeline, PipelineStep, PipelineStepExecutor +from tools.image_understanding_tool import ImageUnderstandingTool +from tools.openai_image_gen import OpenAIImageGen +from tools.replicate_image_gen import ReplicateImageGen + + +def expand_path(path: str) -> str: + """Expand path with ~ to user's home directory.""" + return os.path.expanduser(path) + + +class MediaRegenerationPipeline(MediaGenerationPipeline): + """ + Pipeline for regenerating media (currently images). + + Workflow: + 1. Analyze original image using image understanding + 2. Generate new image based on analysis and user preferences + """ + + system_prompt: str = """ + Please analyze the content of the image, and we want to create a new + image generation prompt, combining the original image and the user + preferences. + + For the generated image generation prompt, follow the below requirement: + 1. The image generation prompt should be as close as possible to the + original image, but organically combine the user preferences. + 2. Not just replicate the object and scene in the original image, but + also understanding the image style, lighting, and composition. + 3. The generated prompt should not be long (<100 words) and can be + easily understood by the image generation model. + 4. The output should be a JSON object with the following fields: + { + "image_generation_prompt": "The image generation prompt" + } + The user preferences are as below: + """ + + def __init__( + self, + image_understanding_tool: ImageUnderstandingTool, + image_generation_tool: Any, # BaseTool type + name: str = "media_regeneration", + debug: bool = False + ): + """ + Initialize the media regeneration pipeline. + + Args: + image_understanding_tool: Tool for analyzing images + image_generation_tool: Tool for generating images + name: Pipeline name + debug: Enable debug output + """ + super().__init__(name) + self.debug = debug + + # Add image understanding step + self.add_step( + PipelineStep( + name="image_understanding", + tool=image_understanding_tool, + input_mapping={ + "original_image": "images", + "system_prompt": "prompt", + "user_preferences": "user_preferences" + }, + output_mapping={ + "analysis": "image_analysis" + }, + transform_input=self._combine_prompt_and_preferences, + transform_output=self._extract_analysis + ) + ) + + # Add image generation step + self.add_step( + PipelineStep( + name="image_generation", + tool=image_generation_tool, + input_mapping={ + "image_analysis": "prompt", + "output_folder": "output_folder", + "aspect_ratio": "aspect_ratio", + "output_format": "output_format" + }, + output_mapping={ + "image_path": "image_path", + "generation_info": "generation_info" + }, + transform_input=self._use_analysis_as_prompt + ) + ) + + def regenerate( + self, + image_path: str, + user_interests: str, + output_folder: str = "~/Downloads", + aspect_ratio: str = "1:1", + output_format: str = "png" + ) -> Dict[str, Any]: + """ + Regenerate an image based on the original. + + Args: + image_path: Path to the original image + user_interests: User preferences for regeneration + output_folder: Folder to save generated image + (default: ~/Downloads) + aspect_ratio: Aspect ratio for generated image + output_format: Output format for generated image + + Returns: + Dictionary containing: + - generated_image_path: Path to the generated image + - image_analysis: Analysis from image understanding + - generation_metadata: Additional generation info + """ + # Prepare input data + input_data = { + "original_image": [image_path], + "system_prompt": self.system_prompt, + "user_preferences": user_interests, + "output_folder": output_folder, + "aspect_ratio": aspect_ratio, + "output_format": output_format, + "return_json": True # Request JSON output from image understanding + } + + # Run pipeline + result = self.run(input_data) + + return result + + def _combine_prompt_and_preferences( + self, tool_input: Dict[str, Any] + ) -> Dict[str, Any]: + """Combine system prompt with user preferences.""" + system_prompt = tool_input.get("prompt", "") + user_preferences = tool_input.get("user_preferences", "") + + if user_preferences: + combined_prompt = ( + f"{system_prompt}\n\n\t {user_preferences}" + ) + else: + combined_prompt = system_prompt + + # Debug: Print the prompt being sent to image understanding + if self.debug: + print("\n🔍 DEBUG - Image Understanding Prompt:") + print(f" {combined_prompt}") + + return {**tool_input, "prompt": combined_prompt} + + def _extract_analysis( + self, tool_output: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Extract analysis text and image generation prompt from JSON output. + + The image understanding tool returns JSON with the structure: + { + "image_generation_prompt": "The image generation prompt" + } + + Handles responses that may be wrapped in markdown code blocks. + """ + analysis = tool_output.get("analysis", "") + + # Debug: Print the raw analysis received from image understanding + if self.debug: + print("\n🔍 DEBUG - Raw Image Understanding Analysis:") + print(f" {analysis}") + + # Clean the analysis text - remove markdown code block markers + cleaned_analysis = analysis.strip() + if cleaned_analysis.startswith("```json"): + cleaned_analysis = cleaned_analysis[7:] # Remove ```json + if cleaned_analysis.startswith("```"): + cleaned_analysis = cleaned_analysis[3:] # Remove ``` + if cleaned_analysis.endswith("```"): + cleaned_analysis = cleaned_analysis[:-3] # Remove trailing ``` + cleaned_analysis = cleaned_analysis.strip() + + # Try to parse as JSON and extract the image_generation_prompt + try: + analysis_json = json.loads(cleaned_analysis) + if (isinstance(analysis_json, dict) and + "image_generation_prompt" in analysis_json): + extracted_prompt = analysis_json["image_generation_prompt"] + + # Debug: Print the extracted prompt + if self.debug: + print("\n🔍 DEBUG - Extracted Image Generation Prompt:") + print(f" {extracted_prompt}") + + return {"analysis": extracted_prompt} + else: + # JSON parsed but doesn't have expected structure + if self.debug: + print("\n⚠️ DEBUG - JSON parsed but missing " + "image_generation_prompt field") + return {"analysis": cleaned_analysis} + except json.JSONDecodeError: + # Not valid JSON, use the cleaned analysis + if self.debug: + print("\n⚠️ DEBUG - Analysis is not valid JSON, " + "using cleaned text") + return {"analysis": cleaned_analysis} + + def _use_analysis_as_prompt( + self, tool_input: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Use the image analysis (which already incorporates user preferences) + as the final generation prompt. + + The image understanding tool generates a prompt that combines + the original image content with user preferences, so we use + that directly without further modification. + """ + # Debug: Print all tool input keys + if self.debug: + print("\n🔍 DEBUG - All Tool Input Keys:") + for key, value in tool_input.items(): + print(f" {key}: {value}") + + # The image_analysis has been mapped to "prompt" by the input mapping + final_prompt = tool_input.get("prompt", "") + + # Debug: Print what we received + if self.debug: + print("\n🔍 DEBUG - Image Generation Input:") + print(f" prompt (from image_analysis): {final_prompt}") + + # Use the prompt directly as the final prompt + # The image understanding tool already incorporates user preferences + + # Debug: Print the prompt being sent to image generation + if self.debug: + print("\n🎨 DEBUG - Image Generation Prompt:") + print(f" {final_prompt}") + + return {**tool_input, "prompt": final_prompt} + + def run(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute the pipeline with the given input. + + Args: + input_data: Initial input data for the pipeline + + Returns: + Final output from the last step + """ + # Expand paths + expanded_image_path = expand_path(input_data["original_image"][0]) + expanded_output_folder = expand_path(input_data["output_folder"]) + + # Create output directory if it doesn't exist + Path(expanded_output_folder).mkdir(parents=True, exist_ok=True) + + # Update input data with expanded paths + input_data["original_image"] = [expanded_image_path] + input_data["output_folder"] = expanded_output_folder + + # Debug: Show initial input + if self.debug: + print("\n🔍 DEBUG - Initial Pipeline Input:") + for key, value in input_data.items(): + print(f" {key}: {value}") + + # Run the parent pipeline with custom execution + self.logger.info( + f"Starting pipeline execution with {len(self.steps)} steps" + ) + + current_input = input_data.copy() + + for i, step in enumerate(self.steps): + self.logger.info( + f"Executing step {i+1}/{len(self.steps)}: {step.name}" + ) + + # Debug: Show input to this step + if self.debug: + print(f"\n🔍 DEBUG - Input to step {i+1} ({step.name}):") + for key, value in current_input.items(): + print(f" {key}: {value}") + + # Execute the step + executor = PipelineStepExecutor(step) + step_output = executor.execute(current_input) + + # Debug: Show output from this step + if self.debug: + print(f"\n🔍 DEBUG - Output from step {i+1} ({step.name}):") + for key, value in step_output.items(): + print(f" {key}: {value}") + + # Merge step output with current input for next step + current_input.update(step_output) + + self.logger.debug(f"Step {step.name} output: {step_output}") + + self.logger.info("Pipeline execution completed") + + # Debug: Show final result + if self.debug: + print("\n🔍 DEBUG - Final Pipeline Result Keys:") + for key, value in current_input.items(): + if key == "image_analysis": + print(f" {key}: {str(value)[:100]}...") + else: + print(f" {key}: {value}") + + return current_input + + +def setup_logging(): + """Setup logging.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + +def main(): + """Main function for command-line execution.""" + parser = argparse.ArgumentParser( + description="Regenerate an image based on user interests" + ) + parser.add_argument( + "--image-path", + required=True, + help="Path to the original image file (supports ~ for home directory)" + ) + parser.add_argument( + "--user-interests", + required=True, + help="User interests/preferences for regeneration" + ) + parser.add_argument( + "--output-folder", + default="~/Downloads", + help="Output folder for generated image (default: ~/Downloads)" + ) + parser.add_argument( + "--aspect-ratio", + default="1:1", + help="Aspect ratio for generated image (default: 1:1)" + ) + parser.add_argument( + "--output-format", + default="png", + help="Output format for generated image (default: png)" + ) + parser.add_argument( + "--generator", + choices=["replicate", "openai"], + default="replicate", + help="Image generation service to use (default: replicate)" + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug output to see prompts used in each step" + ) + + args = parser.parse_args() + + # Expand paths + expanded_image_path = expand_path(args.image_path) + + # Validate input image exists + if not Path(expanded_image_path).exists(): + print(f"Error: Image file not found: {expanded_image_path}") + sys.exit(1) + + # Setup logging + setup_logging() + + try: + # Initialize tools + image_understanding = ImageUnderstandingTool() + + # Choose image generation tool + if args.generator == "replicate": + image_gen = ReplicateImageGen() + else: + image_gen = OpenAIImageGen() + + # Create pipeline + pipeline = MediaRegenerationPipeline( + image_understanding_tool=image_understanding, + image_generation_tool=image_gen, + debug=args.debug + ) + + # Regenerate image + result = pipeline.regenerate( + image_path=args.image_path, + user_interests=args.user_interests, + output_folder=args.output_folder, + aspect_ratio=args.aspect_ratio, + output_format=args.output_format + ) + + # Output results + print("\n" + "="*60) + print("🎨 MEDIA REGENERATION COMPLETE") + print("="*60) + + # Get image path from the result + generated_path = result.get('image_path', '') + if generated_path: + print(f"📁 Image stored at: {generated_path}") + # Show relative path if it's in Downloads + downloads_path = os.path.expanduser("~/Downloads") + if generated_path.startswith(downloads_path): + relative_path = os.path.relpath(generated_path, downloads_path) + print(f" 📂 Relative to Downloads: {relative_path}") + else: + print("❌ No image path returned") + + print("\n🔍 Image Analysis:") + analysis = result.get('image_analysis', '') + if analysis: + print(f" {analysis}") + else: + print(" No analysis available") + + print("\n🎯 Final Generation Prompt:") + # Get the final prompt from generation info + generation_info = result.get('generation_info', {}) + final_prompt = generation_info.get('prompt', '') + if final_prompt: + print(f" {final_prompt}") + else: + print(" No generation prompt available") + + if generation_info: + print("\n📊 Generation Info:") + # Remove the prompt from metadata to avoid duplication + filtered_info = { + k: v for k, v in generation_info.items() if k != 'prompt' + } + for key, value in filtered_info.items(): + print(f" {key}: {value}") + + print("="*60) + + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/media-gen/pipeline.py b/examples/media-gen/pipeline.py new file mode 100644 index 0000000..105d84f --- /dev/null +++ b/examples/media-gen/pipeline.py @@ -0,0 +1,169 @@ +""" +Modular pipeline system for media generation. + +This module provides a flexible pipeline architecture for chaining media +generation tools together. It supports static pipelines with configurable +input/output mappings and allows easy addition of new pipeline steps. +""" + +import logging +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Union + +from polymind.core.tool import BaseTool + + +@dataclass +class PipelineStep: + """ + Represents a single step in a media generation pipeline. + + Attributes: + name: Unique identifier for the step + tool: The tool to execute in this step + input_mapping: Mapping from pipeline input to tool input + output_mapping: Mapping from tool output to pipeline output + transform_input: Optional function to transform input before passing to tool + transform_output: Optional function to transform output after tool execution + """ + name: str + tool: BaseTool + input_mapping: Dict[str, str] + output_mapping: Dict[str, str] + transform_input: Optional[ + Callable[[Dict[str, Any]], Dict[str, Any]] + ] = None + transform_output: Optional[ + Callable[[Dict[str, Any]], Dict[str, Any]] + ] = None + + +class PipelineStepExecutor: + """ + Executes a single pipeline step with input/output transformations. + """ + + def __init__(self, step: PipelineStep): + """Initialize the step executor.""" + self.step = step + self.logger = logging.getLogger(f"PipelineStepExecutor.{step.name}") + + def execute(self, pipeline_input: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute the pipeline step. + + Args: + pipeline_input: Current pipeline input state + + Returns: + Tool output mapped to pipeline output format + """ + self.logger.info(f"Executing step: {self.step.name}") + + # Map pipeline input to tool input + tool_input = self._map_input(pipeline_input) + + # Apply input transformation if provided + if self.step.transform_input: + tool_input = self.step.transform_input(tool_input) + + self.logger.debug(f"Tool input: {tool_input}") + + # Execute the tool + tool_output = self.step.tool.run(tool_input) + + self.logger.debug(f"Tool output: {tool_output}") + + # Apply output transformation if provided + if self.step.transform_output: + tool_output = self.step.transform_output(tool_output) + + # Map tool output to pipeline output + pipeline_output = self._map_output(tool_output) + + self.logger.info(f"Step {self.step.name} completed successfully") + return pipeline_output + + def _map_input(self, pipeline_input: Dict[str, Any]) -> Dict[str, Any]: + """Map pipeline input to tool input using input_mapping.""" + tool_input = {} + for pipeline_key, tool_key in self.step.input_mapping.items(): + if pipeline_key in pipeline_input: + tool_input[tool_key] = pipeline_input[pipeline_key] + return tool_input + + def _map_output(self, tool_output: Dict[str, Any]) -> Dict[str, Any]: + """Map tool output to pipeline output using output_mapping.""" + pipeline_output = {} + for tool_key, pipeline_key in self.step.output_mapping.items(): + if tool_key in tool_output: + pipeline_output[pipeline_key] = tool_output[tool_key] + return pipeline_output + + +class MediaGenerationPipeline: + """ + A modular pipeline for media generation tasks. + + This pipeline allows chaining multiple tools together with configurable + input/output mappings and transformations. It supports static execution + where each step's output becomes the next step's input. + """ + + def __init__(self, name: str): + """ + Initialize the pipeline. + + Args: + name: Name of the pipeline for logging and identification + """ + self.name = name + self.steps: List[PipelineStep] = [] + self.logger = logging.getLogger(f"MediaGenerationPipeline.{name}") + + def add_step(self, step: PipelineStep) -> 'MediaGenerationPipeline': + """ + Add a step to the pipeline. + + Args: + step: PipelineStep to add + + Returns: + Self for method chaining + """ + self.steps.append(step) + self.logger.info(f"Added step: {step.name}") + return self + + def run(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute the pipeline with the given input. + + Args: + input_data: Initial input data for the pipeline + + Returns: + Final output from the last step + """ + self.logger.info(f"Starting pipeline execution with {len(self.steps)} steps") + + current_input = input_data.copy() + + for i, step in enumerate(self.steps): + self.logger.info(f"Executing step {i+1}/{len(self.steps)}: {step.name}") + + # Execute the step + executor = PipelineStepExecutor(step) + step_output = executor.execute(current_input) + + # Merge step output with current input for next step + current_input.update(step_output) + + self.logger.debug(f"Step {step.name} output: {step_output}") + + self.logger.info("Pipeline execution completed") + return current_input + + def get_step_names(self) -> List[str]: + """Get the names of all steps in the pipeline.""" + return [step.name for step in self.steps] \ No newline at end of file diff --git a/examples/media-gen/tools/replicate_image_gen.py b/examples/media-gen/tools/replicate_image_gen.py index e92b27e..6b5d22b 100644 --- a/examples/media-gen/tools/replicate_image_gen.py +++ b/examples/media-gen/tools/replicate_image_gen.py @@ -55,6 +55,7 @@ def run(self, input: dict) -> dict: - seed: Random seed for reproducible results (optional) - aspect_ratio: Image aspect ratio (optional, default: "4:3") - output_format: Output format (optional, default: "jpeg") + - quality: Image quality (optional, default: 80) - model: Replicate model to use (optional, overrides default) Returns: @@ -68,6 +69,7 @@ def run(self, input: dict) -> dict: seed = input.get("seed") aspect_ratio = input.get("aspect_ratio", "4:3") output_format = input.get("output_format", "jpeg") + quality = input.get("quality", 80) model = input.get("model", self._model) # Generate dynamic image name with timestamp to avoid duplication @@ -91,7 +93,8 @@ def run(self, input: dict) -> dict: # Prepare input for Replicate replicate_input = { "prompt": prompt, - "aspect_ratio": aspect_ratio + "aspect_ratio": aspect_ratio, + "quality": quality } # Add seed if provided