Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion common/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ packaging==24.2
pandas==2.2.3
#pathtools==0.1.2
pillow==11.2.1
PyMuPDF==1.26.4
PyMuPDF==1.26.6
pymupdf4llm==0.2.0
platformdirs==4.3.8
pluggy==1.6.0
prometheus_client==0.22.1
Expand Down
165 changes: 31 additions & 134 deletions common/utils/image_data_extractor.py
Original file line number Diff line number Diff line change
@@ -1,165 +1,62 @@
import base64
import io
import logging
import os
import uuid
import hashlib
from pathlib import Path
from langchain_core.messages import HumanMessage, SystemMessage

from common.config import get_multimodal_service

logger = logging.getLogger(__name__)



def describe_image_with_llm(image_input):
def describe_image_with_llm(file_path):
"""
Send image (pixmap or PIL image) to LLM vision model and return description.
Uses multimodal_service from config if available, otherwise falls back to completion_service.
Currently supports: OpenAI, Azure OpenAI, Google GenAI, and Google VertexAI
Read image file and convert to base64 to send to LLM.
"""
try:
from PIL import Image as PILImage

client = get_multimodal_service()
if not client:
return "[Image: Failed to create multimodal LLM client]"


# Read image and convert to base64
pil_image = PILImage.open(file_path)
buffer = io.BytesIO()
# Convert to RGB if needed for better compatibility
if image_input.mode != 'RGB':
image_input = image_input.convert('RGB')
image_input.save(buffer, format="JPEG", quality=95)
b64_img = base64.b64encode(buffer.getvalue()).decode("utf-8")
if pil_image.mode != 'RGB':
pil_image = pil_image.convert('RGB')
pil_image.save(buffer, format="JPEG", quality=95)
image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')

# Build messages (system + human)
messages = [
SystemMessage(
content="You are a helpful assistant that describes images concisely for document analysis."
),
HumanMessage(
content=[
{
"type": "text",
"text": (
"Please describe what you see in this image and "
"if the image has scanned text then extract all the text. "
"if the image has any logo, icon, or branding element, try to describe it with text. "
"Focus on any text, diagrams, charts, or other visual elements."
"If the image is purely a logo, icon, or branding element, start your response with 'LOGO:' or 'ICON:'."
),
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64_img}"},
},
]
),
SystemMessage(
content="You are a helpful assistant that describes images concisely for document analysis."
),
HumanMessage(
content=[
{
"type": "text",
"text": (
"Please describe what you see in this image and "
"if the image has scanned text then extract all the text. "
"If the image has any graph, chart, table, or other diagram, describe it. "
),
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
},
],
),
]

# Get response from LangChain LLM client
# Access the underlying LangChain client
langchain_client = client.llm
response = langchain_client.invoke(messages)

return response.content if hasattr(response, 'content') else str(response)
return response.content if hasattr(response, "content") else str(response)

except Exception as e:
logger.error(f"Failed to describe image with LLM: {str(e)}")
return "[Image: Error processing image description]"


def save_image_and_get_markdown(image_input, context_info="", graphname=None):
"""
Save image locally to static/images/ folder and return markdown reference with description.
LEGACY/OLD APPROACH: Used for backward compatibility with JSONL-based loading.
Images are saved as files and served via /ui/images/ endpoint with img:// protocol.
For NEW direct loading approach, images are stored in Image vertex as base64
and served via /ui/image_vertex/ endpoint with image:// protocol.
Args:
image_input: PIL Image object
context_info: Optional context (e.g., "page 3 of invoice.pdf")
graphname: Graph name to organize images by graph (optional)
Returns:
dict with:
- 'markdown': Markdown string with img:// reference
- 'image_id': Unique identifier for the saved image
- 'image_path': Path where image was saved to static/images/
"""
try:
# FIRST: Get description from LLM to check if it's a logo
description = describe_image_with_llm(image_input)

# Check if the image is a logo, icon, or decorative element BEFORE saving
# These should be filtered out as they're not content-relevant
description_lower = description.lower()
logo_indicators = ['logo', 'icon', 'branding', 'watermark', 'trademark', 'company logo', 'brand logo']

if any(indicator in description_lower for indicator in logo_indicators):
logger.info(f"Detected logo/icon in image, skipping: {description[:100]}")
return None

# If not a logo, proceed with saving the image
# Generate unique image ID using hash of image content
buffer = io.BytesIO()
if image_input.mode != 'RGB':
image_input = image_input.convert('RGB')
image_input.save(buffer, format="JPEG", quality=95)
image_bytes = buffer.getvalue()

# Create hash-based ID (deterministic for same image)
image_hash = hashlib.sha256(image_bytes).hexdigest()[:16]
image_id = f"{image_hash}.jpg"

# Save image to local storage directory organized by graphname
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# If graphname is provided, organize images by graph
if graphname:
images_dir = os.path.join(project_root, "static", "images", graphname)
# Include graphname in the image reference for URL construction
image_reference = f"{graphname}/{image_id}"
else:
images_dir = os.path.join(project_root, "static", "images")
image_reference = image_id

os.makedirs(images_dir, exist_ok=True)

image_path = os.path.join(images_dir, image_id)

# Save image file (skip if already exists with same hash)
if not os.path.exists(image_path):
with open(image_path, 'wb') as f:
f.write(image_bytes)
logger.info(f"Saved content image to: {image_path}")
else:
logger.debug(f"Image already exists: {image_path}")

# Generate markdown with custom img:// protocol (will be replaced later)
# Format: ![description](img://graphname/image_id) or ![description](img://image_id)
markdown = f"![{description}](img://{image_reference})"

logger.info(f"Created image reference: {image_reference} with description")

return {
'markdown': markdown,
'image_id': image_reference,
'image_path': image_path,
'description': description
}

except Exception as e:
logger.error(f"Failed to save image and generate markdown: {str(e)}")
# Fallback to text description only
fallback_desc = f"[Image: {context_info} - processing failed]"
return {
'markdown': fallback_desc,
'image_id': None,
'image_path': None,
'description': fallback_desc
}


Loading
Loading