Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
import uuid
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard, Coroutine
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
from uuid import uuid4

import getstream.models
Expand Down
Empty file.
Empty file.
48 changes: 48 additions & 0 deletions plugins/moondream/example/moondream_vlm_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import asyncio
from uuid import uuid4
from dotenv import load_dotenv

from vision_agents.core import User, Agent
from vision_agents.plugins import deepgram, getstream, vogent, elevenlabs, moondream
from vision_agents.core.events import CallSessionParticipantJoinedEvent
import os

load_dotenv()

async def start_agent() -> None:
llm = moondream.CloudVLM(
api_key=os.getenv("MOONDREAM_API_KEY"),
conf_threshold=0.3,
)
# create an agent to run with Stream's edge, openAI llm
agent = Agent(
edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
agent_user=User(
name="My happy AI friend", id="agent"
),
llm=llm,
tts=elevenlabs.TTS(),
stt=deepgram.STT(),
turn_detection=vogent.TurnDetection(),
)

# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))

@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
await agent.simple_response("Describe what you currently see")


# Have the agent join the call/room
with await agent.join(call):
# Open the demo UI
await agent.edge.open_demo(call)
# run till the call ends
await agent.finish()

if __name__ == "__main__":
# setup_telemetry()
asyncio.run(start_agent())
22 changes: 22 additions & 0 deletions plugins/moondream/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "moondream-example"
version = "0.1.0"
description = "Example using Moondream Detect and VLM with Vision Agents"
requires-python = ">=3.10"
dependencies = [
"vision-agents",
"vision-agents-plugins-moondream",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents-plugins-elevenlabs",
"vision-agents-plugins-vogent",
"python-dotenv",
]

[tool.uv.sources]
vision-agents = { workspace = true }
vision-agents-plugins-moondream = { workspace = true }
vision-agents-plugins-getstream = { workspace = true }
vision-agents-plugins-deepgram = { workspace = true }
vision-agents-plugins-elevenlabs = { workspace = true }
vision-agents-plugins-vogent = { workspace = true }
105 changes: 105 additions & 0 deletions plugins/moondream/tests/test_moondream_vlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""
Tests for the Moondream CloudVLM plugin.

Integration tests require MOONDREAM_API_KEY environment variable:

export MOONDREAM_API_KEY="your-key-here"
uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m integration -v

To run only unit tests (no API key needed):

uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m "not integration" -v
"""
import os
from pathlib import Path
from typing import Iterator

import pytest
import av
from PIL import Image

from vision_agents.plugins.moondream import CloudVLM


@pytest.fixture(scope="session")
def golf_image(assets_dir) -> Iterator[Image.Image]:
"""Load the local golf swing test image from tests/test_assets."""
asset_path = Path(assets_dir) / "golf_swing.png"
with Image.open(asset_path) as img:
yield img.convert("RGB")


@pytest.fixture
def golf_frame(golf_image: Image.Image) -> av.VideoFrame:
"""Create an av.VideoFrame from the golf image."""
return av.VideoFrame.from_image(golf_image)


@pytest.fixture
async def vlm_vqa() -> CloudVLM:
"""Create CloudVLM in VQA mode."""
api_key = os.getenv("MOONDREAM_API_KEY")
if not api_key:
pytest.skip("MOONDREAM_API_KEY not set")

vlm = CloudVLM(api_key=api_key, mode="vqa")
try:
yield vlm
finally:
vlm.close()


@pytest.fixture
async def vlm_caption() -> CloudVLM:
"""Create CloudVLM in caption mode."""
api_key = os.getenv("MOONDREAM_API_KEY")
if not api_key:
pytest.skip("MOONDREAM_API_KEY not set")

vlm = CloudVLM(api_key=api_key, mode="caption")
try:
yield vlm
finally:
vlm.close()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
async def test_vqa_mode(golf_frame: av.VideoFrame, vlm_vqa: CloudVLM):
"""Test VQA mode with a question about the image."""
# Set the latest frame so _process_frame can access it
vlm_vqa._latest_frame = golf_frame

# Ask a question about the image
question = "What sport is being played in this image?"
response = await vlm_vqa.simple_response(question)

# Verify we got a response
assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

# Verify the response mentions golf (should be in the image)
assert "golf" in response.text.lower()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
async def test_caption_mode(golf_frame: av.VideoFrame, vlm_caption: CloudVLM):
"""Test caption mode to generate a description of the image."""
# Set the latest frame so _process_frame can access it
vlm_caption._latest_frame = golf_frame

# Generate caption (text is not needed for caption mode)
response = await vlm_caption.simple_response("")

# Verify we got a response
assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

# Verify the caption is descriptive (not empty)
assert len(response.text.strip()) > 0

18 changes: 7 additions & 11 deletions plugins/moondream/vision_agents/plugins/moondream/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,20 @@
Moondream plugin for vision-agents.

This plugin provides Moondream 3 vision capabilities including object detection,
visual question answering, counting, and captioning.
visual question answering, and captioning.
"""

from .moondream_cloud_processor import (
CloudDetectionProcessor,
)
from .moondream_local_processor import (
LocalDetectionProcessor,
)
from .moondream_video_track import (
MoondreamVideoTrack,
)
from vision_agents.plugins.moondream.detection.moondream_cloud_processor import CloudDetectionProcessor
from vision_agents.plugins.moondream.detection.moondream_local_processor import LocalDetectionProcessor
from vision_agents.plugins.moondream.detection.moondream_video_track import MoondreamVideoTrack
from vision_agents.plugins.moondream.vlm.moondream_cloud_vlm import CloudVLM


__path__ = __import__("pkgutil").extend_path(__path__, __name__)

__all__ = [
"CloudDetectionProcessor",
"CloudVLM",
"LocalDetectionProcessor",
"MoondreamVideoTrack",
]

Loading
Loading