Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Empty file.
48 changes: 48 additions & 0 deletions plugins/moondream/example/moondream_vlm_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import asyncio
from uuid import uuid4
from dotenv import load_dotenv

from vision_agents.core import User, Agent
from vision_agents.plugins import deepgram, getstream, vogent, elevenlabs, moondream
from vision_agents.core.events import CallSessionParticipantJoinedEvent
import os

load_dotenv()

async def start_agent() -> None:
llm = moondream.CloudVLM(
api_key=os.getenv("MOONDREAM_API_KEY"),
conf_threshold=0.3,
)
# create an agent to run with Stream's edge, openAI llm
agent = Agent(
edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
agent_user=User(
name="My happy AI friend", id="agent"
),
llm=llm,
tts=elevenlabs.TTS(),
stt=deepgram.STT(),
turn_detection=vogent.TurnDetection(),
)

# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))

@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
await agent.simple_response("Describe what you currently see")


# Have the agent join the call/room
with await agent.join(call):
# Open the demo UI
await agent.edge.open_demo(call)
# run till the call ends
await agent.finish()

if __name__ == "__main__":
# setup_telemetry()
asyncio.run(start_agent())
22 changes: 22 additions & 0 deletions plugins/moondream/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "moondream-example"
version = "0.1.0"
description = "Example using Moondream Detect and VLM with Vision Agents"
requires-python = ">=3.10"
dependencies = [
"vision-agents",
"vision-agents-plugins-moondream",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents-plugins-elevenlabs",
"vision-agents-plugins-vogent",
"python-dotenv",
]

[tool.uv.sources]
vision-agents = { workspace = true }
vision-agents-plugins-moondream = { workspace = true }
vision-agents-plugins-getstream = { workspace = true }
vision-agents-plugins-deepgram = { workspace = true }
vision-agents-plugins-elevenlabs = { workspace = true }
vision-agents-plugins-vogent = { workspace = true }
105 changes: 105 additions & 0 deletions plugins/moondream/tests/test_moondream_vlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""
Tests for the Moondream CloudVLM plugin.

Integration tests require MOONDREAM_API_KEY environment variable:

export MOONDREAM_API_KEY="your-key-here"
uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m integration -v

To run only unit tests (no API key needed):

uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m "not integration" -v
"""
import os
from pathlib import Path
from typing import Iterator

import pytest
import av
from PIL import Image

from vision_agents.plugins.moondream import CloudVLM


@pytest.fixture(scope="session")
def golf_image(assets_dir) -> Iterator[Image.Image]:
"""Load the local golf swing test image from tests/test_assets."""
asset_path = Path(assets_dir) / "golf_swing.png"
with Image.open(asset_path) as img:
yield img.convert("RGB")


@pytest.fixture
def golf_frame(golf_image: Image.Image) -> av.VideoFrame:
"""Create an av.VideoFrame from the golf image."""
return av.VideoFrame.from_image(golf_image)


@pytest.fixture
async def vlm_vqa() -> CloudVLM:
"""Create CloudVLM in VQA mode."""
api_key = os.getenv("MOONDREAM_API_KEY")
if not api_key:
pytest.skip("MOONDREAM_API_KEY not set")

vlm = CloudVLM(api_key=api_key, mode="vqa")
try:
yield vlm
finally:
vlm.close()


@pytest.fixture
async def vlm_caption() -> CloudVLM:
"""Create CloudVLM in caption mode."""
api_key = os.getenv("MOONDREAM_API_KEY")
if not api_key:
pytest.skip("MOONDREAM_API_KEY not set")

vlm = CloudVLM(api_key=api_key, mode="caption")
try:
yield vlm
finally:
vlm.close()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
async def test_vqa_mode(golf_frame: av.VideoFrame, vlm_vqa: CloudVLM):
"""Test VQA mode with a question about the image."""
# Set the latest frame so _process_frame can access it
vlm_vqa._latest_frame = golf_frame

# Ask a question about the image
question = "What sport is being played in this image?"
response = await vlm_vqa.simple_response(question)

# Verify we got a response
assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

# Verify the response mentions golf (should be in the image)
assert "golf" in response.text.lower()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
async def test_caption_mode(golf_frame: av.VideoFrame, vlm_caption: CloudVLM):
"""Test caption mode to generate a description of the image."""
# Set the latest frame so _process_frame can access it
vlm_caption._latest_frame = golf_frame

# Generate caption (text is not needed for caption mode)
response = await vlm_caption.simple_response("")

# Verify we got a response
assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

# Verify the caption is descriptive (not empty)
assert len(response.text.strip()) > 0

5 changes: 5 additions & 0 deletions plugins/moondream/vision_agents/plugins/moondream/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
MoondreamVideoTrack,
)

from .moondream_vlm import (
CloudVLM,
)

__path__ = __import__("pkgutil").extend_path(__path__, __name__)

__all__ = [
"CloudDetectionProcessor",
"CloudVLM",
"LocalDetectionProcessor",
"MoondreamVideoTrack",
]
Expand Down
Loading
Loading