Skip to content

Commit

Permalink
Update Assistant2.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jjmlovesgit authored Aug 20, 2024
1 parent 7efe242 commit cbf5053
Showing 1 changed file with 42 additions and 17 deletions.
59 changes: 42 additions & 17 deletions Assistant2.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
___________________________________________________________
#tested 8/19/24 see requirements.txt
import os
import asyncio
from typing import Annotated
from dotenv import load_dotenv
import selectors

import asyncio

from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli, llm

from livekit import agents, rtc
from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts
from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts
from livekit.agents.llm import (
ChatContext,
ChatImage,
ChatMessage,
ChatRole,
)
from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant
from livekit.plugins import deepgram, openai, silero, elevenlabs, azure
from livekit.plugins import deepgram, openai, silero, azure


class MyPolicy(asyncio.DefaultEventLoopPolicy):
def new_event_loop(self):
selector = selectors.SelectSelector()
return asyncio.SelectorEventLoop(selector)

asyncio.set_event_loop_policy(MyPolicy())

# Load environment variables from .env file
load_dotenv()
# you don;t need all of these but I was trying the various combos -- Include what you need...
def reload_env_variables():
livekit_url = os.environ.get('LIVEKIT_URL')
livekit_api_key = os.environ.get('LIVEKIT_API_KEY')
Expand All @@ -25,6 +39,7 @@ def reload_env_variables():
openai_api_key = os.environ.get('OPENAI_API_KEY')
speech_region = os.environ.get('AZURE_SPEECH_REGION')
speech_key = os.environ.get('AZURE_SPEECH_KEY')

return {
'livekit_url': livekit_url,
'livekit_api_key': livekit_api_key,
Expand All @@ -41,11 +56,15 @@ def print_env_variables(env_vars):
print(f"{key}: {value[:2]}...{value[-2:]}")
else:
print(f"{key}: None")
env_vars = reload_env_variables()
print_env_variables(env_vars)


class AssistantFunction(agents.llm.FunctionContext):
"""This agent is called by the assistant for image collection / processing."""
"""This class is used to define functions that will be called by the assistant."""
@agents.llm.ai_callable(
desc=(
"Use this function whenever asked to review images from the video camera"
"Use this function whenever asked to evaluate an image, video, or the webcam feed being shared with you"
)
)
async def image(
Expand All @@ -55,11 +74,12 @@ async def image(
agents.llm.TypeInfo(desc="The user message that triggered this function"),
],
):
print(f"The message input triggering vision agent capabilities: {user_msg}")
print(f"Message triggering vision capabilities: {user_msg}")
context = AssistantContext.get_current()
context.store_metadata("user_msg", user_msg)

async def get_video_track(room: rtc.Room):
"""Logic to track and to process images."""
"""Get the first video track from the room. We'll use this track to process images."""
video_track = asyncio.Future[rtc.RemoteVideoTrack]()
for _, participant in room.participants.items():
for _, track_publication in participant.tracks.items():
Expand All @@ -70,14 +90,15 @@ async def get_video_track(room: rtc.Room):
print(f"Using video track {track_publication.track.sid}")
break
return await video_track

async def entrypoint(ctx: JobContext):
print(f"Room name: {ctx.room.name}")
chat_context = ChatContext(
messages=[
ChatMessage(
role=ChatRole.SYSTEM,
text=(
"Your name is Ava and you are an assitant with voice and vision capabilities."
"Your name is Andrew. You are an assitant who is slightly sarcastic and witty. You have both voice and vision capabilities."
"Respond with clear and concise answers with minimal jargon. Do not use emojis."
),
)
Expand All @@ -87,19 +108,19 @@ async def entrypoint(ctx: JobContext):
latest_image: rtc.VideoFrame | None = None
assistant = VoiceAssistant(
vad=silero.VAD(), # We'll use Silero's Voice Activity Detector (VAD)
stt=deepgram.STT(), # We'll use Deepgram's Speech To Text (STT)- Fastest
#stt=azure.STT(), # We'll use Azure Speech Studio - Speech To Text (STT) Slow
stt=deepgram.STT(),
#stt=azure.STT(), # We'll use Deepgram's Speech To Text (STT)
llm=gpt, # We'll use GTP 4.0
#tts=azure.TTS(voice="en-US-AndrewMultilingualNeural"),
tts=azure.TTS(voice="en-US-AvaMultilingualNeural"),
#tts=elevenlabs.TTS(), # Text-to-Speech #tts=openai_tts,
#tts=elevenlabs.TTS(), # Text-to-Speech #tts=openai_tts,
#tts=openai_tts, # We'll use OpenAI's Text To Speech (TTS)
fnc_ctx=AssistantFunction(),
chat_ctx=chat_context,
)
chat = rtc.ChatManager(ctx.room)
async def _answer(text: str, use_image: bool = False):
"""
Respond to the user's message with the given text and where provided by the agent the latest
Answer the user's message with the given text and optionally the latest
image captured from the video track.
"""
args = {}
Expand All @@ -122,13 +143,17 @@ def on_function_calls_finished(ctx: AssistantContext):
asyncio.create_task(_answer(user_msg, use_image=True))
assistant.start(ctx.room)
await asyncio.sleep(3)
await assistant.say("Hey there, how can I help you today?", allow_interruptions=True)
await assistant.say("Hey, how can I help you today?", allow_interruptions=True)
while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
video_track = await get_video_track(ctx.room)
async for event in rtc.VideoStream(video_track):
# We grab the latest image from the video track and store it in a variable.
# We'll continually grab the latest image from the video track
# and store it in a variable.
latest_image = event.frame

async def request_fnc(req: JobRequest) -> None:
await req.accept(entrypoint)
if _name_ == "_main_":

if __name__ == "__main__":
cli.run_app(WorkerOptions(request_fnc))

0 comments on commit cbf5053

Please sign in to comment.