Update Assistant2.py

thinkstudyai · Aug 20, 2024 · cbf5053 · cbf5053
1 parent 7efe242
commit cbf5053
Showing 1 changed file with 42 additions and 17 deletions.
diff --git a/Assistant2.py b/Assistant2.py
@@ -1,21 +1,35 @@
-___________________________________________________________
+#tested 8/19/24 see requirements.txt
 import os
 import asyncio
 from typing import Annotated
 from dotenv import load_dotenv
+import selectors
+
+import asyncio
+
+from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli, llm
+
 from livekit import agents, rtc
-from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts
+from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts 
 from livekit.agents.llm import (
     ChatContext,
     ChatImage,
     ChatMessage,
     ChatRole,
 )
 from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant
-from livekit.plugins import deepgram, openai, silero, elevenlabs, azure
+from livekit.plugins import deepgram, openai, silero, azure
+
+
+class MyPolicy(asyncio.DefaultEventLoopPolicy):
+   def new_event_loop(self):
+      selector = selectors.SelectSelector()
+      return asyncio.SelectorEventLoop(selector)
+
+asyncio.set_event_loop_policy(MyPolicy())
+
 # Load environment variables from .env file
 load_dotenv()
-# you don;t need all of these but I was trying the various combos -- Include what you need...
 def reload_env_variables():
     livekit_url = os.environ.get('LIVEKIT_URL')
     livekit_api_key = os.environ.get('LIVEKIT_API_KEY')
@@ -25,6 +39,7 @@ def reload_env_variables():
     openai_api_key = os.environ.get('OPENAI_API_KEY')
     speech_region = os.environ.get('AZURE_SPEECH_REGION')
     speech_key = os.environ.get('AZURE_SPEECH_KEY')
+
     return {
         'livekit_url': livekit_url,
         'livekit_api_key': livekit_api_key,
@@ -41,11 +56,15 @@ def print_env_variables(env_vars):
             print(f"{key}: {value[:2]}...{value[-2:]}")
         else:
             print(f"{key}: None")
+env_vars = reload_env_variables()
+print_env_variables(env_vars)
+
+
 class AssistantFunction(agents.llm.FunctionContext):
-    """This agent is called by the assistant for image collection / processing."""
+    """This class is used to define functions that will be called by the assistant."""
     @agents.llm.ai_callable(
         desc=(
-            "Use this function whenever asked to review images from the video camera"
+            "Use this function whenever asked to evaluate an image, video, or the webcam feed being shared with you"
                  )
     )
     async def image(
@@ -55,11 +74,12 @@ async def image(
             agents.llm.TypeInfo(desc="The user message that triggered this function"),
         ],
     ):
-        print(f"The message input triggering vision agent capabilities: {user_msg}")
+        print(f"Message triggering vision capabilities: {user_msg}")
         context = AssistantContext.get_current()
         context.store_metadata("user_msg", user_msg)
+
 async def get_video_track(room: rtc.Room):
-    """Logic to track and to process images."""
+    """Get the first video track from the room. We'll use this track to process images."""
     video_track = asyncio.Future[rtc.RemoteVideoTrack]()
     for _, participant in room.participants.items():
         for _, track_publication in participant.tracks.items():
@@ -70,14 +90,15 @@ async def get_video_track(room: rtc.Room):
                 print(f"Using video track {track_publication.track.sid}")
                 break
     return await video_track
+
 async def entrypoint(ctx: JobContext):
     print(f"Room name: {ctx.room.name}")
     chat_context = ChatContext(
         messages=[
             ChatMessage(
                 role=ChatRole.SYSTEM,
                 text=(
-                    "Your name is Ava and you are an assitant with voice and vision capabilities."
+                    "Your name is Andrew. You are an assitant who is slightly sarcastic and witty. You have both voice and vision capabilities."
                     "Respond with clear and concise answers with minimal jargon.  Do not use emojis."
                 ),
             )
@@ -87,19 +108,19 @@ async def entrypoint(ctx: JobContext):
     latest_image: rtc.VideoFrame | None = None
     assistant = VoiceAssistant(
         vad=silero.VAD(),  # We'll use Silero's Voice Activity Detector (VAD)
-        stt=deepgram.STT(), # We'll use Deepgram's Speech To Text (STT)- Fastest
-        #stt=azure.STT(),  # We'll use Azure Speech Studio - Speech To Text (STT) Slow
+        stt=deepgram.STT(),
+        #stt=azure.STT(),  # We'll use Deepgram's Speech To Text (STT)
         llm=gpt, # We'll use GTP 4.0
-        #tts=azure.TTS(voice="en-US-AndrewMultilingualNeural"),
         tts=azure.TTS(voice="en-US-AvaMultilingualNeural"),
-        #tts=elevenlabs.TTS(), # Text-to-Speech #tts=openai_tts,
+        #tts=elevenlabs.TTS(), # Text-to-Speech #tts=openai_tts, 
+        #tts=openai_tts,  # We'll use OpenAI's Text To Speech (TTS)
         fnc_ctx=AssistantFunction(),
         chat_ctx=chat_context,
     )
     chat = rtc.ChatManager(ctx.room)
     async def _answer(text: str, use_image: bool = False):
         """
-        Respond to the user's message with the given text and where provided by the agent the latest
+        Answer the user's message with the given text and optionally the latest
         image captured from the video track.
         """
         args = {}
@@ -122,13 +143,17 @@ def on_function_calls_finished(ctx: AssistantContext):
             asyncio.create_task(_answer(user_msg, use_image=True))
     assistant.start(ctx.room)
     await asyncio.sleep(3)
-    await assistant.say("Hey there, how can I help you today?", allow_interruptions=True)
+    await assistant.say("Hey, how can I help you today?", allow_interruptions=True)
     while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
         video_track = await get_video_track(ctx.room)
         async for event in rtc.VideoStream(video_track):
-            # We grab the latest image from the video track and store it in a variable.
+            # We'll continually grab the latest image from the video track
+            # and store it in a variable.
             latest_image = event.frame
+
 async def request_fnc(req: JobRequest) -> None:
     await req.accept(entrypoint)
-if _name_ == "_main_":
+
+if __name__ == "__main__":
     cli.run_app(WorkerOptions(request_fnc))
+