Merge pull request #16 from jhakulin/jhakulin/azure-support

Jhakulin/azure support
jhakulin · Dec 6, 2024 · 099e140 · 099e140
2 parents 65fc36e + 17f833e
commit 099e140
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -94,8 +94,16 @@ audio_capture.start()
    - After that go to generated `dist` folder and install the generated wheel using following command: `pip install --force-reinstall realtime_ai-0.1.0-py3-none-any.whl`
 
 2. **Setup**:
-   - Replace placeholders like `"OPENAI_API_KEY"` in the sample script with real information.
-   - Check system microphone access and settings to align with the project's audio requirements (e.g., 16bit PCM 24kHz mono).
+   - You need to setup following environment variables in order to use the service.
+
+   - **OPEN_AI**
+     - export OPENAI_API_KEY="Your OpenAI Key"
+     - Check system microphone access and settings to align with the project's audio requirements (e.g., 16bit PCM 24kHz mono).
+
+   - **AZURE_OPEN_AI**
+     - export AZURE_OPENAI_API_KEY="Your Azure OpenAI Key"
+     - export AZURE_OPENAI_ENDPOINT="Your Azure OpenAI Endpoint, shall be in the format: `wss://<service-name>.openai.azure.com/openai/realtime`"
+     - export AZURE_OPENAI_API_VERSION="Azure OpenAI version"
 
 3. **Execution**:
    - Run the script via command-line or an IDE:

diff --git a/samples/async/sample_realtime_ai_with_keyword_and_vad.py b/samples/async/sample_realtime_ai_with_keyword_and_vad.py
@@ -31,7 +31,7 @@
 logging.getLogger("utils.audio_playback").setLevel(logging.ERROR)
 logging.getLogger("utils.audio_capture").setLevel(logging.ERROR)
 logging.getLogger("utils.vad").setLevel(logging.ERROR)
-logging.getLogger("realtime_ai").setLevel(logging.INFO)
+logging.getLogger("realtime_ai").setLevel(logging.ERROR)
 
 # Root logger for general logging
 logger = logging.getLogger()
@@ -336,6 +336,28 @@ def get_vad_configuration(use_server_vad=False):
         return None  # Local VAD typically requires no special configuration
 
 
+def get_openai_configuration():
+    # The Azure endpoint shall be in the format: "wss://<service-name>.openai.azure.com/openai/realtime"
+    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+    api_key = None
+    azure_api_version = None
+
+    if not azure_endpoint:
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            logger.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
+            return None, None, None
+    else:
+        api_key = os.getenv("AZURE_OPENAI_API_KEY")
+        azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-01-preview")
+
+        if not api_key or not azure_endpoint or not azure_api_version:
+            logger.error("Please set the AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION environment variables.")
+            return None, None, None
+
+    return azure_endpoint, api_key, azure_api_version
+
+
 async def main():
     """
     Main function to initialize and run the audio processing and realtime client asynchronously.
@@ -345,27 +367,28 @@ async def main():
     audio_capture = None
 
     try:
-        # Retrieve OpenAI API key from environment variables
-        api_key = os.getenv("OPENAI_API_KEY")
+
+        azure_openai_endpoint, api_key, azure_api_version = get_openai_configuration()
         if not api_key:
-            logger.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
             return
 
         functions = FunctionTool(functions=user_functions)
 
         # Define RealtimeOptions
         options = RealtimeAIOptions(
             api_key=api_key,
-            model="gpt-4o-realtime-preview-2024-10-01",
+            model="gpt-4o-realtime-preview",
             modalities=["audio", "text"],
             instructions="You are a helpful assistant. Respond concisely. You have access to a variety of tools to analyze, translate and review text and code.",
             turn_detection=get_vad_configuration(use_server_vad=False),
             tools=functions.definitions,
             tool_choice="auto",
             temperature=0.8,
             max_output_tokens=None,
-            voice="sage",
-            enable_auto_reconnect=True
+            voice="echo",
+            enable_auto_reconnect=True,
+            azure_openai_endpoint=azure_openai_endpoint,
+            azure_openai_api_version=azure_api_version
         )
 
         # Define AudioStreamOptions

diff --git a/samples/sample_realtime_ai_with_keyword_and_vad.py b/samples/sample_realtime_ai_with_keyword_and_vad.py
@@ -313,6 +313,28 @@ def get_vad_configuration(use_server_vad=False):
         return None  # Local VAD typically requires no special configuration
 
 
+def get_openai_configuration():
+    # The Azure endpoint shall be in the format: "wss://<service-name>.openai.azure.com/openai/realtime"
+    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+    api_key = None
+    azure_api_version = None
+
+    if not azure_endpoint:
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            logger.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
+            return None, None, None
+    else:
+        api_key = os.getenv("AZURE_OPENAI_API_KEY")
+        azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-01-preview")
+
+        if not api_key or not azure_endpoint or not azure_api_version:
+            logger.error("Please set the AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION environment variables.")
+            return None, None, None
+
+    return azure_endpoint, api_key, azure_api_version
+
+
 def main():
     """
     Main function to initialize and run the audio processing and realtime client asynchronously.
@@ -322,27 +344,28 @@ def main():
     audio_capture = None
 
     try:
-        # Retrieve OpenAI API key from environment variables
-        api_key = os.getenv("OPENAI_API_KEY")
+
+        azure_openai_endpoint, api_key, azure_api_version = get_openai_configuration()
         if not api_key:
-            logger.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
             return
 
         functions = FunctionTool(functions=user_functions)
 
         # Define RealtimeOptions
         options = RealtimeAIOptions(
             api_key=api_key,
-            model="gpt-4o-realtime-preview-2024-10-01",
+            model="gpt-4o-realtime-preview",
             modalities=["audio", "text"],
             instructions="You are a helpful assistant. Respond concisely. You have access to a variety of tools to analyze, translate and review text and code.",
             turn_detection=get_vad_configuration(use_server_vad=False),
             tools=functions.definitions,
             tool_choice="auto",
             temperature=0.8,
             max_output_tokens=None,
-            voice="sage",
+            voice="echo",
             enable_auto_reconnect=True,
+            azure_openai_endpoint=azure_openai_endpoint,
+            azure_openai_api_version=azure_api_version
         )
 
         # Define AudioStreamOptions

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="realtime-ai",
-    version="0.1.4",
+    version="0.1.5",
     description="Python SDK for real-time audio processing with OpenAI's Realtime REST API.",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",

diff --git a/src/realtime_ai/aio/realtime_ai_service_manager.py b/src/realtime_ai/aio/realtime_ai_service_manager.py
@@ -137,6 +137,28 @@ def parse_realtime_event(self, json_object: dict) -> Optional[EventBase]:
                     rate_limits_data = json_object['rate_limits']
                     rate_limits = [RateLimit(**rate) for rate in rate_limits_data]
                     return RateLimitsUpdated(event_id=json_object['event_id'], type=event_type, rate_limits=rate_limits)
+                elif event_type == "response.content_part.done":
+                    # Ensure only relevant fields are passed
+                    return ResponseContentPartDone(
+                        event_id=json_object['event_id'], 
+                        type=event_type,
+                        response_id=json_object.get('response_id'),
+                        item_id=json_object.get('item_id'),
+                        output_index=json_object.get('output_index'),
+                        content_index=json_object.get('content_index'),
+                        part=json_object.get('part')
+                    )
+                elif event_type == "response.content_part.added":
+                    # Ensure only relevant fields are passed
+                    return ResponseContentPartAdded(
+                        event_id=json_object['event_id'], 
+                        type=event_type,
+                        response_id=json_object.get('response_id'),
+                        item_id=json_object.get('item_id'),
+                        output_index=json_object.get('output_index'),
+                        content_index=json_object.get('content_index'),
+                        part=json_object.get('part')
+                    )
                 elif event_type == "response.function_call_arguments.done":
                     # Ensure only relevant fields are passed
                     return ResponseFunctionCallArgumentsDone(

diff --git a/src/realtime_ai/aio/web_socket_manager.py b/src/realtime_ai/aio/web_socket_manager.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import websockets
+import uuid
 from realtime_ai.models.realtime_ai_options import RealtimeAIOptions
 
 logger = logging.getLogger(__name__)
@@ -16,11 +17,20 @@ def __init__(self, options: RealtimeAIOptions, service_manager):
         self.options = options
         self.service_manager = service_manager
         self.websocket = None
-        self.url = f"wss://api.openai.com/v1/realtime?model={self.options.model}"
-        self.headers = {
-            "Authorization": f"Bearer {self.options.api_key}",
-            "OpenAI-Beta": "realtime=v1",
-        }
+
+        if self.options.azure_openai_endpoint:
+            self.request_id = uuid.uuid4()
+            self.url = self.options.azure_openai_endpoint + f"?api-version={self.options.azure_openai_api_version}" + f"&deployment={self.options.model}"
+            self.headers = {
+                "x-ms-client-request-id": str(self.request_id),
+                "api-key": self.options.api_key,
+            }
+        else:
+            self.url = f"wss://api.openai.com/v1/realtime?model={self.options.model}"
+            self.headers = {
+                "Authorization": f"Bearer {self.options.api_key}",
+                "openai-beta": "realtime=v1",
+            }
 
         self.reconnect_delay = 5 # Time to wait before attempting to reconnect, in seconds
 

diff --git a/src/realtime_ai/models/realtime_ai_options.py b/src/realtime_ai/models/realtime_ai_options.py
@@ -9,6 +9,8 @@ class RealtimeAIOptions:
     model: str
     modalities: List[str]
     instructions: str
+    azure_openai_endpoint: Optional[str] = None
+    azure_openai_api_version: Optional[str] = None
     voice: str = "alloy"
     input_audio_format: str = "pcm16"
     output_audio_format: str = "pcm16"

diff --git a/src/realtime_ai/realtime_ai_service_manager.py b/src/realtime_ai/realtime_ai_service_manager.py
@@ -139,6 +139,28 @@ def parse_realtime_event(self, json_object: dict) -> Optional[EventBase]:
                     rate_limits_data = json_object['rate_limits']
                     rate_limits = [RateLimit(**rate) for rate in rate_limits_data]
                     return RateLimitsUpdated(event_id=json_object['event_id'], type=event_type, rate_limits=rate_limits)
+                elif event_type == "response.content_part.done":
+                    # Ensure only relevant fields are passed
+                    return ResponseContentPartDone(
+                        event_id=json_object['event_id'], 
+                        type=event_type,
+                        response_id=json_object.get('response_id'),
+                        item_id=json_object.get('item_id'),
+                        output_index=json_object.get('output_index'),
+                        content_index=json_object.get('content_index'),
+                        part=json_object.get('part')
+                    )
+                elif event_type == "response.content_part.added":
+                    # Ensure only relevant fields are passed
+                    return ResponseContentPartAdded(
+                        event_id=json_object['event_id'], 
+                        type=event_type,
+                        response_id=json_object.get('response_id'),
+                        item_id=json_object.get('item_id'),
+                        output_index=json_object.get('output_index'),
+                        content_index=json_object.get('content_index'),
+                        part=json_object.get('part')
+                    )
                 elif event_type == "response.function_call_arguments.done":
                     # Ensure only relevant fields are passed
                     return ResponseFunctionCallArgumentsDone(

diff --git a/src/realtime_ai/web_socket_manager.py b/src/realtime_ai/web_socket_manager.py
@@ -2,6 +2,7 @@
 import logging
 import threading
 import time
+import uuid
 import websocket  # pip install websocket-client
 from realtime_ai.models.realtime_ai_options import RealtimeAIOptions
 
@@ -16,11 +17,20 @@ class WebSocketManager:
     def __init__(self, options : RealtimeAIOptions, service_manager):
         self.options = options
         self.service_manager = service_manager
-        self.url = f"wss://api.openai.com/v1/realtime?model={self.options.model}"
-        self.headers = [
-            f"Authorization: Bearer {self.options.api_key}",
-            "OpenAI-Beta: realtime=v1"
-        ]
+
+        if self.options.azure_openai_endpoint:
+            self.request_id = uuid.uuid4()
+            self.url = self.options.azure_openai_endpoint + f"?api-version={self.options.azure_openai_api_version}" + f"&deployment={self.options.model}"
+            self.headers = {
+                "x-ms-client-request-id": str(self.request_id),
+                "api-key": self.options.api_key,
+            }
+        else:
+            self.url = f"wss://api.openai.com/v1/realtime?model={self.options.model}"
+            self.headers = {
+                "Authorization": f"Bearer {self.options.api_key}",
+                "openai-beta": "realtime=v1",
+            }
 
         self.ws = None
         self._receive_thread = None