openai whisper model integration (#160)

fetchai · Sep 20, 2023 · 268af59 · 268af59
1 parent 877e49d
commit 268af59
Show file tree

Hide file tree

Showing 10 changed files with 1,466 additions and 0 deletions.
diff --git a/integrations/openai-whisper-large-v2/README.md b/integrations/openai-whisper-large-v2/README.md
@@ -0,0 +1,49 @@
+# openai whisper model integration
+
+Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning.
+
+## Requirements
+
+- Python (v3.10+ recommended)
+- Poetry (A Python packaging and dependency management tool)
+
+## Setup
+
+1. For the demo to work, you need to get HuggingFaceAPI Token:
+
+    1. Visit [HuggingFace](https://huggingface.co/).
+    2. Sign up or log in.
+    3. Navigate to `Profile -> Settings -> Access Tokens`.
+    4. Copy an existing token or create a new one.
+
+2. **Install Dependencies**
+
+    ```bash
+    poetry install
+    ```
+
+3.  **Running The Agent Script**
+
+    open terminal and goto "./openai-whisper-large-v2/src" and
+    run below command to load environment variables and run the agent.
+
+    ```bash
+    export HUGGING_FACE_ACCESS_TOKEN="{Your HuggingFaceAPI Token}"
+    poetry run python agent.py
+    ```
+
+    Check the log for "Adding Agent to Bureau" line and copy the {agent address}.
+
+4.  **Running The User Script**
+
+    open terminal and goto "./openai-whisper-large-v2/src" and
+    run below command to load environment variables and run the client.
+
+    ```bash
+    export WHISPER_AGENT_ADDRESS="{ agent address from last step }"
+    poetry run python user.py
+    ```
+
+After running the command, a request is sent to the agent every 30 sec till its successful.
+
+You can change the RECORDING_FILE variable to transcript your own audio file.
diff --git a/integrations/openai-whisper-large-v2/poetry.lock b/integrations/openai-whisper-large-v2/poetry.lock
diff --git a/integrations/openai-whisper-large-v2/pyproject.toml b/integrations/openai-whisper-large-v2/pyproject.toml
@@ -0,0 +1,10 @@
+[tool.poetry]
+name = "openai-whisper-large-v2"
+version = "0.1.0"
+description = "Openai whisper model integration with fetch.ai micro-Agents"
+authors = ["Abhi"]
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.12"
+uagents = "*"
+requests = "^2.31.0"
diff --git a/integrations/openai-whisper-large-v2/src/__init__.py b/integrations/openai-whisper-large-v2/src/__init__.py
diff --git a/integrations/openai-whisper-large-v2/src/agent.py b/integrations/openai-whisper-large-v2/src/agent.py
@@ -0,0 +1,10 @@
+from uagents import Bureau
+
+from agents.whisper_agent import agent
+
+
+if __name__ == "__main__":
+    bureau = Bureau(endpoint="http://127.0.0.1:8000/submit", port=8000)
+    print(f"Adding Agent to Bureau: {agent.address}")
+    bureau.add(agent)
+    bureau.run()
diff --git a/integrations/openai-whisper-large-v2/src/agents/whisper_agent.py b/integrations/openai-whisper-large-v2/src/agents/whisper_agent.py
@@ -0,0 +1,76 @@
+from uagents import Agent, Context, Protocol
+from messages.whisper_basic import AudioTranscriptRequest, AudioTranscriptResponse, Error
+from uagents.setup import fund_agent_if_low
+import os
+import requests
+import base64
+
+# Get the HUGGING_FACE_ACCESS_TOKEN from environment variable or default to a placeholder string if not found.
+HUGGING_FACE_ACCESS_TOKEN = os.getenv(
+    "HUGGING_FACE_ACCESS_TOKEN", "HUGGING_FACE_ACCESS_TOKEN")
+
+if HUGGING_FACE_ACCESS_TOKEN == "HUGGING_FACE_ACCESS_TOKEN":
+    raise Exception(
+        "You need to provide an HUGGING_FACE_ACCESS_TOKEN, by exporting env. Follow README for more details")
+
+WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2"
+
+# Define headers for HTTP request, including content type and authorization details
+HEADERS = {
+    "Authorization": f"Bearer {HUGGING_FACE_ACCESS_TOKEN}"
+}
+
+# Create an agent with predefined properties
+agent = Agent(
+    name="whisper_agent",
+    seed=HUGGING_FACE_ACCESS_TOKEN,
+    port=8000,
+    endpoint=["http://127.0.0.1:8000/submit"],
+)
+
+# Ensure the agent has enough funds
+fund_agent_if_low(agent.wallet.address())
+
+
+async def get_audio_transcript(ctx: Context, sender: str, audiodata: str):
+    try:
+        # Encoding the audio data from ASCII to bytes
+        audiodata = audiodata.encode("ascii")
+        # Converting the audio data from base64 to bytes
+        audioBytes = base64.b64decode(audiodata)
+
+        # Sending POST request to WHISPER_URL with audio bytes
+        response = requests.post(WHISPER_URL, headers=HEADERS, data=audioBytes)
+
+        # If the request response is not successful (non-200 code), send error message
+        if response.status_code != 200:
+            await ctx.send(sender, Error(error=f"Error: {response.json().get('error')}"))
+            return
+
+        # Send the parsed response back to the sender/user
+        await ctx.send(sender, AudioTranscriptResponse(transcript=response.json().get('text')))
+        return
+
+    # If an unknown exception occurs, send a generic error message to the sender/user
+    except Exception as ex:
+        await ctx.send(sender, Error(error=f"Exception detail: {ex}"))
+        return
+
+# Create an instance of Protocol with a label "WhisperModelAgent"
+whisper_agent = Protocol(name="WhisperModelAgent", version="0.1.0")
+
+
+@whisper_agent.on_message(model=AudioTranscriptRequest, replies={AudioTranscriptResponse, Error})
+async def handle_request(ctx: Context, sender: str, request: AudioTranscriptRequest):
+    # Log the request details
+    ctx.logger.info(f"Got request from  {sender}")
+
+    await get_audio_transcript(ctx, sender, request.audio_data)
+
+
+# Include the protocol with the agent, publish_manifest will make the protocol details available on Agentverse.
+agent.include(whisper_agent, publish_manifest=True)
+
+# Define the main entry point of the application
+if __name__ == "__main__":
+    whisper_agent.run()
diff --git a/integrations/openai-whisper-large-v2/src/agents/whisper_user.py b/integrations/openai-whisper-large-v2/src/agents/whisper_user.py
@@ -0,0 +1,67 @@
+from uagents import Agent, Context, Protocol
+from messages.whisper_basic import AudioTranscriptRequest, AudioTranscriptResponse, Error
+from uagents.setup import fund_agent_if_low
+import base64
+import os
+
+RECORDING_FILE = "sample-recording/sample.flac"
+
+WHISPER_AGENT_ADDRESS = os.getenv(
+    "WHISPER_AGENT_ADDRESS", "WHISPER_AGENT_ADDRESS")
+
+if WHISPER_AGENT_ADDRESS == "WHISPER_AGENT_ADDRESS":
+    raise Exception(
+        "You need to provide an WHISPER_AGENT_ADDRESS, by exporting env. Follow README for more details")
+
+# Define user agent with specified parameters
+user = Agent(
+    name="whisper_user",
+    port=8001,
+    endpoint=["http://127.0.0.1:8001/submit"],
+)
+
+# Check and top up the agent's fund if low
+fund_agent_if_low(user.wallet.address())
+
+
+@user.on_event("startup")
+async def initialize_storage(ctx: Context):
+    ctx.storage.set("AudioTranscriptSuccessful", False)
+
+
+# Create an instance of Protocol with a label "WhisperModelUser"
+whisper_user = Protocol(name="WhisperModelUser", version="0.1.0")
+
+
+# This is an asynchronous function that is set to run at intervals of 30 sec.
+# It opens the specified RECORDING_FILE, reads it and encodes in base64 format.
+# Afterwards, it sends a request with the encoded data to the AI uagent's address.
+@whisper_user.on_interval(period=30, messages=AudioTranscriptRequest)
+async def transcript(ctx: Context):
+    AudioTranscriptSuccessful = ctx.storage.get("AudioTranscriptSuccessful")
+
+    if not AudioTranscriptSuccessful:
+        # Opening the file in read binary mode
+        with open(RECORDING_FILE, "rb") as f:
+            # Encoding the audio data to base64
+            data = base64.b64encode(f.read()).decode('ascii')
+        # Using the context to send the request to the desired address with the audio data
+        await ctx.send(WHISPER_AGENT_ADDRESS, AudioTranscriptRequest(audio_data=data))
+
+
+@whisper_user.on_message(model=AudioTranscriptResponse)
+async def handle_data(ctx: Context, sender: str, audioTranscript: AudioTranscriptResponse):
+    ctx.logger.info(f"audio transcript => {audioTranscript.transcript}")
+    ctx.storage.set("AudioTranscriptSuccessful", True)
+
+
+@whisper_user.on_message(model=Error)
+async def handle_error(ctx: Context, sender: str, error: Error):
+    ctx.logger.info(f"Got error from uagent: {error}")
+
+# Include the protocol with the agent, publish_manifest will make the protocol details available on Agentverse.
+user.include(whisper_user, publish_manifest=True)
+
+# Initiate the audio AudioTranscripting task
+if __name__ == "__main__":
+    whisper_user.run()
diff --git a/integrations/openai-whisper-large-v2/src/messages/whisper_basic.py b/integrations/openai-whisper-large-v2/src/messages/whisper_basic.py
@@ -0,0 +1,13 @@
+from uagents import Model
+
+
+class AudioTranscriptRequest(Model):
+    audio_data: str
+
+
+class Error(Model):
+    error: str
+
+
+class AudioTranscriptResponse(Model):
+    transcript: str
diff --git a/integrations/openai-whisper-large-v2/src/sample-recording/sample.flac b/integrations/openai-whisper-large-v2/src/sample-recording/sample.flac
diff --git a/integrations/openai-whisper-large-v2/src/user.py b/integrations/openai-whisper-large-v2/src/user.py
@@ -0,0 +1,8 @@
+from uagents import Bureau
+from agents.whisper_user import user
+
+if __name__ == "__main__":
+    bureau = Bureau(endpoint="http://127.0.0.1:8001/submit", port=8001)
+    print(f"Adding user agent to Bureau: {user.address}")
+    bureau.add(user)
+    bureau.run()