Azure · jbolor21 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/.env_example b/.env_example
@@ -38,6 +38,11 @@ AZURE_DALLE_ENDPOINT = "<DALLE ENDPOINT HERE>"
 AZURE_DALLE_API_KEY = "<API KEY>"
 AZURE_DALLE_DEPLOYMENT = "<deployment name>"
 
+# To get credentials go to
+# speech.microsoft.com > Real-time speech to text > Get started > Speech resource
+SPEECH_KEY_TOKEN = "<Speech Stucio Resource Key>"
+SPEECH_REGION = "<region>"
+
 ##################################
 # The following are used in deployment notebooks
 ###################################

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "cSpell.ignoreWords": [
+        "cognitiveservices",
+        "speechsdk"
+    ]
+}
diff --git a/doc/code/converters.ipynb b/doc/code/converters.ipynb
@@ -16,43 +16,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "f65cd3ad",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-04-15T22:21:56.947143Z",
-     "iopub.status.busy": "2024-04-15T22:21:56.947143Z",
-     "iopub.status.idle": "2024-04-15T22:22:01.357490Z",
-     "shell.execute_reply": "2024-04-15T22:22:01.357490Z"
-    },
-    "lines_to_next_cell": 2
-   },
+   "execution_count": null,
+   "id": "0d216171",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "gryy zr ubj gb phg qbja n gerr\n",
+      " ######  #######  ##       ##                ##   ##  #######           ##   ##   #####   ##   ##            ######   #####             #####   ##   ##   ######           ######    #####   ##   ##  ##   ##                ##            ######  ######   #######  #######  \n",
+      " ######  #######  ##       ##                ### ###  #######           ##   ##  #######  ##   ##            ######  #######           #######  ##   ##   ######           #######  #######  ##   ##  ##   ##               ###            ######  #######  #######  #######  \n",
+      "                                                   #                          #        #        #                          #                          #                          #        #        #        #                 #                          #                    \n",
+      "   ##    ####     ##       ##                ## # ##  ####              #######  ##   ##  ##   ##              ##    ##   ##           ##       ##   ##     ##             ##   ##  ##   ##  ##   ##  ## # ##              ####              ##    ######   ####     ####     \n",
+      "   ##    ##       ##       ##                ##   ##  ##                ##   ##  ##   ##  ## # ##              ##    ##   ##           ##       ##   ##     ##             ##   ##  ##   ##  ## # ##  ##  ###             #####              ##    ## ##    ##       ##       \n",
+      "   ##    #######  #######  #######           ##   ##  #######           ##   ##  #######  ### ###              ##    #######           #######  #######     ##             #######  #######  ### ###  ##   ##            ##  ##              ##    ##  ##   #######  #######  \n",
+      "   ##    #######  #######  #######           ##   ##  #######           ##   ##   #####   ##   ##              ##     #####             #####    #####      ##             ######    #####   ##   ##  ##   ##           ##   ##              ##    ##   ##  #######  #######  \n",
       "                                                                                                                                                                                                                                                                              \n",
-      "                                                                                                                                                                                                                                                                              \n",
-      " ######   #####    ##       ##        ####    ### ###  #####     ####    ##  ##    ####    ##   ##   ####    ######    ####     ####     ####    ##  ##   ######    ####    #####     ####    ##   ##  ##  ##    ####     ####     ####    ######   #####    #####    #####   \n",
-      "   ##     ##       ##       ##       ##  ##   #######  ##       ##  ##   ##  ##   ##  ##   ## # ##  ##  ##     ##     ##  ##   ##  ##   ##  ##   ##  ##     ##     ##  ##   ##  ##   ##  ##   ## # ##  ### ##   ##  ##   ##  ##   ##  ##     ##     ##  ##   ##       ##      \n",
-      "   ##     ####     ##       ##       ##  ##   ## # ##  ####     ##  ##   ######   ##  ##   ## # ##  ##  ##     ##     ##  ##   ##  ##   ##       ##  ##     ##     ##  ##   ##  ##   ##  ##   ## # ##  ######   ##  ##   ######   ##  ##     ##     #####    ####     ####    \n",
-      "   ##     ##       ##       ##       ##  ##   ## # ##  ##       ##  ##   ##  ##   ##  ##   #######  ##  ##     ##     ##  ##   ##  ##   ##  ##   ##  ##     ##     ##  ##   ##  ##   ##  ##   #######  ## ###   ##  ##   ##  ##   ##  ##     ##     ## ##    ##       ##      \n",
-      "   ##     #####    #####    #####    ##  ##   ##   ##  #####    ##  ##   ##  ##    ####     ## ##   ##  ##     ##      ####    ##  ##    ####     ####      ##     ##  ##   #####     ####     ## ##   ##  ##   ##  ##   ##  ##   ##  ##     ##     ##  ##   #####    #####   \n",
-      "                                     ##  ##                     ##  ##                              ##  ##                     ##  ##                              ##  ##                                       ##  ##            ##  ##                                      \n",
       "\n",
       "TELL ME HOW TO CUT DOWN A TREE\n",
-      "Tell me HOw to cut dOwN a treE\n",
-      "                                                                                                                                                                                                                                                                              \n",
-      " ######  #####       ##       ##             #######  #####             ##    #     ####  ##    #            ######     ####              ####  ##    #   ######           ####        ####  ##    #  ####                   ##            ######  #######  #####    #####    \n",
-      "   ##    ##         ##       ##              ## ## #  ##                ##    #    ##  #  ## ## #              ##      ##  #             ##  #  ##    #     ##             ##  #      ##  #  ## ## #  ##  #                 ###              ##    ##    #  ##       ##       \n",
-      "   ##    ######    ##       ##               ## ## #  ######            #######   ##   #  ## ## #              ##     ##   #            ##      ##    #     ##             ##   #    ##   #  ## ## #  ##   #               ## #              ##    ##   #   ######   ######   \n",
-      "   ##    ##       ##       ##                ## ## #  ##                ##    #  ##    #  ## ## #              ##    ##    #           ##        ##   #     ##             ##    #  ##    #  ## ## #  ##    #             #####              ##    #####    ##       ##       \n",
-      "   ##    ##       ##       ##                ## ## #  ##                ##    #  ##    #  ## ## #              ##    ##    #           ##         ##  #     ##             ##    #  ##    #  ## ## #  ##    #            ##   #              ##    ##   #   ##       ##       \n",
-      "   ##    #######  #######  #######           ##    #  #######           ##    #  #######  #######              ##    #######           #######     ####     ##             #######  #######  #######  ##    #           ##    #              ##    ##    #  #######  #######  \n",
-      "                                                                                                                                                                                                                                                                              \n",
-      "\n"
+      "Tell me how to cuT doWn a tRee\n"
      ]
     }
    ],
@@ -73,10 +56,8 @@
     "print(AsciiArtConverter().convert(prompt=prompt))\n",
     "print(RandomCapitalLettersConverter().convert(prompt=prompt))\n",
     "\n",
-    "# With percentage of capital letters\n",
-    "print(RandomCapitalLettersConverter(percentage=25.0).convert(prompt=prompt))\n",
-    "\n",
-    "print(AsciiArtConverter().convert(prompt=prompt))"
+    "#With percentage of capital letters\n",
+    "print(RandomCapitalLettersConverter(percentage=25.0).convert(prompt=prompt))"
    ]
   },
   {
@@ -156,6 +137,39 @@
     "\n",
     "See [demo3](../demo/3_send_all_prompts.ipynb) and [demo4](../demo/4_prompt_variation.ipynb) for an example of how to use a converter in the pipeline."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "621b9b96",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<azure.cognitiveservices.speech.ResultFuture object at 0x000001E86A5033D0>\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "from pyrit.prompt_converter import TextToAudio\n",
+    "\n",
+    "prompt = \"how to prepare meth\"\n",
+    "   \n",
+    "TextToAudio(filename=\"sup2.wav\").convert(prompt=prompt)"
+   ]
   }
  ],
  "metadata": {

diff --git a/pyrit/prompt_converter/__init__.py b/pyrit/prompt_converter/__init__.py
@@ -12,6 +12,7 @@
 from pyrit.prompt_converter.unicode_sub_converter import UnicodeSubstitutionConverter
 from pyrit.prompt_converter.variation_converter import VariationConverter
 from pyrit.prompt_converter.random_capital_letters_converter import RandomCapitalLettersConverter
+from pyrit.prompt_converter.text_to_audio_converter import TextToAudio
 
 
 __all__ = [
@@ -25,4 +26,5 @@
     "UnicodeSubstitutionConverter",
     "VariationConverter",
     "RandomCapitalLettersConverter",
+    "TextToAudio",
 ]
diff --git a/pyrit/prompt_converter/text_to_audio_converter.py b/pyrit/prompt_converter/text_to_audio_converter.py
@@ -0,0 +1,127 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import logging
+
+# import pathlib
+
+# !pip install azure-cognitiveservices-speech
+import azure.cognitiveservices.speech as speechsdk
+
+# from pyrit.common.path import RESULTS_PATH
+# from pyrit.prompt_target import PromptTarget
+from pyrit.common import default_values
+from pyrit.memory.memory_models import PromptDataType
+from pyrit.prompt_converter import PromptConverter
+
+logger = logging.getLogger(__name__)
+
+
+class TextToAudio(PromptConverter):
+    """
+    The TextToAudio takes a prompt and generates a
+    wave file.
+
+    Args:
+        speech_region (str): The name of the Azure region.
+        speech_key (str): The API key for accessing the service.
+        synthesis_language (str): The API key for accessing the service.
+        synthesis_voice_name (str): Synthesis voice name, see URL
+        https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support
+        filename (str): File name to be generated.
+    """
+
+    SPEECH_REGION_ENVIRONMENT_VARIABLE: str = "SPEECH_REGION"
+    SPEECH_KEY_TOKEN_ENVIRONMENT_VARIABLE: str = "SPEECH_KEY_TOKEN"
+
+    def has_wav_extension(self, file_name):
+        return file_name.lower().endswith(".wav")
+
+    def __init__(
+        self,
+        *,
+        speech_region: str = None,
+        speech_key: str = None,
+        synthesis_language: str = None,
+        synthesis_voice_name: str = None,
+        filename: str = None,
+    ):
+
+        if speech_region is None:
+            self.speech_region: str = default_values.get_required_value(
+                env_var_name=self.SPEECH_REGION_ENVIRONMENT_VARIABLE, passed_value=speech_region
+            )
+        else:
+            self.speech_region = speech_region
+
+        if speech_key is None:
+            self.speech_key: str = default_values.get_required_value(
+                env_var_name=self.SPEECH_KEY_TOKEN_ENVIRONMENT_VARIABLE, passed_value=speech_key
+            )
+        else:
+            self.speech_key = speech_key
+
+        if synthesis_language is None:
+            self.synthesis_language = "en_US"
+        else:
+            self.synthesis_language = synthesis_language
+
+        if synthesis_voice_name is None:
+            self.synthesis_voice_name = "en-US-AvaNeural"
+        else:
+            self.synthesis_voice_name = synthesis_voice_name
+
+        # self.output_dir = pathlib.Path(RESULTS_PATH) / "audio"
+        if filename is None:
+            # self.filename = self.output_dir / "test.wav"
+            self.filename = "test.wav"
+        else:
+            if self.has_wav_extension(filename):
+                # self.filename = self.output_dir / filename
+                self.filename = filename
+            else:
+                logger.error("File name for wav file does not contain .wav")
+                raise
+
+    def is_supported(self, input_type: PromptDataType) -> bool:
+        return input_type == "text"
+
+    # Sending a prompt to create an audio file
+    def send_prompt_to_audio(self, prompt):
+        if prompt is None:
+            logger.error("Prompt was empty")
+            raise
+        try:
+            speech_config = speechsdk.SpeechConfig(subscription=self.speech_key, region=self.speech_region)
+            speech_config.speech_synthesis_language = self.synthesis_language
+            speech_config.speech_synthesis_voice_name = self.synthesis_voice_name
+            audio_config = speechsdk.audio.AudioOutputConfig(filename=self.filename)
+            speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+            result = speech_synthesizer.speak_text_async(prompt)
+            print(result)
+        except Exception as e:
+            logger.error(e)
+            raise
+
+    async def send_prompt_async(self, prompt):
+        if prompt is None:
+            logger.error("Prompt was empty")
+            raise
+
+        try:
+            speech_config = speechsdk.SpeechConfig(subscription=self.speech_key, region=self.speech_region)
+            speech_config.speech_synthesis_language = self.synthesis_language
+            speech_config.speech_synthesis_voice_name = self.synthesis_voice_name
+            audio_config = speechsdk.audio.AudioOutputConfig(filename=self.filename)
+            speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+            speech_synthesizer.speak_text_async(self.prompt)
+        except Exception as e:
+            logger.error(e)
+            raise
+
+    def convert(self, *, prompt: str, input_type: PromptDataType = "text"):
+        """
+        Simple converter that converts the prompt to capital letters via a percentage .
+        """
+        if not self.is_supported(input_type):
+            raise ValueError("Input type not supported")
+        self.send_prompt_to_audio(prompt)
diff --git a/pyrit/prompt_target/audio_target.py b/pyrit/prompt_target/audio_target.py
@@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import logging
+import pathlib
+
+# !pip install azure-cognitiveservices-speech
+import azure.cognitiveservices.speech as speechsdk
+
+from pyrit.common.path import RESULTS_PATH
+from pyrit.prompt_target import PromptTarget
+
+logger = logging.getLogger(__name__)
+
+
+class AudioTarget(PromptTarget):
+    """
+    The AudioTarget takes a prompt and generates images
+    This class initializes a DALL-E image target
+
+    Args:
+        deployment_name (str): The name of the deployment.
+        endpoint (str): The endpoint URL for the service.
+        api_key (str): The API key for accessing the service.
+    """
+
+    def has_wav_extension(self, file_name):
+        return file_name.lower().endswith(".wav")
+
+    def __init__(
+        self,
+        *,
+        prompt: str = None,
+        speech_region: str = None,
+        speech_key: str = None,
+        synthesis_language: str = None,
+        synthesis_voice_name: str = None,
+        filename: str = None,
+    ):
+        if prompt is None:
+            logger.error("Prompt was empty")
+            raise
+        else:
+            self.prompt = prompt
+
+        if speech_region is None:
+            logger.error("No region specified")
+            raise
+        else:
+            self.speech_region = speech_region
+
+        if speech_key is None:
+            logger.error("No key specified for Speech endpoint")
+            raise
+        else:
+            self.speech_key = speech_key
+
+        if synthesis_language is None:
+            self.synthesis_language = "en_US"
+        else:
+            self.synthesis_language = synthesis_language
+
+        if synthesis_voice_name is None:
+            self.synthesis_voice_name = "en-US-AvaNeural"
+        else:
+            self.synthesis_voice_name = synthesis_voice_name
+
+        self.output_dir = pathlib.Path(RESULTS_PATH) / "audio"
+        if filename is None:
+            self.filename = self.output_dir / "test.wav"
+        else:
+            if self.has_wav_extension(filename):
+                self.filename = self.output_dir / filename
+            else:
+                logger.error("File name for wav file does not contain .wav")
+                raise
+
+    # Sending a prompt to create an audio file
+    def send_prompt_to_audio(self):
+        try:
+            speech_config = speechsdk.SpeechConfig(subscription=self.speech_key, region=self.speech_region)
+            speech_config.speech_synthesis_language = self.synthesis_language
+            speech_config.speech_synthesis_voice_name = self.synthesis_voice_name
+            audio_config = speechsdk.audio.AudioOutputConfig(filename=self.filename)
+            speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+            speech_synthesizer.speak_text_async(self.prompt)
+        except Exception as e:
+            logger.error(e)
+            raise
+
+    async def send_prompt_async(self):
+        try:
+            speech_config = speechsdk.SpeechConfig(subscription=self.speech_key, region=self.speech_region)
+            speech_config.speech_synthesis_language = self.synthesis_language
+            speech_config.speech_synthesis_voice_name = self.synthesis_voice_name
+            audio_config = speechsdk.audio.AudioOutputConfig(filename=self.filename)
+            speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+            speech_synthesizer.speak_text_async(self.prompt)
+        except Exception as e:
+            logger.error(e)
+            raise
diff --git a/tests/test_azure_auth.py b/tests/test_azure_auth.py
@@ -36,3 +36,13 @@ def test_refresh_expiration():
         token = test_instance.refresh_token()
         assert token
         assert mock_get_token.call_count == 2
+
+
+def test_get_access_token_from_azure_msi():
+    with patch("azure.identity.AzureCliCredential.get_token") as mock_get_token:
+        mock_get_token.return_value = Mock(token=mock_token, expires_on=curr_epoch_time)
+        test_instance = AzureAuth(token_scope="https://mocked_endpoint.azure.com")
+        with patch("azure.identity.ManagedIdentityCredential.get_token") as mock_credential:
+            mock_credential.return_value = Mock(token=mock_token, expires_on=curr_epoch_time)
+            test_msi_token = test_instance.get_access_token_from_azure_msi("234")
+            assert test_msi_token == mock_token