Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

M.E.AI.Abstractions - Speech to Text Abstraction #5838

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified eng/spellchecking_exclusions.dic
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,55 @@ static async Task<ChatResponse> ToChatResponseAsync(
}
}

/// <summary>Coalesces sequential <see cref="TextContent"/> content elements.</summary>
internal static void CoalesceTextContent(List<AIContent> contents)
{
StringBuilder? coalescedText = null;

// Iterate through all of the items in the list looking for contiguous items that can be coalesced.
int start = 0;
while (start < contents.Count - 1)
{
// We need at least two TextContents in a row to be able to coalesce.
if (contents[start] is not TextContent firstText)
{
start++;
continue;
}

if (contents[start + 1] is not TextContent secondText)
{
start += 2;
continue;
}

// Append the text from those nodes and continue appending subsequent TextContents until we run out.
// We null out nodes as their text is appended so that we can later remove them all in one O(N) operation.
coalescedText ??= new();
_ = coalescedText.Clear().Append(firstText.Text).Append(secondText.Text);
contents[start + 1] = null!;
int i = start + 2;
for (; i < contents.Count && contents[i] is TextContent next; i++)
{
_ = coalescedText.Append(next.Text);
contents[i] = null!;
}

// Store the replacement node.
contents[start] = new TextContent(coalescedText.ToString())
{
// We inherit the properties of the first text node. We don't currently propagate additional
// properties from the subsequent nodes. If we ever need to, we can add that here.
AdditionalProperties = firstText.AdditionalProperties?.Clone(),
};

start = i;
}

// Remove all of the null slots left over from the coalescing process.
_ = contents.RemoveAll(u => u is null);
}

/// <summary>Finalizes the <paramref name="response"/> object.</summary>
private static void FinalizeResponse(ChatResponse response)
{
Expand Down Expand Up @@ -280,53 +329,4 @@ private static void ProcessUpdate(ChatResponseUpdate update, ChatResponse respon
}
}
}

/// <summary>Coalesces sequential <see cref="TextContent"/> content elements.</summary>
private static void CoalesceTextContent(List<AIContent> contents)
{
StringBuilder? coalescedText = null;

// Iterate through all of the items in the list looking for contiguous items that can be coalesced.
int start = 0;
while (start < contents.Count - 1)
{
// We need at least two TextContents in a row to be able to coalesce.
if (contents[start] is not TextContent firstText)
{
start++;
continue;
}

if (contents[start + 1] is not TextContent secondText)
{
start += 2;
continue;
}

// Append the text from those nodes and continue appending subsequent TextContents until we run out.
// We null out nodes as their text is appended so that we can later remove them all in one O(N) operation.
coalescedText ??= new();
_ = coalescedText.Clear().Append(firstText.Text).Append(secondText.Text);
contents[start + 1] = null!;
int i = start + 2;
for (; i < contents.Count && contents[i] is TextContent next; i++)
{
_ = coalescedText.Append(next.Text);
contents[i] = null!;
}

// Store the replacement node.
contents[start] = new TextContent(coalescedText.ToString())
{
// We inherit the properties of the first text node. We don't currently propagate additional
// properties from the subsequent nodes. If we ever need to, we can add that here.
AdditionalProperties = firstText.AdditionalProperties?.Clone(),
};

start = i;
}

// Remove all of the null slots left over from the coalescing process.
_ = contents.RemoveAll(u => u is null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ namespace Microsoft.Extensions.AI;
/// <summary>Provides a base class for all content used with AI services.</summary>
[JsonPolymorphic(TypeDiscriminatorPropertyName = "$type")]
[JsonDerivedType(typeof(DataContent), typeDiscriminator: "data")]
[JsonDerivedType(typeof(ErrorContent), typeDiscriminator: "error")]
[JsonDerivedType(typeof(FunctionCallContent), typeDiscriminator: "functionCall")]
[JsonDerivedType(typeof(FunctionResultContent), typeDiscriminator: "functionResult")]
[JsonDerivedType(typeof(TextContent), typeDiscriminator: "text")]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Text.Json.Serialization;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.AI;

/// <summary>Represents an error.</summary>
/// <remarks>
/// Typically, <see cref="ErrorContent"/> is used for non-fatal errors, where something went wrong
/// as part of the operation but the operation was still able to continue.
/// </remarks>
[DebuggerDisplay("{DebuggerDisplay,nq}")]
public class ErrorContent : AIContent
{
/// <summary>The error message.</summary>
private string _message;

/// <summary>Initializes a new instance of the <see cref="ErrorContent"/> class with the specified message.</summary>
/// <param name="message">The message to store in this content.</param>
[JsonConstructor]
public ErrorContent(string message)
{
_message = Throw.IfNull(message);
}

/// <summary>Gets or sets the error message.</summary>
public string Message
{
get => _message;
set => _message = Throw.IfNull(value);
}

/// <summary>Gets or sets the error code.</summary>
public string? ErrorCode { get; set; }

/// <summary>Gets or sets the error details.</summary>
public string? Details { get; set; }

/// <summary>Gets a string representing this instance to display in the debugger.</summary>
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
private string DebuggerDisplay =>
$"Error = {Message}" +
(ErrorCode is not null ? $" ({ErrorCode})" : string.Empty) +
(Details is not null ? $" - {Details}" : string.Empty);
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@
<PropertyGroup>
<TargetFrameworks>$(TargetFrameworks);netstandard2.0</TargetFrameworks>
<NoWarn>$(NoWarn);CA2227;CA1034;SA1316;S3253</NoWarn>
<NoWarn>$(NoWarn);MEAI001</NoWarn>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<DisableNETStandardCompatErrors>true</DisableNETStandardCompatErrors>
</PropertyGroup>

<PropertyGroup>
<InjectExperimentalAttributeOnLegacy>true</InjectExperimentalAttributeOnLegacy>
<InjectJsonSchemaExporterOnLegacy>true</InjectJsonSchemaExporterOnLegacy>
<InjectRequiredMemberOnLegacy>true</InjectRequiredMemberOnLegacy>
<InjectSharedEmptyCollections>true</InjectSharedEmptyCollections>
<InjectStringHashOnLegacy>true</InjectStringHashOnLegacy>
<InjectStringSyntaxAttributeOnLegacy>true</InjectStringSyntaxAttributeOnLegacy>
<InjectRequiredMemberOnLegacy>true</InjectRequiredMemberOnLegacy>
</PropertyGroup>

<ItemGroup Condition="'$(TargetFrameworkIdentifier)' != '.NETCoreApp'">
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.AI;

/// <summary>
/// Provides an optional base class for an <see cref="ISpeechToTextClient"/> that passes through calls to another instance.
/// </summary>
/// <remarks>
/// This is recommended as a base type when building clients that can be chained in any order around an underlying <see cref="ISpeechToTextClient"/>.
/// The default implementation simply passes each call to the inner client instance.
/// </remarks>
[Experimental("MEAI001")]
public class DelegatingSpeechToTextClient : ISpeechToTextClient
{
/// <summary>
/// Initializes a new instance of the <see cref="DelegatingSpeechToTextClient"/> class.
/// </summary>
/// <param name="innerClient">The wrapped client instance.</param>
protected DelegatingSpeechToTextClient(ISpeechToTextClient innerClient)
{
InnerClient = Throw.IfNull(innerClient);
}

/// <inheritdoc />
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}

/// <summary>Gets the inner <see cref="ISpeechToTextClient" />.</summary>
protected ISpeechToTextClient InnerClient { get; }

/// <inheritdoc />
public virtual Task<SpeechToTextResponse> TranscribeAudioAsync(
IList<IAsyncEnumerable<DataContent>> speechContents, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default)
{
return InnerClient.TranscribeAudioAsync(speechContents, options, cancellationToken);
}

/// <inheritdoc />
public virtual IAsyncEnumerable<SpeechToTextResponseUpdate> TranscribeStreamingAudioAsync(
IList<IAsyncEnumerable<DataContent>> speechContents, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default)
{
return InnerClient.TranscribeStreamingAudioAsync(speechContents, options, cancellationToken);
Copy link
Author

@RogerBarreto RogerBarreto Mar 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick observation, as you bring Transcribe signature back, will ISpeechToTextClient interface also have Translate___Async signatures, is that the rational?

My original thinking on having the Response naming was to acommodate those two functionalities in the same method, given the change from IAudioTranscriptionClient to ISpeechToTextClient.

Copy link
Member

@stephentoub stephentoub Mar 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given audio in and text out, what's the difference between transcribe and translate? Isn't the latter still transcription, doing speech recognition to go from audio to text, "just" with a possibly different target language than the audio content?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By definition those are different, and I opted for avoiding any confusion if possible.

"just" with a possibly different target language than the audio content

Yes, currently this is the main difference, while translation may also have multiple outputs (for multiple different language translations from the same input).

Having a dedicated interface for each has its benefits but the underlying functionality is quite the same.

From AI that was the answer I had.

Is there any difference between speech transcription and translation ? can one be a subset of another?

Yes, there is a difference between speech transcription and translation, and while they are related concepts, one is not necessarily a subset of the other. Let me break it down:

Speech Transcription
Speech transcription involves converting spoken language (audio) into written text in the same language. For example, if someone speaks in English, transcription would produce a written English version of what was said. The focus is on accurately capturing the words, and sometimes additional details like tone, pauses, or speaker identification (e.g., in a multi-speaker setting like a podcast or interview). It’s about representing the spoken content in a textual form without changing the language.

Translation
Translation, on the other hand, involves converting text or speech from one language to another. For example, translating spoken English into written Spanish or spoken French into written English. The goal is to preserve the meaning and intent of the original content while adapting it to a different language, which often requires cultural and linguistic adjustments beyond just word-for-word conversion.

Key Differences
Language Change: Transcription stays within the same language; translation shifts between languages.
Process: Transcription is about capturing what’s said as text, while translation involves interpreting and rephrasing meaning in another language.
Purpose: Transcription is often used for documentation (e.g., court records, subtitles), while translation is used to make content accessible to speakers of other languages.
Can One Be a Subset of the Other?
Not exactly, but they can overlap or be part of a broader process:

Transcription as a Step in Translation: In some workflows, speech is first transcribed into text in the original language, and then that text is translated into another language. For example, a Spanish speech might be transcribed into Spanish text and then translated into English. Here, transcription is a precursor to translation, but it’s not a subset—it’s a distinct step.
Real-Time Speech Translation: Modern technology (like AI-powered interpreters) can combine transcription and translation into a seamless process, where spoken words in one language are directly converted to text or speech in another. In this case, transcription might happen internally as part of the translation pipeline, but they remain separate functions conceptually.
Conclusion
Transcription and translation serve different purposes and operate at different levels of language processing. While they can work together (e.g., transcribe then translate), neither is inherently a subset of the other—they’re distinct tools in the language toolkit.

}

/// <inheritdoc />
public virtual object? GetService(Type serviceType, object? serviceKey = null)
{
_ = Throw.IfNull(serviceType);

// If the key is non-null, we don't know what it means so pass through to the inner service.
return
serviceKey is null && serviceType.IsInstanceOfType(this) ? this :
InnerClient.GetService(serviceType, serviceKey);
}

/// <summary>Provides a mechanism for releasing unmanaged resources.</summary>
/// <param name="disposing"><see langword="true"/> if being called from <see cref="Dispose()"/>; otherwise, <see langword="false"/>.</param>
protected virtual void Dispose(bool disposing)
{
if (disposing)
{
InnerClient.Dispose();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
using System.Threading.Tasks;

namespace Microsoft.Extensions.AI;

/// <summary>Represents a speech to text client.</summary>
/// <remarks>
/// <para>
/// Unless otherwise specified, all members of <see cref="ISpeechToTextClient"/> are thread-safe for concurrent use.
/// It is expected that all implementations of <see cref="ISpeechToTextClient"/> support being used by multiple requests concurrently.
/// </para>
/// <para>
/// However, implementations of <see cref="ISpeechToTextClient"/> might mutate the arguments supplied to <see cref="TranscribeAudioAsync"/> and
/// <see cref="TranscribeStreamingAudioAsync"/>, such as by configuring the options instance. Thus, consumers of the interface either should avoid
/// using shared instances of these arguments for concurrent invocations or should otherwise ensure by construction that no
/// <see cref="ISpeechToTextClient"/> instances are used which might employ such mutation. For example, the ConfigureOptions method be
/// provided with a callback that could mutate the supplied options argument, and that should be avoided if using a singleton options instance.
/// </para>
/// </remarks>
[Experimental("MEAI001")]
public interface ISpeechToTextClient : IDisposable
{
/// <summary>Sends speech speech audio contents to the model and returns the generated text.</summary>
/// <param name="speechContents">The list of speech speech audio contents to send.</param>
/// <param name="options">The speech to text options to configure the request.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
/// <returns>The text generated by the client.</returns>
Task<SpeechToTextResponse> TranscribeAudioAsync(
IList<IAsyncEnumerable<DataContent>> speechContents,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any scenarios where an implementation is expected to mutate this? With chat, this is expected to be a history, but with speech-to-text, presumably it's generally more of a one-and-done kind of thing? Maybe this should be an IEnumerable instead of an IList?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, I just noticed, this is an IList<IAsyncEnumerable<DataContent>> rather than an IAsyncEnumerable<DataContent>? The intent here is this handles multiple inputs, each of which is an asynchronously produced sequence of content?

SpeechToTextOptions? options = null,
CancellationToken cancellationToken = default);

/// <summary>Sends speech speech audio contents to the model and streams back the generated text.</summary>
/// <param name="speechContents">The list of speech speech audio contents to send.</param>
/// <param name="options">The speech to text options to configure the request.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
/// <returns>The response messages generated by the client.</returns>
IAsyncEnumerable<SpeechToTextResponseUpdate> TranscribeStreamingAudioAsync(
IList<IAsyncEnumerable<DataContent>> speechContents,
SpeechToTextOptions? options = null,
CancellationToken cancellationToken = default);

/// <summary>Asks the <see cref="ISpeechToTextClient"/> for an object of the specified type <paramref name="serviceType"/>.</summary>
/// <param name="serviceType">The type of object being requested.</param>
/// <param name="serviceKey">An optional key that can be used to help identify the target service.</param>
/// <returns>The found object, otherwise <see langword="null"/>.</returns>
/// <exception cref="ArgumentNullException"><paramref name="serviceType"/> is <see langword="null"/>.</exception>
/// <remarks>
/// The purpose of this method is to allow for the retrieval of strongly typed services that might be provided by the <see cref="ISpeechToTextClient"/>,
/// including itself or any services it might be wrapping.
/// </remarks>
object? GetService(Type serviceType, object? serviceKey = null);
}
Loading
Loading