diff --git a/eng/spellchecking_exclusions.dic b/eng/spellchecking_exclusions.dic index 72596816516..2abdfbd64a2 100644 Binary files a/eng/spellchecking_exclusions.dic and b/eng/spellchecking_exclusions.dic differ diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/ChatCompletion/ChatResponseExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/ChatCompletion/ChatResponseExtensions.cs index 61e4a494d33..01cdb8dc322 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/ChatCompletion/ChatResponseExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/ChatCompletion/ChatResponseExtensions.cs @@ -180,6 +180,55 @@ static async Task ToChatResponseAsync( } } + /// Coalesces sequential content elements. + internal static void CoalesceTextContent(List contents) + { + StringBuilder? coalescedText = null; + + // Iterate through all of the items in the list looking for contiguous items that can be coalesced. + int start = 0; + while (start < contents.Count - 1) + { + // We need at least two TextContents in a row to be able to coalesce. + if (contents[start] is not TextContent firstText) + { + start++; + continue; + } + + if (contents[start + 1] is not TextContent secondText) + { + start += 2; + continue; + } + + // Append the text from those nodes and continue appending subsequent TextContents until we run out. + // We null out nodes as their text is appended so that we can later remove them all in one O(N) operation. + coalescedText ??= new(); + _ = coalescedText.Clear().Append(firstText.Text).Append(secondText.Text); + contents[start + 1] = null!; + int i = start + 2; + for (; i < contents.Count && contents[i] is TextContent next; i++) + { + _ = coalescedText.Append(next.Text); + contents[i] = null!; + } + + // Store the replacement node. + contents[start] = new TextContent(coalescedText.ToString()) + { + // We inherit the properties of the first text node. We don't currently propagate additional + // properties from the subsequent nodes. If we ever need to, we can add that here. + AdditionalProperties = firstText.AdditionalProperties?.Clone(), + }; + + start = i; + } + + // Remove all of the null slots left over from the coalescing process. + _ = contents.RemoveAll(u => u is null); + } + /// Finalizes the object. private static void FinalizeResponse(ChatResponse response) { @@ -296,53 +345,4 @@ private static void ProcessUpdate(ChatResponseUpdate update, ChatResponse respon } } } - - /// Coalesces sequential content elements. - private static void CoalesceTextContent(List contents) - { - StringBuilder? coalescedText = null; - - // Iterate through all of the items in the list looking for contiguous items that can be coalesced. - int start = 0; - while (start < contents.Count - 1) - { - // We need at least two TextContents in a row to be able to coalesce. - if (contents[start] is not TextContent firstText) - { - start++; - continue; - } - - if (contents[start + 1] is not TextContent secondText) - { - start += 2; - continue; - } - - // Append the text from those nodes and continue appending subsequent TextContents until we run out. - // We null out nodes as their text is appended so that we can later remove them all in one O(N) operation. - coalescedText ??= new(); - _ = coalescedText.Clear().Append(firstText.Text).Append(secondText.Text); - contents[start + 1] = null!; - int i = start + 2; - for (; i < contents.Count && contents[i] is TextContent next; i++) - { - _ = coalescedText.Append(next.Text); - contents[i] = null!; - } - - // Store the replacement node. - contents[start] = new TextContent(coalescedText.ToString()) - { - // We inherit the properties of the first text node. We don't currently propagate additional - // properties from the subsequent nodes. If we ever need to, we can add that here. - AdditionalProperties = firstText.AdditionalProperties?.Clone(), - }; - - start = i; - } - - // Remove all of the null slots left over from the coalescing process. - _ = contents.RemoveAll(u => u is null); - } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/AIContent.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/AIContent.cs index 6562b7bcc42..068bd1ce447 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/AIContent.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/AIContent.cs @@ -8,6 +8,7 @@ namespace Microsoft.Extensions.AI; /// Provides a base class for all content used with AI services. [JsonPolymorphic(TypeDiscriminatorPropertyName = "$type")] [JsonDerivedType(typeof(DataContent), typeDiscriminator: "data")] +[JsonDerivedType(typeof(ErrorContent), typeDiscriminator: "error")] [JsonDerivedType(typeof(FunctionCallContent), typeDiscriminator: "functionCall")] [JsonDerivedType(typeof(FunctionResultContent), typeDiscriminator: "functionResult")] [JsonDerivedType(typeof(TextContent), typeDiscriminator: "text")] diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/ErrorContent.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/ErrorContent.cs new file mode 100644 index 00000000000..ceca3002f88 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/ErrorContent.cs @@ -0,0 +1,48 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Text.Json.Serialization; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// Represents an error. +/// +/// Typically, is used for non-fatal errors, where something went wrong +/// as part of the operation but the operation was still able to continue. +/// +[DebuggerDisplay("{DebuggerDisplay,nq}")] +public class ErrorContent : AIContent +{ + /// The error message. + private string _message; + + /// Initializes a new instance of the class with the specified message. + /// The message to store in this content. + [JsonConstructor] + public ErrorContent(string message) + { + _message = Throw.IfNull(message); + } + + /// Gets or sets the error message. + public string Message + { + get => _message; + set => _message = Throw.IfNull(value); + } + + /// Gets or sets the error code. + public string? ErrorCode { get; set; } + + /// Gets or sets the error details. + public string? Details { get; set; } + + /// Gets a string representing this instance to display in the debugger. + [DebuggerBrowsable(DebuggerBrowsableState.Never)] + private string DebuggerDisplay => + $"Error = {Message}" + + (ErrorCode is not null ? $" ({ErrorCode})" : string.Empty) + + (Details is not null ? $" - {Details}" : string.Empty); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Microsoft.Extensions.AI.Abstractions.csproj b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Microsoft.Extensions.AI.Abstractions.csproj index da24217861e..27a2c5d0513 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Microsoft.Extensions.AI.Abstractions.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Microsoft.Extensions.AI.Abstractions.csproj @@ -16,16 +16,18 @@ $(TargetFrameworks);netstandard2.0 $(NoWarn);CA2227;CA1034;SA1316;S3253 + $(NoWarn);MEAI001 true true + true true + true true true true - true diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/DelegatingSpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/DelegatingSpeechToTextClient.cs new file mode 100644 index 00000000000..6cbe2392e4c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/DelegatingSpeechToTextClient.cs @@ -0,0 +1,77 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// +/// Provides an optional base class for an that passes through calls to another instance. +/// +/// +/// This is recommended as a base type when building clients that can be chained in any order around an underlying . +/// The default implementation simply passes each call to the inner client instance. +/// +[Experimental("MEAI001")] +public class DelegatingSpeechToTextClient : ISpeechToTextClient +{ + /// + /// Initializes a new instance of the class. + /// + /// The wrapped client instance. + protected DelegatingSpeechToTextClient(ISpeechToTextClient innerClient) + { + InnerClient = Throw.IfNull(innerClient); + } + + /// + public void Dispose() + { + Dispose(disposing: true); + GC.SuppressFinalize(this); + } + + /// Gets the inner . + protected ISpeechToTextClient InnerClient { get; } + + /// + public virtual Task GetTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default) + { + return InnerClient.GetTextAsync(audioSpeechStream, options, cancellationToken); + } + + /// + public virtual IAsyncEnumerable GetStreamingTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default) + { + return InnerClient.GetStreamingTextAsync(audioSpeechStream, options, cancellationToken); + } + + /// + public virtual object? GetService(Type serviceType, object? serviceKey = null) + { + _ = Throw.IfNull(serviceType); + + // If the key is non-null, we don't know what it means so pass through to the inner service. + return + serviceKey is null && serviceType.IsInstanceOfType(this) ? this : + InnerClient.GetService(serviceType, serviceKey); + } + + /// Provides a mechanism for releasing unmanaged resources. + /// if being called from ; otherwise, . + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + InnerClient.Dispose(); + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/ISpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/ISpeechToTextClient.cs new file mode 100644 index 00000000000..65458d6602c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/ISpeechToTextClient.cs @@ -0,0 +1,61 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI; + +/// Represents a speech to text client. +/// +/// +/// Unless otherwise specified, all members of are thread-safe for concurrent use. +/// It is expected that all implementations of support being used by multiple requests concurrently. +/// +/// +/// However, implementations of might mutate the arguments supplied to and +/// , such as by configuring the options instance. Thus, consumers of the interface either should avoid +/// using shared instances of these arguments for concurrent invocations or should otherwise ensure by construction that no +/// instances are used which might employ such mutation. For example, the ConfigureOptions method be +/// provided with a callback that could mutate the supplied options argument, and that should be avoided if using a singleton options instance. +/// The audio speech stream passed to these methods will not be closed or disposed by the implementation. +/// +/// +[Experimental("MEAI001")] +public interface ISpeechToTextClient : IDisposable +{ + /// Sends audio speech content to the model and returns the generated text. + /// The audio speech stream to send. + /// The speech to text options to configure the request. + /// The to monitor for cancellation requests. The default is . + /// The text generated. + Task GetTextAsync( + Stream audioSpeechStream, + SpeechToTextOptions? options = null, + CancellationToken cancellationToken = default); + + /// Sends audio speech content to the model and streams back the generated text. + /// The audio speech stream to send. + /// The speech to text options to configure the request. + /// The to monitor for cancellation requests. The default is . + /// The text updates representing the streamed output. + IAsyncEnumerable GetStreamingTextAsync( + Stream audioSpeechStream, + SpeechToTextOptions? options = null, + CancellationToken cancellationToken = default); + + /// Asks the for an object of the specified type . + /// The type of object being requested. + /// An optional key that can be used to help identify the target service. + /// The found object, otherwise . + /// is . + /// + /// The purpose of this method is to allow for the retrieval of strongly typed services that might be provided by the , + /// including itself or any services it might be wrapping. + /// + object? GetService(Type serviceType, object? serviceKey = null); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextClientExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextClientExtensions.cs new file mode 100644 index 00000000000..d8ca62f34ea --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextClientExtensions.cs @@ -0,0 +1,77 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// Extensions for . +[Experimental("MEAI001")] +public static class SpeechToTextClientExtensions +{ + /// Asks the for an object of type . + /// The type of the object to be retrieved. + /// The client. + /// An optional key that can be used to help identify the target service. + /// The found object, otherwise . + /// + /// The purpose of this method is to allow for the retrieval of strongly typed services that may be provided by the , + /// including itself or any services it might be wrapping. + /// + public static TService? GetService(this ISpeechToTextClient client, object? serviceKey = null) + { + _ = Throw.IfNull(client); + + return (TService?)client.GetService(typeof(TService), serviceKey); + } + + /// Generates text from speech providing a single audio speech . + /// The client. + /// The single audio speech content. + /// The speech to text options to configure the request. + /// The to monitor for cancellation requests. The default is . + /// The text generated by the client. + public static Task GetTextAsync( + this ISpeechToTextClient client, + DataContent audioSpeechContent, + SpeechToTextOptions? options = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(client); + _ = Throw.IfNull(audioSpeechContent); + + var audioSpeechStream = MemoryMarshal.TryGetArray(audioSpeechContent.Data, out var array) ? + new MemoryStream(array.Array!, array.Offset, array.Count) : + new MemoryStream(audioSpeechContent.Data.ToArray()); + + return client.GetTextAsync(audioSpeechStream, options, cancellationToken); + } + + /// Generates text from speech providing a single audio speech . + /// The client. + /// The single audio speech content. + /// The speech to text options to configure the request. + /// The to monitor for cancellation requests. The default is . + /// The text generated by the client. + public static IAsyncEnumerable GetStreamingTextAsync( + this ISpeechToTextClient client, + DataContent audioSpeechContent, + SpeechToTextOptions? options = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(client); + _ = Throw.IfNull(audioSpeechContent); + + var audioSpeechStream = MemoryMarshal.TryGetArray(audioSpeechContent.Data, out var array) ? + new MemoryStream(array.Array!, array.Offset, array.Count) : + new MemoryStream(audioSpeechContent.Data.ToArray()); + + return client.GetStreamingTextAsync(audioSpeechStream, options, cancellationToken); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextClientMetadata.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextClientMetadata.cs new file mode 100644 index 00000000000..df39fb7facc --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextClientMetadata.cs @@ -0,0 +1,43 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI; + +/// Provides metadata about an . +[Experimental("MEAI001")] +public class SpeechToTextClientMetadata +{ + /// Initializes a new instance of the class. + /// + /// The name of the speech to text provider, if applicable. Where possible, this should map to the + /// appropriate name defined in the OpenTelemetry Semantic Conventions for Generative AI systems. + /// + /// The URL for accessing the speech to text provider, if applicable. + /// The ID of the speech to text used by default, if applicable. + public SpeechToTextClientMetadata(string? providerName = null, Uri? providerUri = null, string? defaultModelId = null) + { + DefaultModelId = defaultModelId; + ProviderName = providerName; + ProviderUri = providerUri; + } + + /// Gets the name of the speech to text provider. + /// + /// Where possible, this maps to the appropriate name defined in the + /// OpenTelemetry Semantic Conventions for Generative AI systems. + /// + public string? ProviderName { get; } + + /// Gets the URL for accessing the speech to text provider. + public Uri? ProviderUri { get; } + + /// Gets the ID of the default model used by this speech to text client. + /// + /// This value can be null if either the name is unknown or there are multiple possible models associated with this instance. + /// An individual request may override this value via . + /// + public string? DefaultModelId { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextOptions.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextOptions.cs new file mode 100644 index 00000000000..cb196a4c91c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextOptions.cs @@ -0,0 +1,42 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI; + +/// Represents the options for an speech to text request. +[Experimental("MEAI001")] +public class SpeechToTextOptions +{ + /// Gets or sets the model ID for the speech to text. + public string? ModelId { get; set; } + + /// Gets or sets the language of source speech. + public string? SpeechLanguage { get; set; } + + /// Gets or sets the language for the target generated text. + public string? TextLanguage { get; set; } + + /// Gets or sets the sample rate of the speech input audio. + public int? SpeechSampleRate { get; set; } + + /// Gets or sets any additional properties associated with the options. + public AdditionalPropertiesDictionary? AdditionalProperties { get; set; } + + /// Produces a clone of the current instance. + /// A clone of the current instance. + public virtual SpeechToTextOptions Clone() + { + SpeechToTextOptions options = new() + { + ModelId = ModelId, + SpeechLanguage = SpeechLanguage, + TextLanguage = TextLanguage, + SpeechSampleRate = SpeechSampleRate, + AdditionalProperties = AdditionalProperties?.Clone(), + }; + + return options; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs new file mode 100644 index 00000000000..24fa20a11ed --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs @@ -0,0 +1,101 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Text.Json.Serialization; +using Microsoft.Shared.Diagnostics; + +#pragma warning disable EA0011 // Consider removing unnecessary conditional access operators + +namespace Microsoft.Extensions.AI; + +/// Represents the result of an speech to text request. +[Experimental("MEAI001")] +public class SpeechToTextResponse +{ + /// The content items in the generated text response. + private IList? _contents; + + /// Initializes a new instance of the class. + [JsonConstructor] + public SpeechToTextResponse() + { + } + + /// Initializes a new instance of the class. + /// The contents for this response. + public SpeechToTextResponse(IList contents) + { + _contents = Throw.IfNull(contents); + } + + /// Initializes a new instance of the class. + /// Content of the response. + public SpeechToTextResponse(string? content) + : this(content is null ? [] : [new TextContent(content)]) + { + } + + /// Gets or sets the start time of the text segment in relation to the full audio speech length. + public TimeSpan? StartTime { get; set; } + + /// Gets or sets the end time of the text segment in relation to the full audio speech length. + public TimeSpan? EndTime { get; set; } + + /// Gets or sets the ID of the speech to text response. + public string? ResponseId { get; set; } + + /// Gets or sets the model ID used in the creation of the speech to text completion. + public string? ModelId { get; set; } + + /// Gets or sets the raw representation of the speech to text completion from an underlying implementation. + /// + /// If a is created to represent some underlying object from another object + /// model, this property can be used to store that original object. This can be useful for debugging or + /// for enabling a consumer to access the underlying object model if needed. + /// + [JsonIgnore] + public object? RawRepresentation { get; set; } + + /// Gets or sets any additional properties associated with the speech to text completion. + public AdditionalPropertiesDictionary? AdditionalProperties { get; set; } + + /// Gets the text of this speech to text response. + /// + /// This property concatenates the text of all objects in . + /// + [JsonIgnore] + public string Text => _contents?.ConcatText() ?? string.Empty; + + /// + public override string ToString() => Text; + + /// Creates an array of instances that represent this . + /// An array of instances that may be used to represent this . + public SpeechToTextResponseUpdate[] ToSpeechToTextResponseUpdates() + { + SpeechToTextResponseUpdate update = new SpeechToTextResponseUpdate + { + Contents = Contents, + AdditionalProperties = AdditionalProperties, + RawRepresentation = RawRepresentation, + StartTime = StartTime, + EndTime = EndTime, + Kind = SpeechToTextResponseUpdateKind.TextUpdated, + ResponseId = ResponseId, + ModelId = ModelId, + }; + + return [update]; + } + + /// Gets or sets the generated content items. + [AllowNull] + public IList Contents + { + get => _contents ??= []; + set => _contents = value; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdate.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdate.cs new file mode 100644 index 00000000000..24b7f079302 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdate.cs @@ -0,0 +1,102 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Text.Json.Serialization; +using Microsoft.Shared.Diagnostics; + +#pragma warning disable EA0011 // Consider removing unnecessary conditional access operators + +namespace Microsoft.Extensions.AI; + +/// +/// Represents a single streaming response chunk from an . +/// +/// +/// is so named because it represents streaming updates +/// to an speech to text generation. As such, it is considered erroneous for multiple updates that are part +/// of the same audio speech to contain competing values. For example, some updates that are part of +/// the same audio speech may have a value, and others may have a non- value, +/// but all of those with a non- value must have the same value (e.g. ). +/// +/// +/// The relationship between and is +/// codified in the and +/// , which enable bidirectional conversions +/// between the two. Note, however, that the conversion may be slightly lossy, for example if multiple updates +/// all have different objects whereas there's +/// only one slot for such an object available in . +/// +/// +[Experimental("MEAI001")] +public class SpeechToTextResponseUpdate +{ + private IList? _contents; + + /// Initializes a new instance of the class. + [JsonConstructor] + public SpeechToTextResponseUpdate() + { + } + + /// Initializes a new instance of the class. + /// The contents for this message. + public SpeechToTextResponseUpdate(IList contents) + { + _contents = Throw.IfNull(contents); + } + + /// Initializes a new instance of the class. + /// Content of the message. + public SpeechToTextResponseUpdate(string? content) + : this(content is null ? [] : [new TextContent(content)]) + { + } + + /// Gets or sets the kind of the generated text update. + public SpeechToTextResponseUpdateKind Kind { get; set; } = SpeechToTextResponseUpdateKind.TextUpdating; + + /// Gets or sets the ID of the generated text response of which this update is a part. + public string? ResponseId { get; set; } + + /// Gets or sets the start time of the text segment associated with this update in relation to the full audio speech length. + public TimeSpan? StartTime { get; set; } + + /// Gets or sets the end time of the text segment associated with this update in relation to the full audio speech length. + public TimeSpan? EndTime { get; set; } + + /// Gets or sets the model ID using in the creation of the speech to text of which this update is a part. + public string? ModelId { get; set; } + + /// Gets or sets the raw representation of the generated text update from an underlying implementation. + /// + /// If a is created to represent some underlying object from another object + /// model, this property can be used to store that original object. This can be useful for debugging or + /// for enabling a consumer to access the underlying object model if needed. + /// + [JsonIgnore] + public object? RawRepresentation { get; set; } + + /// Gets or sets additional properties for the update. + public AdditionalPropertiesDictionary? AdditionalProperties { get; set; } + + /// Gets the text of this speech to text response. + /// + /// This property concatenates the text of all objects in . + /// + [JsonIgnore] + public string Text => _contents?.ConcatText() ?? string.Empty; + + /// Gets or sets the generated content items. + [AllowNull] + public IList Contents + { + get => _contents ??= []; + set => _contents = value; + } + + /// + public override string ToString() => Text; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs new file mode 100644 index 00000000000..230ec838ba3 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs @@ -0,0 +1,145 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// +/// Provides extension methods for working with instances. +/// +[Experimental("MEAI001")] +public static class SpeechToTextResponseUpdateExtensions +{ + /// Combines instances into a single . + /// The updates to be combined. + /// The combined . + public static SpeechToTextResponse ToSpeechToTextResponse( + this IEnumerable updates) + { + _ = Throw.IfNull(updates); + + SpeechToTextResponse response = new(); + List contents = []; + string? responseId = null; + string? modelId = null; + AdditionalPropertiesDictionary? additionalProperties = null; + + TimeSpan? endTime = null; + foreach (var update in updates) + { + // Track the first start time provided by the updates + response.StartTime ??= update.StartTime; + + // Track the last end time provided by the updates + if (update.EndTime is not null) + { + endTime = update.EndTime; + } + + ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties); + } + + ChatResponseExtensions.CoalesceTextContent(contents); + response.EndTime = endTime; + response.Contents = contents; + response.ResponseId = responseId; + response.ModelId = modelId; + response.AdditionalProperties = additionalProperties; + + return response; + } + + /// Combines instances into a single . + /// The updates to be combined. + /// The to monitor for cancellation requests. The default is . + /// The combined . + public static Task ToSpeechToTextResponseAsync( + this IAsyncEnumerable updates, CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(updates); + + return ToResponseAsync(updates, cancellationToken); + + static async Task ToResponseAsync( + IAsyncEnumerable updates, CancellationToken cancellationToken) + { + SpeechToTextResponse response = new(); + List contents = []; + string? responseId = null; + string? modelId = null; + AdditionalPropertiesDictionary? additionalProperties = null; + + TimeSpan? endTime = null; + await foreach (var update in updates.WithCancellation(cancellationToken).ConfigureAwait(false)) + { + // Track the first start time provided by the updates + response.StartTime ??= update.StartTime; + + // Track the last end time provided by the updates + if (update.EndTime is not null) + { + endTime = update.EndTime; + } + + ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties); + } + + ChatResponseExtensions.CoalesceTextContent(contents); + + response.EndTime = endTime; + response.Contents = contents; + response.ResponseId = responseId; + response.ModelId = modelId; + response.AdditionalProperties = additionalProperties; + + return response; + } + } + + /// Processes the , incorporating its contents and properties. + /// The update to process. + /// The list of content items being accumulated. + /// The response ID to update if the update has one. + /// The model ID to update if the update has one. + /// The additional properties to update if the update has any. + private static void ProcessUpdate( + SpeechToTextResponseUpdate update, + List contents, + ref string? responseId, + ref string? modelId, + ref AdditionalPropertiesDictionary? additionalProperties) + { + if (update.ResponseId is not null) + { + responseId = update.ResponseId; + } + + if (update.ModelId is not null) + { + modelId = update.ModelId; + } + + contents.AddRange(update.Contents); + + if (update.AdditionalProperties is not null) + { + if (additionalProperties is null) + { + additionalProperties = new(update.AdditionalProperties); + } + else + { + foreach (var entry in update.AdditionalProperties) + { + additionalProperties[entry.Key] = entry.Value; + } + } + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateKind.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateKind.cs new file mode 100644 index 00000000000..1a3d7b0a474 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateKind.cs @@ -0,0 +1,104 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.ComponentModel; +using System.Diagnostics.CodeAnalysis; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// +/// Describes the intended purpose of a specific update during streaming of speech to text updates. +/// +[Experimental("MEAI001")] +[JsonConverter(typeof(Converter))] +public readonly struct SpeechToTextResponseUpdateKind : IEquatable +{ + /// Gets when the generated text session is opened. + public static SpeechToTextResponseUpdateKind SessionOpen { get; } = new("sessionopen"); + + /// Gets when a non-blocking error occurs during speech to text updates. + public static SpeechToTextResponseUpdateKind Error { get; } = new("error"); + + /// Gets when the text update is in progress, without waiting for silence. + public static SpeechToTextResponseUpdateKind TextUpdating { get; } = new("textupdating"); + + /// Gets when the text was generated after small period of silence. + public static SpeechToTextResponseUpdateKind TextUpdated { get; } = new("textupdated"); + + /// Gets when the generated text session is closed. + public static SpeechToTextResponseUpdateKind SessionClose { get; } = new("sessionclose"); + + /// + /// Gets the value associated with this . + /// + /// + /// The value will be serialized into the "kind" message field of the speech to text update format. + /// + public string Value { get; } + + /// + /// Initializes a new instance of the struct with the provided value. + /// + /// The value to associate with this . + [JsonConstructor] + public SpeechToTextResponseUpdateKind(string value) + { + Value = Throw.IfNullOrWhitespace(value); + } + + /// + /// Returns a value indicating whether two instances are equivalent, as determined by a + /// case-insensitive comparison of their values. + /// + /// The first instance to compare. + /// The second instance to compare. + /// if left and right are both null or have equivalent values; otherwise, . + public static bool operator ==(SpeechToTextResponseUpdateKind left, SpeechToTextResponseUpdateKind right) + { + return left.Equals(right); + } + + /// + /// Returns a value indicating whether two instances are not equivalent, as determined by a + /// case-insensitive comparison of their values. + /// + /// The first instance to compare. + /// The second instance to compare. + /// if left and right have different values; if they have equivalent values or are both null. + public static bool operator !=(SpeechToTextResponseUpdateKind left, SpeechToTextResponseUpdateKind right) + { + return !(left == right); + } + + /// + public override bool Equals([NotNullWhen(true)] object? obj) + => obj is SpeechToTextResponseUpdateKind otherRole && Equals(otherRole); + + /// + public bool Equals(SpeechToTextResponseUpdateKind other) + => string.Equals(Value, other.Value, StringComparison.OrdinalIgnoreCase); + + /// + public override int GetHashCode() + => StringComparer.OrdinalIgnoreCase.GetHashCode(Value); + + /// + public override string ToString() => Value; + + /// Provides a for serializing instances. + [EditorBrowsable(EditorBrowsableState.Never)] + public sealed class Converter : JsonConverter + { + /// + public override SpeechToTextResponseUpdateKind Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) => + new(reader.GetString()!); + + /// + public override void Write(Utf8JsonWriter writer, SpeechToTextResponseUpdateKind value, JsonSerializerOptions options) + => Throw.IfNull(writer).WriteStringValue(value.Value); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Utilities/AIJsonUtilities.Defaults.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Utilities/AIJsonUtilities.Defaults.cs index c85d7791cb6..67ddfcbc8d7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Utilities/AIJsonUtilities.Defaults.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Utilities/AIJsonUtilities.Defaults.cs @@ -77,6 +77,11 @@ private static JsonSerializerOptions CreateDefaultOptions() UseStringEnumConverter = true, DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, WriteIndented = true)] + [JsonSerializable(typeof(SpeechToTextOptions))] + [JsonSerializable(typeof(SpeechToTextClientMetadata))] + [JsonSerializable(typeof(SpeechToTextResponse))] + [JsonSerializable(typeof(SpeechToTextResponseUpdate))] + [JsonSerializable(typeof(IReadOnlyList))] [JsonSerializable(typeof(IList))] [JsonSerializable(typeof(ChatOptions))] [JsonSerializable(typeof(EmbeddingGenerationOptions))] diff --git a/src/Libraries/Microsoft.Extensions.AI.OpenAI/Microsoft.Extensions.AI.OpenAI.csproj b/src/Libraries/Microsoft.Extensions.AI.OpenAI/Microsoft.Extensions.AI.OpenAI.csproj index 18bfe009184..552d45f0fc6 100644 --- a/src/Libraries/Microsoft.Extensions.AI.OpenAI/Microsoft.Extensions.AI.OpenAI.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.OpenAI/Microsoft.Extensions.AI.OpenAI.csproj @@ -16,17 +16,19 @@ $(TargetFrameworks);netstandard2.0 $(NoWarn);CA1063;CA1508;CA2227;SA1316;S1121;S3358;EA0002;OPENAI002 + $(NoWarn);MEAI001 true true true + true + true + true true - true true - true - true + true diff --git a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAIClientExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAIClientExtensions.cs index 6b330e4da00..c2753379974 100644 --- a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAIClientExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAIClientExtensions.cs @@ -3,8 +3,10 @@ using System; using System.ComponentModel; +using System.Diagnostics.CodeAnalysis; using Microsoft.Shared.Diagnostics; using OpenAI; +using OpenAI.Audio; using OpenAI.Chat; using OpenAI.Embeddings; using OpenAI.Responses; @@ -35,6 +37,13 @@ public static IChatClient AsIChatClient(this ChatClient chatClient) => public static IChatClient AsIChatClient(this OpenAIResponseClient responseClient) => new OpenAIResponseChatClient(responseClient); + /// Gets an for use with this . + /// The client. + /// An that can be used to transcribe audio via the . + [Experimental("MEAI001")] + public static ISpeechToTextClient AsISpeechToTextClient(this AudioClient audioClient) => + new OpenAISpeechToTextClient(audioClient); + /// Gets an for use with this . /// The client. /// The model to use. diff --git a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs new file mode 100644 index 00000000000..78fe00a8377 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs @@ -0,0 +1,278 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; +using OpenAI; +using OpenAI.Audio; + +#pragma warning disable S1067 // Expressions should not be too complex +#pragma warning disable S3011 // Reflection should not be used to increase accessibility of classes, methods, or fields + +namespace Microsoft.Extensions.AI; + +/// Represents an for an OpenAI or . +[Experimental("MEAI001")] +internal sealed class OpenAISpeechToTextClient : ISpeechToTextClient +{ + /// Default OpenAI endpoint. + private static readonly Uri _defaultOpenAIEndpoint = new("https://api.openai.com/v1"); + + /// Metadata about the client. + private readonly SpeechToTextClientMetadata _metadata; + + /// The underlying . + private readonly AudioClient _audioClient; + + /// Initializes a new instance of the class for the specified . + /// The underlying client. + public OpenAISpeechToTextClient(AudioClient audioClient) + { + _ = Throw.IfNull(audioClient); + + _audioClient = audioClient; + + // https://github.com/openai/openai-dotnet/issues/215 + // The endpoint and model aren't currently exposed, so use reflection to get at them, temporarily. Once packages + // implement the abstractions directly rather than providing adapters on top of the public APIs, + // the package can provide such implementations separate from what's exposed in the public API. + Uri providerUrl = typeof(AudioClient).GetField("_endpoint", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance) + ?.GetValue(audioClient) as Uri ?? _defaultOpenAIEndpoint; + string? model = typeof(AudioClient).GetField("_model", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance) + ?.GetValue(audioClient) as string; + + _metadata = new("openai", providerUrl, model); + } + + /// + public object? GetService(Type serviceType, object? serviceKey = null) + { + _ = Throw.IfNull(serviceType); + + return + serviceKey is not null ? null : + serviceType == typeof(SpeechToTextClientMetadata) ? _metadata : + serviceType == typeof(AudioClient) ? _audioClient : + serviceType.IsInstanceOfType(this) ? this : + null; + } + + /// + public async IAsyncEnumerable GetStreamingTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(audioSpeechStream); + + var speechResponse = await GetTextAsync(audioSpeechStream, options, cancellationToken).ConfigureAwait(false); + + foreach (var update in speechResponse.ToSpeechToTextResponseUpdates()) + { + yield return update; + } + } + + /// + public async Task GetTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(audioSpeechStream); + + SpeechToTextResponse response = new(); + + // A translation is triggered when the target text language is specified and the source language is not provided or different. + static bool IsTranslationRequest(SpeechToTextOptions? options) + => options is not null && options.TextLanguage is not null + && (options.SpeechLanguage is null || options.SpeechLanguage != options.TextLanguage); + + if (IsTranslationRequest(options)) + { + _ = Throw.IfNull(options); + + var openAIOptions = ToOpenAITranslationOptions(options); + AudioTranslation translationResult; + +#if NET + await using (audioSpeechStream.ConfigureAwait(false)) +#else + using (audioSpeechStream) +#endif + { + translationResult = (await _audioClient.TranslateAudioAsync( + audioSpeechStream, + "file.wav", // this information internally is required but is only being used to create a header name in the multipart request. + openAIOptions, cancellationToken).ConfigureAwait(false)).Value; + } + + UpdateResponseFromOpenAIAudioTranslation(response, translationResult); + } + else + { + var openAIOptions = ToOpenAITranscriptionOptions(options); + + // Transcription request + AudioTranscription transcriptionResult; + +#if NET + await using (audioSpeechStream.ConfigureAwait(false)) +#else + using (audioSpeechStream) +#endif + { + transcriptionResult = (await _audioClient.TranscribeAudioAsync( + audioSpeechStream, + "file.wav", // this information internally is required but is only being used to create a header name in the multipart request. + openAIOptions, cancellationToken).ConfigureAwait(false)).Value; + } + + UpdateResponseFromOpenAIAudioTranscription(response, transcriptionResult); + } + + return response; + } + + /// + void IDisposable.Dispose() + { + // Nothing to dispose. Implementation required for the IAudioTranscriptionClient interface. + } + + /// Updates a from an OpenAI . + /// The response to update. + /// The OpenAI audio transcription. + private static void UpdateResponseFromOpenAIAudioTranscription(SpeechToTextResponse response, AudioTranscription audioTranscription) + { + _ = Throw.IfNull(audioTranscription); + + var segmentCount = audioTranscription.Segments.Count; + var wordCount = audioTranscription.Words.Count; + + TimeSpan? endTime = null; + TimeSpan? startTime = null; + if (segmentCount > 0) + { + endTime = audioTranscription.Segments[segmentCount - 1].EndTime; + startTime = audioTranscription.Segments[0].StartTime; + } + else if (wordCount > 0) + { + endTime = audioTranscription.Words[wordCount - 1].EndTime; + startTime = audioTranscription.Words[0].StartTime; + } + + // Update the response + response.RawRepresentation = audioTranscription; + response.Contents = [new TextContent(audioTranscription.Text)]; + response.StartTime = startTime; + response.EndTime = endTime; + response.AdditionalProperties = new AdditionalPropertiesDictionary + { + [nameof(audioTranscription.Language)] = audioTranscription.Language, + [nameof(audioTranscription.Duration)] = audioTranscription.Duration + }; + } + + /// Converts an extensions options instance to an OpenAI options instance. + private static AudioTranscriptionOptions ToOpenAITranscriptionOptions(SpeechToTextOptions? options) + { + AudioTranscriptionOptions result = new(); + + if (options is not null) + { + if (options.SpeechLanguage is not null) + { + result.Language = options.SpeechLanguage; + } + + if (options.AdditionalProperties is { Count: > 0 } additionalProperties) + { + if (additionalProperties.TryGetValue(nameof(result.Temperature), out float? temperature)) + { + result.Temperature = temperature; + } + + if (additionalProperties.TryGetValue(nameof(result.TimestampGranularities), out object? timestampGranularities)) + { + result.TimestampGranularities = timestampGranularities is AudioTimestampGranularities granularities ? granularities : default; + } + + if (additionalProperties.TryGetValue(nameof(result.ResponseFormat), out AudioTranscriptionFormat? responseFormat)) + { + result.ResponseFormat = responseFormat; + } + + if (additionalProperties.TryGetValue(nameof(result.Prompt), out string? prompt)) + { + result.Prompt = prompt; + } + } + } + + return result; + } + + /// Updates a from an OpenAI . + /// The response to update. + /// The OpenAI audio translation. + private static void UpdateResponseFromOpenAIAudioTranslation(SpeechToTextResponse response, AudioTranslation audioTranslation) + { + _ = Throw.IfNull(audioTranslation); + + var segmentCount = audioTranslation.Segments.Count; + + TimeSpan? endTime = null; + TimeSpan? startTime = null; + if (segmentCount > 0) + { + endTime = audioTranslation.Segments[segmentCount - 1].EndTime; + startTime = audioTranslation.Segments[0].StartTime; + } + + // Update the response + response.RawRepresentation = audioTranslation; + response.Contents = [new TextContent(audioTranslation.Text)]; + response.StartTime = startTime; + response.EndTime = endTime; + response.AdditionalProperties = new AdditionalPropertiesDictionary + { + [nameof(audioTranslation.Language)] = audioTranslation.Language, + [nameof(audioTranslation.Duration)] = audioTranslation.Duration + }; + } + + /// Converts an extensions options instance to an OpenAI options instance. + private static AudioTranslationOptions ToOpenAITranslationOptions(SpeechToTextOptions? options) + { + AudioTranslationOptions result = new(); + + if (options is not null) + { + if (options.AdditionalProperties is { Count: > 0 } additionalProperties) + { + if (additionalProperties.TryGetValue(nameof(result.Temperature), out float? temperature)) + { + result.Temperature = temperature; + } + + if (additionalProperties.TryGetValue(nameof(result.ResponseFormat), out AudioTranslationFormat? responseFormat)) + { + result.ResponseFormat = responseFormat; + } + + if (additionalProperties.TryGetValue(nameof(result.Prompt), out string? prompt)) + { + result.Prompt = prompt; + } + } + } + + return result; + } +} + diff --git a/src/Libraries/Microsoft.Extensions.AI/Microsoft.Extensions.AI.csproj b/src/Libraries/Microsoft.Extensions.AI/Microsoft.Extensions.AI.csproj index 10f590639ec..378c3e49dfd 100644 --- a/src/Libraries/Microsoft.Extensions.AI/Microsoft.Extensions.AI.csproj +++ b/src/Libraries/Microsoft.Extensions.AI/Microsoft.Extensions.AI.csproj @@ -23,6 +23,7 @@ + true true true false diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/ConfigureOptionsSpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/ConfigureOptionsSpeechToTextClient.cs new file mode 100644 index 00000000000..85833a3c171 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/ConfigureOptionsSpeechToTextClient.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// Represents a delegating chat client that configures a instance used by the remainder of the pipeline. +[Experimental("MEAI001")] +public sealed class ConfigureOptionsSpeechToTextClient : DelegatingSpeechToTextClient +{ + /// The callback delegate used to configure options. + private readonly Action _configureOptions; + + /// Initializes a new instance of the class with the specified callback. + /// The inner client. + /// + /// The delegate to invoke to configure the instance. It is passed a clone of the caller-supplied instance + /// (or a newly constructed instance if the caller-supplied instance is ). + /// + /// + /// The delegate is passed either a new instance of if + /// the caller didn't supply a instance, or a clone (via of the caller-supplied + /// instance if one was supplied. + /// + public ConfigureOptionsSpeechToTextClient(ISpeechToTextClient innerClient, Action configure) + : base(innerClient) + { + _configureOptions = Throw.IfNull(configure); + } + + /// + public override async Task GetTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default) + { + return await base.GetTextAsync(audioSpeechStream, Configure(options), cancellationToken).ConfigureAwait(false); + } + + /// + public override async IAsyncEnumerable GetStreamingTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + await foreach (var update in base.GetStreamingTextAsync(audioSpeechStream, Configure(options), cancellationToken).ConfigureAwait(false)) + { + yield return update; + } + } + + /// Creates and configures the to pass along to the inner client. + private SpeechToTextOptions Configure(SpeechToTextOptions? options) + { + options = options?.Clone() ?? new(); + + _configureOptions(options); + + return options; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/ConfigureOptionsSpeechToTextClientBuilderExtensions.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/ConfigureOptionsSpeechToTextClientBuilderExtensions.cs new file mode 100644 index 00000000000..037d25a14d5 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/ConfigureOptionsSpeechToTextClientBuilderExtensions.cs @@ -0,0 +1,38 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics.CodeAnalysis; +using Microsoft.Shared.Diagnostics; + +#pragma warning disable SA1629 // Documentation text should end with a period + +namespace Microsoft.Extensions.AI; + +/// Provides extensions for configuring instances. +[Experimental("MEAI001")] +public static class ConfigureOptionsSpeechToTextClientBuilderExtensions +{ + /// + /// Adds a callback that configures a to be passed to the next client in the pipeline. + /// + /// The . + /// + /// The delegate to invoke to configure the instance. + /// It is passed a clone of the caller-supplied instance (or a newly constructed instance if the caller-supplied instance is ). + /// + /// + /// This method can be used to set default options. The delegate is passed either a new instance of + /// if the caller didn't supply a instance, or a clone (via ) + /// of the caller-supplied instance if one was supplied. + /// + /// The . + public static SpeechToTextClientBuilder ConfigureOptions( + this SpeechToTextClientBuilder builder, Action configure) + { + _ = Throw.IfNull(builder); + _ = Throw.IfNull(configure); + + return builder.Use(innerClient => new ConfigureOptionsSpeechToTextClient(innerClient, configure)); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/LoggingSpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/LoggingSpeechToTextClient.cs new file mode 100644 index 00000000000..4494d319dc0 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/LoggingSpeechToTextClient.cs @@ -0,0 +1,199 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Runtime.CompilerServices; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// A delegating speech to text client that logs speech to text operations to an . +/// +/// The provided implementation of is thread-safe for concurrent use so long as the +/// employed is also thread-safe for concurrent use. +/// +[Experimental("MEAI001")] +public partial class LoggingSpeechToTextClient : DelegatingSpeechToTextClient +{ + /// An instance used for all logging. + private readonly ILogger _logger; + + /// The to use for serialization of state written to the logger. + private JsonSerializerOptions _jsonSerializerOptions; + + /// Initializes a new instance of the class. + /// The underlying . + /// An instance that will be used for all logging. + public LoggingSpeechToTextClient(ISpeechToTextClient innerClient, ILogger logger) + : base(innerClient) + { + _logger = Throw.IfNull(logger); + _jsonSerializerOptions = AIJsonUtilities.DefaultOptions; + } + + /// Gets or sets JSON serialization options to use when serializing logging data. + public JsonSerializerOptions JsonSerializerOptions + { + get => _jsonSerializerOptions; + set => _jsonSerializerOptions = Throw.IfNull(value); + } + + /// + public override async Task GetTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default) + { + if (_logger.IsEnabled(LogLevel.Debug)) + { + if (_logger.IsEnabled(LogLevel.Trace)) + { + LogInvokedSensitive(nameof(GetTextAsync), AsJson(options), AsJson(this.GetService())); + } + else + { + LogInvoked(nameof(GetTextAsync)); + } + } + + try + { + var response = await base.GetTextAsync(audioSpeechStream, options, cancellationToken).ConfigureAwait(false); + + if (_logger.IsEnabled(LogLevel.Debug)) + { + if (_logger.IsEnabled(LogLevel.Trace)) + { + LogCompletedSensitive(nameof(GetTextAsync), AsJson(response)); + } + else + { + LogCompleted(nameof(GetTextAsync)); + } + } + + return response; + } + catch (OperationCanceledException) + { + LogInvocationCanceled(nameof(GetTextAsync)); + throw; + } + catch (Exception ex) + { + LogInvocationFailed(nameof(GetTextAsync), ex); + throw; + } + } + + /// + public override async IAsyncEnumerable GetStreamingTextAsync( + Stream audioSpeechStream, SpeechToTextOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + if (_logger.IsEnabled(LogLevel.Debug)) + { + if (_logger.IsEnabled(LogLevel.Trace)) + { + LogInvokedSensitive(nameof(GetStreamingTextAsync), AsJson(options), AsJson(this.GetService())); + } + else + { + LogInvoked(nameof(GetStreamingTextAsync)); + } + } + + IAsyncEnumerator e; + try + { + e = base.GetStreamingTextAsync(audioSpeechStream, options, cancellationToken).GetAsyncEnumerator(cancellationToken); + } + catch (OperationCanceledException) + { + LogInvocationCanceled(nameof(GetStreamingTextAsync)); + throw; + } + catch (Exception ex) + { + LogInvocationFailed(nameof(GetStreamingTextAsync), ex); + throw; + } + + try + { + SpeechToTextResponseUpdate? update = null; + while (true) + { + try + { + if (!await e.MoveNextAsync().ConfigureAwait(false)) + { + break; + } + + update = e.Current; + } + catch (OperationCanceledException) + { + LogInvocationCanceled(nameof(GetStreamingTextAsync)); + throw; + } + catch (Exception ex) + { + LogInvocationFailed(nameof(GetStreamingTextAsync), ex); + throw; + } + + if (_logger.IsEnabled(LogLevel.Debug)) + { + if (_logger.IsEnabled(LogLevel.Trace)) + { + LogStreamingUpdateSensitive(AsJson(update)); + } + else + { + LogStreamingUpdate(); + } + } + + yield return update; + } + + LogCompleted(nameof(GetStreamingTextAsync)); + } + finally + { + await e.DisposeAsync().ConfigureAwait(false); + } + } + + private string AsJson(T value) => LoggingHelpers.AsJson(value, _jsonSerializerOptions); + + [LoggerMessage(LogLevel.Debug, "{MethodName} invoked.")] + private partial void LogInvoked(string methodName); + + [LoggerMessage(LogLevel.Trace, "{MethodName} invoked: Options: {SpeechToTextOptions}. Metadata: {SpeechToTextClientMetadata}.")] + private partial void LogInvokedSensitive(string methodName, string speechToTextOptions, string speechToTextClientMetadata); + + [LoggerMessage(LogLevel.Debug, "{MethodName} completed.")] + private partial void LogCompleted(string methodName); + + [LoggerMessage(LogLevel.Trace, "{MethodName} completed: {SpeechToTextResponse}.")] + private partial void LogCompletedSensitive(string methodName, string speechToTextResponse); + + [LoggerMessage(LogLevel.Debug, "GetStreamingTextAsync received update.")] + private partial void LogStreamingUpdate(); + + [LoggerMessage(LogLevel.Trace, "GetStreamingTextAsync received update: {SpeechToTextResponseUpdate}")] + private partial void LogStreamingUpdateSensitive(string speechToTextResponseUpdate); + + [LoggerMessage(LogLevel.Debug, "{MethodName} canceled.")] + private partial void LogInvocationCanceled(string methodName); + + [LoggerMessage(LogLevel.Error, "{MethodName} failed.")] + private partial void LogInvocationFailed(string methodName, Exception error); +} diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilder.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilder.cs new file mode 100644 index 00000000000..dae4224a94d --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilder.cs @@ -0,0 +1,81 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// A builder for creating pipelines of . +[Experimental("MEAI001")] +public sealed class SpeechToTextClientBuilder +{ + private readonly Func _innerClientFactory; + + /// The registered client factory instances. + private List>? _clientFactories; + + /// Initializes a new instance of the class. + /// The inner that represents the underlying backend. + public SpeechToTextClientBuilder(ISpeechToTextClient innerClient) + { + _ = Throw.IfNull(innerClient); + _innerClientFactory = _ => innerClient; + } + + /// Initializes a new instance of the class. + /// A callback that produces the inner that represents the underlying backend. + public SpeechToTextClientBuilder(Func innerClientFactory) + { + _innerClientFactory = Throw.IfNull(innerClientFactory); + } + + /// Builds an that represents the entire pipeline. Calls to this instance will pass through each of the pipeline stages in turn. + /// + /// The that should provide services to the instances. + /// If null, an empty will be used. + /// + /// An instance of that represents the entire pipeline. + public ISpeechToTextClient Build(IServiceProvider? services = null) + { + services ??= EmptyServiceProvider.Instance; + var audioClient = _innerClientFactory(services); + + // To match intuitive expectations, apply the factories in reverse order, so that the first factory added is the outermost. + if (_clientFactories is not null) + { + for (var i = _clientFactories.Count - 1; i >= 0; i--) + { + audioClient = _clientFactories[i](audioClient, services) ?? + throw new InvalidOperationException( + $"The {nameof(SpeechToTextClientBuilder)} entry at index {i} returned null. " + + $"Ensure that the callbacks passed to {nameof(Use)} return non-null {nameof(ISpeechToTextClient)} instances."); + } + } + + return audioClient; + } + + /// Adds a factory for an intermediate audio transcription client to the audio transcription client pipeline. + /// The client factory function. + /// The updated instance. + public SpeechToTextClientBuilder Use(Func clientFactory) + { + _ = Throw.IfNull(clientFactory); + + return Use((innerClient, _) => clientFactory(innerClient)); + } + + /// Adds a factory for an intermediate audio transcription client to the audio transcription client pipeline. + /// The client factory function. + /// The updated instance. + public SpeechToTextClientBuilder Use(Func clientFactory) + { + _ = Throw.IfNull(clientFactory); + + (_clientFactories ??= []).Add(clientFactory); + return this; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderExtensions.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderExtensions.cs new file mode 100644 index 00000000000..7ce2b19ac37 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderExtensions.cs @@ -0,0 +1,48 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics.CodeAnalysis; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// Provides extensions for configuring instances. +[Experimental("MEAI001")] +public static class SpeechToTextClientBuilderExtensions +{ + /// Adds logging to the audio transcription client pipeline. + /// The . + /// + /// An optional used to create a logger with which logging should be performed. + /// If not supplied, a required instance will be resolved from the service provider. + /// + /// An optional callback that can be used to configure the instance. + /// The . + public static SpeechToTextClientBuilder UseLogging( + this SpeechToTextClientBuilder builder, + ILoggerFactory? loggerFactory = null, + Action? configure = null) + { + _ = Throw.IfNull(builder); + + return builder.Use((innerClient, services) => + { + loggerFactory ??= services.GetRequiredService(); + + // If the factory we resolve is for the null logger, the LoggingAudioTranscriptionClient will end up + // being an expensive nop, so skip adding it and just return the inner client. + if (loggerFactory == NullLoggerFactory.Instance) + { + return innerClient; + } + + var audioClient = new LoggingSpeechToTextClient(innerClient, loggerFactory.CreateLogger(typeof(LoggingSpeechToTextClient))); + configure?.Invoke(audioClient); + return audioClient; + }); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderServiceCollectionExtensions.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderServiceCollectionExtensions.cs new file mode 100644 index 00000000000..5ef54e8db26 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderServiceCollectionExtensions.cs @@ -0,0 +1,81 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics.CodeAnalysis; +using Microsoft.Extensions.AI; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.DependencyInjection; + +/// Provides extension methods for registering with a . +[Experimental("MEAI001")] +public static class SpeechToTextClientBuilderServiceCollectionExtensions +{ + /// Registers a singleton in the . + /// The to which the client should be added. + /// The inner that represents the underlying backend. + /// The service lifetime for the client. Defaults to . + /// A that can be used to build a pipeline around the inner client. + /// The client is registered as a singleton service. + public static SpeechToTextClientBuilder AddSpeechToTextClient( + this IServiceCollection serviceCollection, + ISpeechToTextClient innerClient, + ServiceLifetime lifetime = ServiceLifetime.Singleton) + => AddSpeechToTextClient(serviceCollection, _ => innerClient, lifetime); + + /// Registers a singleton in the . + /// The to which the client should be added. + /// A callback that produces the inner that represents the underlying backend. + /// The service lifetime for the client. Defaults to . + /// A that can be used to build a pipeline around the inner client. + /// The client is registered as a singleton service. + public static SpeechToTextClientBuilder AddSpeechToTextClient( + this IServiceCollection serviceCollection, + Func innerClientFactory, + ServiceLifetime lifetime = ServiceLifetime.Singleton) + { + _ = Throw.IfNull(serviceCollection); + _ = Throw.IfNull(innerClientFactory); + + var builder = new SpeechToTextClientBuilder(innerClientFactory); + serviceCollection.Add(new ServiceDescriptor(typeof(ISpeechToTextClient), builder.Build, lifetime)); + return builder; + } + + /// Registers a keyed singleton in the . + /// The to which the client should be added. + /// The key with which to associate the client. + /// The inner that represents the underlying backend. + /// The service lifetime for the client. Defaults to . + /// A that can be used to build a pipeline around the inner client. + /// The client is registered as a scoped service. + public static SpeechToTextClientBuilder AddKeyedSpeechToTextClient( + this IServiceCollection serviceCollection, + object serviceKey, + ISpeechToTextClient innerClient, + ServiceLifetime lifetime = ServiceLifetime.Singleton) + => AddKeyedSpeechToTextClient(serviceCollection, serviceKey, _ => innerClient, lifetime); + + /// Registers a keyed singleton in the . + /// The to which the client should be added. + /// The key with which to associate the client. + /// A callback that produces the inner that represents the underlying backend. + /// The service lifetime for the client. Defaults to . + /// A that can be used to build a pipeline around the inner client. + /// The client is registered as a scoped service. + public static SpeechToTextClientBuilder AddKeyedSpeechToTextClient( + this IServiceCollection serviceCollection, + object serviceKey, + Func innerClientFactory, + ServiceLifetime lifetime = ServiceLifetime.Singleton) + { + _ = Throw.IfNull(serviceCollection); + _ = Throw.IfNull(serviceKey); + _ = Throw.IfNull(innerClientFactory); + + var builder = new SpeechToTextClientBuilder(innerClientFactory); + serviceCollection.Add(new ServiceDescriptor(typeof(ISpeechToTextClient), serviceKey, factory: (services, serviceKey) => builder.Build(services), lifetime)); + return builder; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderSpeechToTextClientExtensions.cs b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderSpeechToTextClientExtensions.cs new file mode 100644 index 00000000000..29569c55207 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI/SpeechToText/SpeechToTextClientBuilderSpeechToTextClientExtensions.cs @@ -0,0 +1,27 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics.CodeAnalysis; +using Microsoft.Extensions.AI; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI; + +/// Provides extension methods for working with in the context of . +[Experimental("MEAI001")] +public static class SpeechToTextClientBuilderSpeechToTextClientExtensions +{ + /// Creates a new using as its inner client. + /// The client to use as the inner client. + /// The new instance. + /// + /// This method is equivalent to using the constructor directly, + /// specifying as the inner client. + /// + public static SpeechToTextClientBuilder AsBuilder(this ISpeechToTextClient innerClient) + { + _ = Throw.IfNull(innerClient); + + return new SpeechToTextClientBuilder(innerClient); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/ErrorContentTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/ErrorContentTests.cs new file mode 100644 index 00000000000..2564f6bc2c9 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/ErrorContentTests.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class ErrorContentTests +{ + [Fact] + public void Constructor_ShouldInitializeProperties() + { + // Arrange + string errorMessage = "Error occurred"; + string errorCode = "ERR001"; + string errorDetails = "Something went wrong"; + + // Act + var errorContent = new ErrorContent(errorMessage) + { + ErrorCode = errorCode, + Details = errorDetails + }; + + // Assert + Assert.Equal(errorMessage, errorContent.Message); + Assert.Equal(errorCode, errorContent.ErrorCode); + Assert.Equal(errorDetails, errorContent.Details); + } + + [Fact] + public void JsonSerialization_ShouldSerializeAndDeserializeCorrectly() + { + // Arrange + var errorContent = new ErrorContent("Error occurred") + { + ErrorCode = "ERR001", + Details = "Something went wrong" + }; + var options = new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase }; + + // Act + var json = JsonSerializer.Serialize(errorContent, options); + var deserializedErrorContent = JsonSerializer.Deserialize(json, options); + + // Assert + Assert.NotNull(deserializedErrorContent); + Assert.Equal(errorContent.Message, deserializedErrorContent!.Message); + Assert.Equal(errorContent.ErrorCode, deserializedErrorContent.ErrorCode); + Assert.Equal(errorContent.Details, deserializedErrorContent.Details); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Microsoft.Extensions.AI.Abstractions.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Microsoft.Extensions.AI.Abstractions.Tests.csproj index b22bdc9fdde..f7b3a0154e5 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Microsoft.Extensions.AI.Abstractions.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Microsoft.Extensions.AI.Abstractions.Tests.csproj @@ -6,6 +6,7 @@ $(NoWarn);CA1063;CA1861;CA2201;VSTHRD003;S104 + $(NoWarn);MEAI001 true diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/DelegatingSpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/DelegatingSpeechToTextClientTests.cs new file mode 100644 index 00000000000..ef4da7f94bd --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/DelegatingSpeechToTextClientTests.cs @@ -0,0 +1,166 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class DelegatingSpeechToTextClientTests +{ + [Fact] + public void RequiresInnerSpeechToTextClient() + { + Assert.Throws("innerClient", () => new NoOpDelegatingSpeechToTextClient(null!)); + } + + [Fact] + public async Task GetTextAsyncDefaultsToInnerClientAsync() + { + // Arrange + using var expectedAudioSpeechStream = new MemoryStream(); + var expectedOptions = new SpeechToTextOptions(); + var expectedCancellationToken = CancellationToken.None; + var expectedResult = new TaskCompletionSource(); + var expectedResponse = new SpeechToTextResponse([]); + using var inner = new TestSpeechToTextClient + { + GetTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + Assert.Same(expectedAudioSpeechStream, audioSpeechStream); + Assert.Same(expectedOptions, options); + Assert.Equal(expectedCancellationToken, cancellationToken); + return expectedResult.Task; + } + }; + + using var delegating = new NoOpDelegatingSpeechToTextClient(inner); + + // Act + var resultTask = delegating.GetTextAsync(expectedAudioSpeechStream, expectedOptions, expectedCancellationToken); + + // Assert + Assert.False(resultTask.IsCompleted); + expectedResult.SetResult(expectedResponse); + Assert.True(resultTask.IsCompleted); + Assert.Same(expectedResponse, await resultTask); + } + + [Fact] + public async Task GetStreamingTextAsyncDefaultsToInnerClientAsync() + { + // Arrange + using var expectedAudioSpeechStream = new MemoryStream(); + var expectedOptions = new SpeechToTextOptions(); + var expectedCancellationToken = CancellationToken.None; + SpeechToTextResponseUpdate[] expectedResults = + [ + new("Text update 1"), + new("Text update 2") + ]; + + using var inner = new TestSpeechToTextClient + { + GetStreamingTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + Assert.Same(expectedAudioSpeechStream, audioSpeechStream); + Assert.Same(expectedOptions, options); + Assert.Equal(expectedCancellationToken, cancellationToken); + return YieldAsync(expectedResults); + } + }; + + using var delegating = new NoOpDelegatingSpeechToTextClient(inner); + + // Act + var resultAsyncEnumerable = delegating.GetStreamingTextAsync(expectedAudioSpeechStream, expectedOptions, expectedCancellationToken); + + // Assert + var enumerator = resultAsyncEnumerable.GetAsyncEnumerator(); + Assert.True(await enumerator.MoveNextAsync()); + Assert.Same(expectedResults[0], enumerator.Current); + Assert.True(await enumerator.MoveNextAsync()); + Assert.Same(expectedResults[1], enumerator.Current); + Assert.False(await enumerator.MoveNextAsync()); + } + + [Fact] + public void GetServiceThrowsForNullType() + { + using var inner = new TestSpeechToTextClient(); + using var delegating = new NoOpDelegatingSpeechToTextClient(inner); + Assert.Throws("serviceType", () => delegating.GetService(null!)); + } + + [Fact] + public void GetServiceReturnsSelfIfCompatibleWithRequestAndKeyIsNull() + { + // Arrange + using var inner = new TestSpeechToTextClient(); + using var delegating = new NoOpDelegatingSpeechToTextClient(inner); + + // Act + var client = delegating.GetService(); + + // Assert + Assert.Same(delegating, client); + } + + [Fact] + public void GetServiceDelegatesToInnerIfKeyIsNotNull() + { + // Arrange + var expectedParam = new object(); + var expectedKey = new object(); + using var expectedResult = new TestSpeechToTextClient(); + using var inner = new TestSpeechToTextClient + { + GetServiceCallback = (_, _) => expectedResult + }; + using var delegating = new NoOpDelegatingSpeechToTextClient(inner); + + // Act + var client = delegating.GetService(expectedKey); + + // Assert + Assert.Same(expectedResult, client); + } + + [Fact] + public void GetServiceDelegatesToInnerIfNotCompatibleWithRequest() + { + // Arrange + var expectedParam = new object(); + var expectedResult = TimeZoneInfo.Local; + var expectedKey = new object(); + using var inner = new TestSpeechToTextClient + { + GetServiceCallback = (type, key) => type == expectedResult.GetType() && key == expectedKey + ? expectedResult + : throw new InvalidOperationException("Unexpected call") + }; + using var delegating = new NoOpDelegatingSpeechToTextClient(inner); + + // Act + var tzi = delegating.GetService(expectedKey); + + // Assert + Assert.Same(expectedResult, tzi); + } + + private static async IAsyncEnumerable YieldAsync(IEnumerable input) + { + await Task.Yield(); + foreach (var item in input) + { + yield return item; + } + } + + private sealed class NoOpDelegatingSpeechToTextClient(ISpeechToTextClient innerClient) + : DelegatingSpeechToTextClient(innerClient); +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientExtensionsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientExtensionsTests.cs new file mode 100644 index 00000000000..d39c73fc0c6 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientExtensionsTests.cs @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextClientExtensionsTests +{ + [Fact] + public void GetService_InvalidArgs_Throws() + { + Assert.Throws("client", () => + { + _ = SpeechToTextClientExtensions.GetService(null!); + }); + } + + [Fact] + public async Task GetTextAsync_InvalidArgs_Throws() + { + // Note: the extension method now requires a DataContent (not a string). + ISpeechToTextClient? client = null; + var content = new DataContent("data:audio/wav;base64,AQIDBA=="); + var ex1 = await Assert.ThrowsAsync(() => SpeechToTextClientExtensions.GetTextAsync(client!, content)); + Assert.Equal("client", ex1.ParamName); + + using var testClient = new TestSpeechToTextClient(); + DataContent? nullContent = null; + var ex2 = await Assert.ThrowsAsync(() => SpeechToTextClientExtensions.GetTextAsync(testClient, nullContent!)); + Assert.Equal("audioSpeechContent", ex2.ParamName); + } + + [Fact] + public async Task GetStreamingTextAsync_InvalidArgs_Throws() + { + ISpeechToTextClient? client = null; + var content = new DataContent("data:audio/wav;base64,AQIDBA=="); + var ex1 = await Assert.ThrowsAsync(() => SpeechToTextClientExtensions.GetStreamingTextAsync(client!, content).GetAsyncEnumerator().MoveNextAsync().AsTask()); + Assert.Equal("client", ex1.ParamName); + + using var testClient = new TestSpeechToTextClient(); + DataContent? nullContent = null; + var ex2 = await Assert.ThrowsAsync(() => SpeechToTextClientExtensions.GetStreamingTextAsync(testClient, nullContent!).GetAsyncEnumerator().MoveNextAsync().AsTask()); + Assert.Equal("audioSpeechContent", ex2.ParamName); + } + + [Fact] + public async Task GetStreamingTextAsync_CreatesTextMessageAsync() + { + // Arrange + var expectedOptions = new SpeechToTextOptions(); + using var cts = new CancellationTokenSource(); + + using TestSpeechToTextClient client = new() + { + GetStreamingTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + // For testing, return an async enumerable yielding one streaming update with text "world". + var update = new SpeechToTextResponseUpdate(); + update.Contents.Add(new TextContent("world")); + return YieldAsync(update); + }, + }; + + int count = 0; + await foreach (var update in SpeechToTextClientExtensions.GetStreamingTextAsync( + client, + new DataContent("data:audio/wav;base64,AQIDBA=="), + expectedOptions, + cts.Token)) + { + Assert.Equal(0, count); + Assert.Equal("world", update.Text); + count++; + } + + Assert.Equal(1, count); + } + + private static async IAsyncEnumerable YieldAsync(params SpeechToTextResponseUpdate[] updates) + { + await Task.Yield(); + foreach (var update in updates) + { + yield return update; + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientMetadataTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientMetadataTests.cs new file mode 100644 index 00000000000..c9081d0adb6 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientMetadataTests.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextClientMetadataTests +{ + [Fact] + public void Constructor_NullValues_AllowedAndRoundtrip() + { + SpeechToTextClientMetadata metadata = new(null, null, null); + Assert.Null(metadata.ProviderName); + Assert.Null(metadata.ProviderUri); + Assert.Null(metadata.DefaultModelId); + } + + [Fact] + public void Constructor_Value_Roundtrips() + { + var uri = new Uri("https://example.com"); + SpeechToTextClientMetadata metadata = new("providerName", uri, "theModel"); + Assert.Equal("providerName", metadata.ProviderName); + Assert.Same(uri, metadata.ProviderUri); + Assert.Equal("theModel", metadata.DefaultModelId); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientTests.cs new file mode 100644 index 00000000000..092ad57b2c2 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextClientTests.cs @@ -0,0 +1,85 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextClientTests +{ + [Fact] + public async Task GetTextAsync_CreatesTextMessageAsync() + { + // Arrange + var expectedResponse = new SpeechToTextResponse("hello"); + var expectedOptions = new SpeechToTextOptions(); + using var cts = new CancellationTokenSource(); + + using TestSpeechToTextClient client = new() + { + GetTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + // For the purpose of the test, we assume that the underlying implementation converts the audio speech stream into a transcription choice. + // (In a real implementation, the audio speech data would be processed.) + return Task.FromResult(new SpeechToTextResponse("hello")); + }, + }; + + // Act – call the extension method with a valid DataContent. + SpeechToTextResponse response = await SpeechToTextClientExtensions.GetTextAsync( + client, + new DataContent("data:audio/wav;base64,AQIDBA=="), + expectedOptions, + cts.Token); + + // Assert + Assert.Equal(expectedResponse.Text, response.Text); + } + + [Fact] + public async Task GetStreamingTextAsync_CreatesStreamingUpdatesAsync() + { + // Arrange + var expectedOptions = new SpeechToTextOptions(); + using var cts = new CancellationTokenSource(); + + using TestSpeechToTextClient client = new() + { + GetStreamingTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + // For the purpose of the test, we simulate a streaming response with multiple updates + return GetStreamingUpdatesAsync(); + }, + }; + + // Act – call the extension method with a valid DataContent + List updates = []; + await foreach (var update in SpeechToTextClientExtensions.GetStreamingTextAsync( + client, + new DataContent("data:audio/wav;base64,AQIDBA=="), + expectedOptions, + cts.Token)) + { + updates.Add(update); + } + + // Assert + Assert.Equal(3, updates.Count); + Assert.Equal("hello ", updates[0].Text); + Assert.Equal("world ", updates[1].Text); + Assert.Equal("!", updates[2].Text); + } + + // Helper method to simulate streaming updates +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + private static async IAsyncEnumerable GetStreamingUpdatesAsync() + { + yield return new("hello "); + yield return new("world "); + yield return new("!"); + } +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextOptionsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextOptionsTests.cs new file mode 100644 index 00000000000..20936fd4517 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextOptionsTests.cs @@ -0,0 +1,84 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextOptionsTests +{ + [Fact] + public void Constructor_Parameterless_PropsDefaulted() + { + SpeechToTextOptions options = new(); + Assert.Null(options.ModelId); + Assert.Null(options.SpeechLanguage); + Assert.Null(options.SpeechSampleRate); + Assert.Null(options.AdditionalProperties); + + SpeechToTextOptions clone = options.Clone(); + Assert.Null(clone.ModelId); + Assert.Null(clone.SpeechLanguage); + Assert.Null(clone.SpeechSampleRate); + Assert.Null(clone.AdditionalProperties); + } + + [Fact] + public void Properties_Roundtrip() + { + SpeechToTextOptions options = new(); + + AdditionalPropertiesDictionary additionalProps = new() + { + ["key"] = "value", + }; + + options.ModelId = "modelId"; + options.SpeechLanguage = "en-US"; + options.SpeechSampleRate = 44100; + options.AdditionalProperties = additionalProps; + + Assert.Equal("modelId", options.ModelId); + Assert.Equal("en-US", options.SpeechLanguage); + Assert.Equal(44100, options.SpeechSampleRate); + Assert.Same(additionalProps, options.AdditionalProperties); + + SpeechToTextOptions clone = options.Clone(); + Assert.Equal("modelId", clone.ModelId); + Assert.Equal("en-US", clone.SpeechLanguage); + Assert.Equal(44100, clone.SpeechSampleRate); + Assert.Equal(additionalProps, clone.AdditionalProperties); + } + + [Fact] + public void JsonSerialization_Roundtrips() + { + SpeechToTextOptions options = new(); + + AdditionalPropertiesDictionary additionalProps = new() + { + ["key"] = "value", + }; + + options.ModelId = "modelId"; + options.SpeechLanguage = "en-US"; + options.SpeechSampleRate = 44100; + options.AdditionalProperties = additionalProps; + + string json = JsonSerializer.Serialize(options, TestJsonSerializerContext.Default.SpeechToTextOptions); + + SpeechToTextOptions? deserialized = JsonSerializer.Deserialize(json, TestJsonSerializerContext.Default.SpeechToTextOptions); + Assert.NotNull(deserialized); + + Assert.Equal("modelId", deserialized.ModelId); + Assert.Equal("en-US", deserialized.SpeechLanguage); + Assert.Equal(44100, deserialized.SpeechSampleRate); + + Assert.NotNull(deserialized.AdditionalProperties); + Assert.Single(deserialized.AdditionalProperties); + Assert.True(deserialized.AdditionalProperties.TryGetValue("key", out object? value)); + Assert.IsType(value); + Assert.Equal("value", ((JsonElement)value!).GetString()); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs new file mode 100644 index 00000000000..33b27b01291 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs @@ -0,0 +1,229 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.Json; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextResponseTests +{ + [Fact] + public void Constructor_InvalidArgs_Throws() + { + Assert.Throws("contents", () => new SpeechToTextResponse((IList)null!)); + } + + [Fact] + public void Constructor_Parameterless_PropsDefaulted() + { + SpeechToTextResponse response = new(); + Assert.Empty(response.Contents); + Assert.Empty(response.Text); + Assert.NotNull(response.Contents); + Assert.Same(response.Contents, response.Contents); + Assert.Empty(response.Contents); + Assert.Null(response.RawRepresentation); + Assert.Null(response.AdditionalProperties); + Assert.Null(response.StartTime); + Assert.Null(response.EndTime); + Assert.Equal(string.Empty, response.ToString()); + } + + [Theory] + [InlineData(null)] + [InlineData("text")] + public void Constructor_String_PropsRoundtrip(string? text) + { + SpeechToTextResponse response = new(text); + + Assert.Same(response.Contents, response.Contents); + if (text is null) + { + Assert.Empty(response.Contents); + } + else + { + Assert.Single(response.Contents); + TextContent tc = Assert.IsType(response.Contents[0]); + Assert.Equal(text, tc.Text); + } + + Assert.Null(response.RawRepresentation); + Assert.Null(response.AdditionalProperties); + Assert.Equal(text ?? string.Empty, response.ToString()); + } + + [Fact] + public void Constructor_List_InvalidArgs_Throws() + { + Assert.Throws("contents", () => new SpeechToTextResponse((IList)null!)); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + [InlineData(2)] + public void Constructor_List_PropsRoundtrip(int contentCount) + { + List content = []; + for (int i = 0; i < contentCount; i++) + { + content.Add(new TextContent($"text-{i}")); + } + + SpeechToTextResponse response = new(content); + + Assert.Same(response.Contents, response.Contents); + if (contentCount == 0) + { + Assert.Empty(response.Contents); + Assert.Empty(response.Text); + } + else + { + Assert.Equal(contentCount, response.Contents.Count); + for (int i = 0; i < contentCount; i++) + { + TextContent tc = Assert.IsType(response.Contents[i]); + Assert.Equal($"text-{i}", tc.Text); + } + + Assert.Equal(string.Concat(Enumerable.Range(0, contentCount).Select(i => $"text-{i}")), response.Text); + Assert.Equal(string.Concat(Enumerable.Range(0, contentCount).Select(i => $"text-{i}")), response.ToString()); + } + } + + [Fact] + public void Properties_Roundtrip() + { + SpeechToTextResponse response = new(); + Assert.Null(response.ResponseId); + response.ResponseId = "id"; + Assert.Equal("id", response.ResponseId); + + Assert.Null(response.ModelId); + response.ModelId = "modelId"; + Assert.Equal("modelId", response.ModelId); + + Assert.Null(response.RawRepresentation); + object raw = new(); + response.RawRepresentation = raw; + Assert.Same(raw, response.RawRepresentation); + + Assert.Null(response.AdditionalProperties); + AdditionalPropertiesDictionary additionalProps = []; + response.AdditionalProperties = additionalProps; + Assert.Same(additionalProps, response.AdditionalProperties); + + Assert.Null(response.StartTime); + TimeSpan startTime = TimeSpan.FromSeconds(1); + response.StartTime = startTime; + Assert.Equal(startTime, response.StartTime); + + Assert.Null(response.EndTime); + TimeSpan endTime = TimeSpan.FromSeconds(2); + response.EndTime = endTime; + Assert.Equal(endTime, response.EndTime); + + List newContents = [new TextContent("text1"), new TextContent("text2")]; + response.Contents = newContents; + Assert.Same(newContents, response.Contents); + } + + [Fact] + public void JsonSerialization_Roundtrips() + { + SpeechToTextResponse original = new() + { + Contents = + [ + new TextContent("Text1"), + new TextContent("Text2"), + new TextContent("Text3"), + new TextContent("Text4"), + ], + ResponseId = "id", + ModelId = "modelId", + StartTime = TimeSpan.FromSeconds(1), + EndTime = TimeSpan.FromSeconds(2), + RawRepresentation = new(), + AdditionalProperties = new() { ["key"] = "value" }, + }; + + string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponse); + + SpeechToTextResponse? result = JsonSerializer.Deserialize(json, TestJsonSerializerContext.Default.SpeechToTextResponse); + + Assert.NotNull(result); + Assert.Equal(4, result.Contents.Count); + + for (int i = 0; i < original.Contents.Count; i++) + { + Assert.Equal($"Text{i + 1}", ((TextContent)result.Contents[i]).Text); + } + + Assert.Equal("id", result.ResponseId); + Assert.Equal("modelId", result.ModelId); + Assert.Equal(TimeSpan.FromSeconds(1), result.StartTime); + Assert.Equal(TimeSpan.FromSeconds(2), result.EndTime); + + Assert.NotNull(result.AdditionalProperties); + Assert.Single(result.AdditionalProperties); + Assert.True(result.AdditionalProperties.TryGetValue("key", out object? value)); + Assert.IsType(value); + Assert.Equal("value", ((JsonElement)value!).GetString()); + } + + [Fact] + public void ToString_OutputsText() + { + SpeechToTextResponse response = new("This is a test." + Environment.NewLine + "It's multiple lines."); + Assert.Equal("This is a test." + Environment.NewLine + "It's multiple lines.", response.ToString()); + } + + [Fact] + public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate() + { + // Arrange: create a response with contents + SpeechToTextResponse response = new() + { + Contents = + [ + new TextContent("Hello, "), + new DataContent("data:image/png;base64,AQIDBA==", mediaType: "image/png"), + new TextContent("world!") + ], + StartTime = TimeSpan.FromSeconds(1), + EndTime = TimeSpan.FromSeconds(2), + ResponseId = "12345", + ModelId = "someModel", + AdditionalProperties = new() { ["key1"] = "value1", ["key2"] = 42 }, + }; + + // Act: convert to streaming updates + SpeechToTextResponseUpdate[] updates = response.ToSpeechToTextResponseUpdates(); + + // Assert: should be a single update with all properties + Assert.Single(updates); + + SpeechToTextResponseUpdate update = updates[0]; + Assert.Equal("12345", update.ResponseId); + Assert.Equal("someModel", update.ModelId); + Assert.Equal(SpeechToTextResponseUpdateKind.TextUpdated, update.Kind); + Assert.Equal(TimeSpan.FromSeconds(1), update.StartTime); + Assert.Equal(TimeSpan.FromSeconds(2), update.EndTime); + + Assert.Equal(3, update.Contents.Count); + Assert.Equal("Hello, ", Assert.IsType(update.Contents[0]).Text); + Assert.Equal("image/png", Assert.IsType(update.Contents[1]).MediaType); + Assert.Equal("world!", Assert.IsType(update.Contents[2]).Text); + + Assert.NotNull(update.AdditionalProperties); + Assert.Equal("value1", update.AdditionalProperties["key1"]); + Assert.Equal(42, update.AdditionalProperties["key2"]); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs new file mode 100644 index 00000000000..f0a2f08ab13 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs @@ -0,0 +1,140 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextResponseUpdateExtensionsTests +{ + public static IEnumerable ToSpeechToTextResponse_Coalescing_VariousSequenceAndGapLengths_MemberData() + { + foreach (bool useAsync in new[] { false, true }) + { + for (int numSequences = 1; numSequences <= 3; numSequences++) + { + for (int sequenceLength = 1; sequenceLength <= 3; sequenceLength++) + { + for (int gapLength = 1; gapLength <= 3; gapLength++) + { + foreach (bool gapBeginningEnd in new[] { false, true }) + { + yield return new object[] { useAsync, numSequences, sequenceLength, gapLength, false }; + } + } + } + } + } + } + + [Fact] + public void InvalidArgs_Throws() + { + Assert.Throws("updates", () => ((List)null!).ToSpeechToTextResponse()); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public async Task ToSpeechToTextResponse_SuccessfullyCreatesResponse(bool useAsync) + { + SpeechToTextResponseUpdate[] updates = + [ + new("Hello ") { ModelId = "model123", StartTime = null, AdditionalProperties = new() { ["a"] = "b" } }, + new("human, ") { ModelId = "model123", StartTime = TimeSpan.FromSeconds(10), EndTime = TimeSpan.FromSeconds(20) }, + new("How ") { ModelId = "model123", StartTime = TimeSpan.FromSeconds(22), EndTime = TimeSpan.FromSeconds(23) }, + new("are ") { ModelId = "model123", StartTime = TimeSpan.FromSeconds(23), EndTime = TimeSpan.FromSeconds(24) }, + new([new TextContent("You?")]) { ModelId = "model123", StartTime = TimeSpan.FromSeconds(24), EndTime = TimeSpan.FromSeconds(25), AdditionalProperties = new() { ["c"] = "d" } }, + new() { ResponseId = "someResponse", ModelId = "model123", StartTime = TimeSpan.FromSeconds(25), EndTime = TimeSpan.FromSeconds(35) }, + ]; + + SpeechToTextResponse response = useAsync ? + updates.ToSpeechToTextResponse() : + await YieldAsync(updates).ToSpeechToTextResponseAsync(); + + Assert.NotNull(response); + + Assert.Equal("someResponse", response.ResponseId); + Assert.Equal(TimeSpan.FromSeconds(10), response.StartTime); + Assert.Equal(TimeSpan.FromSeconds(35), response.EndTime); + Assert.Equal("model123", response.ModelId); + + Assert.NotNull(response.AdditionalProperties); + Assert.Equal(2, response.AdditionalProperties.Count); + Assert.Equal("b", response.AdditionalProperties["a"]); + Assert.Equal("d", response.AdditionalProperties["c"]); + + Assert.Equal("Hello human, How are You?", response.Text); + } + + [Theory] + [MemberData(nameof(ToSpeechToTextResponse_Coalescing_VariousSequenceAndGapLengths_MemberData))] + public async Task ToSpeechToTextResponse_Coalescing_VariousSequenceAndGapLengths(bool useAsync, int numSequences, int sequenceLength, int gapLength, bool gapBeginningEnd) + { + List updates = []; + + List expected = []; + + if (gapBeginningEnd) + { + AddGap(); + } + + for (int sequenceNum = 0; sequenceNum < numSequences; sequenceNum++) + { + StringBuilder sb = new(); + for (int i = 0; i < sequenceLength; i++) + { + string text = $"{(char)('A' + sequenceNum)}{i}"; + updates.Add(new(text)); + sb.Append(text); + } + + expected.Add(sb.ToString()); + + if (sequenceNum < numSequences - 1) + { + AddGap(); + } + } + + if (gapBeginningEnd) + { + AddGap(); + } + + void AddGap() + { + for (int i = 0; i < gapLength; i++) + { + updates.Add(new() { Contents = [new DataContent("data:image/png;base64,aGVsbG8=")] }); + } + } + + SpeechToTextResponse response = useAsync ? await YieldAsync(updates).ToSpeechToTextResponseAsync() : updates.ToSpeechToTextResponse(); + Assert.NotNull(response); + + Assert.Equal(expected.Count + (gapLength * ((numSequences - 1) + (gapBeginningEnd ? 2 : 0))), response.Contents.Count); + + TextContent[] contents = response.Contents.OfType().ToArray(); + Assert.Equal(expected.Count, contents.Length); + for (int i = 0; i < expected.Count; i++) + { + Assert.Equal(expected[i], contents[i].Text); + } + } + + private static async IAsyncEnumerable YieldAsync(IEnumerable updates) + { + foreach (SpeechToTextResponseUpdate update in updates) + { + await Task.Yield(); + yield return update; + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateKindTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateKindTests.cs new file mode 100644 index 00000000000..ddc72d076db --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateKindTests.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextResponseUpdateKindTests +{ + [Fact] + public void Constructor_Value_Roundtrips() + { + Assert.Equal("abc", new SpeechToTextResponseUpdateKind("abc").Value); + } + + [Fact] + public void Constructor_NullOrWhiteSpace_Throws() + { + Assert.Throws("value", () => new SpeechToTextResponseUpdateKind(null!)); + Assert.Throws("value", () => new SpeechToTextResponseUpdateKind(" ")); + } + + [Fact] + public void Equality_UsesOrdinalIgnoreCaseComparison() + { + var kind1 = new SpeechToTextResponseUpdateKind("abc"); + var kind2 = new SpeechToTextResponseUpdateKind("ABC"); + Assert.True(kind1.Equals(kind2)); + Assert.True(kind1.Equals((object)kind2)); + Assert.True(kind1 == kind2); + Assert.False(kind1 != kind2); + + var kind3 = new SpeechToTextResponseUpdateKind("def"); + Assert.False(kind1.Equals(kind3)); + Assert.False(kind1.Equals((object)kind3)); + Assert.False(kind1 == kind3); + Assert.True(kind1 != kind3); + + Assert.Equal(kind1.GetHashCode(), new SpeechToTextResponseUpdateKind("abc").GetHashCode()); + Assert.Equal(kind1.GetHashCode(), new SpeechToTextResponseUpdateKind("ABC").GetHashCode()); + } + + [Fact] + public void Singletons_UseKnownValues() + { + Assert.Equal(SpeechToTextResponseUpdateKind.SessionOpen.ToString(), SpeechToTextResponseUpdateKind.SessionOpen.Value); + Assert.Equal(SpeechToTextResponseUpdateKind.Error.ToString(), SpeechToTextResponseUpdateKind.Error.Value); + Assert.Equal(SpeechToTextResponseUpdateKind.TextUpdating.ToString(), SpeechToTextResponseUpdateKind.TextUpdating.Value); + Assert.Equal(SpeechToTextResponseUpdateKind.TextUpdated.ToString(), SpeechToTextResponseUpdateKind.TextUpdated.Value); + Assert.Equal(SpeechToTextResponseUpdateKind.SessionClose.ToString(), SpeechToTextResponseUpdateKind.SessionClose.Value); + } + + [Fact] + public void JsonSerialization_Roundtrips() + { + var kind = new SpeechToTextResponseUpdateKind("abc"); + string json = JsonSerializer.Serialize(kind, TestJsonSerializerContext.Default.SpeechToTextResponseUpdateKind); + Assert.Equal("\"abc\"", json); + + var result = JsonSerializer.Deserialize(json, TestJsonSerializerContext.Default.SpeechToTextResponseUpdateKind); + Assert.Equal(kind, result); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateTests.cs new file mode 100644 index 00000000000..0eae376070e --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateTests.cs @@ -0,0 +1,114 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Text.Json; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextResponseUpdateTests +{ + [Fact] + public void Constructor_PropsDefaulted() + { + SpeechToTextResponseUpdate update = new(); + + Assert.Equal(SpeechToTextResponseUpdateKind.TextUpdating, update.Kind); + Assert.Empty(update.Text); + Assert.Empty(update.Contents); + Assert.Null(update.ResponseId); + Assert.Null(update.StartTime); + Assert.Null(update.EndTime); + Assert.Equal(string.Empty, update.ToString()); + } + + [Fact] + public void Properties_Roundtrip() + { + SpeechToTextResponseUpdate update = new() + { + Kind = new SpeechToTextResponseUpdateKind("custom"), + }; + + Assert.Equal("custom", update.Kind.Value); + + // Test the computed Text property + Assert.Empty(update.Text); + + // Contents: assigning a new list then resetting to null should yield an empty list. + List newList = new(); + newList.Add(new TextContent("content1")); + update.Contents = newList; + Assert.Same(newList, update.Contents); + update.Contents = null; + Assert.NotNull(update.Contents); + Assert.Empty(update.Contents); + + update.ResponseId = "comp123"; + Assert.Equal("comp123", update.ResponseId); + + update.StartTime = TimeSpan.FromSeconds(10); + update.EndTime = TimeSpan.FromSeconds(20); + Assert.Equal(TimeSpan.FromSeconds(10), update.StartTime); + Assert.Equal(TimeSpan.FromSeconds(20), update.EndTime); + } + + [Fact] + public void Text_Get_UsesFirstTextContent() + { + SpeechToTextResponseUpdate update = new( + [ + new DataContent("data:audio/wav;base64,AQIDBA==", "application/octet-stream"), + new DataContent("data:image/wav;base64,AQIDBA==", "application/octet-stream"), + new FunctionCallContent("callId1", "fc1"), + new TextContent("text-1"), + new TextContent("text-2"), + new FunctionResultContent("callId1", "result"), + ]); + + // The getter returns the text of the first TextContent (which is at index 3). + TextContent textContent = Assert.IsType(update.Contents[3]); + Assert.Equal("text-1", textContent.Text); + Assert.Equal("text-1text-2", update.Text); + + // Assume the ToString concatenates the text of all TextContent items. + Assert.Equal("text-1text-2", update.ToString()); + + // The setter should update the first TextContent item. + Assert.Same(textContent, update.Contents[3]); + } + + [Fact] + public void JsonSerialization_Roundtrips() + { + SpeechToTextResponseUpdate original = new() + { + Kind = new SpeechToTextResponseUpdateKind("transcribed"), + ResponseId = "id123", + StartTime = TimeSpan.FromSeconds(5), + EndTime = TimeSpan.FromSeconds(10), + Contents = new List + { + new TextContent("text-1"), + new DataContent("data:audio/wav;base64,AQIDBA==", "application/octet-stream") + } + }; + + string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponseUpdate); + SpeechToTextResponseUpdate? result = JsonSerializer.Deserialize(json, TestJsonSerializerContext.Default.SpeechToTextResponseUpdate); + Assert.NotNull(result); + + Assert.Equal(original.Kind, result.Kind); + Assert.Equal(original.ResponseId, result.ResponseId); + Assert.Equal(original.StartTime, result.StartTime); + Assert.Equal(original.EndTime, result.EndTime); + Assert.Equal(original.Contents.Count, result.Contents.Count); + for (int i = 0; i < original.Contents.Count; i++) + { + // Compare via string conversion. + Assert.Equal(original.Contents[i].ToString(), result.Contents[i].ToString()); + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestJsonSerializerContext.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestJsonSerializerContext.cs index 0362be74821..d15f0a19fa9 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestJsonSerializerContext.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestJsonSerializerContext.cs @@ -16,6 +16,10 @@ namespace Microsoft.Extensions.AI; UseStringEnumConverter = true)] [JsonSerializable(typeof(ChatResponse))] [JsonSerializable(typeof(ChatResponseUpdate))] +[JsonSerializable(typeof(SpeechToTextResponse))] +[JsonSerializable(typeof(SpeechToTextResponseUpdate))] +[JsonSerializable(typeof(SpeechToTextResponseUpdateKind))] +[JsonSerializable(typeof(SpeechToTextOptions))] [JsonSerializable(typeof(ChatOptions))] [JsonSerializable(typeof(EmbeddingGenerationOptions))] [JsonSerializable(typeof(Dictionary))] diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestSpeechToTextClient.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestSpeechToTextClient.cs new file mode 100644 index 00000000000..44e1d739533 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/TestSpeechToTextClient.cs @@ -0,0 +1,60 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI; + +public sealed class TestSpeechToTextClient : ISpeechToTextClient +{ + public TestSpeechToTextClient() + { + GetServiceCallback = DefaultGetServiceCallback; + } + + public IServiceProvider? Services { get; set; } + + // Callbacks for asynchronous operations. + public Func>? + GetTextAsyncCallback + { get; set; } + + public Func>? + GetStreamingTextAsyncCallback + { get; set; } + + public Func GetServiceCallback { get; set; } + + private object? DefaultGetServiceCallback(Type serviceType, object? serviceKey) + => serviceType is not null && serviceKey is null && serviceType.IsInstanceOfType(this) ? this : null; + + public Task GetTextAsync( + Stream audioSpeechStream, + SpeechToTextOptions? options = null, + CancellationToken cancellationToken = default) + => GetTextAsyncCallback!.Invoke(audioSpeechStream, options, cancellationToken); + + public IAsyncEnumerable GetStreamingTextAsync( + Stream audioSpeechStream, + SpeechToTextOptions? options = null, + CancellationToken cancellationToken = default) + => GetStreamingTextAsyncCallback!.Invoke(audioSpeechStream, options, cancellationToken); + + public object? GetService(Type serviceType, object? serviceKey = null) + => GetServiceCallback!.Invoke(serviceType, serviceKey); + + public void Dispose() + { + // Dispose of resources if any. + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/ChatClientIntegrationTests.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/ChatClientIntegrationTests.cs index 6b06b3c0ed2..5f00c9b9c44 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/ChatClientIntegrationTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/ChatClientIntegrationTests.cs @@ -967,7 +967,9 @@ private static Uri GetImageDataUri() [MemberNotNull(nameof(_chatClient))] protected void SkipIfNotEnabled() { - if (_chatClient is null) + string? skipIntegration = TestRunnerConfiguration.Instance["SkipIntegrationTests"]; + + if (skipIntegration is not null || _chatClient is null) { throw new SkipTestException("Client is not enabled."); } diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj index cf9f4d9703d..ec925a15309 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj @@ -7,6 +7,7 @@ $(NoWarn);CA1063;CA1861;SA1130;VSTHRD003 + $(NoWarn);MEAI001 true @@ -17,11 +18,19 @@ - + + + + + + Never + + + diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.mp3 b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.mp3 new file mode 100644 index 00000000000..9fbfb2bca17 Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.mp3 differ diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/dotnet.png b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/dotnet.png similarity index 100% rename from test/Libraries/Microsoft.Extensions.AI.Integration.Tests/dotnet.png rename to test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/dotnet.png diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs new file mode 100644 index 00000000000..f0ea6c1790e --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs @@ -0,0 +1,83 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Text; +using System.Threading.Tasks; +using Microsoft.TestUtilities; +using Xunit; + +#pragma warning disable CA2214 // Do not call overridable methods in constructors + +namespace Microsoft.Extensions.AI; + +public abstract class SpeechToTextClientIntegrationTests : IDisposable +{ + private readonly ISpeechToTextClient? _client; + + protected SpeechToTextClientIntegrationTests() + { + _client = CreateClient(); + } + + public void Dispose() + { + _client?.Dispose(); + GC.SuppressFinalize(this); + } + + protected abstract ISpeechToTextClient? CreateClient(); + + [ConditionalFact] + public virtual async Task GetTextAsync_SingleAudioRequestMessage() + { + SkipIfNotEnabled(); + + using var audioSpeechStream = GetAudioStream("audio001.mp3"); + var response = await _client.GetTextAsync(audioSpeechStream); + + Assert.Contains("gym", response.Text, StringComparison.OrdinalIgnoreCase); + } + + [ConditionalFact] + public virtual async Task GetStreamingTextAsync_SingleStreamingResponseChoice() + { + SkipIfNotEnabled(); + + using var audioSpeechStream = GetAudioStream("audio001.mp3"); + + StringBuilder sb = new(); + await foreach (var chunk in _client.GetStreamingTextAsync(audioSpeechStream)) + { + sb.Append(chunk.Text); + } + + string responseText = sb.ToString(); + Assert.Contains("finally", responseText, StringComparison.OrdinalIgnoreCase); + Assert.Contains("gym", responseText, StringComparison.OrdinalIgnoreCase); + } + + private static Stream GetAudioStream(string fileName) + { + using Stream? s = typeof(SpeechToTextClientIntegrationTests).Assembly.GetManifestResourceStream($"Microsoft.Extensions.AI.Resources.{fileName}"); + Assert.NotNull(s); + MemoryStream ms = new(); + s.CopyTo(ms); + + ms.Position = 0; + return ms; + } + + [MemberNotNull(nameof(_client))] + protected void SkipIfNotEnabled() + { + string? skipIntegration = TestRunnerConfiguration.Instance["SkipIntegrationTests"]; + + if (skipIntegration is not null || _client is null) + { + throw new SkipTestException("Client is not enabled."); + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs new file mode 100644 index 00000000000..6b0374d70cd --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs @@ -0,0 +1,215 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Net.Http; +using System.Text; +using System.Text.Json; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +#pragma warning disable S3996 // URI properties should not be strings + +/// +/// An that checks the multi-part request body as a root +/// JSON structure of properties and sends back an expected JSON response. +/// +/// +/// The order of the properties does not affect the comparison. +/// +/// An expected input of { "name": "something" } will Assert for a multipart body that has +/// a name field with a value of something. +/// +/// +/// An expected input of { "multiple[]": ["one","two"] } will Assert for a multipart body that has +/// two multiple[] fields each having "one" and "two" value respectively. +/// +/// +/// +/// A JSON string representing the expected structure and values of the multipart request body to be verified. +/// For example, { "name": "something" } or { "multiple[]": ["one","two"] }. +/// +/// +/// A JSON string that will be returned as the response body when the request matches the expected input. +/// +public class VerbatimMultiPartHttpHandler(string expectedInput, string sentJsonOutput) : HttpClientHandler +{ + public string? ExpectedRequestUriContains { get; init; } + + protected override async Task SendAsync( + HttpRequestMessage request, + CancellationToken cancellationToken) + { + Assert.NotNull(request.Content); + Assert.NotNull(request.Content.Headers.ContentType); + Assert.Equal("multipart/form-data", request.Content.Headers.ContentType.MediaType); + + Assert.NotNull(request.RequestUri); + if (!string.IsNullOrEmpty(ExpectedRequestUriContains)) + { + Assert.Contains(ExpectedRequestUriContains!, request.RequestUri!.ToString()); + } + + Dictionary parameters = []; + + // Extract the boundary + string? boundary = request.Content.Headers.ContentType.Parameters + .FirstOrDefault(p => p.Name == "boundary")?.Value; + + if (string.IsNullOrEmpty(boundary)) + { + throw new InvalidOperationException("Boundary not found."); + } + + string fullBoundary = $"--{boundary!.Trim('"')}"; + + // Read the entire body into memory (for simplicity; stream in production for large data) +#if NET + byte[] bodyBytes = await request.Content.ReadAsByteArrayAsync(cancellationToken); +#else + byte[] bodyBytes = await request.Content.ReadAsByteArrayAsync(); +#endif + using var stream = new MemoryStream(bodyBytes); + using var reader = new StreamReader(stream, Encoding.UTF8); +#if NET + + string bodyText = await reader.ReadToEndAsync(cancellationToken); +#else + string bodyText = await reader.ReadToEndAsync(); +#endif + + // Make it legible for debugging and splitting + bodyText = RemoveSpecialCharacters(bodyText); + + string[] parts = bodyText.Split(new string[] { fullBoundary }, StringSplitOptions.None); + + foreach (string part in parts) + { + if (part.Trim() == "--") + { + continue; // End boundary + } + + // Parse headers and body + int headerEnd = part.IndexOf("\r\n\r\n"); + if (headerEnd < 0) + { + continue; + } + + string headers = part.Substring(0, headerEnd).Trim(); + string rawValue = part.Substring(headerEnd + 4).TrimEnd('\r', '\n'); + + // Get the parameter name and value + if (headers.Contains("name=")) + { + // Text field + string name = ExtractNameFromHeaders(headers); + + // Skip file fields + if (!name.StartsWith("file")) + { + if (parameters.ContainsKey(name)) + { + ((List)parameters[name]).Add(ParseContentToJsonElement(rawValue)); + } + else + { + parameters.Add(name, new List { ParseContentToJsonElement(rawValue) }); + } + } + } + } + + // Transform one value lists into single values + foreach (var key in parameters.Keys.ToList()) + { + if (parameters[key] is List list && list.Count == 1) + { + parameters[key] = list[0]; + } + } + + var jsonParameters = JsonSerializer.Serialize(parameters); + Assert.NotNull(jsonParameters); + + AssertJsonEquals(expectedInput, jsonParameters); + + return new() { Content = new StringContent(sentJsonOutput, Encoding.UTF8, "application/json") }; + } + + private static string RemoveSpecialCharacters(string input) + { + return Regex.Replace(input, @"[^a-zA-Z0-9_ .,!?\r\n""=;\//\[\]-]", ""); + } + + private static JsonElement ParseContentToJsonElement(string content) + { + // Try parsing as a number + if (int.TryParse(content, out int intValue)) + { + return JsonSerializer.SerializeToElement(intValue); + } + + if (double.TryParse(content, out double doubleValue)) + { + return JsonSerializer.SerializeToElement(doubleValue); + } + + // Try parsing as a boolean + if (bool.TryParse(content, out bool boolValue)) + { + return JsonSerializer.SerializeToElement(boolValue); + } + + // Default to string + return JsonSerializer.SerializeToElement(content); + } + + private static string ExtractNameFromHeaders(string headers) + { + const string NamePrefix = "name="; + int start = headers.IndexOf(NamePrefix) + NamePrefix.Length; + int end = headers.IndexOf(";", start); + + if (end == -1) + { + end = headers.Length; + } + + return headers.Substring(start, end - start).Trim('"'); + } + + public static string? RemoveWhiteSpace(string? text) => + text is null ? null : + Regex.Replace(text, @"\s*", string.Empty); + + private static Dictionary? GetCharacterFrequencies(string text) + => RemoveWhiteSpace(text)?.GroupBy(c => c) + .ToDictionary(g => g.Key, g => g.Count()); + + private static void AssertJsonEquals(string expected, string actual) + { + var expectedFrequencies = GetCharacterFrequencies(expected); + var actualFrequencies = GetCharacterFrequencies(actual); + + Assert.NotNull(expectedFrequencies); + Assert.NotNull(actualFrequencies); + + foreach (var kvp in expectedFrequencies) + { + if (!actualFrequencies.ContainsKey(kvp.Key) || kvp.Value != actualFrequencies[kvp.Key]) + { + Assert.Fail($"Expected: {expected}, Actual: {actual}"); + } + + // Ensure the frequencies are equal during the test + Assert.Equal(kvp.Value, actualFrequencies[kvp.Key]); + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/IntegrationTestHelpers.cs b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/IntegrationTestHelpers.cs index 4b7252965f0..2a20b121ab0 100644 --- a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/IntegrationTestHelpers.cs +++ b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/IntegrationTestHelpers.cs @@ -16,6 +16,7 @@ internal static class IntegrationTestHelpers public static OpenAIClient? GetOpenAIClient() { var configuration = TestRunnerConfiguration.Instance; + string? apiKey = configuration["OpenAI:Key"]; if (apiKey is not null) diff --git a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/Microsoft.Extensions.AI.OpenAI.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/Microsoft.Extensions.AI.OpenAI.Tests.csproj index 66412bfeace..5626f4f207e 100644 --- a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/Microsoft.Extensions.AI.OpenAI.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/Microsoft.Extensions.AI.OpenAI.Tests.csproj @@ -2,7 +2,7 @@ Microsoft.Extensions.AI Unit tests for Microsoft.Extensions.AI.OpenAI - $(NoWarn);OPENAI002 + $(NoWarn);OPENAI002;MEAI001 diff --git a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientIntegrationTests.cs b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientIntegrationTests.cs new file mode 100644 index 00000000000..c80b37c865e --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientIntegrationTests.cs @@ -0,0 +1,12 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI; + +public class OpenAISpeechToTextClientIntegrationTests : SpeechToTextClientIntegrationTests +{ + protected override ISpeechToTextClient? CreateClient() + => IntegrationTestHelpers.GetOpenAIClient()? + .GetAudioClient(TestRunnerConfiguration.Instance["OpenAI:AudioTranscriptionModel"] ?? "whisper-1") + .AsISpeechToTextClient(); +} diff --git a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs new file mode 100644 index 00000000000..4587c3a5524 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs @@ -0,0 +1,284 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.ClientModel; +using System.ClientModel.Primitives; +using System.IO; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; +using Azure.AI.OpenAI; +using Microsoft.Extensions.Logging; +using OpenAI; +using OpenAI.Audio; +using Xunit; + +#pragma warning disable S103 // Lines should not be too long + +namespace Microsoft.Extensions.AI; + +public class OpenAISpeechToTextClientTests +{ + [Fact] + public void AsISpeechToTextClient_InvalidArgs_Throws() + { + Assert.Throws("audioClient", () => ((AudioClient)null!).AsISpeechToTextClient()); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public void AsISpeechToTextClient_AudioClient_ProducesExpectedMetadata(bool useAzureOpenAI) + { + Uri endpoint = new("http://localhost/some/endpoint"); + string model = "amazingModel"; + + var client = useAzureOpenAI ? + new AzureOpenAIClient(endpoint, new ApiKeyCredential("key")) : + new OpenAIClient(new ApiKeyCredential("key"), new OpenAIClientOptions { Endpoint = endpoint }); + + ISpeechToTextClient speechToTextClient = client.GetAudioClient(model).AsISpeechToTextClient(); + var metadata = speechToTextClient.GetService(); + Assert.Equal("openai", metadata?.ProviderName); + Assert.Equal(endpoint, metadata?.ProviderUri); + Assert.Equal(model, metadata?.DefaultModelId); + } + + [Fact] + public void GetService_AudioClient_SuccessfullyReturnsUnderlyingClient() + { + AudioClient audioClient = new OpenAIClient(new ApiKeyCredential("key")).GetAudioClient("model"); + ISpeechToTextClient speechToTextClient = audioClient.AsISpeechToTextClient(); + Assert.Same(speechToTextClient, speechToTextClient.GetService()); + Assert.Same(audioClient, speechToTextClient.GetService()); + using var factory = LoggerFactory.Create(b => b.AddFakeLogging()); + using ISpeechToTextClient pipeline = speechToTextClient + .AsBuilder() + .UseLogging(factory) + .Build(); + + Assert.NotNull(pipeline.GetService()); + + Assert.Same(audioClient, pipeline.GetService()); + Assert.IsType(pipeline.GetService()); + } + + [Theory] + [InlineData("pt", null)] + [InlineData("en", null)] + [InlineData("en", "en")] + [InlineData("pt", "pt")] + public async Task GetTextAsync_BasicRequestResponse(string? speechLanguage, string? textLanguage) + { + string input = $$""" + { + "model": "whisper-1", + "language": "{{speechLanguage}}" + } + """; + + const string Output = """ + { + "text":"I finally got back to the gym the other day." + } + """; + + using VerbatimMultiPartHttpHandler handler = new(input, Output) { ExpectedRequestUriContains = "audio/transcriptions" }; + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var audioSpeechStream = GetAudioStream(); + var response = await client.GetTextAsync(audioSpeechStream, new SpeechToTextOptions + { + SpeechLanguage = speechLanguage, + TextLanguage = textLanguage + }); + + Assert.NotNull(response); + + Assert.Contains("I finally got back to the gym the other day", response.Text); + + Assert.NotNull(response.RawRepresentation); + Assert.IsType(response.RawRepresentation); + } + + [Fact] + public async Task GetTextAsync_Cancelled_Throws() + { + using HttpClient httpClient = new(); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var fileStream = GetAudioStream(); + using var cancellationTokenSource = new CancellationTokenSource(); + cancellationTokenSource.Cancel(); + + await Assert.ThrowsAsync(() + => client.GetTextAsync(fileStream, cancellationToken: cancellationTokenSource.Token)); + } + + [Fact] + public async Task GetStreamingTextAsync_Cancelled_Throws() + { + using HttpClient httpClient = new(); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var fileStream = GetAudioStream(); + using var cancellationTokenSource = new CancellationTokenSource(); + cancellationTokenSource.Cancel(); + + await Assert.ThrowsAsync(() + => client + .GetStreamingTextAsync(fileStream, cancellationToken: cancellationTokenSource.Token) + .GetAsyncEnumerator() + .MoveNextAsync() + .AsTask()); + } + + [Theory] + [InlineData("pt", null)] + [InlineData("en", null)] + [InlineData("en", "en")] + [InlineData("pt", "pt")] + public async Task GetStreamingTextAsync_BasicRequestResponse(string? speechLanguage, string? textLanguage) + { + // There's no support for streaming audio in the OpenAI API, + // so we're just testing the client's ability to handle streaming responses. + + string input = $$""" + { + "model": "whisper-1", + "language": "{{speechLanguage}}" + } + """; + + const string Output = """ + { + "text":"I finally got back to the gym the other day." + } + """; + + using VerbatimMultiPartHttpHandler handler = new(input, Output) { ExpectedRequestUriContains = "audio/transcriptions" }; + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var audioSpeechStream = GetAudioStream(); + await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream, new SpeechToTextOptions + { + SpeechLanguage = speechLanguage, + TextLanguage = textLanguage + })) + { + Assert.Contains("I finally got back to the gym the other day", update.Text); + Assert.NotNull(update.RawRepresentation); + Assert.IsType(update.RawRepresentation); + } + } + + [Fact] + public async Task GetStreamingTextAsync_BasicTranslateRequestResponse() + { + string textLanguage = "en"; + + // There's no support for non english translations, so no language is passed to the API. + const string Input = $$""" + { + "model": "whisper-1" + } + """; + + const string Output = """ + { + "text":"I finally got back to the gym the other day." + } + """; + + using VerbatimMultiPartHttpHandler handler = new(Input, Output) { ExpectedRequestUriContains = "audio/translations" }; + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var audioSpeechStream = GetAudioStream(); + await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream, new SpeechToTextOptions + { + SpeechLanguage = "pt", + TextLanguage = textLanguage + })) + { + Assert.Contains("I finally got back to the gym the other day", update.Text); + Assert.NotNull(update.RawRepresentation); + Assert.IsType(update.RawRepresentation); + } + } + + [Fact] + public async Task GetTextAsync_NonStronglyTypedOptions_AllSent() + { + const string Input = """ + { + "model": "whisper-1", + "prompt":"Hide any bad words with ", + "temperature": 0.5, + "response_format": "vtt", + "timestamp_granularities[]": ["word","segment"] + } + """; + + const string Output = """ + { + "text":"I finally got back to the gym the other day." + } + """; + + using VerbatimMultiPartHttpHandler handler = new(Input, Output); + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var audioSpeechStream = GetAudioStream(); + Assert.NotNull(await client.GetTextAsync(audioSpeechStream, new() + { + AdditionalProperties = new() + { + ["Prompt"] = "Hide any bad words with ", + ["SpeechLanguage"] = "pt", + ["Temperature"] = 0.5f, + ["TimestampGranularities"] = AudioTimestampGranularities.Segment | AudioTimestampGranularities.Word, + ["ResponseFormat"] = AudioTranscriptionFormat.Vtt, + }, + })); + } + + [Fact] + public async Task GetTextAsync_StronglyTypedOptions_AllSent() + { + const string Input = """ + { + "model": "whisper-1", + "language": "pt" + } + """; + + const string Output = """ + { + "text":"I finally got back to the gym the other day." + } + """; + + using VerbatimMultiPartHttpHandler handler = new(Input, Output); + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "whisper-1"); + + using var audioSpeechStream = GetAudioStream(); + Assert.NotNull(await client.GetTextAsync(audioSpeechStream, new() + { + SpeechLanguage = "pt", + })); + } + + private static Stream GetAudioStream() + => new MemoryStream([0x01, 0x02]); + + private static ISpeechToTextClient CreateSpeechToTextClient(HttpClient httpClient, string modelId) => + new OpenAIClient(new ApiKeyCredential("apikey"), new OpenAIClientOptions { Transport = new HttpClientPipelineTransport(httpClient) }) + .GetAudioClient(modelId) + .AsISpeechToTextClient(); +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Tests/Microsoft.Extensions.AI.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Tests/Microsoft.Extensions.AI.Tests.csproj index 32589c430e0..9b8967a37ce 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Tests/Microsoft.Extensions.AI.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Tests/Microsoft.Extensions.AI.Tests.csproj @@ -6,6 +6,7 @@ $(NoWarn);CA1063;CA1861;SA1130;VSTHRD003 + $(NoWarn);MEAI001 true @@ -16,6 +17,7 @@ + diff --git a/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/ConfigureOptionsSpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/ConfigureOptionsSpeechToTextClientTests.cs new file mode 100644 index 00000000000..6140b7ed354 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/ConfigureOptionsSpeechToTextClientTests.cs @@ -0,0 +1,101 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class ConfigureOptionsSpeechToTextClientTests +{ + [Fact] + public void ConfigureOptionsSpeechToTextClient_InvalidArgs_Throws() + { + Assert.Throws("innerClient", () => new ConfigureOptionsSpeechToTextClient(null!, _ => { })); + Assert.Throws("configure", () => new ConfigureOptionsSpeechToTextClient(new TestSpeechToTextClient(), null!)); + } + + [Fact] + public void ConfigureOptions_InvalidArgs_Throws() + { + using var innerClient = new TestSpeechToTextClient(); + var builder = innerClient.AsBuilder(); + Assert.Throws("configure", () => builder.ConfigureOptions(null!)); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public async Task ConfigureOptions_ReturnedInstancePassedToNextClient(bool nullProvidedOptions) + { + SpeechToTextOptions? providedOptions = nullProvidedOptions ? null : new() { ModelId = "test" }; + SpeechToTextOptions? returnedOptions = null; + SpeechToTextResponse expectedResponse = new([]); + var expectedUpdates = Enumerable.Range(0, 3).Select(i => new SpeechToTextResponseUpdate()).ToArray(); + using CancellationTokenSource cts = new(); + + using ISpeechToTextClient innerClient = new TestSpeechToTextClient + { + GetTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + Assert.Same(returnedOptions, options); + Assert.Equal(cts.Token, cancellationToken); + return Task.FromResult(expectedResponse); + }, + + GetStreamingTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + Assert.Same(returnedOptions, options); + Assert.Equal(cts.Token, cancellationToken); + return YieldUpdates(expectedUpdates); + }, + }; + + using var client = innerClient + .AsBuilder() + .ConfigureOptions(options => + { + Assert.NotSame(providedOptions, options); + if (nullProvidedOptions) + { + Assert.Null(options.ModelId); + } + else + { + Assert.Equal(providedOptions!.ModelId, options.ModelId); + } + + returnedOptions = options; + }) + .Build(); + + using var audioSpeechStream = new MemoryStream(new byte[] { 1, 2, 3, 4 }); + var response = await client.GetTextAsync(audioSpeechStream, providedOptions, cts.Token); + Assert.Same(expectedResponse, response); + + int i = 0; + using var audioSpeechStream2 = new MemoryStream(new byte[] { 1, 2, 3, 4 }); + await using var e = client.GetStreamingTextAsync(audioSpeechStream2, providedOptions, cts.Token).GetAsyncEnumerator(); + while (i < expectedUpdates.Length) + { + Assert.True(await e.MoveNextAsync()); + Assert.Same(expectedUpdates[i++], e.Current); + } + + Assert.False(await e.MoveNextAsync()); + + static async IAsyncEnumerable YieldUpdates(SpeechToTextResponseUpdate[] updates) + { + foreach (var update in updates) + { + await Task.Yield(); + yield return update; + } + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/LoggingSpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/LoggingSpeechToTextClientTests.cs new file mode 100644 index 00000000000..79c09dd5c6f --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/LoggingSpeechToTextClientTests.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Logging.Testing; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class LoggingSpeechToTextClientTests +{ + [Fact] + public void LoggingSpeechToTextClient_InvalidArgs_Throws() + { + Assert.Throws("innerClient", () => new LoggingSpeechToTextClient(null!, NullLogger.Instance)); + Assert.Throws("logger", () => new LoggingSpeechToTextClient(new TestSpeechToTextClient(), null!)); + } + + [Fact] + public void UseLogging_AvoidsInjectingNopClient() + { + using var innerClient = new TestSpeechToTextClient(); + + Assert.Null(innerClient.AsBuilder().UseLogging(NullLoggerFactory.Instance).Build().GetService(typeof(LoggingSpeechToTextClient))); + Assert.Same(innerClient, innerClient.AsBuilder().UseLogging(NullLoggerFactory.Instance).Build().GetService(typeof(ISpeechToTextClient))); + + using var factory = LoggerFactory.Create(b => b.AddFakeLogging()); + Assert.NotNull(innerClient.AsBuilder().UseLogging(factory).Build().GetService(typeof(LoggingSpeechToTextClient))); + + ServiceCollection c = new(); + c.AddFakeLogging(); + var services = c.BuildServiceProvider(); + Assert.NotNull(innerClient.AsBuilder().UseLogging().Build(services).GetService(typeof(LoggingSpeechToTextClient))); + Assert.NotNull(innerClient.AsBuilder().UseLogging(null).Build(services).GetService(typeof(LoggingSpeechToTextClient))); + Assert.Null(innerClient.AsBuilder().UseLogging(NullLoggerFactory.Instance).Build(services).GetService(typeof(LoggingSpeechToTextClient))); + } + + [Theory] + [InlineData(LogLevel.Trace)] + [InlineData(LogLevel.Debug)] + [InlineData(LogLevel.Information)] + public async Task GetTextAsync_LogsResponseInvocationAndCompletion(LogLevel level) + { + var collector = new FakeLogCollector(); + + ServiceCollection c = new(); + c.AddLogging(b => b.AddProvider(new FakeLoggerProvider(collector)).SetMinimumLevel(level)); + var services = c.BuildServiceProvider(); + + using ISpeechToTextClient innerClient = new TestSpeechToTextClient + { + GetTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => + { + return Task.FromResult(new SpeechToTextResponse("blue whale")); + }, + }; + + using ISpeechToTextClient client = innerClient + .AsBuilder() + .UseLogging() + .Build(services); + + using var audioSpeechStream = new MemoryStream(new byte[] { 1, 2, 3, 4 }); + await client.GetTextAsync( + audioSpeechStream, + new SpeechToTextOptions { SpeechLanguage = "pt" }); + + var logs = collector.GetSnapshot(); + if (level is LogLevel.Trace) + { + Assert.Collection(logs, + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetTextAsync)} invoked:") && entry.Message.Contains("\"speechLanguage\": \"pt\"")), + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetTextAsync)} completed:") && entry.Message.Contains("blue whale"))); + } + else if (level is LogLevel.Debug) + { + Assert.Collection(logs, + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetTextAsync)} invoked.") && !entry.Message.Contains("\"speechLanguage\": \"pt\"")), + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetTextAsync)} completed.") && !entry.Message.Contains("blue whale"))); + } + else + { + Assert.Empty(logs); + } + } + + [Theory] + [InlineData(LogLevel.Trace)] + [InlineData(LogLevel.Debug)] + [InlineData(LogLevel.Information)] + public async Task GetStreamingTextAsync_LogsUpdateReceived(LogLevel level) + { + var collector = new FakeLogCollector(); + using ILoggerFactory loggerFactory = LoggerFactory.Create(b => b.AddProvider(new FakeLoggerProvider(collector)).SetMinimumLevel(level)); + + using ISpeechToTextClient innerClient = new TestSpeechToTextClient + { + GetStreamingTextAsyncCallback = (audioSpeechStream, options, cancellationToken) => GetUpdatesAsync() + }; + + static async IAsyncEnumerable GetUpdatesAsync() + { + await Task.Yield(); + yield return new SpeechToTextResponseUpdate("blue "); + yield return new SpeechToTextResponseUpdate("whale"); + } + + using ISpeechToTextClient client = innerClient + .AsBuilder() + .UseLogging(loggerFactory) + .Build(); + + using var audioSpeechStream = new MemoryStream(new byte[] { 1, 2, 3, 4 }); + await foreach (var update in client.GetStreamingTextAsync( + audioSpeechStream, + new SpeechToTextOptions { SpeechLanguage = "pt" })) + { + // nop + } + + var logs = collector.GetSnapshot(); + if (level is LogLevel.Trace) + { + Assert.Collection(logs, + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} invoked:") && entry.Message.Contains("\"speechLanguage\": \"pt\"")), + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} received update:") && entry.Message.Contains("blue")), + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} received update:") && entry.Message.Contains("whale")), + entry => Assert.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} completed.", entry.Message)); + } + else if (level is LogLevel.Debug) + { + Assert.Collection(logs, + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} invoked.") && !entry.Message.Contains("speechLanguage")), + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} received update.") && !entry.Message.Contains("blue")), + entry => Assert.True(entry.Message.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} received update.") && !entry.Message.Contains("whale")), + entry => Assert.Contains($"{nameof(ISpeechToTextClient.GetStreamingTextAsync)} completed.", entry.Message)); + } + else + { + Assert.Empty(logs); + } + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/SingletonSpeechToTextClientExtensions.cs b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/SingletonSpeechToTextClientExtensions.cs new file mode 100644 index 00000000000..5fc038f8147 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/SingletonSpeechToTextClientExtensions.cs @@ -0,0 +1,11 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI; + +public static class SingletonSpeechToTextClientExtensions +{ + public static SpeechToTextClientBuilder UseSingletonMiddleware(this SpeechToTextClientBuilder builder) + => builder.Use((inner, services) + => new SpeechToTextClientDependencyInjectionPatterns.SingletonMiddleware(inner, services)); +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/SpeechToTextClientDependencyInjectionPatterns.cs b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/SpeechToTextClientDependencyInjectionPatterns.cs new file mode 100644 index 00000000000..07596a1bb6f --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Tests/SpeechToText/SpeechToTextClientDependencyInjectionPatterns.cs @@ -0,0 +1,162 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using Microsoft.Extensions.DependencyInjection; +using Xunit; + +namespace Microsoft.Extensions.AI; + +public class SpeechToTextClientDependencyInjectionPatterns +{ + private IServiceCollection ServiceCollection { get; } = new ServiceCollection(); + + [Fact] + public void CanRegisterSingletonUsingFactory() + { + // Arrange/Act + ServiceCollection.AddSpeechToTextClient(services => new TestSpeechToTextClient { Services = services }) + .UseSingletonMiddleware(); + + // Assert + var services = ServiceCollection.BuildServiceProvider(); + using var scope1 = services.CreateScope(); + using var scope2 = services.CreateScope(); + + var instance1 = scope1.ServiceProvider.GetRequiredService(); + var instance1Copy = scope1.ServiceProvider.GetRequiredService(); + var instance2 = scope2.ServiceProvider.GetRequiredService(); + + // Each scope gets the same instance, because it's singleton + var instance = Assert.IsType(instance1); + Assert.Same(instance, instance1Copy); + Assert.Same(instance, instance2); + Assert.IsType(instance.InnerClient); + } + + [Fact] + public void CanRegisterSingletonUsingSharedInstance() + { + // Arrange/Act + using var singleton = new TestSpeechToTextClient(); + ServiceCollection.AddSpeechToTextClient(singleton) + .UseSingletonMiddleware(); + + // Assert + var services = ServiceCollection.BuildServiceProvider(); + using var scope1 = services.CreateScope(); + using var scope2 = services.CreateScope(); + + var instance1 = scope1.ServiceProvider.GetRequiredService(); + var instance1Copy = scope1.ServiceProvider.GetRequiredService(); + var instance2 = scope2.ServiceProvider.GetRequiredService(); + + // Each scope gets the same instance, because it's singleton + var instance = Assert.IsType(instance1); + Assert.Same(instance, instance1Copy); + Assert.Same(instance, instance2); + Assert.IsType(instance.InnerClient); + } + + [Fact] + public void CanRegisterKeyedSingletonUsingFactory() + { + // Arrange/Act + ServiceCollection.AddKeyedSpeechToTextClient("mykey", services => new TestSpeechToTextClient { Services = services }) + .UseSingletonMiddleware(); + + // Assert + var services = ServiceCollection.BuildServiceProvider(); + using var scope1 = services.CreateScope(); + using var scope2 = services.CreateScope(); + + Assert.Null(services.GetService()); + + var instance1 = scope1.ServiceProvider.GetRequiredKeyedService("mykey"); + var instance1Copy = scope1.ServiceProvider.GetRequiredKeyedService("mykey"); + var instance2 = scope2.ServiceProvider.GetRequiredKeyedService("mykey"); + + // Each scope gets the same instance, because it's singleton + var instance = Assert.IsType(instance1); + Assert.Same(instance, instance1Copy); + Assert.Same(instance, instance2); + Assert.IsType(instance.InnerClient); + } + + [Fact] + public void CanRegisterKeyedSingletonUsingSharedInstance() + { + // Arrange/Act + using var singleton = new TestSpeechToTextClient(); + ServiceCollection.AddKeyedSpeechToTextClient("mykey", singleton) + .UseSingletonMiddleware(); + + // Assert + var services = ServiceCollection.BuildServiceProvider(); + using var scope1 = services.CreateScope(); + using var scope2 = services.CreateScope(); + + Assert.Null(services.GetService()); + + var instance1 = scope1.ServiceProvider.GetRequiredKeyedService("mykey"); + var instance1Copy = scope1.ServiceProvider.GetRequiredKeyedService("mykey"); + var instance2 = scope2.ServiceProvider.GetRequiredKeyedService("mykey"); + + // Each scope gets the same instance, because it's singleton + var instance = Assert.IsType(instance1); + Assert.Same(instance, instance1Copy); + Assert.Same(instance, instance2); + Assert.IsType(instance.InnerClient); + } + + [Theory] + [InlineData(null)] + [InlineData(ServiceLifetime.Singleton)] + [InlineData(ServiceLifetime.Scoped)] + [InlineData(ServiceLifetime.Transient)] + public void AddSpeechToTextClient_RegistersExpectedLifetime(ServiceLifetime? lifetime) + { + ServiceCollection sc = new(); + ServiceLifetime expectedLifetime = lifetime ?? ServiceLifetime.Singleton; + SpeechToTextClientBuilder builder = lifetime.HasValue + ? sc.AddSpeechToTextClient(services => new TestSpeechToTextClient(), lifetime.Value) + : sc.AddSpeechToTextClient(services => new TestSpeechToTextClient()); + + ServiceDescriptor sd = Assert.Single(sc); + Assert.Equal(typeof(ISpeechToTextClient), sd.ServiceType); + Assert.False(sd.IsKeyedService); + Assert.Null(sd.ImplementationInstance); + Assert.NotNull(sd.ImplementationFactory); + Assert.IsType(sd.ImplementationFactory(null!)); + Assert.Equal(expectedLifetime, sd.Lifetime); + } + + [Theory] + [InlineData(null)] + [InlineData(ServiceLifetime.Singleton)] + [InlineData(ServiceLifetime.Scoped)] + [InlineData(ServiceLifetime.Transient)] + public void AddKeyedSpeechToTextClient_RegistersExpectedLifetime(ServiceLifetime? lifetime) + { + ServiceCollection sc = new(); + ServiceLifetime expectedLifetime = lifetime ?? ServiceLifetime.Singleton; + SpeechToTextClientBuilder builder = lifetime.HasValue + ? sc.AddKeyedSpeechToTextClient("key", services => new TestSpeechToTextClient(), lifetime.Value) + : sc.AddKeyedSpeechToTextClient("key", services => new TestSpeechToTextClient()); + + ServiceDescriptor sd = Assert.Single(sc); + Assert.Equal(typeof(ISpeechToTextClient), sd.ServiceType); + Assert.True(sd.IsKeyedService); + Assert.Equal("key", sd.ServiceKey); + Assert.Null(sd.KeyedImplementationInstance); + Assert.NotNull(sd.KeyedImplementationFactory); + Assert.IsType(sd.KeyedImplementationFactory(null!, null!)); + Assert.Equal(expectedLifetime, sd.Lifetime); + } + + public class SingletonMiddleware(ISpeechToTextClient inner, IServiceProvider services) : DelegatingSpeechToTextClient(inner) + { + public new ISpeechToTextClient InnerClient => base.InnerClient; + public IServiceProvider Services => services; + } +}