diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs new file mode 100644 index 00000000000..3dbc8211416 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs @@ -0,0 +1,44 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using System.Text.Json; +using System.Text.Json.Nodes; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class AIToolExtensions +{ + internal static string RenderAsJson( + this IEnumerable toolDefinitions, + JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(toolDefinitions); + + var toolDefinitionsJsonArray = new JsonArray(); + + foreach (AIFunction function in toolDefinitions.OfType()) + { + JsonNode functionJsonNode = + new JsonObject + { + ["name"] = function.Name, + ["description"] = function.Description, + ["functionSchema"] = JsonNode.Parse(function.JsonSchema.GetRawText()), + }; + + if (function.ReturnJsonSchema is not null) + { + functionJsonNode["functionReturnValueSchema"] = + JsonNode.Parse(function.ReturnJsonSchema.Value.GetRawText()); + } + + toolDefinitionsJsonArray.Add(functionJsonNode); + } + + string renderedToolDefinitions = toolDefinitionsJsonArray.ToJsonString(options); + return renderedToolDefinitions; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs new file mode 100644 index 00000000000..cfad90f85f4 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Text.Json; +using System.Text.Json.Nodes; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class ChatMessageExtensions +{ + internal static string RenderAsJson(this IEnumerable messages, JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(messages); + + var messagesJsonArray = new JsonArray(); + + foreach (ChatMessage message in messages) + { + JsonNode? messageJsonNode = + JsonSerializer.SerializeToNode( + message, + AIJsonUtilities.DefaultOptions.GetTypeInfo(typeof(ChatMessage))); + + if (messageJsonNode is not null) + { + messagesJsonArray.Add(messageJsonNode); + } + } + + string renderedMessages = messagesJsonArray.ToJsonString(options); + return renderedMessages; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs new file mode 100644 index 00000000000..c579caa7cb1 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs @@ -0,0 +1,51 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Linq; +using System.Text.Json; +using System.Text.Json.Nodes; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class ChatResponseExtensions +{ + internal static string RenderAsJson(this ChatResponse modelResponse, JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(modelResponse); + + return modelResponse.Messages.RenderAsJson(options); + } + + internal static string RenderToolCallsAndResultsAsJson( + this ChatResponse modelResponse, + JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(modelResponse); + + var toolCallsAndResultsJsonArray = new JsonArray(); + + foreach (AIContent content in modelResponse.Messages.SelectMany(m => m.Contents)) + { + if (content is FunctionCallContent or FunctionResultContent) + { + Type contentType = + content is FunctionCallContent ? typeof(FunctionCallContent) : typeof(FunctionResultContent); + + JsonNode? toolCallOrResultJsonNode = + JsonSerializer.SerializeToNode( + content, + AIJsonUtilities.DefaultOptions.GetTypeInfo(contentType)); + + if (toolCallOrResultJsonNode is not null) + { + toolCallsAndResultsJsonArray.Add(toolCallOrResultJsonNode); + } + } + } + + string renderedToolCallsAndResults = toolCallsAndResultsJsonArray.ToJsonString(options); + return renderedToolCallsAndResults; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs index a5f04300f70..3bd57cf322b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs @@ -87,7 +87,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(CompletenessEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(CompletenessEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs index 714a027b4a1..ced79652bc7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs @@ -86,7 +86,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(EquivalenceEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs index 51acfa37d10..792db414d1c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs @@ -33,6 +33,25 @@ internal static EvaluationMetricInterpretation InterpretScore(this NumericMetric : new EvaluationMetricInterpretation(rating); } + internal static EvaluationMetricInterpretation InterpretScore( + this BooleanMetric metric, + bool passValue = true) + { + EvaluationRating rating = metric.Value switch + { + null => EvaluationRating.Inconclusive, + true => passValue ? EvaluationRating.Exceptional : EvaluationRating.Unacceptable, + false => passValue ? EvaluationRating.Unacceptable : EvaluationRating.Exceptional, + }; + + return metric.Value is bool value && value == passValue + ? new EvaluationMetricInterpretation(rating) + : new EvaluationMetricInterpretation( + rating, + failed: true, + reason: $"{metric.Name} is not {passValue}."); + } + internal static bool TryParseEvaluationResponseWithValue( this EvaluationMetric metric, ChatResponse evaluationResponse, @@ -81,7 +100,7 @@ internal static bool TryParseEvaluationResponseWithTags( static bool TryParseTag(string text, string tagName, [NotNullWhen(true)] out string? tagValue) { - const RegexOptions Options = RegexOptions.Multiline; + const RegexOptions Options = RegexOptions.Singleline; Match match = Regex.Match(text, $@"<{tagName}>(?.*?)", Options); if (!match.Success || match.Groups["value"] is not Group valueGroup || !valueGroup.Success) @@ -131,6 +150,11 @@ private static bool TryParseValue(this EvaluationMetric metric, string val booleanMetric.Value = booleanValue; return true; } + else if (int.TryParse(valueText, out int intValue) && (intValue is 0 or 1)) + { + booleanMetric.Value = intValue is 1; + return true; + } else { metric.AddDiagnostics( diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs index bf9b499ebc7..a52fbcf2ad9 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs @@ -85,7 +85,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(GroundednessEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(GroundednessEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs new file mode 100644 index 00000000000..4f19d308f10 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs @@ -0,0 +1,407 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// An that evaluates an AI system's effectiveness at identifying and resolving user intent. +/// +/// +/// +/// evaluates an AI system's effectiveness at identifying and resolving user +/// intent based on the supplied conversation history and the tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +/// returns a that contains a score for 'Intent +/// Resolution'. The score is a number between 1 and 5, with 1 indicating a poor score, and 5 indicating an excellent +/// score. +/// +/// +/// Note: is an AI-based evaluator that uses an AI model to perform its +/// evaluation. While the prompt that this evaluator uses to perform its evaluation is designed to be model-agnostic, +/// the performance of this prompt (and the resulting evaluation) can vary depending on the model used, and can be +/// especially poor when a smaller / local model is used. +/// +/// +/// The prompt that uses has been tested against (and tuned to work well with) +/// the following models. So, using this evaluator with a model from the following list is likely to produce the best +/// results. (The model to be used can be configured via .) +/// +/// +/// GPT-4o +/// +/// +[Experimental("AIEVAL001")] +public sealed class IntentResolutionEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string IntentResolutionMetricName => "Intent Resolution"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [IntentResolutionMetricName]; + + private static readonly ChatOptions _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + MaxOutputTokens = 800, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Json + }; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + _ = Throw.IfNull(chatConfiguration); + + var metric = new NumericMetric(IntentResolutionMetricName); + var result = new EvaluationResult(metric); + + if (!messages.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + "The conversation history supplied for evaluation did not include any messages.")); + + return result; + } + + if (!modelResponse.Messages.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation did not include any messages.")); + + return result; + } + + IntentResolutionEvaluatorContext? context = + additionalContext?.OfType().FirstOrDefault(); + + if (context is not null && context.ToolDefinitions.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(IntentResolutionEvaluatorContext)} did not contain any {nameof(IntentResolutionEvaluatorContext.ToolDefinitions)}.")); + + return result; + } + + var toolDefinitionNames = new HashSet(context?.ToolDefinitions.Select(td => td.Name) ?? []); + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) + { + if (context is null) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not supplied via {nameof(IntentResolutionEvaluatorContext)}.")); + } + else + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not included in the supplied {nameof(IntentResolutionEvaluatorContext)}.")); + } + + return result; + } + + List evaluationInstructions = GetEvaluationInstructions(messages, modelResponse, context); + + (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = + await TimingHelper.ExecuteWithTimingAsync(() => + chatConfiguration.ChatClient.GetResponseAsync( + evaluationInstructions, + _chatOptions, + cancellationToken)).ConfigureAwait(false); + + if (context is not null) + { + metric.AddOrUpdateContext(context); + } + + await ParseEvaluationResponseAsync( + metric, + evaluationResponse, + evaluationDuration, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + + return result; + } + + private static List GetEvaluationInstructions( + IEnumerable messages, + ChatResponse modelResponse, + IntentResolutionEvaluatorContext? context) + { + const string SystemPrompt = + "You are an expert in evaluating the quality of a RESPONSE from an intelligent assistant based on provided definition and Data."; + + List evaluationInstructions = [new ChatMessage(ChatRole.System, SystemPrompt)]; + + string renderedConversation = messages.RenderAsJson(); + string renderedModelResponse = modelResponse.RenderAsJson(); + string? renderedToolDefinitions = context?.ToolDefinitions.RenderAsJson(); + +#pragma warning disable S103 // Lines should not be too long + string evaluationPrompt = + $$""" + # Goal + Your goal is to assess the quality of the RESPONSE of an assistant in relation to a QUERY from a user, specifically focusing on + the assistant's ability to understand and resolve the user intent expressed in the QUERY. There is also a field for tool definitions + describing the functions, if any, that are accessible to the agent and that the agent may invoke in the RESPONSE if necessary. + + There are two components to intent resolution: + - Intent Understanding: The extent to which the agent accurately discerns the user's underlying need or inquiry. + - Response Resolution: The degree to which the agent's response is comprehensive, relevant, and adequately addresses the user's request. + + Note that the QUERY can either be a string with a user request or an entire conversation history including previous requests and responses from the assistant. + In this case, the assistant's response should be evaluated in the context of the entire conversation but the focus should be on the last intent. + + # Data + QUERY: {{renderedConversation}} + RESPONSE: {{renderedModelResponse}} + TOOL_DEFINITIONS: {{renderedToolDefinitions}} + + + # Ratings + ## [Score: 1] (Response completely unrelated to user intent) + **Definition:** The agent's response does not address the query at all. + + **Example:** + **Query:** How do I bake a chocolate cake? + **Response:** The latest smartphone models have incredible features and performance. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The agent's response is entirely off-topic, discussing smartphones instead of providing any information about baking a chocolate cake." + "conversation_has_intent": true, + "agent_perceived_intent": "discussion about smartphone features", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": false, + "intent_resolved": false, + "resolution_score": 1, + } + + + ## [Score: 2] (Response minimally relates to user intent) + **Definition:** The response shows a token attempt to address the query by mentioning a relevant keyword or concept, but it provides almost no useful or actionable information. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Chocolate cake involves some ingredients. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "While the response mentions 'ingredients' related to a chocolate cake, it barely addresses the process or any detailed steps, leaving the query unresolved." + "conversation_has_intent": true, + "agent_perceived_intent": "mention of ingredients", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": false, + "intent_resolved": false, + "resolution_score": 2, + } + + + ## [Score: 3] (Response partially addresses the user intent but lacks complete details) + **Definition:** The response provides a basic idea related to the query by mentioning a few relevant elements, but it omits several key details and specifics needed for fully resolving the user's query. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Preheat your oven and mix the ingredients before baking the cake. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The response outlines a minimal process (preheating and mixing) but omits critical details like ingredient measurements, baking time, and temperature specifics, resulting in only a partial resolution of the query." + "conversation_has_intent": true, + "agent_perceived_intent": "basic baking process", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": false, + "resolution_score": 3, + } + + + ## [Score: 4] (Response addresses the user intent with moderate accuracy but has minor inaccuracies or omissions) + **Definition:** The response offers a moderately detailed answer that includes several specific elements relevant to the query, yet it still lacks some finer details or complete information. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Preheat your oven to 350°F. In a bowl, combine flour, sugar, cocoa, eggs, and milk, mix well, and bake for about 30 minutes. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The response includes specific steps and ingredients, indicating a clear intent to provide a recipe. However, it lacks detailed measurements and finer instructions, which makes the resolution only moderately complete." + "conversation_has_intent": true, + "agent_perceived_intent": "provide a basic chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": false, + "resolution_score": 4, + } + + + ## [Score: 5] (Response directly addresses the user intent and fully resolves it) + **Definition:** The response provides a complete, detailed, and accurate answer that fully resolves the user's query with all necessary information and precision. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Preheat your oven to 350°F (175°C) and grease a 9-inch round cake pan. In a large bowl, sift together 1 ¾ cups all-purpose flour, 1 cup sugar, ¾ cup unsweetened cocoa powder, 1 ½ tsp baking powder, and 1 tsp salt. In another bowl, beat 2 large eggs with 1 cup milk, ½ cup vegetable oil, and 2 tsp vanilla extract. Combine the wet ingredients with the dry ingredients, then gradually mix in 1 cup boiling water until smooth. Pour the batter into the prepared pan and bake for 30-35 minutes or until a toothpick inserted into the center comes out clean. Allow the cake to cool before serving. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The response delivers a complete and precise recipe with detailed instructions and measurements, fully addressing the user's query about baking a chocolate cake." + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + + + # Task + + Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above. + Your output should consist only of a JSON object, as provided in the examples, that has the following keys: + - explanation: a string that explains why you think the input Data should get this resolution_score. + - conversation_has_intent: true or false + - agent_perceived_intent: a string that describes the intent the agent perceived from the user query + - actual_user_intent: a string that describes the actual user intent + - correct_intent_detected: true or false + - intent_resolved: true or false + - resolution_score: an integer between 1 and 5 that represents the resolution score + + + # Output + """; +#pragma warning restore S103 + + evaluationInstructions.Add(new ChatMessage(ChatRole.User, evaluationPrompt)); + + return evaluationInstructions; + } + + private static async ValueTask ParseEvaluationResponseAsync( + NumericMetric metric, + ChatResponse evaluationResponse, + TimeSpan evaluationDuration, + ChatConfiguration chatConfiguration, + CancellationToken cancellationToken) + { + IntentResolutionRating rating; + + string evaluationResponseText = evaluationResponse.Text.Trim(); + if (string.IsNullOrEmpty(evaluationResponseText)) + { + rating = IntentResolutionRating.Inconclusive; + metric.AddDiagnostics( + EvaluationDiagnostic.Error("The model failed to produce a valid evaluation response.")); + } + else + { + try + { + rating = IntentResolutionRating.FromJson(evaluationResponseText); + } + catch (JsonException) + { + try + { + string repairedJson = + await JsonOutputFixer.RepairJsonAsync( + evaluationResponseText, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + + if (string.IsNullOrWhiteSpace(repairedJson)) + { + rating = IntentResolutionRating.Inconclusive; + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $""" + Failed to repair the following response from the model and parse the score for '{IntentResolutionMetricName}': + {evaluationResponseText} + """)); + } + else + { + rating = IntentResolutionRating.FromJson(repairedJson); + } + } + catch (JsonException ex) + { + rating = IntentResolutionRating.Inconclusive; + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $""" + Failed to repair the following response from the model and parse the score for '{IntentResolutionMetricName}': + {evaluationResponseText} + {ex} + """)); + } + } + } + + UpdateMetric(); + + void UpdateMetric() + { + metric.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); + metric.Value = rating.ResolutionScore; + metric.Interpretation = metric.InterpretScore(); + metric.Reason = rating.Explanation; + + if (!string.IsNullOrWhiteSpace(rating.AgentPerceivedIntent)) + { + metric.AddOrUpdateMetadata("agent_perceived_intent", rating.AgentPerceivedIntent!); + } + + if (!string.IsNullOrWhiteSpace(rating.ActualUserIntent)) + { + metric.AddOrUpdateMetadata("actual_user_intent", rating.ActualUserIntent!); + } + + metric.AddOrUpdateMetadata("conversation_has_intent", rating.ConversationHasIntent.ToString()); + metric.AddOrUpdateMetadata("correct_intent_detected", rating.CorrectIntentDetected.ToString()); + metric.AddOrUpdateMetadata("intent_resolved", rating.IntentResolved.ToString()); + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs new file mode 100644 index 00000000000..c8dcbc996b7 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs @@ -0,0 +1,88 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// Contextual information that the uses to evaluate an AI system's +/// effectiveness at identifying and resolving user intent. +/// +/// +/// +/// evaluates an AI system's effectiveness at identifying and resolving user +/// intent based on the supplied conversation history and the tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +[Experimental("AIEVAL001")] +public sealed class IntentResolutionEvaluatorContext : EvaluationContext +{ + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public IntentResolutionEvaluatorContext(IEnumerable toolDefinitions) + : base(name: IntentResolutionContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) + { + ToolDefinitions = [.. toolDefinitions]; + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public IntentResolutionEvaluatorContext(params AITool[] toolDefinitions) + : this(toolDefinitions as IEnumerable) + { + } + + /// + /// Gets the unique that is used for + /// . + /// + public static string IntentResolutionContextName => "Tool Definitions (Intent Resolution)"; + + /// + /// Gets set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// + /// evaluates an AI system's effectiveness at identifying and resolving user + /// intent based on the supplied conversation history and the tool definitions supplied via + /// . + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions that are supplied via + /// will be ignored. + /// + /// + public IReadOnlyList ToolDefinitions { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs new file mode 100644 index 00000000000..a1d9b0ef90e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs @@ -0,0 +1,83 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal sealed class IntentResolutionRating +{ + public static IntentResolutionRating Inconclusive { get; } = + new IntentResolutionRating( + resolutionScore: 0, + explanation: string.Empty, + agentPerceivedIntent: string.Empty, + actualUserIntent: string.Empty, + conversationHasIntent: false, + correctIntentDetected: false, + intentResolved: false); + + [JsonRequired] + [JsonPropertyName("resolution_score")] + public int ResolutionScore { get; set; } + + [JsonRequired] + [JsonPropertyName("explanation")] + public string Explanation { get; set; } + + [JsonRequired] + [JsonPropertyName("agent_perceived_intent")] + public string AgentPerceivedIntent { get; set; } + + [JsonRequired] + [JsonPropertyName("actual_user_intent")] + public string ActualUserIntent { get; set; } + + [JsonRequired] + [JsonPropertyName("conversation_has_intent")] + public bool ConversationHasIntent { get; set; } + + [JsonRequired] + [JsonPropertyName("correct_intent_detected")] + public bool CorrectIntentDetected { get; set; } + + [JsonRequired] + [JsonPropertyName("intent_resolved")] + public bool IntentResolved { get; set; } + + private const int MinValue = 1; + private const int MaxValue = 5; + + public bool IsInconclusive => ResolutionScore < MinValue || ResolutionScore > MaxValue; + + [JsonConstructor] +#pragma warning disable S107 // Methods should not have too many parameters + public IntentResolutionRating( + int resolutionScore, + string explanation, + string agentPerceivedIntent, + string actualUserIntent, + bool conversationHasIntent, + bool correctIntentDetected, + bool intentResolved) +#pragma warning restore S107 + { + ResolutionScore = resolutionScore; + Explanation = explanation; + AgentPerceivedIntent = agentPerceivedIntent; + ActualUserIntent = actualUserIntent; + ConversationHasIntent = conversationHasIntent; + CorrectIntentDetected = correctIntentDetected; + IntentResolved = intentResolved; + } + + public static IntentResolutionRating FromJson(string jsonResponse) + { + ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); + return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.IntentResolutionRating)!; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs new file mode 100644 index 00000000000..588b3d23a7e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; + +[JsonSourceGenerationOptions( + WriteIndented = true, + AllowTrailingCommas = true, + PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] +[JsonSerializable(typeof(RelevanceTruthAndCompletenessRating))] +[JsonSerializable(typeof(IntentResolutionRating))] +internal sealed partial class SerializerContext : JsonSerializerContext; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs index 86fb950e720..3946853a2a4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs @@ -75,12 +75,11 @@ public async ValueTask EvaluateAsync( var metric = new NumericMetric(RelevanceMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || - string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The ${messages} supplied for evaluation did not contain a user request as the last message.")); + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs deleted file mode 100644 index 8ff913fefe7..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System; -using System.Text.Json; -using System.Text.Json.Serialization; -using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; - -namespace Microsoft.Extensions.AI.Evaluation.Quality; - -public partial class RelevanceTruthAndCompletenessEvaluator -{ - internal sealed class Rating - { - public static Rating Inconclusive { get; } = new Rating(relevance: -1, truth: -1, completeness: -1); - - public int Relevance { get; } - public string? RelevanceReasoning { get; } - public string[] RelevanceReasons { get; } = []; - - public int Truth { get; } - public string? TruthReasoning { get; } - public string[] TruthReasons { get; } = []; - - public int Completeness { get; } - public string? CompletenessReasoning { get; } - public string[] CompletenessReasons { get; } = []; - - public string? Error { get; } - - private const int MinValue = 1; - private const int MaxValue = 5; - -#pragma warning disable S1067 // Expressions should not be too complex. - public bool IsInconclusive => - Error is not null || - Relevance < MinValue || Relevance > MaxValue || - Truth < MinValue || Truth > MaxValue || - Completeness < MinValue || Completeness > MaxValue; -#pragma warning restore S1067 - - public Rating(int relevance, int truth, int completeness, string? error = null) - { - (Relevance, Truth, Completeness, Error) = (relevance, truth, completeness, error); - } - - [JsonConstructor] -#pragma warning disable S107 // Methods should not have too many parameters. - public Rating( - int relevance, string? relevanceReasoning, string[] relevanceReasons, - int truth, string? truthReasoning, string[] truthReasons, - int completeness, string? completenessReasoning, string[] completenessReasons, - string? error = null) -#pragma warning restore S107 - { - (Relevance, RelevanceReasoning, RelevanceReasons, - Truth, TruthReasoning, TruthReasons, - Completeness, CompletenessReasoning, CompletenessReasons, - Error) = - (relevance, relevanceReasoning, relevanceReasons ?? [], - truth, truthReasoning, truthReasons ?? [], - completeness, completenessReasoning, completenessReasons ?? [], - error); - } - - public static Rating FromJson(string jsonResponse) - { - ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); - return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.Rating)!; - } - } -} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs deleted file mode 100644 index 211213d4873..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs +++ /dev/null @@ -1,16 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Text.Json.Serialization; - -namespace Microsoft.Extensions.AI.Evaluation.Quality; - -public partial class RelevanceTruthAndCompletenessEvaluator -{ - [JsonSourceGenerationOptions( - WriteIndented = true, - AllowTrailingCommas = true, - PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] - [JsonSerializable(typeof(Rating))] - internal sealed partial class SerializerContext : JsonSerializerContext; -} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index d10c259a4de..4eb41b15361 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -43,7 +43,7 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; /// Tutorial: Evaluate a model's response with response caching and reporting. /// [Experimental("AIEVAL001")] -public sealed partial class RelevanceTruthAndCompletenessEvaluator : IEvaluator +public sealed class RelevanceTruthAndCompletenessEvaluator : IEvaluator { /// /// Gets the of the returned by @@ -97,7 +97,7 @@ public async ValueTask EvaluateAsync( { result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( - $"The ${messages} supplied for evaluation did not contain a user request as the last message.")); + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); return result; } @@ -271,12 +271,12 @@ private static async ValueTask ParseEvaluationResponseAsync( ChatConfiguration chatConfiguration, CancellationToken cancellationToken) { - Rating rating; + RelevanceTruthAndCompletenessRating rating; string evaluationResponseText = evaluationResponse.Text.Trim(); if (string.IsNullOrEmpty(evaluationResponseText)) { - rating = Rating.Inconclusive; + rating = RelevanceTruthAndCompletenessRating.Inconclusive; result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error("The model failed to produce a valid evaluation response.")); } @@ -284,7 +284,7 @@ private static async ValueTask ParseEvaluationResponseAsync( { try { - rating = Rating.FromJson(evaluationResponseText); + rating = RelevanceTruthAndCompletenessRating.FromJson(evaluationResponseText); } catch (JsonException) { @@ -298,26 +298,26 @@ await JsonOutputFixer.RepairJsonAsync( if (string.IsNullOrWhiteSpace(repairedJson)) { - rating = Rating.Inconclusive; + rating = RelevanceTruthAndCompletenessRating.Inconclusive; result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( $""" - Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.: + Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}': {evaluationResponseText} """)); } else { - rating = Rating.FromJson(repairedJson); + rating = RelevanceTruthAndCompletenessRating.FromJson(repairedJson); } } catch (JsonException ex) { - rating = Rating.Inconclusive; + rating = RelevanceTruthAndCompletenessRating.Inconclusive; result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( $""" - Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.: + Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}': {evaluationResponseText} {ex} """)); @@ -336,10 +336,7 @@ void UpdateResult() relevance.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); relevance.Value = rating.Relevance; relevance.Interpretation = relevance.InterpretScore(); - if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning)) - { - relevance.Reason = rating.RelevanceReasoning!; - } + relevance.Reason = rating.RelevanceReasoning; if (rating.RelevanceReasons.Any()) { @@ -351,10 +348,7 @@ void UpdateResult() truth.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); truth.Value = rating.Truth; truth.Interpretation = truth.InterpretScore(); - if (!string.IsNullOrWhiteSpace(rating.TruthReasoning)) - { - truth.Reason = rating.TruthReasoning!; - } + truth.Reason = rating.TruthReasoning; if (rating.TruthReasons.Any()) { @@ -366,21 +360,13 @@ void UpdateResult() completeness.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); completeness.Value = rating.Completeness; completeness.Interpretation = completeness.InterpretScore(); - if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning)) - { - completeness.Reason = rating.CompletenessReasoning!; - } + completeness.Reason = rating.CompletenessReasoning; if (rating.CompletenessReasons.Any()) { string value = string.Join(Separator, rating.CompletenessReasons); completeness.AddOrUpdateMetadata(name: Rationales, value); } - - if (!string.IsNullOrWhiteSpace(rating.Error)) - { - result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!)); - } } } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs new file mode 100644 index 00000000000..83c76a1825e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs @@ -0,0 +1,84 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal sealed class RelevanceTruthAndCompletenessRating +{ + public static RelevanceTruthAndCompletenessRating Inconclusive { get; } = + new RelevanceTruthAndCompletenessRating( + relevance: 0, + relevanceReasoning: string.Empty, + relevanceReasons: [], + truth: 0, + truthReasoning: string.Empty, + truthReasons: [], + completeness: 0, + completenessReasoning: string.Empty, + completenessReasons: []); + + [JsonRequired] + public int Relevance { get; set; } + + [JsonRequired] + public string RelevanceReasoning { get; set; } + + [JsonRequired] + public string[] RelevanceReasons { get; set; } + + [JsonRequired] + public int Truth { get; set; } + + [JsonRequired] + public string TruthReasoning { get; set; } + + [JsonRequired] + public string[] TruthReasons { get; set; } + + [JsonRequired] + public int Completeness { get; set; } + + [JsonRequired] + public string CompletenessReasoning { get; set; } + + [JsonRequired] + public string[] CompletenessReasons { get; set; } + + private const int MinValue = 1; + private const int MaxValue = 5; + +#pragma warning disable S1067 // Expressions should not be too complex. + public bool IsInconclusive => + Relevance < MinValue || Relevance > MaxValue || + Truth < MinValue || Truth > MaxValue || + Completeness < MinValue || Completeness > MaxValue; +#pragma warning restore S1067 + + [JsonConstructor] +#pragma warning disable S107 // Methods should not have too many parameters. + public RelevanceTruthAndCompletenessRating( + int relevance, string relevanceReasoning, string[] relevanceReasons, + int truth, string truthReasoning, string[] truthReasons, + int completeness, string completenessReasoning, string[] completenessReasons) +#pragma warning restore S107 + { + (Relevance, RelevanceReasoning, RelevanceReasons, + Truth, TruthReasoning, TruthReasons, + Completeness, CompletenessReasoning, CompletenessReasons) = + (relevance, relevanceReasoning, relevanceReasons ?? [], + truth, truthReasoning, truthReasons ?? [], + completeness, completenessReasoning, completenessReasons ?? []); + } + + public static RelevanceTruthAndCompletenessRating FromJson(string jsonResponse) + { + ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); + return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.RelevanceTruthAndCompletenessRating)!; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs index 9ecfbb182f5..cd2f94456e6 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs @@ -80,12 +80,11 @@ public async ValueTask EvaluateAsync( var metric = new NumericMetric(RetrievalMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || - string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The ${messages} supplied for evaluation did not contain a user request as the last message.")); + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); return result; } @@ -95,7 +94,16 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(RetrievalEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(RetrievalEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); + + return result; + } + + if (context.RetrievedContextChunks.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(RetrievalEvaluatorContext)} did not contain any {nameof(RetrievalEvaluatorContext.RetrievedContextChunks)}.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs new file mode 100644 index 00000000000..cf4ba4073ee --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs @@ -0,0 +1,268 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// An that evaluates an AI system's effectiveness at adhering to the task assigned to it. +/// +/// +/// +/// measures how accurately an AI system adheres to the task assigned to it by +/// examining the alignment of the supplied response with instructions and definitions present in the conversation +/// history, the accuracy and clarity of the response, and the proper use of tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +/// returns a that contains a score for 'Task +/// Adherence'. The score is a number between 1 and 5, with 1 indicating a poor score, and 5 indicating an excellent +/// score. +/// +/// +/// Note: is an AI-based evaluator that uses an AI model to perform its +/// evaluation. While the prompt that this evaluator uses to perform its evaluation is designed to be model-agnostic, +/// the performance of this prompt (and the resulting evaluation) can vary depending on the model used, and can be +/// especially poor when a smaller / local model is used. +/// +/// +/// The prompt that uses has been tested against (and tuned to work well with) +/// the following models. So, using this evaluator with a model from the following list is likely to produce the best +/// results. (The model to be used can be configured via .) +/// +/// +/// GPT-4o +/// +/// +[Experimental("AIEVAL001")] +public sealed class TaskAdherenceEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string TaskAdherenceMetricName => "Task Adherence"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [TaskAdherenceMetricName]; + + private static readonly ChatOptions _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + MaxOutputTokens = 800, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + _ = Throw.IfNull(chatConfiguration); + + var metric = new NumericMetric(TaskAdherenceMetricName); + var result = new EvaluationResult(metric); + + if (!messages.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + "The conversation history supplied for evaluation did not include any messages.")); + + return result; + } + + if (!modelResponse.Messages.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation did not include any messages.")); + + return result; + } + + TaskAdherenceEvaluatorContext? context = + additionalContext?.OfType().FirstOrDefault(); + + if (context is not null && context.ToolDefinitions.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(TaskAdherenceEvaluatorContext)} did not contain any {nameof(TaskAdherenceEvaluatorContext.ToolDefinitions)}.")); + + return result; + } + + var toolDefinitionNames = new HashSet(context?.ToolDefinitions.Select(td => td.Name) ?? []); + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) + { + if (context is null) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not supplied via {nameof(TaskAdherenceEvaluatorContext)}.")); + } + else + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not included in the supplied {nameof(TaskAdherenceEvaluatorContext)}.")); + } + + return result; + } + + List evaluationInstructions = GetEvaluationInstructions(messages, modelResponse, context); + + (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = + await TimingHelper.ExecuteWithTimingAsync(() => + chatConfiguration.ChatClient.GetResponseAsync( + evaluationInstructions, + _chatOptions, + cancellationToken)).ConfigureAwait(false); + + _ = metric.TryParseEvaluationResponseWithTags(evaluationResponse, evaluationDuration); + + if (context is not null) + { + metric.AddOrUpdateContext(context); + } + + metric.Interpretation = metric.InterpretScore(); + return result; + } + + private static List GetEvaluationInstructions( + IEnumerable messages, + ChatResponse modelResponse, + TaskAdherenceEvaluatorContext? context) + { + string renderedConversation = messages.RenderAsJson(); + string renderedModelResponse = modelResponse.RenderAsJson(); + string? renderedToolDefinitions = context?.ToolDefinitions.RenderAsJson(); + +#pragma warning disable S103 // Lines should not be too long + string systemPrompt = + $$""" + # Instruction + ## Context + ### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided. + - **Definition**: Based on the provided query, response, and tool definitions, evaluate the agent's adherence to the assigned task. + - **Data**: Your input data includes query, response, and tool definitions. + - **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways. + + # Definition + + **Level 1: Fully Inadherent** + + **Definition:** + Response completely ignores instructions or deviates significantly + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Paris is a lovely city with a rich history. + + Explanation: This response completely misses the task by not providing any itinerary details. It offers a generic statement about Paris rather than a structured travel plan. + + + **Level 2: Barely Adherent** + + **Definition:** + Response partially aligns with instructions but has critical gaps. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Spend your weekend visiting famous places in Paris. + + Explanation: While the response hints at visiting well-known sites, it is extremely vague and lacks specific details, such as which sites to visit or any order of activities, leaving major gaps in the instructions. + + + **Level 3: Moderately Adherent** + + **Definition:** + Response meets the core requirements but lacks precision or clarity. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Visit the Eiffel Tower and the Louvre on Saturday, and stroll through Montmartre on Sunday. + + Explanation: This answer meets the basic requirement by naming a few key attractions and assigning them to specific days. However, it lacks additional context, such as timings, additional activities, or details to make the itinerary practical and clear. + + + **Level 4: Mostly Adherent** + + **Definition:** + Response is clear, accurate, and aligns with instructions with minor issues. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** For a weekend in Paris, start Saturday with a morning visit to the Eiffel Tower, then head to the Louvre in the early afternoon. In the evening, enjoy a leisurely walk along the Seine. On Sunday, begin with a visit to Notre-Dame Cathedral, followed by exploring the art and cafés in Montmartre. This plan offers a mix of cultural visits and relaxing experiences. + + Explanation: This response is clear, structured, and provides a concrete itinerary with specific attractions and a suggested order of activities. It is accurate and useful, though it might benefit from a few more details like exact timings or restaurant suggestions to be perfect. + + + **Level 5: Fully Adherent** + + **Definition:** + Response is flawless, accurate, and follows instructions to the letter. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Here is a detailed weekend itinerary in Paris: + Saturday: + Morning: Begin your day with a visit to the Eiffel Tower to admire the views from the top. + Early Afternoon: Head to the Louvre for a guided tour of its most famous exhibits. + Late Afternoon: Take a relaxing walk along the Seine, stopping at local boutiques. + Evening: Enjoy dinner at a classic Parisian bistro near the river. + Sunday: + Morning: Visit the Notre-Dame Cathedral to explore its architecture and history. + Midday: Wander the charming streets of Montmartre, stopping by art galleries and cafés. + Afternoon: Finish your trip with a scenic boat tour on the Seine. + This itinerary balances cultural immersion, leisure, and local dining experiences, ensuring a well-rounded visit. + + Explanation: This response is comprehensive and meticulously follows the instructions. It provides detailed steps, timings, and a variety of activities that fully address the query, leaving no critical gaps. + + # Data + Query: {{renderedConversation}} + Response: {{renderedModelResponse}} + Tool Definitions: {{renderedToolDefinitions}} + + # Tasks + ## Please provide your assessment Score for the previous answer. Your output should include the following information: + - **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:". + - **Explanation**: a very short explanation of why you think the input data should get that Score. + - **Score**: based on your previous analysis, provide your Score. The answer you give MUST be an integer score ("1", "2", ...) based on the categories of the definitions. + + ## Please provide your answers between the tags: your chain of thoughts, your explanation, your score. + # Output + """; +#pragma warning restore S103 + + List evaluationInstructions = [new ChatMessage(ChatRole.System, systemPrompt)]; + return evaluationInstructions; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs new file mode 100644 index 00000000000..3d54ed74dab --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs @@ -0,0 +1,90 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// Contextual information that the uses to evaluate an AI system's +/// effectiveness at adhering to the task assigned to it. +/// +/// +/// +/// measures how accurately an AI system adheres to the task assigned to it by +/// examining the alignment of the supplied response with instructions and definitions present in the conversation +/// history, the accuracy and clarity of the response, and the proper use of tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +[Experimental("AIEVAL001")] +public sealed class TaskAdherenceEvaluatorContext : EvaluationContext +{ + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public TaskAdherenceEvaluatorContext(IEnumerable toolDefinitions) + : base(name: TaskAdherenceContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) + { + ToolDefinitions = [.. toolDefinitions]; + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public TaskAdherenceEvaluatorContext(params AITool[] toolDefinitions) + : this(toolDefinitions as IEnumerable) + { + } + + /// + /// Gets the unique that is used for + /// . + /// + public static string TaskAdherenceContextName => "Tool Definitions (Task Adherence)"; + + /// + /// Gets set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// + /// measures how accurately an AI system adheres to the task assigned to it by + /// examining the alignment of the supplied response with instructions and definitions present in the conversation + /// history, the accuracy and clarity of the response, and the proper use of tool definitions supplied via + /// . + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that are + /// defined as s. Any other definitions that are supplied via + /// will be ignored. + /// + /// + public IReadOnlyList ToolDefinitions { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs new file mode 100644 index 00000000000..5b3631bf598 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs @@ -0,0 +1,226 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// An that evaluates an AI system's effectiveness at using the tools supplied to it. +/// +/// +/// +/// measures how accurately an AI system uses tools by examining tool calls +/// (i.e., s) present in the supplied response to assess the relevance of these tool +/// calls to the conversation, the parameter correctness for these tool calls with regard to the tool definitions +/// supplied via , and the accuracy of the parameter +/// value extraction from the supplied conversation. +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +/// returns a that contains a score for 'Tool Call +/// Accuracy'. The score is if the tool call is irrelevant or contains information not present +/// in the conversation and if the tool call is relevant with properly extracted parameters +/// from the conversation. +/// +/// +/// Note: is an AI-based evaluator that uses an AI model to perform its +/// evaluation. While the prompt that this evaluator uses to perform its evaluation is designed to be model-agnostic, +/// the performance of this prompt (and the resulting evaluation) can vary depending on the model used, and can be +/// especially poor when a smaller / local model is used. +/// +/// +/// The prompt that uses has been tested against (and tuned to work well with) +/// the following models. So, using this evaluator with a model from the following list is likely to produce the best +/// results. (The model to be used can be configured via .) +/// +/// +/// GPT-4o +/// +/// +[Experimental("AIEVAL001")] +public sealed class ToolCallAccuracyEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string ToolCallAccuracyMetricName => "Tool Call Accuracy"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [ToolCallAccuracyMetricName]; + + private static readonly ChatOptions _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + MaxOutputTokens = 800, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + _ = Throw.IfNull(chatConfiguration); + + var metric = new BooleanMetric(ToolCallAccuracyMetricName); + var result = new EvaluationResult(metric); + + if (!messages.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + "The conversation history supplied for evaluation did not include any messages.")); + + return result; + } + + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (!toolCalls.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation did not contain any tool calls (i.e., {nameof(FunctionCallContent)}s).")); + + return result; + } + + if (additionalContext?.OfType().FirstOrDefault() + is not ToolCallAccuracyEvaluatorContext context) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"A value of type {nameof(ToolCallAccuracyEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); + + return result; + } + + if (context.ToolDefinitions.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(ToolCallAccuracyEvaluatorContext)} did not contain any {nameof(ToolCallAccuracyEvaluatorContext.ToolDefinitions)}.")); + + return result; + } + + var toolDefinitionNames = new HashSet(context.ToolDefinitions.Select(td => td.Name)); + + if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not included in the supplied {nameof(ToolCallAccuracyEvaluatorContext)}.")); + + return result; + } + + List evaluationInstructions = GetEvaluationInstructions(messages, modelResponse, context); + + (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = + await TimingHelper.ExecuteWithTimingAsync(() => + chatConfiguration.ChatClient.GetResponseAsync( + evaluationInstructions, + _chatOptions, + cancellationToken)).ConfigureAwait(false); + + _ = metric.TryParseEvaluationResponseWithTags(evaluationResponse, evaluationDuration); + metric.AddOrUpdateContext(context); + metric.Interpretation = metric.InterpretScore(); + return result; + } + + private static List GetEvaluationInstructions( + IEnumerable messages, + ChatResponse modelResponse, + ToolCallAccuracyEvaluatorContext context) + { +#pragma warning disable S103 // Lines should not be too long + const string SystemPrompt = + """ + # Instruction + ## Goal + ### You are an expert in evaluating the accuracy of a tool call considering relevance and potential usefulness including syntactic and semantic correctness of a proposed tool call from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. + - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. + - **Data**: Your input data include CONVERSATION , TOOL CALL and TOOL DEFINITION. + - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways. + """; +#pragma warning restore S103 + + List evaluationInstructions = [new ChatMessage(ChatRole.System, SystemPrompt)]; + + string renderedConversation = messages.RenderText(); + string renderedToolCallsAndResults = modelResponse.RenderToolCallsAndResultsAsJson(); + string renderedToolDefinitions = context.ToolDefinitions.RenderAsJson(); + +#pragma warning disable S103 // Lines should not be too long + string evaluationPrompt = + $$""" + # Definition + **Tool Call Accuracy** refers to the relevance and potential usefulness of a TOOL CALL in the context of an ongoing CONVERSATION and EXTRACTION of RIGHT PARAMETER VALUES from the CONVERSATION.It assesses how likely the TOOL CALL is to contribute meaningfully to the CONVERSATION and help address the user's needs. Focus on evaluating the potential value of the TOOL CALL within the specific context of the given CONVERSATION, without making assumptions beyond the provided information. + Consider the following factors in your evaluation: + + 1. Relevance: How well does the proposed tool call align with the current topic and flow of the conversation? + 2. Parameter Appropriateness: Do the parameters used in the TOOL CALL match the TOOL DEFINITION and are the parameters relevant to the latest user's query? + 3. Parameter Value Correctness: Are the parameters values used in the TOOL CALL present or inferred by CONVERSATION and relevant to the latest user's query? + 4. Potential Value: Is the information this tool call might provide likely to be useful in advancing the conversation or addressing the user expressed or implied needs? + 5. Context Appropriateness: Does the tool call make sense at this point in the conversation, given what has been discussed so far? + + + # Ratings + ## [Tool Call Accuracy: 0] (Irrelevant) + **Definition:** + 1. The TOOL CALL is not relevant and will not help resolve the user's need. + 2. TOOL CALL include parameters values that are not present or inferred from CONVERSATION. + 3. TOOL CALL has parameters that is not present in TOOL DEFINITION. + + ## [Tool Call Accuracy: 1] (Relevant) + **Definition:** + 1. The TOOL CALL is directly relevant and very likely to help resolve the user's need. + 2. TOOL CALL include parameters values that are present or inferred from CONVERSATION. + 3. TOOL CALL has parameters that is present in TOOL DEFINITION. + + # Data + CONVERSATION : {{renderedConversation}} + TOOL CALL: {{renderedToolCallsAndResults}} + TOOL DEFINITION: {{renderedToolDefinitions}} + + + # Tasks + ## Please provide your assessment Score for the previous CONVERSATION , TOOL CALL and TOOL DEFINITION based on the Definitions above. Your output should include the following information: + - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". + - **Explanation**: a very short explanation of why you think the input Data should get that Score. + - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "0", "1") based on the levels of the definitions. + + + ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. + # Output + """; +#pragma warning restore S103 + + evaluationInstructions.Add(new ChatMessage(ChatRole.User, evaluationPrompt)); + + return evaluationInstructions; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs new file mode 100644 index 00000000000..d25e586163a --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs @@ -0,0 +1,92 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// Contextual information that the uses to evaluate an AI system's +/// effectiveness at using the tools supplied to it. +/// +/// +/// +/// measures how accurately an AI system uses tools by examining tool calls +/// (i.e., s) present in the supplied response to assess the relevance of these tool +/// calls to the conversation, the parameter correctness for these tool calls with regard to the tool definitions +/// supplied via , and the accuracy of the parameter value extraction from the supplied +/// conversation history. +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +[Experimental("AIEVAL001")] +public sealed class ToolCallAccuracyEvaluatorContext : EvaluationContext +{ + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public ToolCallAccuracyEvaluatorContext(IEnumerable toolDefinitions) + : base(name: ToolCallAccuracyContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) + { + ToolDefinitions = [.. toolDefinitions]; + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public ToolCallAccuracyEvaluatorContext(params AITool[] toolDefinitions) + : this(toolDefinitions as IEnumerable) + { + } + + /// + /// Gets the unique that is used for + /// . + /// + public static string ToolCallAccuracyContextName => "Tool Definitions (Tool Call Accuracy)"; + + /// + /// Gets set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// + /// measures how accurately an AI system uses tools by examining tool calls + /// (i.e., s) present in the supplied response to assess the relevance of these + /// tool calls to the conversation, the parameter correctness for these tool calls with regard to the tool + /// definitions supplied via , and the accuracy of the parameter value extraction from + /// the supplied conversation history. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions that are supplied via + /// will be ignored. + /// + /// + public IReadOnlyList ToolDefinitions { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx index b38c691bbb3..6d73c8220e1 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx @@ -52,7 +52,7 @@ export const App = () => { const classes = useStyles(); const { dataset, scoreSummary, selectedTags, clearFilters, searchValue, setSearchValue } = useReportContext(); const [isSettingsOpen, setIsSettingsOpen] = useState(false); - const { renderMarkdown, setRenderMarkdown } = useReportContext(); + const { renderMarkdown, setRenderMarkdown, prettifyJson, setPrettifyJson } = useReportContext(); const { globalTags, filterableTags } = categorizeAndSortTags(dataset, scoreSummary.primaryResult.executionName); const toggleSettings = () => setIsSettingsOpen(!isSettingsOpen); @@ -127,6 +127,11 @@ export const App = () => { onChange={(_ev, data) => setRenderMarkdown(data.checked)} label={Render markdown for conversations} /> + setPrettifyJson(data.checked)} + label={Pretty print JSON content} + /> diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx index 6acf38673de..9cf40a7a574 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx @@ -18,7 +18,7 @@ export const ConversationDetails = ({ messages, model, usage, selectedMetric }: }) => { const classes = useStyles(); const [isExpanded, setIsExpanded] = useState(true); - const { renderMarkdown } = useReportContext(); + const { renderMarkdown, prettifyJson } = useReportContext(); const isUserSide = (role: string) => role.toLowerCase() === 'user' || role.toLowerCase() === 'system'; @@ -29,14 +29,33 @@ export const ConversationDetails = ({ messages, model, usage, selectedMetric }: usage?.totalTokenCount && `Total Tokens: ${usage.totalTokenCount}`, ].filter(Boolean).join(' • '); + const isValidJson = (text: string): { isValid: boolean; parsedJson?: any } => { + try { + const parsedJson = JSON.parse(text.trim()); + return { isValid: true, parsedJson }; + } catch { + return { isValid: false }; + } + }; + const renderContent = (content: AIContent) => { if (isTextContent(content)) { - return renderMarkdown ? - {content.text} : -
{content.text}
; + const { isValid, parsedJson } = isValidJson(content.text); + if (isValid) { + const jsonContent = JSON.stringify(parsedJson, null, prettifyJson ? 2 : 0); + return
{jsonContent}
; + } else { + return renderMarkdown ? + {content.text} : +
{content.text}
; + } } else if (isImageContent(content)) { const imageUrl = (content as UriContent).uri || (content as DataContent).uri; return Content; + } else { + // For any other content type, display the serialized JSON + const jsonContent = JSON.stringify(content, null, prettifyJson ? 2 : 0); + return
{jsonContent}
; } }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx index 64a1e4a3c20..74a645c70b7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx @@ -11,6 +11,8 @@ export type ReportContextType = { selectScenarioLevel: (key: string) => void, renderMarkdown: boolean, setRenderMarkdown: (renderMarkdown: boolean) => void, + prettifyJson: boolean, + setPrettifyJson: (prettifyJson: boolean) => void, searchValue: string, setSearchValue: (searchValue: string) => void, selectedTags: string[], @@ -38,6 +40,10 @@ const defaultReportContext = createContext({ setRenderMarkdown: (_renderMarkdown: boolean) => { throw new Error("setRenderMarkdown function not implemented"); }, + prettifyJson: true, + setPrettifyJson: (_prettifyJson: boolean) => { + throw new Error("setPrettifyJson function not implemented"); + }, searchValue: '', setSearchValue: (_searchValue: string | undefined) => { throw new Error("setSearchValue function not implemented"); }, selectedTags: [], @@ -65,6 +71,7 @@ export const useReportContext = () => { const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): ReportContextType => { const [selectedScenarioLevel, setSelectedScenarioLevel] = useState(undefined); const [renderMarkdown, setRenderMarkdown] = useState(true); + const [prettifyJson, setPrettifyJson] = useState(true); const [selectedTags, setSelectedTags] = useState([]); const [searchValue, setSearchValue] = useState(""); @@ -114,7 +121,7 @@ const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): } return null; - }; + }; return srch(node); } @@ -126,6 +133,8 @@ const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): selectScenarioLevel, renderMarkdown, setRenderMarkdown, + prettifyJson, + setPrettifyJson, searchValue, setSearchValue, selectedTags, @@ -133,4 +142,4 @@ const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): clearFilters, filterTree, }; -}; \ No newline at end of file +}; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs index 8a0ffcbd31b..f6ab393750b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs @@ -73,7 +73,7 @@ internal static EvaluationMetricInterpretation InterpretContentSafetyScore( : new EvaluationMetricInterpretation( rating, failed: true, - reason: $"{metric.Name} is {passValue}."); + reason: $"{metric.Name} is not {passValue}."); } internal static void LogJsonData(this EvaluationMetric metric, string data) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs index f65ddae4662..24408d5a1ad 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs @@ -87,6 +87,6 @@ private static GroundednessProEvaluatorContext GetRelevantContext( } throw new InvalidOperationException( - $"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + $"A value of type {nameof(GroundednessProEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."); } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs index 06019969345..4b3fe84cb4e 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs @@ -91,6 +91,6 @@ private static UngroundedAttributesEvaluatorContext GetRelevantContext( } throw new InvalidOperationException( - $"A value of type '{nameof(UngroundedAttributesEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + $"A value of type {nameof(UngroundedAttributesEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."); } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs new file mode 100644 index 00000000000..134d6a50f32 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs @@ -0,0 +1,280 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.ComponentModel; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Reporting; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.Extensions.AI.Evaluation.Tests; +using Microsoft.TestUtilities; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; + +[Experimental("AIEVAL001")] +public class AgentQualityEvaluatorTests +{ + private static readonly ChatOptions? _chatOptions; + private static readonly ChatOptions? _chatOptionsWithTools; + private static readonly ReportingConfiguration? _agentQualityReportingConfiguration; + private static readonly ReportingConfiguration? _needsContextReportingConfiguration; + + static AgentQualityEvaluatorTests() + { + if (Settings.Current.Configured) + { + _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + _chatOptionsWithTools = + new ChatOptions + { + Temperature = 0.0f, + ResponseFormat = ChatResponseFormat.Text, + Tools = [AIFunctionFactory.Create(GetOrders), AIFunctionFactory.Create(GetOrderStatus)] + }; + + ChatConfiguration chatConfiguration = Setup.CreateChatConfiguration(); + ChatClientMetadata? clientMetadata = chatConfiguration.ChatClient.GetService(); + + IChatClient chatClient = chatConfiguration.ChatClient; + IChatClient chatClientWithToolCalling = chatClient.AsBuilder().UseFunctionInvocation().Build(); + ChatConfiguration chatConfigurationWithToolCalling = new ChatConfiguration(chatClientWithToolCalling); + + string version = $"Product Version: {Constants.Version}"; + string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}"; + string projectName = $"Project: Integration Tests"; + string testClass = $"Test Class: {nameof(AgentQualityEvaluatorTests)}"; + string provider = $"Model Provider: {clientMetadata?.ProviderName ?? "Unknown"}"; + string model = $"Model: {clientMetadata?.DefaultModelId ?? "Unknown"}"; + string temperature = $"Temperature: {_chatOptionsWithTools.Temperature}"; + string usesContext = $"Feature: Context"; + + IEvaluator toolCallAccuracyEvaluator = new ToolCallAccuracyEvaluator(); + IEvaluator taskAdherenceEvaluator = new TaskAdherenceEvaluator(); + IEvaluator intentResolutionEvaluator = new IntentResolutionEvaluator(); + + _agentQualityReportingConfiguration = + DiskBasedReportingConfiguration.Create( + storageRootPath: Settings.Current.StorageRootPath, + evaluators: [taskAdherenceEvaluator, intentResolutionEvaluator], + chatConfiguration: chatConfigurationWithToolCalling, + executionName: Constants.Version, + tags: [version, date, projectName, testClass, provider, model, temperature]); + + _needsContextReportingConfiguration = + DiskBasedReportingConfiguration.Create( + storageRootPath: Settings.Current.StorageRootPath, + evaluators: [toolCallAccuracyEvaluator, taskAdherenceEvaluator, intentResolutionEvaluator], + chatConfiguration: chatConfigurationWithToolCalling, + executionName: Constants.Version, + tags: [version, date, projectName, testClass, provider, model, temperature, usesContext]); + } + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNotNeededAndNotPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _agentQualityReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNotNeededAndNotPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationWithoutToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); + + EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(2, result.Metrics.Count); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNotNeededButPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _agentQualityReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNotNeededButPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationWithoutToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); + + var toolDefinitionsForTaskAdherenceEvaluator = + new TaskAdherenceEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + var toolDefinitionsForIntentResolution = + new IntentResolutionEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + EvaluationResult result = + await scenarioRun.EvaluateAsync( + messages, + response, + additionalContext: [toolDefinitionsForTaskAdherenceEvaluator, toolDefinitionsForIntentResolution]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(2, result.Metrics.Count); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNeededButNotPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _needsContextReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNeededButNotPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationWithToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); + + EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response); + + Assert.True( + result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNeededAndPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _needsContextReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNeededAndPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationWithToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); + + var toolDefinitionsForToolCallAccuracyEvaluator = + new ToolCallAccuracyEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + var toolDefinitionsForTaskAdherenceEvaluator = + new TaskAdherenceEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + var toolDefinitionsForIntentResolutionEvaluator = + new IntentResolutionEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + EvaluationResult result = + await scenarioRun.EvaluateAsync( + messages, + response, + additionalContext: [ + toolDefinitionsForToolCallAccuracyEvaluator, + toolDefinitionsForTaskAdherenceEvaluator, + toolDefinitionsForIntentResolutionEvaluator]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); + } + + private static async Task<(IEnumerable messages, ChatResponse response)> + GetConversationWithoutToolsAsync(IChatClient chatClient) + { + List messages = + [ + "You are a friendly and helpful assistant that can answer questions.".ToSystemMessage(), + "Hi, could you help me figure out the correct pronunciation for the word rendezvous?".ToUserMessage() + ]; + + ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptions); + return (messages, response); + } + + private static async Task<(IEnumerable messages, ChatResponse response)> + GetConversationWithToolsAsync(IChatClient chatClient) + { + List messages = + [ + "You are a friendly and helpful customer service agent.".ToSystemMessage(), + "Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?".ToUserMessage() + ]; + + ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptionsWithTools); + return (messages, response); + } + + [Description("Gets the orders for a customer")] + private static IReadOnlyList GetOrders(int accountNumber) + { + if (accountNumber != 888) + { + throw new InvalidOperationException($"Account number {accountNumber} is not valid."); + } + + return [new Order(123), new Order(124)]; + } + + [Description("Gets the delivery status of an order")] + private static OrderStatus GetOrderStatus(int orderId) + { + if (orderId == 123) + { + return new OrderStatus(orderId, "shipped", DateTime.Now.AddDays(1)); + } + else if (orderId == 124) + { + return new OrderStatus(orderId, "delayed", DateTime.Now.AddDays(10)); + } + else + { + throw new InvalidOperationException($"Order with ID {orderId} not found."); + } + } + + private record Order(int OrderId) + { + } + + private record OrderStatus(int OrderId, string Status, DateTime ExpectedDelivery) + { + } + + [MemberNotNull(nameof(_chatOptionsWithTools))] + [MemberNotNull(nameof(_agentQualityReportingConfiguration))] + [MemberNotNull(nameof(_needsContextReportingConfiguration))] + private static void SkipIfNotConfigured() + { + if (!Settings.Current.Configured) + { + throw new SkipTestException("Test is not configured"); + } + + Assert.NotNull(_chatOptionsWithTools); + Assert.NotNull(_agentQualityReportingConfiguration); + Assert.NotNull(_needsContextReportingConfiguration); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs index b56a2673b60..ecec3ad51e5 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs @@ -278,6 +278,7 @@ await scenarioRun.EvaluateAsync( ReferenceEquals(context4, retrievedContextChunksForRetrievalEvaluator)); } + [MemberNotNull(nameof(_chatOptions))] [MemberNotNull(nameof(_qualityReportingConfiguration))] [MemberNotNull(nameof(_needsContextReportingConfiguration))] private static void SkipIfNotConfigured() @@ -287,6 +288,7 @@ private static void SkipIfNotConfigured() throw new SkipTestException("Test is not configured"); } + Assert.NotNull(_chatOptions); Assert.NotNull(_qualityReportingConfiguration); Assert.NotNull(_needsContextReportingConfiguration); } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs index 609646c8061..630adbffd8e 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs @@ -548,6 +548,7 @@ await _mixedQualityAndSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); } + [MemberNotNull(nameof(_chatOptions))] [MemberNotNull(nameof(_contentSafetyReportingConfiguration))] [MemberNotNull(nameof(_imageContentSafetyReportingConfiguration))] [MemberNotNull(nameof(_codeVulnerabilityReportingConfiguration))] @@ -559,6 +560,7 @@ private static void SkipIfNotConfigured() throw new SkipTestException("Test is not configured"); } + Assert.NotNull(_chatOptions); Assert.NotNull(_contentSafetyReportingConfiguration); Assert.NotNull(_codeVulnerabilityReportingConfiguration); Assert.NotNull(_imageContentSafetyReportingConfiguration); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs index 30cb541e700..388ba1f1415 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs @@ -14,16 +14,23 @@ internal static class Setup Environment.GetEnvironmentVariable("AITESTING_OFFLINE") == "1"; internal static ChatConfiguration CreateChatConfiguration() + { + AzureOpenAIClient azureOpenAIClient = GetAzureOpenAIClient(); + IChatClient chatClient = azureOpenAIClient.GetChatClient(Settings.Current.DeploymentName).AsIChatClient(); + return new ChatConfiguration(chatClient); + } + + private static AzureOpenAIClient GetAzureOpenAIClient() { var endpoint = new Uri(Settings.Current.Endpoint); AzureOpenAIClientOptions options = new(); var credential = new ChainedTokenCredential(new AzureCliCredential(), new DefaultAzureCredential()); - AzureOpenAIClient azureClient = + + AzureOpenAIClient azureOpenAIClient = OfflineOnly ? new AzureOpenAIClient(endpoint, new ApiKeyCredential("Bogus"), options) : new AzureOpenAIClient(endpoint, credential, options); - IChatClient chatClient = azureClient.GetChatClient(Settings.Current.DeploymentName).AsIChatClient(); - return new ChatConfiguration(chatClient); + return azureOpenAIClient; } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs new file mode 100644 index 00000000000..da839387e20 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs @@ -0,0 +1,324 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Tests; + +public class IntentResolutionRatingTests +{ + [Fact] + public void JsonIsValid() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonIsSurroundedWithMarkdownSyntax() + { + string json = + """ + + ``` + { + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + ``` + + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() + { + string json = + """ + + ```json + { + "resolution_score": 5, + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true + } + ``` + + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonCanBeRoundTripped() + { + IntentResolutionRating rating = + new IntentResolutionRating( + resolutionScore: 1, + explanation: "explanation", + agentPerceivedIntent: "perceived intent", + actualUserIntent: "actual intent", + conversationHasIntent: false, + correctIntentDetected: true, + intentResolved: true); + + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.IntentResolutionRating); + IntentResolutionRating deserialized = IntentResolutionRating.FromJson(json); + + Assert.Equal(rating.ResolutionScore, deserialized.ResolutionScore); + Assert.Equal(rating.Explanation, deserialized.Explanation); + Assert.Equal(rating.AgentPerceivedIntent, deserialized.AgentPerceivedIntent); + Assert.Equal(rating.ActualUserIntent, deserialized.ActualUserIntent); + Assert.Equal(rating.ConversationHasIntent, deserialized.ConversationHasIntent); + Assert.Equal(rating.CorrectIntentDetected, deserialized.CorrectIntentDetected); + Assert.Equal(rating.IntentResolved, deserialized.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void InconclusiveJsonCanBeRoundTripped() + { + IntentResolutionRating rating = IntentResolutionRating.Inconclusive; + + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.IntentResolutionRating); + IntentResolutionRating deserialized = IntentResolutionRating.FromJson(json); + + Assert.Equal(rating.ResolutionScore, deserialized.ResolutionScore); + Assert.Equal(rating.Explanation, deserialized.Explanation); + Assert.Equal(rating.AgentPerceivedIntent, deserialized.AgentPerceivedIntent); + Assert.Equal(rating.ActualUserIntent, deserialized.ActualUserIntent); + Assert.Equal(rating.ConversationHasIntent, deserialized.ConversationHasIntent); + Assert.Equal(rating.CorrectIntentDetected, deserialized.CorrectIntentDetected); + Assert.Equal(rating.IntentResolved, deserialized.IntentResolved); + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithNegativeScoreIsInconclusive() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": -1 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithZeroScoreIsInconclusive() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 0 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithExcessivelyHighScoreIsInconclusive() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 200 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithAdditionalHallucinatedPropertyIsProcessedCorrectly() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "hallucinated_property": "Some hallucinated text.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithDuplicatePropertyUsesLastValue() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "explanation": "Duplicate explanation.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("Duplicate explanation.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithSemicolonsInsteadOfCommasThrowsException() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake."; + "conversation_has_intent": true; + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe"; + "actual_user_intent": "bake a chocolate cake"; + "correct_intent_detected": true; + "intent_resolved": true; + "resolution_score": 5 + } + """; + + Assert.Throws(() => IntentResolutionRating.FromJson(json)); + } + + [Fact] + public void JsonWithMissingPropertiesThrowsException() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "intent_resolved": true, + "resolution_score": 5 + } + """; + + Assert.Throws(() => IntentResolutionRating.FromJson(json)); + } + + [Fact] + public void JsonWithIncorrectPropertyValueTypeThrowsException() + { + // Incorrect property value (string instead of boolean for conversation_has_intent). + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": "A string value", + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + """; + + Assert.Throws(() => IntentResolutionRating.FromJson(json)); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessEvaluatorRatingTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessEvaluatorRatingTests.cs deleted file mode 100644 index db7cc6e3a26..00000000000 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessEvaluatorRatingTests.cs +++ /dev/null @@ -1,148 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Text.Json; -using Microsoft.Extensions.AI.Evaluation.Quality; -using Xunit; - -namespace Microsoft.Extensions.AI.Evaluation.Tests; - -[Experimental("AIEVAL001")] -public class RelevanceTruthAndCompletenessEvaluatorRatingTests -{ - [Fact] - public void JsonIsValid() - { - string json = """ - {"relevance": 1, "truth": 5, "completeness": 4} - """; - - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - - Assert.Equal(1, rating.Relevance); - Assert.Equal(5, rating.Truth); - Assert.Equal(4, rating.Completeness); - Assert.Null(rating.RelevanceReasoning); - Assert.Null(rating.TruthReasoning); - Assert.Null(rating.CompletenessReasoning); - Assert.Empty(rating.RelevanceReasons); - Assert.Empty(rating.TruthReasons); - Assert.Empty(rating.CompletenessReasons); - Assert.False(rating.IsInconclusive); - } - - [Fact] - public void JsonIsSurroundedWithMarkdownSyntax() - { - string json = """ - - ``` - {"relevance": 1, "truth": 5, "completeness": 4} - ``` - - """; - - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - - Assert.Equal(1, rating.Relevance); - Assert.Equal(5, rating.Truth); - Assert.Equal(4, rating.Completeness); - Assert.Null(rating.RelevanceReasoning); - Assert.Null(rating.TruthReasoning); - Assert.Null(rating.CompletenessReasoning); - Assert.Empty(rating.RelevanceReasons); - Assert.Empty(rating.TruthReasons); - Assert.Empty(rating.CompletenessReasons); - Assert.False(rating.IsInconclusive); - } - - [Fact] - public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() - { - string json = """ - - ```json - {"relevance": 1, "truth": 5, "completeness": 4} - ``` - - """; - - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - - Assert.Equal(1, rating.Relevance); - Assert.Equal(5, rating.Truth); - Assert.Equal(4, rating.Completeness); - Assert.Null(rating.RelevanceReasoning); - Assert.Null(rating.TruthReasoning); - Assert.Null(rating.CompletenessReasoning); - Assert.Empty(rating.RelevanceReasons); - Assert.Empty(rating.TruthReasons); - Assert.Empty(rating.CompletenessReasons); - Assert.False(rating.IsInconclusive); - } - - [Fact] - public void JsonCanBeRoundTripped() - { - var rating = new RelevanceTruthAndCompletenessEvaluator.Rating( - relevance: 1, - relevanceReasoning: "The response is not relevant to the request.", - relevanceReasons: ["Reason 1", "Reason 2"], - truth: 5, - truthReasoning: "The response is mostly true.", - truthReasons: ["Reason 1", "Reason 2"], - completeness: 4, - completenessReasoning: "The response is mostly complete.", - completenessReasons: ["Reason 1", "Reason 2"]); - - string json = JsonSerializer.Serialize(rating, RelevanceTruthAndCompletenessEvaluator.SerializerContext.Default.Rating); - var deserialized = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.Equal(rating.Relevance, deserialized.Relevance); - Assert.Equal(rating.RelevanceReasoning, deserialized.RelevanceReasoning); - Assert.True(rating.RelevanceReasons.SequenceEqual(deserialized.RelevanceReasons)); - Assert.Equal(rating.Truth, deserialized.Truth); - Assert.Equal(rating.TruthReasoning, deserialized.TruthReasoning); - Assert.True(rating.TruthReasons.SequenceEqual(deserialized.TruthReasons)); - Assert.Equal(rating.Completeness, deserialized.Completeness); - Assert.Equal(rating.CompletenessReasoning, deserialized.CompletenessReasoning); - Assert.True(rating.CompletenessReasons.SequenceEqual(deserialized.CompletenessReasons)); - Assert.False(rating.IsInconclusive); - } - - [Fact] - public void JsonContainsInconclusiveMetrics() - { - string json = """{"relevance": -1, "truth": 4, "completeness": 7}"""; - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.True(rating.IsInconclusive); - - json = """{"relevance": 0, "truth": -1, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.True(rating.IsInconclusive); - - json = """{"relevance": 0, "truth": 4, "completeness": -5}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.True(rating.IsInconclusive); - - json = """{"relevance": 10, "truth": 4, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.True(rating.IsInconclusive); - - json = """{"relevance": 0, "truth": 5, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.True(rating.IsInconclusive); - - json = """{"relevance": 1, "truth": 4, "completeness": 6}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); - Assert.True(rating.IsInconclusive); - } - - [Fact] - public void JsonContainsErrors() - { - string json = """{"relevance": 0, "truth": 2 ;"completeness": 3}"""; - Assert.Throws(() => RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json)); - } -} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs new file mode 100644 index 00000000000..6be1a8ba142 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs @@ -0,0 +1,382 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Linq; +using System.Text.Json; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Tests; + +public class RelevanceTruthAndCompletenessRatingTests +{ + [Fact] + public void JsonIsValid() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(1, rating.Truth); + Assert.Equal(1, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Equal(3, rating.TruthReasons.Length); + Assert.Contains("truth_reason_incorrect_information", rating.TruthReasons); + Assert.Contains("truth_reason_outdated_information", rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Equal(2, rating.CompletenessReasons.Length); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.Contains("completeness_reason_genericsolution_missingcode", rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonIsSurroundedWithMarkdownSyntax() + { + string json = + """ + + ``` + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": [], + "truth": 4, + "truthReasoning": "The reason for the truth score", + "truthReasons": [], + "completeness": 5, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": [] + } + ``` + + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(4, rating.Truth); + Assert.Equal(5, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Empty(rating.RelevanceReasons); + Assert.Empty(rating.TruthReasons); + Assert.Empty(rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() + { + string json = + """ + + ```json + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 3, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_misleading_incorrectforintent"], + "completeness": 2, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution"], + } + ``` + + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(3, rating.Truth); + Assert.Equal(2, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Single(rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Single(rating.CompletenessReasons); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonCanBeRoundTripped() + { + RelevanceTruthAndCompletenessRating rating = + new RelevanceTruthAndCompletenessRating( + relevance: 1, + relevanceReasoning: "The response is not relevant to the request.", + relevanceReasons: ["Reason 1", "Reason 2"], + truth: 5, + truthReasoning: "The response is mostly true.", + truthReasons: ["Reason 1", "Reason 2"], + completeness: 4, + completenessReasoning: "The response is mostly complete.", + completenessReasons: ["Reason 1", "Reason 2"]); + + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.RelevanceTruthAndCompletenessRating); + RelevanceTruthAndCompletenessRating deserialized = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(rating.Relevance, deserialized.Relevance); + Assert.Equal(rating.RelevanceReasoning, deserialized.RelevanceReasoning); + Assert.True(rating.RelevanceReasons.SequenceEqual(deserialized.RelevanceReasons)); + Assert.Equal(rating.Truth, deserialized.Truth); + Assert.Equal(rating.TruthReasoning, deserialized.TruthReasoning); + Assert.True(rating.TruthReasons.SequenceEqual(deserialized.TruthReasons)); + Assert.Equal(rating.Completeness, deserialized.Completeness); + Assert.Equal(rating.CompletenessReasoning, deserialized.CompletenessReasoning); + Assert.True(rating.CompletenessReasons.SequenceEqual(deserialized.CompletenessReasons)); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void InconclusiveJsonCanBeRoundTripped() + { + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.Inconclusive; + + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.RelevanceTruthAndCompletenessRating); + RelevanceTruthAndCompletenessRating deserialized = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(rating.Relevance, deserialized.Relevance); + Assert.Equal(rating.RelevanceReasoning, deserialized.RelevanceReasoning); + Assert.True(rating.RelevanceReasons.SequenceEqual(deserialized.RelevanceReasons)); + Assert.Equal(rating.Truth, deserialized.Truth); + Assert.Equal(rating.TruthReasoning, deserialized.TruthReasoning); + Assert.True(rating.TruthReasons.SequenceEqual(deserialized.TruthReasons)); + Assert.Equal(rating.Completeness, deserialized.Completeness); + Assert.Equal(rating.CompletenessReasoning, deserialized.CompletenessReasoning); + Assert.True(rating.CompletenessReasons.SequenceEqual(deserialized.CompletenessReasons)); + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithNegativeScoreIsInconclusive() + { + string json = + """ + { + "relevance": -1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithZeroScoreIsInconclusive() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 0, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithExcessivelyHighScoreIsInconclusive() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 100, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithAdditionalHallucinatedPropertyIsProcessedCorrectly() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "hallucinatedProperty": "Some hallucinated text", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(1, rating.Truth); + Assert.Equal(1, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Equal(3, rating.TruthReasons.Length); + Assert.Contains("truth_reason_incorrect_information", rating.TruthReasons); + Assert.Contains("truth_reason_outdated_information", rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Equal(2, rating.CompletenessReasons.Length); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.Contains("completeness_reason_genericsolution_missingcode", rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithDuplicatePropertyUsesLastValue() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasoning": "Duplicate reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(1, rating.Truth); + Assert.Equal(1, rating.Completeness); + Assert.Equal("Duplicate reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Equal(3, rating.TruthReasons.Length); + Assert.Contains("truth_reason_incorrect_information", rating.TruthReasons); + Assert.Contains("truth_reason_outdated_information", rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Equal(2, rating.CompletenessReasons.Length); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.Contains("completeness_reason_genericsolution_missingcode", rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithSemicolonsInsteadOfCommasThrowsException() + { + string json = + """ + { + "relevance": 1; + "relevanceReasoning": "The reason for the relevance score"; + "relevanceReasons": ["relevance_reason_distant_topic"]; + "truth": 1; + "truthReasoning": "The reason for the truth score"; + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"]; + "completeness": 1; + "completenessReasoning": "The reason for the completeness score"; + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"]; + } + """; + + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); + } + + [Fact] + public void JsonWithMissingPropertiesThrowsException() + { + string json = + """ + { + "relevance": 1, + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + } + """; + + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); + } + + [Fact] + public void JsonWithIncorrectPropertyValueTypeThrowsException() + { + // Incorrect property value (integer instead of string for relevanceReasoning). + string json = + """ + { + "relevance": 1, + "relevanceReasoning": 6, + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); + } +}