From d5d337c93a4422b2a42490e1476049e2eac587c7 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 17:34:36 -0700 Subject: [PATCH 01/18] Introduce Reason property on EvaluationMetric --- ...ceTruthAndCompletenessEvaluator.Prompts.cs | 88 +----------- .../RelevanceTruthAndCompletenessEvaluator.cs | 25 +--- ...nceTruthAndCompletenessEvaluatorOptions.cs | 41 ------ .../TypeScript/components/EvalTypes.d.ts | 4 + .../BooleanMetric.cs | 7 +- .../EvaluationMetric{T}.cs | 13 +- .../NumericMetric.cs | 7 +- .../StringMetric.cs | 7 +- .../EndToEndTests.cs | 3 +- ...vanceTruthAndCompletenessEvaluatorTests.cs | 133 ------------------ 10 files changed, 41 insertions(+), 287 deletions(-) delete mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs delete mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs index 91d89d65531..d1dbc93a35b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs @@ -7,93 +7,7 @@ public partial class RelevanceTruthAndCompletenessEvaluator { private static class Prompts { - internal static string BuildEvaluationPrompt(string userQuery, string modelResponse, string history) - { -#pragma warning disable S103 // Lines should not be too long - return - $$""" - Read the History, User Query, and Model Response below and produce your response as a single JSON object. - Do not include any other text in your response besides the JSON object. - - The JSON object should have the following format. However, do not include any markdown tags in your - response. Your response should start with an open curly brace and end with a closing curly brace for the - JSON. - ``` - { - "relevance": 1, - "truth": 1, - "completeness": 1 - } - ``` - - ----- - - History: {{history}} - - ----- - - User Query: {{userQuery}} - - ----- - - Model Response: {{modelResponse}} - - ----- - - That's the History, User Query, and Model Response you will rate. Now, in 3 Steps, you will evaluate the Model Response on 3 criteria. - - ----- - - Step 1: Rate the relevance of the response. - - Regardless of truth of statements, how much of the response is directly related to the request? - - 1 = None of the response is at all related - 2 = A little of the response is directly related, or the response is a little bit related - 3 = Some of the response is related, or the response is somewhat related - 4 = Most of the response is related, or the response is mostly related - 5 = 100% of the response is 100% related to the request - - Record your response as the value of the "relevance" property in the JSON output. - - ----- - - Step 2: Rate the truth of the response. - - Read the History, Query, and Model Response again. - - Regardless of relevance, how true are the verifiable statements in the response? - - 1 = The entire response is totally false - 2 = A little of the response is true, or the response is a little bit true - 3 = Some of the response is true, or the response is somewhat true - 4 = Most of the response is true, or the response is mostly true - 5 = 100% of the response is 100% true - - Record your response as the value of the "truth" property in the JSON output. - - ----- - - Step 3: Rate the completeness of the response. - - Read the History, Query, and Model Response again. - - Regardless of whether the statements made in the response are true, how many of the points necessary to address the request, does the response contain? - - 1 = The response omits all points that are necessary to address the request. - 2 = The response includes a little of the points that are necessary to address the request. - 3 = The response includes some of the points that are necessary to address the request. - 4 = The response includes most of the points that are necessary to address the request. - 5 = The response includes all points that are necessary to address the request. For explain tasks, nothing is left unexplained. For improve tasks, I looked for all potential improvements, and none were left out. For fix tasks, the response purports to get the user all the way to a fixed state (regardless of whether it actually works). For "do task" responses, it does everything requested. - - Record your response as the value of the "completeness" property in the JSON output. - - ----- - """; -#pragma warning restore S103 - } - - internal static string BuildEvaluationPromptWithReasoning( + internal static string BuildEvaluationPrompt( string userQuery, string modelResponse, string history) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index 419feb45743..b1bf9e797a5 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -25,9 +25,7 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; /// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating /// a poor score, and 5 indicating an excellent score. /// -/// Options for . -public sealed partial class RelevanceTruthAndCompletenessEvaluator( - RelevanceTruthAndCompletenessEvaluatorOptions? options = null) : ChatConversationEvaluator +public sealed partial class RelevanceTruthAndCompletenessEvaluator : ChatConversationEvaluator { /// /// Gets the of the returned by @@ -61,9 +59,6 @@ public sealed partial class RelevanceTruthAndCompletenessEvaluator( ResponseFormat = ChatResponseFormat.Json }; - private readonly RelevanceTruthAndCompletenessEvaluatorOptions _options = - options ?? RelevanceTruthAndCompletenessEvaluatorOptions.Default; - /// protected override EvaluationResult InitializeResult() { @@ -101,17 +96,7 @@ userRequest is not null string renderedHistory = builder.ToString(); - string prompt = - _options.IncludeReasoning - ? Prompts.BuildEvaluationPromptWithReasoning( - renderedUserRequest, - renderedModelResponse, - renderedHistory) - : Prompts.BuildEvaluationPrompt( - renderedUserRequest, - renderedModelResponse, - renderedHistory); - + string prompt = Prompts.BuildEvaluationPrompt(renderedUserRequest, renderedModelResponse, renderedHistory); return prompt; } @@ -192,7 +177,7 @@ void UpdateResult(Rating rating) relevance.Interpretation = relevance.InterpretScore(); if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning)) { - relevance.AddDiagnostic(EvaluationDiagnostic.Informational(rating.RelevanceReasoning!)); + relevance.Reason = rating.RelevanceReasoning!; } NumericMetric truth = result.Get(TruthMetricName); @@ -200,7 +185,7 @@ void UpdateResult(Rating rating) truth.Interpretation = truth.InterpretScore(); if (!string.IsNullOrWhiteSpace(rating.TruthReasoning)) { - truth.AddDiagnostic(EvaluationDiagnostic.Informational(rating.TruthReasoning!)); + truth.Reason = rating.TruthReasoning!; } NumericMetric completeness = result.Get(CompletenessMetricName); @@ -208,7 +193,7 @@ void UpdateResult(Rating rating) completeness.Interpretation = completeness.InterpretScore(); if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning)) { - completeness.AddDiagnostic(EvaluationDiagnostic.Informational(rating.CompletenessReasoning!)); + completeness.Reason = rating.CompletenessReasoning!; } if (!string.IsNullOrWhiteSpace(rating.Error)) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs deleted file mode 100644 index 9271b2cc4af..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#pragma warning disable S3604 -// S3604: Member initializer values should not be redundant. -// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary -// constructor syntax. - -namespace Microsoft.Extensions.AI.Evaluation.Quality; - -/// -/// Options for . -/// -/// -/// If is set to , this instructs the -/// to include s (with -/// set to ) as -/// part of the returned s for 'Relevance' 'Truth' and 'Completeness' that explain the -/// reasoning behind the corresponding scores. By default, is set to -/// . -/// -public sealed class RelevanceTruthAndCompletenessEvaluatorOptions(bool includeReasoning = false) -{ - /// - /// Gets the default options for . - /// - /// - /// is set to by default. - /// - public static RelevanceTruthAndCompletenessEvaluatorOptions Default { get; } = - new RelevanceTruthAndCompletenessEvaluatorOptions(); - - /// - /// Gets a value indicating whether the should include - /// s (with set to - /// ) as part of the returned - /// s for 'Relevance' 'Truth' and 'Completeness' to explain the reasoning behind the - /// corresponding scores. By default, is set to . - /// - public bool IncludeReasoning { get; } = includeReasoning; -} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts index 1055df330df..3877deccb8d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts @@ -65,20 +65,24 @@ type BaseEvaluationMetric = { type MetricWithNoValue = BaseEvaluationMetric & { $type: "none"; + reason?: string; value: undefined; }; type NumericMetric = BaseEvaluationMetric & { $type: "numeric"; + reason?: string; value?: number; }; type BooleanMetric = BaseEvaluationMetric & { $type: "boolean"; + reason?: string; value?: boolean; }; type StringMetric = BaseEvaluationMetric & { $type: "string"; + reason?: string; value?: string; }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs index bc71408ffa2..fe987382a26 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs @@ -9,4 +9,9 @@ namespace Microsoft.Extensions.AI.Evaluation; /// /// The name of the . /// The value of the . -public sealed class BooleanMetric(string name, bool? value = null) : EvaluationMetric(name, value); +/// +/// An optional string that can be used to provide some commentary around the result represented by +/// . +/// +public sealed class BooleanMetric(string name, bool? value = null, string? reason = null) +: EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs index f0d6eea9d10..fb57d2571fb 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs @@ -20,14 +20,25 @@ public class EvaluationMetric : EvaluationMetric /// public T? Value { get; set; } + /// + /// Gets or sets a string that can optionally be used to provide some commentary around the result represented by + /// . + /// + public string? Reason { get; set; } + /// /// Initializes a new instance of the class. /// /// The name of the . /// The value of the . - protected EvaluationMetric(string name, T? value) + /// + /// An optional string that can be used to provide some commentary around the result represented by + /// . + /// + protected EvaluationMetric(string name, T? value, string? reason = null) : base(name) { Value = value; + Reason = reason; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs index 35dec86ca63..93234a967b8 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs @@ -20,4 +20,9 @@ namespace Microsoft.Extensions.AI.Evaluation; /// /// The name of the . /// The value of the . -public sealed class NumericMetric(string name, double? value = null) : EvaluationMetric(name, value); +/// +/// An optional string that can be used to provide some commentary around the result represented by +/// . +/// +public sealed class NumericMetric(string name, double? value = null, string? reason = null) + : EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs index b80c16fbbd8..5fed520b3a2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs @@ -12,4 +12,9 @@ namespace Microsoft.Extensions.AI.Evaluation; /// /// The name of the . /// The value of the . -public sealed class StringMetric(string name, string? value = null) : EvaluationMetric(name, value); +/// +/// An optional string that can be used to provide some commentary around the result represented by +/// . +/// +public sealed class StringMetric(string name, string? value = null, string? reason = null) + : EvaluationMetric(name, value, reason); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs index dbfdebc529c..65801f0342f 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs @@ -33,8 +33,7 @@ static EndToEndTests() if (Settings.Current.Configured) { - var options = new RelevanceTruthAndCompletenessEvaluatorOptions(includeReasoning: true); - IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator(options); + IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator(); IEvaluator coherenceEvaluator = new CoherenceEvaluator(); IEvaluator fluencyEvaluator = new FluencyEvaluator(); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs deleted file mode 100644 index 8b479ea57cf..00000000000 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Threading.Tasks; -using Microsoft.Extensions.AI.Evaluation.Quality; -using Microsoft.Extensions.AI.Evaluation.Reporting; -using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; -using Microsoft.TestUtilities; -using Xunit; - -namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; - -public class RelevanceTruthAndCompletenessEvaluatorTests -{ - private static readonly ChatOptions _chatOptions; - private static readonly ReportingConfiguration? _reportingConfigurationWithoutReasoning; - private static readonly ReportingConfiguration? _reportingConfigurationWithReasoning; - - static RelevanceTruthAndCompletenessEvaluatorTests() - { - _chatOptions = - new ChatOptions - { - Temperature = 0.0f, - ResponseFormat = ChatResponseFormat.Text - }; - - if (Settings.Current.Configured) - { - IEvaluator rtcEvaluatorWithoutReasoning = new RelevanceTruthAndCompletenessEvaluator(); - - _reportingConfigurationWithoutReasoning = - DiskBasedReportingConfiguration.Create( - storageRootPath: Settings.Current.StorageRootPath, - evaluators: [rtcEvaluatorWithoutReasoning], - chatConfiguration: Setup.CreateChatConfiguration(), - executionName: Constants.Version); - - var options = new RelevanceTruthAndCompletenessEvaluatorOptions(includeReasoning: true); - IEvaluator rtcEvaluatorWithReasoning = new RelevanceTruthAndCompletenessEvaluator(options); - - _reportingConfigurationWithReasoning = - DiskBasedReportingConfiguration.Create( - storageRootPath: Settings.Current.StorageRootPath, - evaluators: [rtcEvaluatorWithReasoning], - chatConfiguration: Setup.CreateChatConfiguration(), - executionName: Constants.Version); - } - } - - [ConditionalFact] - public async Task WithoutReasoning() - { - SkipIfNotConfigured(); - - await using ScenarioRun scenarioRun = - await _reportingConfigurationWithoutReasoning.CreateScenarioRunAsync( - scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(RelevanceTruthAndCompletenessEvaluatorTests)}.{nameof(WithoutReasoning)}"); - - IChatClient chatClient = scenarioRun.ChatConfiguration!.ChatClient; - - var messages = new List(); - string prompt = @"What is the molecular formula of ammonia?"; - ChatMessage promptMessage = prompt.ToUserMessage(); - messages.Add(promptMessage); - - ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptions); - ChatMessage responseMessage = response.Messages.Single(); - Assert.NotNull(responseMessage.Text); - - EvaluationResult result = await scenarioRun.EvaluateAsync(promptMessage, responseMessage); - - Assert.False(result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Informational)); - - NumericMetric relevance = result.Get(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName); - NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); - NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - - Assert.True(relevance.Value >= 4); - Assert.True(truth.Value >= 4); - Assert.True(completeness.Value >= 4); - } - - [ConditionalFact] - public async Task WithReasoning() - { - SkipIfNotConfigured(); - - await using ScenarioRun scenarioRun = - await _reportingConfigurationWithReasoning.CreateScenarioRunAsync( - scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(RelevanceTruthAndCompletenessEvaluatorTests)}.{nameof(WithReasoning)}"); - - IChatClient chatClient = scenarioRun.ChatConfiguration!.ChatClient; - - var messages = new List(); - string prompt = @"What is the molecular formula of glucose?"; - ChatMessage promptMessage = prompt.ToUserMessage(); - messages.Add(promptMessage); - - ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptions); - ChatMessage responseMessage = response.Messages.Single(); - Assert.NotNull(responseMessage.Text); - - EvaluationResult result = await scenarioRun.EvaluateAsync(promptMessage, responseMessage); - - Assert.True(result.ContainsDiagnostics(d => d.Severity == EvaluationDiagnosticSeverity.Informational)); - Assert.False(result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning)); - - NumericMetric relevance = result.Get(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName); - NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); - NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - - Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Diagnostics.Single().Message)); - Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Diagnostics.Single().Message)); - Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Diagnostics.Single().Message)); - } - - [MemberNotNull(nameof(_reportingConfigurationWithReasoning))] - [MemberNotNull(nameof(_reportingConfigurationWithoutReasoning))] - private static void SkipIfNotConfigured() - { - if (!Settings.Current.Configured) - { - throw new SkipTestException("Test is not configured"); - } - - Assert.NotNull(_reportingConfigurationWithReasoning); - Assert.NotNull(_reportingConfigurationWithoutReasoning); - } -} From 2a9426e5b6b9647ab20dfa4d9b4164c7ca33d1c7 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 18:21:37 -0700 Subject: [PATCH 02/18] Fix mouse cursor for text containers --- .../TypeScript/components/ScenarioTree.tsx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 7ed42c44b16..6c97626de05 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -103,6 +103,7 @@ const useStyles = makeStyles({ padding: '1rem', border: '1px solid #e0e0e0', backgroundColor: tokens.colorNeutralBackground2, + cursor: 'text', }, promptBox: { border: '1px solid #e0e0e0', @@ -110,6 +111,7 @@ const useStyles = makeStyles({ padding: '1rem', maxHeight: '20rem', overflow: 'auto', + cursor: 'text', }, promptTitleLine: { display: 'flex', @@ -187,6 +189,7 @@ export const PromptDetails = ({ history, response }: { history: string, response

Response

+
{renderResponse ? {response} :
{response}
}
From 4235f20458fc1e88e89a4d284aeef70389cef12b Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 19:45:33 -0700 Subject: [PATCH 03/18] Moce the toggle for rendering markdown under a global settings gear --- .../TypeScript/components/App.css | 3 +- .../TypeScript/components/App.tsx | 39 +++++++++++++++---- .../TypeScript/components/ScenarioTree.tsx | 32 ++++++--------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css index 0ab7c21274b..24695e5565d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css @@ -5,7 +5,6 @@ The .NET Foundation licenses this file to you under the MIT license. #root { margin: 0 auto; - padding: 2rem; + padding: 0rem 2rem 2rem 2rem; background-color: white; } - diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx index cc2215f8e8e..237a6ba4794 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx @@ -1,6 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +import { useState } from 'react'; +import { Settings28Regular } from '@fluentui/react-icons'; +import { Drawer, DrawerBody, DrawerHeader, DrawerHeaderTitle, Switch } from '@fluentui/react-components'; import { makeStyles } from '@fluentui/react-components'; import './App.css'; import { ScoreNode } from './Summary'; @@ -12,20 +15,42 @@ type AppProperties = { }; const useStyles = makeStyles({ - footerText: { fontSize: '0.8rem', marginTop: '2rem' } -}) + header: { display: 'flex', justifyContent: 'space-between', alignItems: 'center', position: 'sticky', top: 0, backgroundColor: 'white', zIndex: 1 }, + footerText: { fontSize: '0.8rem', marginTop: '2rem' }, + closeButton: { position: 'absolute', top: '1rem', right: '1rem', cursor: 'pointer', fontSize: '1.5rem' } +}); -function App({dataset, tree}:AppProperties) { +function App({ dataset, tree }: AppProperties) { const classes = useStyles(); + const [isSettingsOpen, setIsSettingsOpen] = useState(false); + const [renderMarkdown, setRenderMarkdown] = useState(true); + + const toggleSettings = () => setIsSettingsOpen(!isSettingsOpen); + const toggleRenderMarkdown = () => setRenderMarkdown(!renderMarkdown); + const closeSettings = () => setIsSettingsOpen(false); + return ( <> -

AI Evaluation Report

+
+

AI Evaluation Report

+ +
- +

Generated at {dataset.createdAt} by Microsoft.Extensions.AI.Evaluation.Reporting version {dataset.generatorVersion}

+ + + + Settings + × + + + + + - ) + ); } -export default App +export default App; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 6c97626de05..97f87641b89 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -1,7 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -import { makeStyles, Switch, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, TreeOpenChangeData, TreeOpenChangeEvent } from "@fluentui/react-components"; +import { makeStyles, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, TreeOpenChangeData, TreeOpenChangeEvent } from "@fluentui/react-components"; import { useState, useCallback } from "react"; import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails } from "./Summary"; import { PassFailBar } from "./PassFailBar"; @@ -9,7 +9,7 @@ import { MetricCardList } from "./MetricCard"; import ReactMarkdown from "react-markdown"; import { ErrorCircleRegular } from "@fluentui/react-icons"; -const ScenarioLevel = ({ node, parentPath, isOpen }: { node: ScoreNode, parentPath: string, isOpen: (path: string) => boolean }) => { +const ScenarioLevel = ({ node, parentPath, isOpen, renderMarkdown }: { node: ScoreNode, parentPath: string, isOpen: (path: string) => boolean, renderMarkdown: boolean }) => { const path = `${parentPath}.${node.name}`; if (node.isLeafNode) { return @@ -19,7 +19,7 @@ const ScenarioLevel = ({ node, parentPath, isOpen }: { node: ScoreNode, parentPa - + @@ -31,14 +31,14 @@ const ScenarioLevel = ({ node, parentPath, isOpen }: { node: ScoreNode, parentPa {node.childNodes.map((n) => ( - + ))} ; } }; -export const ScenarioGroup = ({ node }: { node: ScoreNode }) => { +export const ScenarioGroup = ({ node, renderMarkdown }: { node: ScoreNode, renderMarkdown: boolean }) => { const [openItems, setOpenItems] = useState>(() => new Set()); const handleOpenChange = useCallback((_: TreeOpenChangeEvent, data: TreeOpenChangeData) => { setOpenItems(data.openItems); @@ -47,11 +47,11 @@ export const ScenarioGroup = ({ node }: { node: ScoreNode }) => { return ( - + ); }; -export const ScoreDetail = ({ scenario }: { scenario: ScenarioRunResult }) => { +export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRunResult, renderMarkdown: boolean }) => { const classes = useStyles(); const failureMessages = []; @@ -70,7 +70,7 @@ export const ScoreDetail = ({ scenario }: { scenario: ScenarioRunResult }) => { return (
{failureMessages && failureMessages.length > 0 && } - +
); }; @@ -164,34 +164,24 @@ const ScoreNodeHeader = ({ item, showPrompt }: { item: ScoreNode, showPrompt?: b ); }; -export const PromptDetails = ({ history, response }: { history: string, response: string }) => { +export const PromptDetails = ({ history, response, renderMarkdown }: { history: string, response: string, renderMarkdown: boolean }) => { const classes = useStyles(); - const [renderPrompt, setRenderPrompt] = useState(true); - const onChangeRenderPrompt = useCallback((ev: React.ChangeEvent) => { - setRenderPrompt(ev.currentTarget.checked); - }, [setRenderPrompt]); - const [renderResponse, setRenderResponse] = useState(true); - const onChangeRenderResponse = useCallback((ev: React.ChangeEvent) => { - setRenderResponse(ev.currentTarget.checked); - }, [setRenderResponse]); return (

Prompt

-
- {renderPrompt ? {history} :
{history}
} + {renderMarkdown ? {history} :
{history}
}

Response

-
- {renderResponse ? {response} :
{response}
} + {renderMarkdown ? {response} :
{response}
}
); }; From 0ac49bebb64397603c5ceac32e0ccce64681f539 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 20:24:58 -0700 Subject: [PATCH 04/18] Make sections for conversation and failure reasons collapsible --- .../TypeScript/components/ScenarioTree.tsx | 97 +++++++++++++------ 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 97f87641b89..8aac1c5780b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -8,6 +8,7 @@ import { PassFailBar } from "./PassFailBar"; import { MetricCardList } from "./MetricCard"; import ReactMarkdown from "react-markdown"; import { ErrorCircleRegular } from "@fluentui/react-icons"; +import { ChevronDown12Regular, ChevronRight12Regular } from '@fluentui/react-icons'; const ScenarioLevel = ({ node, parentPath, isOpen, renderMarkdown }: { node: ScoreNode, parentPath: string, isOpen: (path: string) => boolean, renderMarkdown: boolean }) => { const path = `${parentPath}.${node.name}`; @@ -91,11 +92,35 @@ const useStyles = makeStyles({ scenarioLabel: { whiteSpace: 'nowrap', fontWeight: '500', - }, + }, iterationArea: { marginTop: '1rem', marginBottom: '1rem', }, + section: { + marginTop: '2rem', + }, + sectionHeader: { + display: 'flex', + alignItems: 'center', + cursor: 'pointer', + userSelect: 'none', + marginBottom: '1rem', + }, + sectionHeaderText: { + margin: 0, + marginLeft: '0.5rem', + fontSize: '1.25rem', + fontWeight: 'bold', + }, + sectionSubHeader: { + fontSize: '0.875rem', + fontWeight: 'bold', + marginBottom: '0.5rem', + }, + sectionContent: { + marginBottom: '1.5rem', + }, failMessage: { color: tokens.colorStatusDangerForeground2, }, @@ -105,30 +130,38 @@ const useStyles = makeStyles({ backgroundColor: tokens.colorNeutralBackground2, cursor: 'text', }, - promptBox: { + conversationBox: { border: '1px solid #e0e0e0', borderRadius: '4px', padding: '1rem', maxHeight: '20rem', overflow: 'auto', cursor: 'text', + '& pre': { + whiteSpace: 'pre-wrap', + wordWrap: 'break-word', + }, }, - promptTitleLine: { - display: 'flex', - flexDirection: 'row', - alignItems: 'center', - }, - promptTitle: { flexGrow: 1 }, }); export const FailMessage = ({ messages }: { messages: string[] }) => { const classes = useStyles(); - return
-

Failure Reasons

-
- {messages.map((msg) => <> {msg}
)} + const [isExpanded, setIsExpanded] = useState(true); + + return ( +
+
setIsExpanded(!isExpanded)}> + {isExpanded ? : } +

Failure Reasons

+
+ + {isExpanded && ( +
+ {messages.map((msg) => <> {msg}
)} +
+ )}
-
; + ); }; const PassFailBadge = ({ pass, total }: { pass: number, total: number }) => { @@ -166,22 +199,28 @@ const ScoreNodeHeader = ({ item, showPrompt }: { item: ScoreNode, showPrompt?: b export const PromptDetails = ({ history, response, renderMarkdown }: { history: string, response: string, renderMarkdown: boolean }) => { const classes = useStyles(); + const [isExpanded, setIsExpanded] = useState(true); - return (
-
-

Prompt

-
- -
- {renderMarkdown ? {history} :
{history}
} -
- -
-

Response

-
- -
- {renderMarkdown ? {response} :
{response}
} + return ( +
+
setIsExpanded(!isExpanded)}> + {isExpanded ? : } +

Conversation

+
+ + {isExpanded && ( +
+
+
Prompt
+ {renderMarkdown ? {history} :
{history}
} +
+ +
+
Response
+ {renderMarkdown ? {response} :
{response}
} +
+
+ )}
-
); + ); }; From c654def65883cf533a431b35f4965e2c4df9513d Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 20:37:35 -0700 Subject: [PATCH 05/18] Collapse single child nodes into parent to avoid too much spellunking down the tree --- .../TypeScript/components/ScenarioTree.tsx | 1 + .../TypeScript/components/Summary.ts | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 8aac1c5780b..1aa6aac9de5 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -11,6 +11,7 @@ import { ErrorCircleRegular } from "@fluentui/react-icons"; import { ChevronDown12Regular, ChevronRight12Regular } from '@fluentui/react-icons'; const ScenarioLevel = ({ node, parentPath, isOpen, renderMarkdown }: { node: ScoreNode, parentPath: string, isOpen: (path: string) => boolean, renderMarkdown: boolean }) => { + node.collapseSingleChildNodes(); const path = `${parentPath}.${node.name}`; if (node.isLeafNode) { return diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts index 8cef12ce4f1..7d72b1a7e63 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts @@ -104,7 +104,22 @@ export class ScoreNode { } } - + collapseSingleChildNodes() { + if (this.isLeafNode) { + return; + } + + while (this.childNodes.length === 1) { + const onlyChild = this.childNodes[0]; + this.name += ` > ${onlyChild.name}`; + this.children = onlyChild.children; + this.scenario = onlyChild.scenario; + } + + for (const child of this.childNodes) { + child.collapseSingleChildNodes(); + } + } }; export const DefaultRootNodeName = "All Evaluations"; From 3df9090c9f0a4c24822103711177c93b9b315180 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 20:57:57 -0700 Subject: [PATCH 06/18] Improve spacing --- .../TypeScript/components/App.tsx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx index 237a6ba4794..b745dc2fe2d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx @@ -17,7 +17,9 @@ type AppProperties = { const useStyles = makeStyles({ header: { display: 'flex', justifyContent: 'space-between', alignItems: 'center', position: 'sticky', top: 0, backgroundColor: 'white', zIndex: 1 }, footerText: { fontSize: '0.8rem', marginTop: '2rem' }, - closeButton: { position: 'absolute', top: '1rem', right: '1rem', cursor: 'pointer', fontSize: '1.5rem' } + closeButton: { position: 'absolute', top: '1.5rem', right: '1rem', cursor: 'pointer', fontSize: '2rem' }, + switchLabel: { fontSize: '1rem', paddingTop: '1rem' }, + drawerBody: { paddingTop: '1rem' }, }); function App({ dataset, tree }: AppProperties) { @@ -45,8 +47,8 @@ function App({ dataset, tree }: AppProperties) { Settings × - - + + Render markdown for conversations} /> From 67076a82932b27ec9e084f9bbceb2d7f8b1ca6db Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Mon, 10 Mar 2025 23:33:58 -0700 Subject: [PATCH 07/18] Show metric details in a collapsible section on selection instead displaying only on hover --- .../TypeScript/components/MetricCard.tsx | 112 +++++++++-------- .../TypeScript/components/ScenarioTree.tsx | 116 +++++++++++++++++- 2 files changed, 166 insertions(+), 62 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx index 504674bcab4..462bb61da8b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx @@ -1,19 +1,28 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -import { makeStyles, mergeClasses, tokens, Tooltip } from "@fluentui/react-components"; -import { DismissCircle16Regular, ErrorCircleRegular, Info16Regular, InfoRegular, Warning16Regular, WarningRegular } from "@fluentui/react-icons"; +import { makeStyles, mergeClasses, tokens } from "@fluentui/react-components"; +import { DismissCircle16Regular, Info16Regular, Warning16Regular } from "@fluentui/react-icons"; const useCardListStyles = makeStyles({ metricCardList: { display: 'flex', gap: '1rem', flexWrap: 'wrap' }, }); -export const MetricCardList = ({ scenario }: { scenario: ScenarioRunResult }) => { +export const MetricCardList = ({ scenario, onMetricSelect, selectedMetric }: { + scenario: ScenarioRunResult, + onMetricSelect: (metric: MetricType | null) => void, + selectedMetric: MetricType | null +}) => { const classes = useCardListStyles(); return (
{Object.values(scenario.evaluationResult.metrics).map((metric, index) => ( - + onMetricSelect(selectedMetric === metric ? null : metric)} + isSelected={selectedMetric === metric} + /> ))}
); @@ -23,7 +32,21 @@ const useCardStyles = makeStyles({ card: { display: 'flex', flexDirection: 'column', alignItems: 'center', gap: '0.5rem', padding: '.75rem', border: '1px solid #e0e0e0', borderRadius: '4px', - minWidth: '8rem' + minWidth: '8rem', + cursor: 'pointer', + transition: 'box-shadow 0.2s ease-in-out, outline 0.2s ease-in-out', + position: 'relative', + '&:hover': { + opacity: 0.9, + boxShadow: '0 2px 4px rgba(0, 0, 0, 0.1)' + } + }, + selectedCard: { + zIndex: 1, + boxShadow: '0 4px 8px rgba(0, 0, 0, 0.15)', + outline: `2px solid ${tokens.colorNeutralForeground3}`, + outlineOffset: '0px', + border: 'none' }, metricText: { fontSize: '1rem', fontWeight: 'normal' }, valueText: { fontSize: '1.5rem', fontWeight: 'bold' }, @@ -81,8 +104,15 @@ const useCardColors = (interpretation?: EvaluationMetricInterpretation) => { type MetricType = StringMetric | NumericMetric | BooleanMetric | MetricWithNoValue; -export const MetricCard = ({ metric }: { metric: MetricType }) => { - +export const MetricCard = ({ + metric, + onClick, + isSelected +}: { + metric: MetricType, + onClick: () => void, + isSelected: boolean +}) => { let renderValue: (metric: MetricType) => React.ReactNode; switch (metric.$type) { case "string": @@ -106,27 +136,28 @@ export const MetricCard = ({ metric }: { metric: MetricType }) => { const classes = useCardStyles(); const { fg, bg } = useCardColors(metric.interpretation); - const hasReason = metric.interpretation?.reason != null; + + const hasReasons = metric.reason != null || metric.interpretation?.reason != null; const hasInformationalMessages = metric.diagnostics.some((d: EvaluationDiagnostic) => d.severity == "informational"); const hasWarningMessages = metric.diagnostics.some((d: EvaluationDiagnostic) => d.severity == "warning"); const hasErrorMessages = metric.diagnostics.some((d: EvaluationDiagnostic) => d.severity == "error"); - const supportsHover = hasReason || hasInformationalMessages || hasWarningMessages || hasErrorMessages; - const card = - (
-
{metric.name} { (hasErrorMessages && ) || - (hasWarningMessages && ) || - ((hasInformationalMessages || hasReason) && )}
+ + const cardClass = mergeClasses( + bg, + classes.card, + isSelected ? classes.selectedCard : undefined + ); + + return ( +
+
{metric.name} { + (hasErrorMessages && ) || + (hasWarningMessages && ) || + ((hasInformationalMessages || hasReasons) && )} +
{renderValue(metric)}
-
); - if (supportsHover) { - return ( }} - relationship="description"> - {card} - ); - } else { - return card; - } +
+ ); }; const useDetailStyles = makeStyles({ @@ -134,36 +165,3 @@ const useDetailStyles = makeStyles({ diagWarn: { fontStyle: tokens.fontFamilyMonospace, color: tokens.colorStatusWarningForeground2 }, diagInfo: { fontStyle: tokens.fontFamilyMonospace }, }); - -export const MetricDetails = ({ metric }: { metric: MetricWithNoValue | NumericMetric | BooleanMetric | StringMetric }) => { - const classes = useDetailStyles(); - const reason = metric.interpretation?.reason; - const failed = metric.interpretation?.failed ?? false; - const informationalMessages = metric.diagnostics.filter((d: EvaluationDiagnostic) => d.severity == "informational").map((d: EvaluationDiagnostic) => d.message); - const hasInformationalMessages = informationalMessages.length > 0; - const warningMessages = metric.diagnostics.filter((d: EvaluationDiagnostic) => d.severity == "warning").map((d: EvaluationDiagnostic) => d.message); - const hasWarningMessages = warningMessages.length > 0; - const errorMessages = metric.diagnostics.filter((d: EvaluationDiagnostic) => d.severity == "error").map((d: EvaluationDiagnostic) => d.message); - const hasErrorMessages = errorMessages.length > 0; - return ( -
- {reason &&
- {failed ? -

{reason}

: -

{reason}

- } -
} - {hasErrorMessages &&
- {errorMessages.map((message: string, index: number) => -

{message}

)} -
} - {hasWarningMessages &&
- {warningMessages.map((message: string, index: number) => -

{message}

)} -
} - {hasInformationalMessages &&
- {informationalMessages.map((message: string, index: number) => -

{message}

)} -
} -
); -}; \ No newline at end of file diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 1aa6aac9de5..a55cc34c386 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -7,10 +7,15 @@ import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails } from import { PassFailBar } from "./PassFailBar"; import { MetricCardList } from "./MetricCard"; import ReactMarkdown from "react-markdown"; -import { ErrorCircleRegular } from "@fluentui/react-icons"; +import { DismissCircle16Regular, Info16Regular, Warning16Regular } from "@fluentui/react-icons"; import { ChevronDown12Regular, ChevronRight12Regular } from '@fluentui/react-icons'; -const ScenarioLevel = ({ node, parentPath, isOpen, renderMarkdown }: { node: ScoreNode, parentPath: string, isOpen: (path: string) => boolean, renderMarkdown: boolean }) => { +const ScenarioLevel = ({ node, parentPath, isOpen, renderMarkdown }: { + node: ScoreNode, + parentPath: string, + isOpen: (path: string) => boolean, + renderMarkdown: boolean, +}) => { node.collapseSingleChildNodes(); const path = `${parentPath}.${node.name}`; if (node.isLeafNode) { @@ -50,11 +55,12 @@ export const ScenarioGroup = ({ node, renderMarkdown }: { node: ScoreNode, rende return ( - ); + ); }; export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRunResult, renderMarkdown: boolean }) => { const classes = useStyles(); + const [selectedMetric, setSelectedMetric] = useState(null); const failureMessages = []; for (const e of Object.values(scenario.evaluationResult.metrics)) { @@ -70,12 +76,103 @@ export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRu const {history, response} = getPromptDetails(scenario.messages, scenario.modelResponse); return (
- + + {selectedMetric && } {failureMessages && failureMessages.length > 0 && }
); }; +export const MetricDetailsSection = ({ metric }: { metric: MetricType }) => { + const classes = useStyles(); + const [isExpanded, setIsExpanded] = useState(true); + + const reason = metric.reason; + const hasReason = reason != null; + const interpretationReason = metric.interpretation?.reason; + const hasInterpretationReason = interpretationReason != null; + const diagnostics = metric.diagnostics || []; + const hasDiagnostics = diagnostics.length > 0; + + if (!hasReason && !hasInterpretationReason && !hasDiagnostics) return null; + + return ( +
+
setIsExpanded(!isExpanded)}> + {isExpanded ? : } +

Metric Details: {metric.name}

+
+ + {isExpanded && ( +
+ {hasReason && ( +
+
Evaluation Reason
+
+ {reason} +
+
+ )} + + {hasInterpretationReason && ( +
+ {metric.interpretation?.failed ? +
Failure Reason
: +
Interpretation Reason
+ } +
+ {metric.interpretation?.failed ? + {interpretationReason} : + {interpretationReason} + } +
+
+ )} + + {hasDiagnostics && ( +
+
Diagnostics
+ +
+ )} +
+ )} +
+ ); +}; + +const DiagnosticsContent = ({ diagnostics }: { diagnostics: EvaluationDiagnostic[] }) => { + const classes = useStyles(); + + const errorDiagnostics = diagnostics.filter(d => d.severity === "error"); + const warningDiagnostics = diagnostics.filter(d => d.severity === "warning"); + const infoDiagnostics = diagnostics.filter(d => d.severity === "informational"); + + return ( + <> + {errorDiagnostics.map((diag, index) => ( +
+ {diag.message} +
+ ))} + {warningDiagnostics.map((diag, index) => ( +
+ {diag.message} +
+ ))} + {infoDiagnostics.map((diag, index) => ( +
+ {diag.message} +
+ ))} + + ); +}; + const useStyles = makeStyles({ headerContainer: { display: 'flex', alignItems: 'center', flexDirection: 'row', gap: '0.5rem' }, promptHint: { fontFamily: tokens.fontFamilyMonospace, opacity: 0.6, fontSize: '0.7rem', paddingLeft: '1rem', whiteSpace: 'nowrap' }, @@ -124,6 +221,15 @@ const useStyles = makeStyles({ }, failMessage: { color: tokens.colorStatusDangerForeground2, + marginBottom: '0.25rem', + }, + warningMessage: { + color: tokens.colorStatusWarningForeground2, + marginBottom: '0.25rem', + }, + infoMessage: { + color: tokens.colorNeutralForeground1, + marginBottom: '0.25rem', }, failContainer: { padding: '1rem', @@ -158,7 +264,7 @@ export const FailMessage = ({ messages }: { messages: string[] }) => { {isExpanded && (
- {messages.map((msg) => <> {msg}
)} + {messages.map((msg) => <> {msg}
)}
)}
From 12253f9e56e656de2987780a8562004079a4529f Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 11 Mar 2025 00:31:34 -0700 Subject: [PATCH 08/18] Adjust sizing --- .../TypeScript/components/MetricCard.tsx | 10 ++-------- .../TypeScript/components/ScenarioTree.tsx | 17 ++++++++--------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx index 462bb61da8b..fdb826f784e 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx @@ -49,7 +49,7 @@ const useCardStyles = makeStyles({ border: 'none' }, metricText: { fontSize: '1rem', fontWeight: 'normal' }, - valueText: { fontSize: '1.5rem', fontWeight: 'bold' }, + valueText: { fontSize: '1rem', fontWeight: 'bold' }, scoreFgDefault: { color: tokens.colorNeutralStrokeAccessible }, scoreFg0: { color: tokens.colorStatusDangerForeground1 }, scoreFg1: { color: tokens.colorStatusDangerForeground2 }, @@ -102,7 +102,7 @@ const useCardColors = (interpretation?: EvaluationMetricInterpretation) => { return { fg, bg }; }; -type MetricType = StringMetric | NumericMetric | BooleanMetric | MetricWithNoValue; +export type MetricType = StringMetric | NumericMetric | BooleanMetric | MetricWithNoValue; export const MetricCard = ({ metric, @@ -159,9 +159,3 @@ export const MetricCard = ({
); }; - -const useDetailStyles = makeStyles({ - diagError: { fontStyle: tokens.fontFamilyMonospace, color: tokens.colorStatusDangerForeground2 }, - diagWarn: { fontStyle: tokens.fontFamilyMonospace, color: tokens.colorStatusWarningForeground2 }, - diagInfo: { fontStyle: tokens.fontFamilyMonospace }, -}); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index a55cc34c386..8a34412cea9 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -5,7 +5,7 @@ import { makeStyles, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, Tree import { useState, useCallback } from "react"; import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails } from "./Summary"; import { PassFailBar } from "./PassFailBar"; -import { MetricCardList } from "./MetricCard"; +import { MetricCardList, type MetricType } from "./MetricCard"; import ReactMarkdown from "react-markdown"; import { DismissCircle16Regular, Info16Regular, Warning16Regular } from "@fluentui/react-icons"; import { ChevronDown12Regular, ChevronRight12Regular } from '@fluentui/react-icons'; @@ -176,9 +176,7 @@ const DiagnosticsContent = ({ diagnostics }: { diagnostics: EvaluationDiagnostic const useStyles = makeStyles({ headerContainer: { display: 'flex', alignItems: 'center', flexDirection: 'row', gap: '0.5rem' }, promptHint: { fontFamily: tokens.fontFamilyMonospace, opacity: 0.6, fontSize: '0.7rem', paddingLeft: '1rem', whiteSpace: 'nowrap' }, - score: { - fontSize: tokens.fontSizeBase100, - }, + score: { fontSize: tokens.fontSizeBase200 }, passFailBadge: { display: 'flex', flexDirection: 'row', @@ -190,13 +188,14 @@ const useStyles = makeStyles({ scenarioLabel: { whiteSpace: 'nowrap', fontWeight: '500', + fontSize: tokens.fontSizeBase300, }, iterationArea: { marginTop: '1rem', marginBottom: '1rem', }, section: { - marginTop: '2rem', + marginTop: '1rem', }, sectionHeader: { display: 'flex', @@ -208,12 +207,12 @@ const useStyles = makeStyles({ sectionHeaderText: { margin: 0, marginLeft: '0.5rem', - fontSize: '1.25rem', - fontWeight: 'bold', + fontSize: tokens.fontSizeBase300, + fontWeight: '500', }, sectionSubHeader: { - fontSize: '0.875rem', - fontWeight: 'bold', + fontSize: tokens.fontSizeBase300, + fontWeight: '500', marginBottom: '0.5rem', }, sectionContent: { From ada0cc47301432e81ddfe8d2e3ec602bed5723bf Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 11 Mar 2025 13:13:14 -0700 Subject: [PATCH 09/18] Update comment --- .../RelevanceTruthAndCompletenessEvaluator.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index b1bf9e797a5..85ff8bae12d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -23,7 +23,8 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; /// /// returns three s that contain scores /// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating -/// a poor score, and 5 indicating an excellent score. +/// a poor score, and 5 indicating an excellent score. Each returned score is also accompanied by a +/// that provides an explanation for the score. /// public sealed partial class RelevanceTruthAndCompletenessEvaluator : ChatConversationEvaluator { From 8d305c0375b21041ec75d38418b34f10e86cdf9a Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 11 Mar 2025 17:47:59 -0700 Subject: [PATCH 10/18] Remove Failure Reasons section --- .../TypeScript/components/ScenarioTree.tsx | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 8a34412cea9..ae50faa03a8 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -61,18 +61,6 @@ export const ScenarioGroup = ({ node, renderMarkdown }: { node: ScoreNode, rende export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRunResult, renderMarkdown: boolean }) => { const classes = useStyles(); const [selectedMetric, setSelectedMetric] = useState(null); - - const failureMessages = []; - for (const e of Object.values(scenario.evaluationResult.metrics)) { - if (e.interpretation && e.interpretation.failed) { - failureMessages.push(e.interpretation.reason || "Metric failed."); - } - for (const d of e.diagnostics) { - if (d.severity === "error") { - failureMessages.push(d.message); - } - } - } const {history, response} = getPromptDetails(scenario.messages, scenario.modelResponse); return (
@@ -82,7 +70,6 @@ export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRu selectedMetric={selectedMetric} /> {selectedMetric && } - {failureMessages && failureMessages.length > 0 && }
); }; @@ -250,26 +237,6 @@ const useStyles = makeStyles({ }, }); -export const FailMessage = ({ messages }: { messages: string[] }) => { - const classes = useStyles(); - const [isExpanded, setIsExpanded] = useState(true); - - return ( -
-
setIsExpanded(!isExpanded)}> - {isExpanded ? : } -

Failure Reasons

-
- - {isExpanded && ( -
- {messages.map((msg) => <> {msg}
)} -
- )} -
- ); -}; - const PassFailBadge = ({ pass, total }: { pass: number, total: number }) => { const classes = useStyles(); return (
From 052cd4680f9c130b91d95cb91e3f41354da98164 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 11 Mar 2025 18:02:58 -0700 Subject: [PATCH 11/18] Use / in place of > for level separators --- .../TypeScript/components/ScenarioTree.tsx | 20 ++++++++++++++++++- .../TypeScript/components/Summary.ts | 2 +- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index ae50faa03a8..f4208b76687 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -176,6 +176,15 @@ const useStyles = makeStyles({ whiteSpace: 'nowrap', fontWeight: '500', fontSize: tokens.fontSizeBase300, + display: 'flex', + gap: '0.5rem', + alignItems: 'center', + }, + separator: { + color: tokens.colorNeutralForeground4, + fontSize: tokens.fontSizeBase200, + fontWeight: '300', + padding: '0 0.125rem', }, iterationArea: { marginTop: '1rem', @@ -262,9 +271,18 @@ const ScoreNodeHeader = ({ item, showPrompt }: { item: ScoreNode, showPrompt?: b break; } + const parts = item.name.split(' / '); + return (
-
{item.name}
+
+ {parts.map((part, index) => ( + <> + {part} + {index < parts.length - 1 && /} + + ))} +
{showPrompt && item.shortenedPrompt &&
{item.shortenedPrompt}
}
); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts index 7d72b1a7e63..2ea86a4e8f7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts @@ -111,7 +111,7 @@ export class ScoreNode { while (this.childNodes.length === 1) { const onlyChild = this.childNodes[0]; - this.name += ` > ${onlyChild.name}`; + this.name += ` / ${onlyChild.name}`; this.children = onlyChild.children; this.scenario = onlyChild.scenario; } From 0085309a52727260fbd8b97a11a4a7ab38658d90 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 02:33:18 -0700 Subject: [PATCH 12/18] Introduce chat bubbles for conversations Also fix numerous sizing and layout issues. --- .../TypeScript/components/MetricCard.tsx | 131 +++++++++++++----- .../TypeScript/components/ScenarioTree.tsx | 117 ++++++++++++---- .../TypeScript/components/Summary.ts | 58 +++++--- 3 files changed, 231 insertions(+), 75 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx index fdb826f784e..3ec8defb84a 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx @@ -1,7 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -import { makeStyles, mergeClasses, tokens } from "@fluentui/react-components"; +import { makeStyles, mergeClasses, tokens, Tooltip } from "@fluentui/react-components"; import { DismissCircle16Regular, Info16Regular, Warning16Regular } from "@fluentui/react-icons"; const useCardListStyles = makeStyles({ @@ -30,9 +30,14 @@ export const MetricCardList = ({ scenario, onMetricSelect, selectedMetric }: { const useCardStyles = makeStyles({ card: { - display: 'flex', flexDirection: 'column', alignItems: 'center', gap: '0.5rem', - padding: '.75rem', border: '1px solid #e0e0e0', borderRadius: '4px', - minWidth: '8rem', + display: 'flex', + flexDirection: 'column', + alignItems: 'center', + gap: '0.5rem', + padding: '.75rem', + border: '1px solid #e0e0e0', + borderRadius: '4px', + width: '10rem', cursor: 'pointer', transition: 'box-shadow 0.2s ease-in-out, outline 0.2s ease-in-out', position: 'relative', @@ -48,8 +53,41 @@ const useCardStyles = makeStyles({ outlineOffset: '0px', border: 'none' }, - metricText: { fontSize: '1rem', fontWeight: 'normal' }, - valueText: { fontSize: '1rem', fontWeight: 'bold' }, + metricNameText: { + fontSize: '1rem', + fontWeight: 'normal', + width: '80%', + textAlign: 'center', + overflow: 'hidden', + textOverflow: 'ellipsis', + lineHeight: '1.2', + maxHeight: '2.4em', + display: '-webkit-box', + WebkitLineClamp: 2, + WebkitBoxOrient: 'vertical', + marginTop: '-0.5rem', + }, + iconPlaceholder: { + height: '4px', + width: '100%', + position: 'relative', + marginBottom: '0', + }, + metricIcon: { + position: 'absolute', + top: '-0.25rem', + right: '-0.25rem', + }, + metricValueText: { + fontSize: '1rem', + fontWeight: 'bold', + width: '80%', + textAlign: 'center', + overflow: 'hidden', + textOverflow: 'ellipsis', + whiteSpace: 'nowrap', + maxHeight: '1.2em', + }, scoreFgDefault: { color: tokens.colorNeutralStrokeAccessible }, scoreFg0: { color: tokens.colorStatusDangerForeground1 }, scoreFg1: { color: tokens.colorStatusDangerForeground2 }, @@ -113,27 +151,24 @@ export const MetricCard = ({ onClick: () => void, isSelected: boolean }) => { - let renderValue: (metric: MetricType) => React.ReactNode; - switch (metric.$type) { - case "string": - renderValue = (metric: MetricType) => <>{metric?.value ?? "??"}; - break; - case "boolean": - renderValue = (metric: MetricType) => <>{ - !metric || metric.value === undefined || metric.value === null ? - '??' : - metric.value ? 'Pass' : 'Fail'}; - break; - case "numeric": - renderValue = (metric: MetricType) => <>{metric?.value ?? "??"}; - break; - case "none": - renderValue = () => <>None; - break; - default: - throw new Error(`Unknown metric type: ${metric["$type"]}`); - } + const getValue = (metric: MetricType): string => { + switch (metric.$type) { + case "string": + return metric?.value ?? "??"; + case "boolean": + return !metric || metric.value === undefined || metric.value === null ? + '??' : + metric.value ? 'Pass' : 'Fail'; + case "numeric": + return metric?.value?.toString() ?? "??"; + case "none": + return "None"; + default: + throw new Error(`Unknown metric type: ${metric["$type"]}`); + } + }; + const metricValue = getValue(metric); const classes = useCardStyles(); const { fg, bg } = useCardColors(metric.interpretation); @@ -148,14 +183,44 @@ export const MetricCard = ({ isSelected ? classes.selectedCard : undefined ); + let statusIcon = null; + let statusTooltip = ''; + + if (hasErrorMessages) { + statusIcon = ; + statusTooltip = 'This metric has errors. Click the card to view more details.'; + } else if (hasWarningMessages) { + statusIcon = ; + statusTooltip = 'This metric has warnings. Click the card to view more details.'; + } else if (hasInformationalMessages || hasReasons) { + statusIcon = ; + statusTooltip = 'This metric has additional information. Click the card to view more details.'; + } + + const tooltipContent = ( +
+
Name: {metric.name}
+
Value: {metricValue}
+
+ ); + return ( -
-
{metric.name} { - (hasErrorMessages && ) || - (hasWarningMessages && ) || - ((hasInformationalMessages || hasReasons) && )} + +
+
+ {statusIcon && ( + + {statusIcon} + + )} +
+
+ {metric.name} +
+
+ {metricValue} +
-
{renderValue(metric)}
-
+ ); }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index f4208b76687..db1a3c518ba 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -1,9 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -import { makeStyles, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, TreeOpenChangeData, TreeOpenChangeEvent } from "@fluentui/react-components"; -import { useState, useCallback } from "react"; -import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails } from "./Summary"; +import React, { useState, useCallback } from "react"; +import { makeStyles, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, TreeOpenChangeData, TreeOpenChangeEvent, mergeClasses } from "@fluentui/react-components"; +import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails, ChatMessageDisplay } from "./Summary"; import { PassFailBar } from "./PassFailBar"; import { MetricCardList, type MetricType } from "./MetricCard"; import ReactMarkdown from "react-markdown"; @@ -61,7 +61,7 @@ export const ScenarioGroup = ({ node, renderMarkdown }: { node: ScoreNode, rende export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRunResult, renderMarkdown: boolean }) => { const classes = useStyles(); const [selectedMetric, setSelectedMetric] = useState(null); - const {history, response} = getPromptDetails(scenario.messages, scenario.modelResponse); + const { messages } = getPromptDetails(scenario.messages, scenario.modelResponse); return (
{selectedMetric && } - +
); }; @@ -191,14 +191,14 @@ const useStyles = makeStyles({ marginBottom: '1rem', }, section: { - marginTop: '1rem', + marginTop: '0.75rem', }, sectionHeader: { display: 'flex', alignItems: 'center', cursor: 'pointer', userSelect: 'none', - marginBottom: '1rem', + marginBottom: '0.5rem', }, sectionHeaderText: { margin: 0, @@ -209,10 +209,10 @@ const useStyles = makeStyles({ sectionSubHeader: { fontSize: tokens.fontSizeBase300, fontWeight: '500', - marginBottom: '0.5rem', + marginBottom: '0.25rem', }, sectionContent: { - marginBottom: '1.5rem', + marginBottom: '0.75rem', }, failMessage: { color: tokens.colorStatusDangerForeground2, @@ -233,9 +233,7 @@ const useStyles = makeStyles({ cursor: 'text', }, conversationBox: { - border: '1px solid #e0e0e0', - borderRadius: '4px', - padding: '1rem', + padding: '0.75rem', maxHeight: '20rem', overflow: 'auto', cursor: 'text', @@ -244,6 +242,51 @@ const useStyles = makeStyles({ wordWrap: 'break-word', }, }, + chatContainer: { + display: 'flex', + flexDirection: 'column', + gap: '0.75rem', + padding: '0.75rem 0', + position: 'relative', + }, + messageRow: { + display: 'flex', + flexDirection: 'column', + width: '900px', + position: 'relative', + }, + userMessageRow: { + marginLeft: '0', + }, + assistantMessageRow: { + marginLeft: '100px', + }, + messageParticipantName: { + fontSize: tokens.fontSizeBase200, + marginBottom: '0.25rem', + color: tokens.colorNeutralForeground3, + paddingLeft: '0.5rem', + }, + messageBubble: { + padding: '0.75rem 1rem', + borderRadius: '12px', + overflow: 'hidden', + wordBreak: 'break-word', + width: '100%', + }, + userBubble: { + backgroundColor: tokens.colorNeutralBackground3, + borderTopLeftRadius: '4px', + }, + systemBubble: { + backgroundColor: tokens.colorBrandBackground, + color: tokens.colorNeutralForegroundInverted, + borderTopLeftRadius: '4px', + }, + assistantBubble: { + backgroundColor: tokens.colorNeutralBackground4, + borderTopRightRadius: '4px', + }, }); const PassFailBadge = ({ pass, total }: { pass: number, total: number }) => { @@ -277,10 +320,10 @@ const ScoreNodeHeader = ({ item, showPrompt }: { item: ScoreNode, showPrompt?: b
{parts.map((part, index) => ( - <> + {part} {index < parts.length - 1 && /} - + ))}
@@ -288,10 +331,15 @@ const ScoreNodeHeader = ({ item, showPrompt }: { item: ScoreNode, showPrompt?: b
); }; -export const PromptDetails = ({ history, response, renderMarkdown }: { history: string, response: string, renderMarkdown: boolean }) => { +export const PromptDetails = ({ messages, renderMarkdown }: { + messages: ChatMessageDisplay[], + renderMarkdown: boolean +}) => { const classes = useStyles(); const [isExpanded, setIsExpanded] = useState(true); + const isUserSide = (role: string) => role.toLowerCase() === 'user' || role.toLowerCase() === 'system'; + return (
setIsExpanded(!isExpanded)}> @@ -300,16 +348,35 @@ export const PromptDetails = ({ history, response, renderMarkdown }: { history:
{isExpanded && ( -
-
-
Prompt
- {renderMarkdown ? {history} :
{history}
} -
- -
-
Response
- {renderMarkdown ? {response} :
{response}
} -
+
+ {messages.map((message, index) => { + const isFromUserSide = isUserSide(message.role); + const messageRowClass = mergeClasses( + classes.messageRow, + isFromUserSide ? classes.userMessageRow : classes.assistantMessageRow + ); + + let messageBubble; + if (message.role.toLowerCase() === 'system') { + messageBubble = mergeClasses(classes.messageBubble, classes.systemBubble); + } else if (isFromUserSide) { + messageBubble = mergeClasses(classes.messageBubble, classes.userBubble); + } else { + messageBubble = mergeClasses(classes.messageBubble, classes.assistantBubble); + } + + return ( +
+
{message.participantName}
+
+ {renderMarkdown ? + {message.content} : +
{message.content}
+ } +
+
+ ); + })}
)}
diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts index 2ea86a4e8f7..f49db7a2a60 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts @@ -65,7 +65,6 @@ export class ScoreNode { } aggregate() { - // Reset node to defaults before recalculating this.failed = false; this.numPassingIterations = 0; this.numFailingIterations = 0; @@ -84,7 +83,15 @@ export class ScoreNode { this.numPassingIterations = this.failed ? 0 : 1; this.numFailingIterations = this.failed ? 1 : 0; const lastMessage = this.scenario?.messages[this.scenario?.messages.length - 1]; - const {history} = getPromptDetails(lastMessage ? [lastMessage] : [], this.scenario?.modelResponse); + + const { messages } = getPromptDetails(lastMessage ? [lastMessage] : [], this.scenario?.modelResponse); + let history = ""; + if (messages.length === 1) { + history = messages[0].content; + } else if (messages.length > 1) { + history = messages.map(m => `[${m.participantName}] ${m.content}`).join("\n\n"); + } + this.shortenedPrompt = shortenPrompt(history); } else { for (const child of this.childNodes) { @@ -160,25 +167,42 @@ const isTextContent = (content: AIContent): content is TextContent => { return (content as TextContent).text !== undefined; }; -export const getPromptDetails = (messages: ChatMessage[], modelResponse?: ChatResponse): {history:string, response: string}=> { - let history: string = ""; - if (messages.length === 1) { - history = messages[0].contents.map(c => (c as TextContent).text).join("\n"); - } else if (messages.length > 1) { - const historyItems: string[] = []; - for (const m of messages) { +export type ChatMessageDisplay = { + role: string; + participantName: string; + content: string; +}; + +export const getPromptDetails = (messages: ChatMessage[], modelResponse?: ChatResponse): { messages: ChatMessageDisplay[] } => { + const chatMessages: ChatMessageDisplay[] = []; + + for (const m of messages) { + for (const c of m.contents) { + if (isTextContent(c)) { + const participantName = m.authorName ? `${m.authorName} (${m.role})` : m.role; + chatMessages.push({ + role: m.role, + participantName: participantName, + content: c.text + }); + } + } + } + + if (modelResponse?.messages) { + for (const m of modelResponse.messages) { for (const c of m.contents) { if (isTextContent(c)) { - const historyItem = m.authorName - ? `[${m.authorName} (${m.role})] ${c.text}` : `[${m.role}] ${c.text}`; - historyItems.push(historyItem); + const participantName = m.authorName ? `${m.authorName} (${m.role})` : m.role || 'Assistant'; + chatMessages.push({ + role: m.role, + participantName: participantName, + content: c.text + }); } } } - history = historyItems.join("\n\n"); } - const response: string = modelResponse?.messages.map(m => m.contents.map(c => (c as TextContent).text).join("\n") ?? "").join("\n") ?? ""; - - return { history, response }; -}; \ No newline at end of file + return { messages: chatMessages }; +}; From 4554caf43da3644bdf83fa862906087274bda5d0 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 03:04:30 -0700 Subject: [PATCH 13/18] Add some more tests for reason --- .../EndToEndTests.cs | 12 ++-- .../ResultsTests.cs | 64 ++++++++++++------- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs index 65801f0342f..8307dc38591 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs @@ -81,9 +81,9 @@ await _reportingConfiguration.CreateScenarioRunAsync( NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Diagnostics.Single().Message)); - Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Diagnostics.Single().Message)); - Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Diagnostics.Single().Message)); + Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Reason)); + Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Reason)); + Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Reason)); NumericMetric coherence = result.Get(CoherenceEvaluator.CoherenceMetricName); Assert.True(coherence.Value >= 4); @@ -132,9 +132,9 @@ await _reportingConfiguration.CreateScenarioRunAsync( NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Diagnostics.Single().Message)); - Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Diagnostics.Single().Message)); - Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Diagnostics.Single().Message)); + Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Reason)); + Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Reason)); + Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Reason)); NumericMetric coherence = result.Get(CoherenceEvaluator.CoherenceMetricName); Assert.True(coherence.Value >= 4); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs index 338532e5a3d..e7ed381ad93 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs @@ -151,7 +151,8 @@ public async Task ResultWithBooleanMetric() var metricA = new BooleanMetric("Metric with value false", false); var metricB = new BooleanMetric("Metric with value true", true); var metricC = new BooleanMetric("Metric without value"); - evaluator.TestMetrics = [metricA, metricB, metricC]; + var metricD = new BooleanMetric("Metric with reason", false, reason: "The reason"); + evaluator.TestMetrics = [metricA, metricB, metricC, metricD]; await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync( @@ -163,6 +164,7 @@ await reportingConfiguration.CreateScenarioRunAsync( Assert.Null(metricA.Interpretation); Assert.Null(metricB.Interpretation); Assert.Null(metricC.Interpretation); + Assert.Null(metricD.Interpretation); Assert.False(result.ContainsDiagnostics()); } @@ -176,7 +178,8 @@ public async Task ResultWithBooleanMetricAndInterpretation() var metricA = new BooleanMetric("Metric with value false", false); var metricB = new BooleanMetric("Metric with value true", true); var metricC = new BooleanMetric("Metric without value"); - evaluator.TestMetrics = [metricA, metricB, metricC]; + var metricD = new BooleanMetric("Metric with reason", false, reason: "The reason"); + evaluator.TestMetrics = [metricA, metricB, metricC, metricD]; await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync( @@ -221,9 +224,9 @@ public async Task ResultWithStringMetric() var metricF = new StringMetric("Measurement System: Nautical", "Nautical"); var metricG = new StringMetric("Measurement System: Astronomical", "Astronomical"); var metricH = new StringMetric("Measurement System: Multiple", "Multiple"); - var metricI = new StringMetric("Measurement System: Blah", "Blah"); - var metricJ = new StringMetric("Measurement System: Empty", ""); - var metricK = new StringMetric("Measurement System: Null"); + var metricI = new StringMetric("Measurement System: Blah", "Blah", reason: "Value was unexpected"); + var metricJ = new StringMetric("Measurement System: Empty", "", reason: "Value was empty"); + var metricK = new StringMetric("Measurement System: Null", reason: "Value was null"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH, metricI, metricJ, metricK]; @@ -276,9 +279,9 @@ public async Task ResultWithStringMetricAndInterpretation() var metricF = new StringMetric("Measurement System: Nautical", "Nautical"); var metricG = new StringMetric("Measurement System: Astronomical", "Astronomical"); var metricH = new StringMetric("Measurement System: Multiple", "Multiple"); - var metricI = new StringMetric("Measurement System: Blah", "Blah"); - var metricJ = new StringMetric("Measurement System: Empty", ""); - var metricK = new StringMetric("Measurement System: Null"); + var metricI = new StringMetric("Measurement System: Blah", "Blah", reason: "Value was unexpected"); + var metricJ = new StringMetric("Measurement System: Empty", "", reason: "Value was empty"); + var metricK = new StringMetric("Measurement System: Null", reason: "Value was null"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH, metricI, metricJ, metricK]; @@ -322,14 +325,14 @@ public async Task ResultWithNumericMetrics() var evaluator = new TestEvaluator(); ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); - var metricA = new NumericMetric("Metric with value 0", 0); - var metricB = new NumericMetric("Metric with value 1", 1); - var metricC = new NumericMetric("Metric with value 2", 2); - var metricD = new NumericMetric("Metric with value 3", 3); - var metricE = new NumericMetric("Metric with value 4", 4); - var metricF = new NumericMetric("Metric with value 5", 5); - var metricG = new NumericMetric("Metric with value 6", 6); - var metricH = new NumericMetric("Metric with no value"); + var metricA = new NumericMetric("Metric with value 0", 0, reason: "Because of reason A"); + var metricB = new NumericMetric("Metric with value 1", 1, reason: "Because of reason B"); + var metricC = new NumericMetric("Metric with value 2", 2, reason: "Because of reason C"); + var metricD = new NumericMetric("Metric with value 3", 3, reason: "Because of reason D"); + var metricE = new NumericMetric("Metric with value 4", 4, reason: "Because of reason E"); + var metricF = new NumericMetric("Metric with value 5", 5, reason: "Because of reason F"); + var metricG = new NumericMetric("Metric with value 6", 6, reason: "Because of reason G"); + var metricH = new NumericMetric("Metric with no value", reason: "Because of reason H"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH]; await using ScenarioRun scenarioRun = @@ -357,14 +360,14 @@ public async Task ResultWithNumericMetricsAndInterpretation() var evaluator = new TestEvaluator(); ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); - var metricA = new NumericMetric("Metric with value 0", 0); - var metricB = new NumericMetric("Metric with value 1", 1); - var metricC = new NumericMetric("Metric with value 2", 2); - var metricD = new NumericMetric("Metric with value 3", 3); - var metricE = new NumericMetric("Metric with value 4", 4); - var metricF = new NumericMetric("Metric with value 5", 5); - var metricG = new NumericMetric("Metric with value 6", 6); - var metricH = new NumericMetric("Metric with no value"); + var metricA = new NumericMetric("Metric with value 0", 0, reason: "Because of reason A"); + var metricB = new NumericMetric("Metric with value 1", 1, reason: "Because of reason B"); + var metricC = new NumericMetric("Metric with value 2", 2, reason: "Because of reason C"); + var metricD = new NumericMetric("Metric with value 3", 3, reason: "Because of reason D"); + var metricE = new NumericMetric("Metric with value 4", 4, reason: "Because of reason E"); + var metricF = new NumericMetric("Metric with value 5", 5, reason: "Because of reason F"); + var metricG = new NumericMetric("Metric with value 6", 6, reason: "Because of reason G"); + var metricH = new NumericMetric("Metric with no value", reason: "Because of reason H"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH]; await using ScenarioRun scenarioRun = @@ -405,11 +408,13 @@ public async Task ResultWithDiagnosticsOnUninterpretedMetrics() metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics"); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.Reason = "Reason for metric 2"; var metric3 = new EvaluationMetric("Metric with error diagnostics only"); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); @@ -419,9 +424,11 @@ public async Task ResultWithDiagnosticsOnUninterpretedMetrics() var metric4 = new StringMetric("Metric with warning diagnostics only"); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only"); metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; @@ -452,11 +459,13 @@ public async Task ResultWithDiagnosticsOnFailingMetrics() metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics"); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.Reason = "Reason for metric 2"; var metric3 = new EvaluationMetric("Metric with error diagnostics only"); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); @@ -466,9 +475,11 @@ public async Task ResultWithDiagnosticsOnFailingMetrics() var metric4 = new StringMetric("Metric with warning diagnostics only"); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only"); metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; @@ -505,23 +516,28 @@ public async Task ResultWithDiagnosticsOnPassingMetrics() metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics", value: true); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.Reason = "Reason for metric 2"; var metric3 = new NumericMetric("Metric with error diagnostics only", value: 5); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only", value: "A"); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only", value: 4); metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; From 12f1979fabd4131782e8bdee3fb3fe82b5f9e9c1 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 03:33:43 -0700 Subject: [PATCH 14/18] Fix indentation --- .../Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs | 2 +- .../Microsoft.Extensions.AI.Evaluation/NumericMetric.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs index fe987382a26..746ddcf02fe 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs @@ -14,4 +14,4 @@ namespace Microsoft.Extensions.AI.Evaluation; /// . /// public sealed class BooleanMetric(string name, bool? value = null, string? reason = null) -: EvaluationMetric(name, value, reason); + : EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs index 93234a967b8..6b34beafde8 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs @@ -25,4 +25,4 @@ namespace Microsoft.Extensions.AI.Evaluation; /// . /// public sealed class NumericMetric(string name, double? value = null, string? reason = null) - : EvaluationMetric(name, value, reason); + : EvaluationMetric(name, value, reason); From 51b7fb3756b915a72403a92182bcc64a2379c12f Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 03:58:08 -0700 Subject: [PATCH 15/18] Fix up some styles --- .../TypeScript/components/ScenarioTree.tsx | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index db1a3c518ba..37c376bcf88 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -95,7 +95,7 @@ export const MetricDetailsSection = ({ metric }: { metric: MetricType }) => {
{isExpanded && ( -
+
{hasReason && (
Evaluation Reason
@@ -232,22 +232,17 @@ const useStyles = makeStyles({ backgroundColor: tokens.colorNeutralBackground2, cursor: 'text', }, - conversationBox: { - padding: '0.75rem', - maxHeight: '20rem', - overflow: 'auto', - cursor: 'text', - '& pre': { - whiteSpace: 'pre-wrap', - wordWrap: 'break-word', - }, - }, - chatContainer: { + sectionContainer: { display: 'flex', flexDirection: 'column', gap: '0.75rem', padding: '0.75rem 0', + cursor: 'text', position: 'relative', + '& pre': { + whiteSpace: 'pre-wrap', + wordWrap: 'break-word', + }, }, messageRow: { display: 'flex', @@ -348,7 +343,7 @@ export const PromptDetails = ({ messages, renderMarkdown }: {
{isExpanded && ( -
+
{messages.map((message, index) => { const isFromUserSide = isUserSide(message.role); const messageRowClass = mergeClasses( From cf857935a910a91fdff0fa4de125a670bc7c0a86 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 17:23:57 -0700 Subject: [PATCH 16/18] Move Reason to base class alongside Interpretation --- .../RelevanceTruthAndCompletenessEvaluator.cs | 2 +- .../BooleanMetric.cs | 4 ++-- .../EvaluationMetric.cs | 12 +++++++++++- .../EvaluationMetric{T}.cs | 13 +++---------- .../NumericMetric.cs | 4 ++-- .../StringMetric.cs | 4 ++-- .../ResultsTests.cs | 2 ++ 7 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index 85ff8bae12d..73a62970914 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -24,7 +24,7 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; /// returns three s that contain scores /// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating /// a poor score, and 5 indicating an excellent score. Each returned score is also accompanied by a -/// that provides an explanation for the score. +/// that provides an explanation for the score. /// public sealed partial class RelevanceTruthAndCompletenessEvaluator : ChatConversationEvaluator { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs index 746ddcf02fe..0edb9f8b0b4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs @@ -10,8 +10,8 @@ namespace Microsoft.Extensions.AI.Evaluation; /// The name of the . /// The value of the . /// -/// An optional string that can be used to provide some commentary around the result represented by -/// . +/// An optional string that can be used to provide some commentary around the result represented by this +/// . /// public sealed class BooleanMetric(string name, bool? value = null, string? reason = null) : EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs index 78bb6831486..038599963af 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs @@ -15,17 +15,27 @@ namespace Microsoft.Extensions.AI.Evaluation; /// A base class that represents the result of an evaluation. /// /// The name of the . +/// +/// An optional string that can be used to provide some commentary around the result represented by this +/// . +/// [JsonDerivedType(typeof(NumericMetric), "numeric")] [JsonDerivedType(typeof(BooleanMetric), "boolean")] [JsonDerivedType(typeof(StringMetric), "string")] [JsonDerivedType(typeof(EvaluationMetric), "none")] -public class EvaluationMetric(string name) +public class EvaluationMetric(string name, string? reason = null) { /// /// Gets or sets the name of the . /// public string Name { get; set; } = name; + /// + /// Gets or sets a string that can optionally be used to provide some commentary around the result represented by + /// this . + /// + public string? Reason { get; set; } = reason; + /// /// Gets or sets an that identifies whether the result of the /// evaluation represented by the current is considered good or bad, passed or diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs index fb57d2571fb..d2745069bc5 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs @@ -20,25 +20,18 @@ public class EvaluationMetric : EvaluationMetric /// public T? Value { get; set; } - /// - /// Gets or sets a string that can optionally be used to provide some commentary around the result represented by - /// . - /// - public string? Reason { get; set; } - /// /// Initializes a new instance of the class. /// /// The name of the . /// The value of the . /// - /// An optional string that can be used to provide some commentary around the result represented by - /// . + /// An optional string that can be used to provide some commentary around the result represented by this + /// . /// protected EvaluationMetric(string name, T? value, string? reason = null) - : base(name) + : base(name, reason) { Value = value; - Reason = reason; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs index 6b34beafde8..2a0a07c2193 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs @@ -21,8 +21,8 @@ namespace Microsoft.Extensions.AI.Evaluation; /// The name of the . /// The value of the . /// -/// An optional string that can be used to provide some commentary around the result represented by -/// . +/// An optional string that can be used to provide some commentary around the result represented by this +/// . /// public sealed class NumericMetric(string name, double? value = null, string? reason = null) : EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs index 5fed520b3a2..97fd10921bc 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs @@ -13,8 +13,8 @@ namespace Microsoft.Extensions.AI.Evaluation; /// The name of the . /// The value of the . /// -/// An optional string that can be used to provide some commentary around the result represented by -/// . +/// An optional string that can be used to provide some commentary around the result represented by this +/// . /// public sealed class StringMetric(string name, string? value = null, string? reason = null) : EvaluationMetric(name, value, reason); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs index e7ed381ad93..b4336ec802f 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs @@ -419,6 +419,7 @@ public async Task ResultWithDiagnosticsOnUninterpretedMetrics() var metric3 = new EvaluationMetric("Metric with error diagnostics only"); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only"); @@ -470,6 +471,7 @@ public async Task ResultWithDiagnosticsOnFailingMetrics() var metric3 = new EvaluationMetric("Metric with error diagnostics only"); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only"); From 2116fd43e7e4c87794ed66b4ca68f0f5db4f5569 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 18:38:00 -0700 Subject: [PATCH 17/18] Update colors and spacing --- .../TypeScript/components/MetricCard.tsx | 9 +++---- .../TypeScript/components/ScenarioTree.tsx | 27 +++---------------- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx index 3ec8defb84a..738bd51bcba 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx @@ -37,7 +37,7 @@ const useCardStyles = makeStyles({ padding: '.75rem', border: '1px solid #e0e0e0', borderRadius: '4px', - width: '10rem', + width: '12rem', cursor: 'pointer', transition: 'box-shadow 0.2s ease-in-out, outline 0.2s ease-in-out', position: 'relative', @@ -61,10 +61,9 @@ const useCardStyles = makeStyles({ overflow: 'hidden', textOverflow: 'ellipsis', lineHeight: '1.2', - maxHeight: '2.4em', - display: '-webkit-box', - WebkitLineClamp: 2, - WebkitBoxOrient: 'vertical', + height: '1.2em', + display: "block", + whiteSpace: 'nowrap', marginTop: '-0.5rem', }, iconPlaceholder: { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 37c376bcf88..154bbcd42b2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -247,14 +247,14 @@ const useStyles = makeStyles({ messageRow: { display: 'flex', flexDirection: 'column', - width: '900px', + width: '60rem', position: 'relative', }, userMessageRow: { marginLeft: '0', }, assistantMessageRow: { - marginLeft: '100px', + marginLeft: '10rem', }, messageParticipantName: { fontSize: tokens.fontSizeBase200, @@ -268,19 +268,7 @@ const useStyles = makeStyles({ overflow: 'hidden', wordBreak: 'break-word', width: '100%', - }, - userBubble: { backgroundColor: tokens.colorNeutralBackground3, - borderTopLeftRadius: '4px', - }, - systemBubble: { - backgroundColor: tokens.colorBrandBackground, - color: tokens.colorNeutralForegroundInverted, - borderTopLeftRadius: '4px', - }, - assistantBubble: { - backgroundColor: tokens.colorNeutralBackground4, - borderTopRightRadius: '4px', }, }); @@ -350,20 +338,11 @@ export const PromptDetails = ({ messages, renderMarkdown }: { classes.messageRow, isFromUserSide ? classes.userMessageRow : classes.assistantMessageRow ); - - let messageBubble; - if (message.role.toLowerCase() === 'system') { - messageBubble = mergeClasses(classes.messageBubble, classes.systemBubble); - } else if (isFromUserSide) { - messageBubble = mergeClasses(classes.messageBubble, classes.userBubble); - } else { - messageBubble = mergeClasses(classes.messageBubble, classes.assistantBubble); - } return (
{message.participantName}
-
+
{renderMarkdown ? {message.content} :
{message.content}
From 882ead1df956f06ce4afa338ffe5b099187866c3 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 12 Mar 2025 20:28:24 -0700 Subject: [PATCH 18/18] Update test --- .../ResultsTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs index b4336ec802f..01241b5760b 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs @@ -193,6 +193,8 @@ await reportingConfiguration.CreateScenarioRunAsync( Assert.NotNull(metricB.Interpretation); Assert.True(metricB.Interpretation!.Failed); Assert.Null(metricC.Interpretation); + Assert.NotNull(metricD.Interpretation); + Assert.False(metricD.Interpretation!.Failed); Assert.False(result.ContainsDiagnostics()); }