diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs index 91d89d65531..d1dbc93a35b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs @@ -7,93 +7,7 @@ public partial class RelevanceTruthAndCompletenessEvaluator { private static class Prompts { - internal static string BuildEvaluationPrompt(string userQuery, string modelResponse, string history) - { -#pragma warning disable S103 // Lines should not be too long - return - $$""" - Read the History, User Query, and Model Response below and produce your response as a single JSON object. - Do not include any other text in your response besides the JSON object. - - The JSON object should have the following format. However, do not include any markdown tags in your - response. Your response should start with an open curly brace and end with a closing curly brace for the - JSON. - ``` - { - "relevance": 1, - "truth": 1, - "completeness": 1 - } - ``` - - ----- - - History: {{history}} - - ----- - - User Query: {{userQuery}} - - ----- - - Model Response: {{modelResponse}} - - ----- - - That's the History, User Query, and Model Response you will rate. Now, in 3 Steps, you will evaluate the Model Response on 3 criteria. - - ----- - - Step 1: Rate the relevance of the response. - - Regardless of truth of statements, how much of the response is directly related to the request? - - 1 = None of the response is at all related - 2 = A little of the response is directly related, or the response is a little bit related - 3 = Some of the response is related, or the response is somewhat related - 4 = Most of the response is related, or the response is mostly related - 5 = 100% of the response is 100% related to the request - - Record your response as the value of the "relevance" property in the JSON output. - - ----- - - Step 2: Rate the truth of the response. - - Read the History, Query, and Model Response again. - - Regardless of relevance, how true are the verifiable statements in the response? - - 1 = The entire response is totally false - 2 = A little of the response is true, or the response is a little bit true - 3 = Some of the response is true, or the response is somewhat true - 4 = Most of the response is true, or the response is mostly true - 5 = 100% of the response is 100% true - - Record your response as the value of the "truth" property in the JSON output. - - ----- - - Step 3: Rate the completeness of the response. - - Read the History, Query, and Model Response again. - - Regardless of whether the statements made in the response are true, how many of the points necessary to address the request, does the response contain? - - 1 = The response omits all points that are necessary to address the request. - 2 = The response includes a little of the points that are necessary to address the request. - 3 = The response includes some of the points that are necessary to address the request. - 4 = The response includes most of the points that are necessary to address the request. - 5 = The response includes all points that are necessary to address the request. For explain tasks, nothing is left unexplained. For improve tasks, I looked for all potential improvements, and none were left out. For fix tasks, the response purports to get the user all the way to a fixed state (regardless of whether it actually works). For "do task" responses, it does everything requested. - - Record your response as the value of the "completeness" property in the JSON output. - - ----- - """; -#pragma warning restore S103 - } - - internal static string BuildEvaluationPromptWithReasoning( + internal static string BuildEvaluationPrompt( string userQuery, string modelResponse, string history) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index 419feb45743..73a62970914 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -23,11 +23,10 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; /// /// returns three s that contain scores /// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating -/// a poor score, and 5 indicating an excellent score. +/// a poor score, and 5 indicating an excellent score. Each returned score is also accompanied by a +/// that provides an explanation for the score. /// -/// Options for . -public sealed partial class RelevanceTruthAndCompletenessEvaluator( - RelevanceTruthAndCompletenessEvaluatorOptions? options = null) : ChatConversationEvaluator +public sealed partial class RelevanceTruthAndCompletenessEvaluator : ChatConversationEvaluator { /// /// Gets the of the returned by @@ -61,9 +60,6 @@ public sealed partial class RelevanceTruthAndCompletenessEvaluator( ResponseFormat = ChatResponseFormat.Json }; - private readonly RelevanceTruthAndCompletenessEvaluatorOptions _options = - options ?? RelevanceTruthAndCompletenessEvaluatorOptions.Default; - /// protected override EvaluationResult InitializeResult() { @@ -101,17 +97,7 @@ userRequest is not null string renderedHistory = builder.ToString(); - string prompt = - _options.IncludeReasoning - ? Prompts.BuildEvaluationPromptWithReasoning( - renderedUserRequest, - renderedModelResponse, - renderedHistory) - : Prompts.BuildEvaluationPrompt( - renderedUserRequest, - renderedModelResponse, - renderedHistory); - + string prompt = Prompts.BuildEvaluationPrompt(renderedUserRequest, renderedModelResponse, renderedHistory); return prompt; } @@ -192,7 +178,7 @@ void UpdateResult(Rating rating) relevance.Interpretation = relevance.InterpretScore(); if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning)) { - relevance.AddDiagnostic(EvaluationDiagnostic.Informational(rating.RelevanceReasoning!)); + relevance.Reason = rating.RelevanceReasoning!; } NumericMetric truth = result.Get(TruthMetricName); @@ -200,7 +186,7 @@ void UpdateResult(Rating rating) truth.Interpretation = truth.InterpretScore(); if (!string.IsNullOrWhiteSpace(rating.TruthReasoning)) { - truth.AddDiagnostic(EvaluationDiagnostic.Informational(rating.TruthReasoning!)); + truth.Reason = rating.TruthReasoning!; } NumericMetric completeness = result.Get(CompletenessMetricName); @@ -208,7 +194,7 @@ void UpdateResult(Rating rating) completeness.Interpretation = completeness.InterpretScore(); if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning)) { - completeness.AddDiagnostic(EvaluationDiagnostic.Informational(rating.CompletenessReasoning!)); + completeness.Reason = rating.CompletenessReasoning!; } if (!string.IsNullOrWhiteSpace(rating.Error)) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs deleted file mode 100644 index 9271b2cc4af..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#pragma warning disable S3604 -// S3604: Member initializer values should not be redundant. -// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary -// constructor syntax. - -namespace Microsoft.Extensions.AI.Evaluation.Quality; - -/// -/// Options for . -/// -/// -/// If is set to , this instructs the -/// to include s (with -/// set to ) as -/// part of the returned s for 'Relevance' 'Truth' and 'Completeness' that explain the -/// reasoning behind the corresponding scores. By default, is set to -/// . -/// -public sealed class RelevanceTruthAndCompletenessEvaluatorOptions(bool includeReasoning = false) -{ - /// - /// Gets the default options for . - /// - /// - /// is set to by default. - /// - public static RelevanceTruthAndCompletenessEvaluatorOptions Default { get; } = - new RelevanceTruthAndCompletenessEvaluatorOptions(); - - /// - /// Gets a value indicating whether the should include - /// s (with set to - /// ) as part of the returned - /// s for 'Relevance' 'Truth' and 'Completeness' to explain the reasoning behind the - /// corresponding scores. By default, is set to . - /// - public bool IncludeReasoning { get; } = includeReasoning; -} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css index 0ab7c21274b..24695e5565d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.css @@ -5,7 +5,6 @@ The .NET Foundation licenses this file to you under the MIT license. #root { margin: 0 auto; - padding: 2rem; + padding: 0rem 2rem 2rem 2rem; background-color: white; } - diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx index cc2215f8e8e..b745dc2fe2d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx @@ -1,6 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +import { useState } from 'react'; +import { Settings28Regular } from '@fluentui/react-icons'; +import { Drawer, DrawerBody, DrawerHeader, DrawerHeaderTitle, Switch } from '@fluentui/react-components'; import { makeStyles } from '@fluentui/react-components'; import './App.css'; import { ScoreNode } from './Summary'; @@ -12,20 +15,44 @@ type AppProperties = { }; const useStyles = makeStyles({ - footerText: { fontSize: '0.8rem', marginTop: '2rem' } -}) + header: { display: 'flex', justifyContent: 'space-between', alignItems: 'center', position: 'sticky', top: 0, backgroundColor: 'white', zIndex: 1 }, + footerText: { fontSize: '0.8rem', marginTop: '2rem' }, + closeButton: { position: 'absolute', top: '1.5rem', right: '1rem', cursor: 'pointer', fontSize: '2rem' }, + switchLabel: { fontSize: '1rem', paddingTop: '1rem' }, + drawerBody: { paddingTop: '1rem' }, +}); -function App({dataset, tree}:AppProperties) { +function App({ dataset, tree }: AppProperties) { const classes = useStyles(); + const [isSettingsOpen, setIsSettingsOpen] = useState(false); + const [renderMarkdown, setRenderMarkdown] = useState(true); + + const toggleSettings = () => setIsSettingsOpen(!isSettingsOpen); + const toggleRenderMarkdown = () => setRenderMarkdown(!renderMarkdown); + const closeSettings = () => setIsSettingsOpen(false); + return ( <> -

AI Evaluation Report

+
+

AI Evaluation Report

+ +
- +

Generated at {dataset.createdAt} by Microsoft.Extensions.AI.Evaluation.Reporting version {dataset.generatorVersion}

+ + + + Settings + × + + + Render markdown for conversations} /> + + - ) + ); } -export default App +export default App; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts index 1055df330df..3877deccb8d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts @@ -65,20 +65,24 @@ type BaseEvaluationMetric = { type MetricWithNoValue = BaseEvaluationMetric & { $type: "none"; + reason?: string; value: undefined; }; type NumericMetric = BaseEvaluationMetric & { $type: "numeric"; + reason?: string; value?: number; }; type BooleanMetric = BaseEvaluationMetric & { $type: "boolean"; + reason?: string; value?: boolean; }; type StringMetric = BaseEvaluationMetric & { $type: "string"; + reason?: string; value?: string; }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx index 504674bcab4..738bd51bcba 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/MetricCard.tsx @@ -2,18 +2,27 @@ // The .NET Foundation licenses this file to you under the MIT license. import { makeStyles, mergeClasses, tokens, Tooltip } from "@fluentui/react-components"; -import { DismissCircle16Regular, ErrorCircleRegular, Info16Regular, InfoRegular, Warning16Regular, WarningRegular } from "@fluentui/react-icons"; +import { DismissCircle16Regular, Info16Regular, Warning16Regular } from "@fluentui/react-icons"; const useCardListStyles = makeStyles({ metricCardList: { display: 'flex', gap: '1rem', flexWrap: 'wrap' }, }); -export const MetricCardList = ({ scenario }: { scenario: ScenarioRunResult }) => { +export const MetricCardList = ({ scenario, onMetricSelect, selectedMetric }: { + scenario: ScenarioRunResult, + onMetricSelect: (metric: MetricType | null) => void, + selectedMetric: MetricType | null +}) => { const classes = useCardListStyles(); return (
{Object.values(scenario.evaluationResult.metrics).map((metric, index) => ( - + onMetricSelect(selectedMetric === metric ? null : metric)} + isSelected={selectedMetric === metric} + /> ))}
); @@ -21,12 +30,63 @@ export const MetricCardList = ({ scenario }: { scenario: ScenarioRunResult }) => const useCardStyles = makeStyles({ card: { - display: 'flex', flexDirection: 'column', alignItems: 'center', gap: '0.5rem', - padding: '.75rem', border: '1px solid #e0e0e0', borderRadius: '4px', - minWidth: '8rem' + display: 'flex', + flexDirection: 'column', + alignItems: 'center', + gap: '0.5rem', + padding: '.75rem', + border: '1px solid #e0e0e0', + borderRadius: '4px', + width: '12rem', + cursor: 'pointer', + transition: 'box-shadow 0.2s ease-in-out, outline 0.2s ease-in-out', + position: 'relative', + '&:hover': { + opacity: 0.9, + boxShadow: '0 2px 4px rgba(0, 0, 0, 0.1)' + } + }, + selectedCard: { + zIndex: 1, + boxShadow: '0 4px 8px rgba(0, 0, 0, 0.15)', + outline: `2px solid ${tokens.colorNeutralForeground3}`, + outlineOffset: '0px', + border: 'none' + }, + metricNameText: { + fontSize: '1rem', + fontWeight: 'normal', + width: '80%', + textAlign: 'center', + overflow: 'hidden', + textOverflow: 'ellipsis', + lineHeight: '1.2', + height: '1.2em', + display: "block", + whiteSpace: 'nowrap', + marginTop: '-0.5rem', + }, + iconPlaceholder: { + height: '4px', + width: '100%', + position: 'relative', + marginBottom: '0', + }, + metricIcon: { + position: 'absolute', + top: '-0.25rem', + right: '-0.25rem', + }, + metricValueText: { + fontSize: '1rem', + fontWeight: 'bold', + width: '80%', + textAlign: 'center', + overflow: 'hidden', + textOverflow: 'ellipsis', + whiteSpace: 'nowrap', + maxHeight: '1.2em', }, - metricText: { fontSize: '1rem', fontWeight: 'normal' }, - valueText: { fontSize: '1.5rem', fontWeight: 'bold' }, scoreFgDefault: { color: tokens.colorNeutralStrokeAccessible }, scoreFg0: { color: tokens.colorStatusDangerForeground1 }, scoreFg1: { color: tokens.colorStatusDangerForeground2 }, @@ -79,91 +139,87 @@ const useCardColors = (interpretation?: EvaluationMetricInterpretation) => { return { fg, bg }; }; -type MetricType = StringMetric | NumericMetric | BooleanMetric | MetricWithNoValue; - -export const MetricCard = ({ metric }: { metric: MetricType }) => { +export type MetricType = StringMetric | NumericMetric | BooleanMetric | MetricWithNoValue; - let renderValue: (metric: MetricType) => React.ReactNode; - switch (metric.$type) { - case "string": - renderValue = (metric: MetricType) => <>{metric?.value ?? "??"}; - break; - case "boolean": - renderValue = (metric: MetricType) => <>{ - !metric || metric.value === undefined || metric.value === null ? - '??' : - metric.value ? 'Pass' : 'Fail'}; - break; - case "numeric": - renderValue = (metric: MetricType) => <>{metric?.value ?? "??"}; - break; - case "none": - renderValue = () => <>None; - break; - default: - throw new Error(`Unknown metric type: ${metric["$type"]}`); - } +export const MetricCard = ({ + metric, + onClick, + isSelected +}: { + metric: MetricType, + onClick: () => void, + isSelected: boolean +}) => { + const getValue = (metric: MetricType): string => { + switch (metric.$type) { + case "string": + return metric?.value ?? "??"; + case "boolean": + return !metric || metric.value === undefined || metric.value === null ? + '??' : + metric.value ? 'Pass' : 'Fail'; + case "numeric": + return metric?.value?.toString() ?? "??"; + case "none": + return "None"; + default: + throw new Error(`Unknown metric type: ${metric["$type"]}`); + } + }; + const metricValue = getValue(metric); const classes = useCardStyles(); const { fg, bg } = useCardColors(metric.interpretation); - const hasReason = metric.interpretation?.reason != null; + + const hasReasons = metric.reason != null || metric.interpretation?.reason != null; const hasInformationalMessages = metric.diagnostics.some((d: EvaluationDiagnostic) => d.severity == "informational"); const hasWarningMessages = metric.diagnostics.some((d: EvaluationDiagnostic) => d.severity == "warning"); const hasErrorMessages = metric.diagnostics.some((d: EvaluationDiagnostic) => d.severity == "error"); - const supportsHover = hasReason || hasInformationalMessages || hasWarningMessages || hasErrorMessages; - const card = - (
-
{metric.name} { (hasErrorMessages && ) || - (hasWarningMessages && ) || - ((hasInformationalMessages || hasReason) && )}
-
{renderValue(metric)}
-
); - if (supportsHover) { - return ( }} - relationship="description"> - {card} - ); - } else { - return card; + + const cardClass = mergeClasses( + bg, + classes.card, + isSelected ? classes.selectedCard : undefined + ); + + let statusIcon = null; + let statusTooltip = ''; + + if (hasErrorMessages) { + statusIcon = ; + statusTooltip = 'This metric has errors. Click the card to view more details.'; + } else if (hasWarningMessages) { + statusIcon = ; + statusTooltip = 'This metric has warnings. Click the card to view more details.'; + } else if (hasInformationalMessages || hasReasons) { + statusIcon = ; + statusTooltip = 'This metric has additional information. Click the card to view more details.'; } -}; - -const useDetailStyles = makeStyles({ - diagError: { fontStyle: tokens.fontFamilyMonospace, color: tokens.colorStatusDangerForeground2 }, - diagWarn: { fontStyle: tokens.fontFamilyMonospace, color: tokens.colorStatusWarningForeground2 }, - diagInfo: { fontStyle: tokens.fontFamilyMonospace }, -}); - -export const MetricDetails = ({ metric }: { metric: MetricWithNoValue | NumericMetric | BooleanMetric | StringMetric }) => { - const classes = useDetailStyles(); - const reason = metric.interpretation?.reason; - const failed = metric.interpretation?.failed ?? false; - const informationalMessages = metric.diagnostics.filter((d: EvaluationDiagnostic) => d.severity == "informational").map((d: EvaluationDiagnostic) => d.message); - const hasInformationalMessages = informationalMessages.length > 0; - const warningMessages = metric.diagnostics.filter((d: EvaluationDiagnostic) => d.severity == "warning").map((d: EvaluationDiagnostic) => d.message); - const hasWarningMessages = warningMessages.length > 0; - const errorMessages = metric.diagnostics.filter((d: EvaluationDiagnostic) => d.severity == "error").map((d: EvaluationDiagnostic) => d.message); - const hasErrorMessages = errorMessages.length > 0; - return ( + + const tooltipContent = (
- {reason &&
- {failed ? -

{reason}

: -

{reason}

- } -
} - {hasErrorMessages &&
- {errorMessages.map((message: string, index: number) => -

{message}

)} -
} - {hasWarningMessages &&
- {warningMessages.map((message: string, index: number) => -

{message}

)} -
} - {hasInformationalMessages &&
- {informationalMessages.map((message: string, index: number) => -

{message}

)} -
} -
); -}; \ No newline at end of file +
Name: {metric.name}
+
Value: {metricValue}
+ + ); + + return ( + +
+
+ {statusIcon && ( + + {statusIcon} + + )} +
+
+ {metric.name} +
+
+ {metricValue} +
+
+
+ ); +}; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx index 7ed42c44b16..154bbcd42b2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScenarioTree.tsx @@ -1,15 +1,22 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -import { makeStyles, Switch, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, TreeOpenChangeData, TreeOpenChangeEvent } from "@fluentui/react-components"; -import { useState, useCallback } from "react"; -import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails } from "./Summary"; +import React, { useState, useCallback } from "react"; +import { makeStyles, tokens, Tree, TreeItem, TreeItemLayout, TreeItemValue, TreeOpenChangeData, TreeOpenChangeEvent, mergeClasses } from "@fluentui/react-components"; +import { DefaultRootNodeName, ScoreNode, ScoreNodeType, getPromptDetails, ChatMessageDisplay } from "./Summary"; import { PassFailBar } from "./PassFailBar"; -import { MetricCardList } from "./MetricCard"; +import { MetricCardList, type MetricType } from "./MetricCard"; import ReactMarkdown from "react-markdown"; -import { ErrorCircleRegular } from "@fluentui/react-icons"; +import { DismissCircle16Regular, Info16Regular, Warning16Regular } from "@fluentui/react-icons"; +import { ChevronDown12Regular, ChevronRight12Regular } from '@fluentui/react-icons'; -const ScenarioLevel = ({ node, parentPath, isOpen }: { node: ScoreNode, parentPath: string, isOpen: (path: string) => boolean }) => { +const ScenarioLevel = ({ node, parentPath, isOpen, renderMarkdown }: { + node: ScoreNode, + parentPath: string, + isOpen: (path: string) => boolean, + renderMarkdown: boolean, +}) => { + node.collapseSingleChildNodes(); const path = `${parentPath}.${node.name}`; if (node.isLeafNode) { return @@ -19,7 +26,7 @@ const ScenarioLevel = ({ node, parentPath, isOpen }: { node: ScoreNode, parentPa - + @@ -31,14 +38,14 @@ const ScenarioLevel = ({ node, parentPath, isOpen }: { node: ScoreNode, parentPa {node.childNodes.map((n) => ( - + ))} ; } }; -export const ScenarioGroup = ({ node }: { node: ScoreNode }) => { +export const ScenarioGroup = ({ node, renderMarkdown }: { node: ScoreNode, renderMarkdown: boolean }) => { const [openItems, setOpenItems] = useState>(() => new Set()); const handleOpenChange = useCallback((_: TreeOpenChangeEvent, data: TreeOpenChangeData) => { setOpenItems(data.openItems); @@ -47,39 +54,116 @@ export const ScenarioGroup = ({ node }: { node: ScoreNode }) => { return ( - - ); + + ); }; -export const ScoreDetail = ({ scenario }: { scenario: ScenarioRunResult }) => { +export const ScoreDetail = ({ scenario, renderMarkdown }: { scenario: ScenarioRunResult, renderMarkdown: boolean }) => { const classes = useStyles(); - - const failureMessages = []; - for (const e of Object.values(scenario.evaluationResult.metrics)) { - if (e.interpretation && e.interpretation.failed) { - failureMessages.push(e.interpretation.reason || "Metric failed."); - } - for (const d of e.diagnostics) { - if (d.severity === "error") { - failureMessages.push(d.message); - } - } - } - const {history, response} = getPromptDetails(scenario.messages, scenario.modelResponse); + const [selectedMetric, setSelectedMetric] = useState(null); + const { messages } = getPromptDetails(scenario.messages, scenario.modelResponse); return (
- - {failureMessages && failureMessages.length > 0 && } - + + {selectedMetric && } +
); }; +export const MetricDetailsSection = ({ metric }: { metric: MetricType }) => { + const classes = useStyles(); + const [isExpanded, setIsExpanded] = useState(true); + + const reason = metric.reason; + const hasReason = reason != null; + const interpretationReason = metric.interpretation?.reason; + const hasInterpretationReason = interpretationReason != null; + const diagnostics = metric.diagnostics || []; + const hasDiagnostics = diagnostics.length > 0; + + if (!hasReason && !hasInterpretationReason && !hasDiagnostics) return null; + + return ( +
+
setIsExpanded(!isExpanded)}> + {isExpanded ? : } +

Metric Details: {metric.name}

+
+ + {isExpanded && ( +
+ {hasReason && ( +
+
Evaluation Reason
+
+ {reason} +
+
+ )} + + {hasInterpretationReason && ( +
+ {metric.interpretation?.failed ? +
Failure Reason
: +
Interpretation Reason
+ } +
+ {metric.interpretation?.failed ? + {interpretationReason} : + {interpretationReason} + } +
+
+ )} + + {hasDiagnostics && ( +
+
Diagnostics
+ +
+ )} +
+ )} +
+ ); +}; + +const DiagnosticsContent = ({ diagnostics }: { diagnostics: EvaluationDiagnostic[] }) => { + const classes = useStyles(); + + const errorDiagnostics = diagnostics.filter(d => d.severity === "error"); + const warningDiagnostics = diagnostics.filter(d => d.severity === "warning"); + const infoDiagnostics = diagnostics.filter(d => d.severity === "informational"); + + return ( + <> + {errorDiagnostics.map((diag, index) => ( +
+ {diag.message} +
+ ))} + {warningDiagnostics.map((diag, index) => ( +
+ {diag.message} +
+ ))} + {infoDiagnostics.map((diag, index) => ( +
+ {diag.message} +
+ ))} + + ); +}; + const useStyles = makeStyles({ headerContainer: { display: 'flex', alignItems: 'center', flexDirection: 'row', gap: '0.5rem' }, promptHint: { fontFamily: tokens.fontFamilyMonospace, opacity: 0.6, fontSize: '0.7rem', paddingLeft: '1rem', whiteSpace: 'nowrap' }, - score: { - fontSize: tokens.fontSizeBase100, - }, + score: { fontSize: tokens.fontSizeBase200 }, passFailBadge: { display: 'flex', flexDirection: 'row', @@ -91,44 +175,103 @@ const useStyles = makeStyles({ scenarioLabel: { whiteSpace: 'nowrap', fontWeight: '500', - }, + fontSize: tokens.fontSizeBase300, + display: 'flex', + gap: '0.5rem', + alignItems: 'center', + }, + separator: { + color: tokens.colorNeutralForeground4, + fontSize: tokens.fontSizeBase200, + fontWeight: '300', + padding: '0 0.125rem', + }, iterationArea: { marginTop: '1rem', marginBottom: '1rem', }, + section: { + marginTop: '0.75rem', + }, + sectionHeader: { + display: 'flex', + alignItems: 'center', + cursor: 'pointer', + userSelect: 'none', + marginBottom: '0.5rem', + }, + sectionHeaderText: { + margin: 0, + marginLeft: '0.5rem', + fontSize: tokens.fontSizeBase300, + fontWeight: '500', + }, + sectionSubHeader: { + fontSize: tokens.fontSizeBase300, + fontWeight: '500', + marginBottom: '0.25rem', + }, + sectionContent: { + marginBottom: '0.75rem', + }, failMessage: { color: tokens.colorStatusDangerForeground2, + marginBottom: '0.25rem', + }, + warningMessage: { + color: tokens.colorStatusWarningForeground2, + marginBottom: '0.25rem', + }, + infoMessage: { + color: tokens.colorNeutralForeground1, + marginBottom: '0.25rem', }, failContainer: { padding: '1rem', border: '1px solid #e0e0e0', backgroundColor: tokens.colorNeutralBackground2, + cursor: 'text', }, - promptBox: { - border: '1px solid #e0e0e0', - borderRadius: '4px', - padding: '1rem', - maxHeight: '20rem', - overflow: 'auto', + sectionContainer: { + display: 'flex', + flexDirection: 'column', + gap: '0.75rem', + padding: '0.75rem 0', + cursor: 'text', + position: 'relative', + '& pre': { + whiteSpace: 'pre-wrap', + wordWrap: 'break-word', + }, }, - promptTitleLine: { + messageRow: { display: 'flex', - flexDirection: 'row', - alignItems: 'center', + flexDirection: 'column', + width: '60rem', + position: 'relative', + }, + userMessageRow: { + marginLeft: '0', + }, + assistantMessageRow: { + marginLeft: '10rem', + }, + messageParticipantName: { + fontSize: tokens.fontSizeBase200, + marginBottom: '0.25rem', + color: tokens.colorNeutralForeground3, + paddingLeft: '0.5rem', + }, + messageBubble: { + padding: '0.75rem 1rem', + borderRadius: '12px', + overflow: 'hidden', + wordBreak: 'break-word', + width: '100%', + backgroundColor: tokens.colorNeutralBackground3, }, - promptTitle: { flexGrow: 1 }, }); -export const FailMessage = ({ messages }: { messages: string[] }) => { - const classes = useStyles(); - return
-

Failure Reasons

-
- {messages.map((msg) => <> {msg}
)} -
-
; -}; - const PassFailBadge = ({ pass, total }: { pass: number, total: number }) => { const classes = useStyles(); return (
@@ -154,41 +297,62 @@ const ScoreNodeHeader = ({ item, showPrompt }: { item: ScoreNode, showPrompt?: b break; } + const parts = item.name.split(' / '); + return (
-
{item.name}
+
+ {parts.map((part, index) => ( + + {part} + {index < parts.length - 1 && /} + + ))} +
{showPrompt && item.shortenedPrompt &&
{item.shortenedPrompt}
}
); }; -export const PromptDetails = ({ history, response }: { history: string, response: string }) => { +export const PromptDetails = ({ messages, renderMarkdown }: { + messages: ChatMessageDisplay[], + renderMarkdown: boolean +}) => { const classes = useStyles(); - const [renderPrompt, setRenderPrompt] = useState(true); - const onChangeRenderPrompt = useCallback((ev: React.ChangeEvent) => { - setRenderPrompt(ev.currentTarget.checked); - }, [setRenderPrompt]); - const [renderResponse, setRenderResponse] = useState(true); - const onChangeRenderResponse = useCallback((ev: React.ChangeEvent) => { - setRenderResponse(ev.currentTarget.checked); - }, [setRenderResponse]); - - return (
-
-

Prompt

- -
+ const [isExpanded, setIsExpanded] = useState(true); -
- {renderPrompt ? {history} :
{history}
} -
+ const isUserSide = (role: string) => role.toLowerCase() === 'user' || role.toLowerCase() === 'system'; -
-

Response

- -
-
- {renderResponse ? {response} :
{response}
} + return ( +
+
setIsExpanded(!isExpanded)}> + {isExpanded ? : } +

Conversation

+
+ + {isExpanded && ( +
+ {messages.map((message, index) => { + const isFromUserSide = isUserSide(message.role); + const messageRowClass = mergeClasses( + classes.messageRow, + isFromUserSide ? classes.userMessageRow : classes.assistantMessageRow + ); + + return ( +
+
{message.participantName}
+
+ {renderMarkdown ? + {message.content} : +
{message.content}
+ } +
+
+ ); + })} +
+ )}
-
); + ); }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts index 8cef12ce4f1..f49db7a2a60 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Summary.ts @@ -65,7 +65,6 @@ export class ScoreNode { } aggregate() { - // Reset node to defaults before recalculating this.failed = false; this.numPassingIterations = 0; this.numFailingIterations = 0; @@ -84,7 +83,15 @@ export class ScoreNode { this.numPassingIterations = this.failed ? 0 : 1; this.numFailingIterations = this.failed ? 1 : 0; const lastMessage = this.scenario?.messages[this.scenario?.messages.length - 1]; - const {history} = getPromptDetails(lastMessage ? [lastMessage] : [], this.scenario?.modelResponse); + + const { messages } = getPromptDetails(lastMessage ? [lastMessage] : [], this.scenario?.modelResponse); + let history = ""; + if (messages.length === 1) { + history = messages[0].content; + } else if (messages.length > 1) { + history = messages.map(m => `[${m.participantName}] ${m.content}`).join("\n\n"); + } + this.shortenedPrompt = shortenPrompt(history); } else { for (const child of this.childNodes) { @@ -104,7 +111,22 @@ export class ScoreNode { } } - + collapseSingleChildNodes() { + if (this.isLeafNode) { + return; + } + + while (this.childNodes.length === 1) { + const onlyChild = this.childNodes[0]; + this.name += ` / ${onlyChild.name}`; + this.children = onlyChild.children; + this.scenario = onlyChild.scenario; + } + + for (const child of this.childNodes) { + child.collapseSingleChildNodes(); + } + } }; export const DefaultRootNodeName = "All Evaluations"; @@ -145,25 +167,42 @@ const isTextContent = (content: AIContent): content is TextContent => { return (content as TextContent).text !== undefined; }; -export const getPromptDetails = (messages: ChatMessage[], modelResponse?: ChatResponse): {history:string, response: string}=> { - let history: string = ""; - if (messages.length === 1) { - history = messages[0].contents.map(c => (c as TextContent).text).join("\n"); - } else if (messages.length > 1) { - const historyItems: string[] = []; - for (const m of messages) { +export type ChatMessageDisplay = { + role: string; + participantName: string; + content: string; +}; + +export const getPromptDetails = (messages: ChatMessage[], modelResponse?: ChatResponse): { messages: ChatMessageDisplay[] } => { + const chatMessages: ChatMessageDisplay[] = []; + + for (const m of messages) { + for (const c of m.contents) { + if (isTextContent(c)) { + const participantName = m.authorName ? `${m.authorName} (${m.role})` : m.role; + chatMessages.push({ + role: m.role, + participantName: participantName, + content: c.text + }); + } + } + } + + if (modelResponse?.messages) { + for (const m of modelResponse.messages) { for (const c of m.contents) { if (isTextContent(c)) { - const historyItem = m.authorName - ? `[${m.authorName} (${m.role})] ${c.text}` : `[${m.role}] ${c.text}`; - historyItems.push(historyItem); + const participantName = m.authorName ? `${m.authorName} (${m.role})` : m.role || 'Assistant'; + chatMessages.push({ + role: m.role, + participantName: participantName, + content: c.text + }); } } } - history = historyItems.join("\n\n"); } - const response: string = modelResponse?.messages.map(m => m.contents.map(c => (c as TextContent).text).join("\n") ?? "").join("\n") ?? ""; - - return { history, response }; -}; \ No newline at end of file + return { messages: chatMessages }; +}; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs index bc71408ffa2..0edb9f8b0b4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/BooleanMetric.cs @@ -9,4 +9,9 @@ namespace Microsoft.Extensions.AI.Evaluation; ///
/// The name of the . /// The value of the . -public sealed class BooleanMetric(string name, bool? value = null) : EvaluationMetric(name, value); +/// +/// An optional string that can be used to provide some commentary around the result represented by this +/// . +/// +public sealed class BooleanMetric(string name, bool? value = null, string? reason = null) + : EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs index 78bb6831486..038599963af 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs @@ -15,17 +15,27 @@ namespace Microsoft.Extensions.AI.Evaluation; /// A base class that represents the result of an evaluation. /// /// The name of the . +/// +/// An optional string that can be used to provide some commentary around the result represented by this +/// . +/// [JsonDerivedType(typeof(NumericMetric), "numeric")] [JsonDerivedType(typeof(BooleanMetric), "boolean")] [JsonDerivedType(typeof(StringMetric), "string")] [JsonDerivedType(typeof(EvaluationMetric), "none")] -public class EvaluationMetric(string name) +public class EvaluationMetric(string name, string? reason = null) { /// /// Gets or sets the name of the . /// public string Name { get; set; } = name; + /// + /// Gets or sets a string that can optionally be used to provide some commentary around the result represented by + /// this . + /// + public string? Reason { get; set; } = reason; + /// /// Gets or sets an that identifies whether the result of the /// evaluation represented by the current is considered good or bad, passed or diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs index f0d6eea9d10..d2745069bc5 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric{T}.cs @@ -25,8 +25,12 @@ public class EvaluationMetric : EvaluationMetric /// /// The name of the . /// The value of the . - protected EvaluationMetric(string name, T? value) - : base(name) + /// + /// An optional string that can be used to provide some commentary around the result represented by this + /// . + /// + protected EvaluationMetric(string name, T? value, string? reason = null) + : base(name, reason) { Value = value; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs index 35dec86ca63..2a0a07c2193 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/NumericMetric.cs @@ -20,4 +20,9 @@ namespace Microsoft.Extensions.AI.Evaluation; /// /// The name of the . /// The value of the . -public sealed class NumericMetric(string name, double? value = null) : EvaluationMetric(name, value); +/// +/// An optional string that can be used to provide some commentary around the result represented by this +/// . +/// +public sealed class NumericMetric(string name, double? value = null, string? reason = null) + : EvaluationMetric(name, value, reason); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs index b80c16fbbd8..97fd10921bc 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/StringMetric.cs @@ -12,4 +12,9 @@ namespace Microsoft.Extensions.AI.Evaluation; /// /// The name of the . /// The value of the . -public sealed class StringMetric(string name, string? value = null) : EvaluationMetric(name, value); +/// +/// An optional string that can be used to provide some commentary around the result represented by this +/// . +/// +public sealed class StringMetric(string name, string? value = null, string? reason = null) + : EvaluationMetric(name, value, reason); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs index dbfdebc529c..8307dc38591 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/EndToEndTests.cs @@ -33,8 +33,7 @@ static EndToEndTests() if (Settings.Current.Configured) { - var options = new RelevanceTruthAndCompletenessEvaluatorOptions(includeReasoning: true); - IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator(options); + IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator(); IEvaluator coherenceEvaluator = new CoherenceEvaluator(); IEvaluator fluencyEvaluator = new FluencyEvaluator(); @@ -82,9 +81,9 @@ await _reportingConfiguration.CreateScenarioRunAsync( NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Diagnostics.Single().Message)); - Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Diagnostics.Single().Message)); - Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Diagnostics.Single().Message)); + Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Reason)); + Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Reason)); + Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Reason)); NumericMetric coherence = result.Get(CoherenceEvaluator.CoherenceMetricName); Assert.True(coherence.Value >= 4); @@ -133,9 +132,9 @@ await _reportingConfiguration.CreateScenarioRunAsync( NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Diagnostics.Single().Message)); - Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Diagnostics.Single().Message)); - Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Diagnostics.Single().Message)); + Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Reason)); + Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Reason)); + Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Reason)); NumericMetric coherence = result.Get(CoherenceEvaluator.CoherenceMetricName); Assert.True(coherence.Value >= 4); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs deleted file mode 100644 index 8b479ea57cf..00000000000 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/RelevanceTruthAndCompletenessEvaluatorTests.cs +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Threading.Tasks; -using Microsoft.Extensions.AI.Evaluation.Quality; -using Microsoft.Extensions.AI.Evaluation.Reporting; -using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; -using Microsoft.TestUtilities; -using Xunit; - -namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; - -public class RelevanceTruthAndCompletenessEvaluatorTests -{ - private static readonly ChatOptions _chatOptions; - private static readonly ReportingConfiguration? _reportingConfigurationWithoutReasoning; - private static readonly ReportingConfiguration? _reportingConfigurationWithReasoning; - - static RelevanceTruthAndCompletenessEvaluatorTests() - { - _chatOptions = - new ChatOptions - { - Temperature = 0.0f, - ResponseFormat = ChatResponseFormat.Text - }; - - if (Settings.Current.Configured) - { - IEvaluator rtcEvaluatorWithoutReasoning = new RelevanceTruthAndCompletenessEvaluator(); - - _reportingConfigurationWithoutReasoning = - DiskBasedReportingConfiguration.Create( - storageRootPath: Settings.Current.StorageRootPath, - evaluators: [rtcEvaluatorWithoutReasoning], - chatConfiguration: Setup.CreateChatConfiguration(), - executionName: Constants.Version); - - var options = new RelevanceTruthAndCompletenessEvaluatorOptions(includeReasoning: true); - IEvaluator rtcEvaluatorWithReasoning = new RelevanceTruthAndCompletenessEvaluator(options); - - _reportingConfigurationWithReasoning = - DiskBasedReportingConfiguration.Create( - storageRootPath: Settings.Current.StorageRootPath, - evaluators: [rtcEvaluatorWithReasoning], - chatConfiguration: Setup.CreateChatConfiguration(), - executionName: Constants.Version); - } - } - - [ConditionalFact] - public async Task WithoutReasoning() - { - SkipIfNotConfigured(); - - await using ScenarioRun scenarioRun = - await _reportingConfigurationWithoutReasoning.CreateScenarioRunAsync( - scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(RelevanceTruthAndCompletenessEvaluatorTests)}.{nameof(WithoutReasoning)}"); - - IChatClient chatClient = scenarioRun.ChatConfiguration!.ChatClient; - - var messages = new List(); - string prompt = @"What is the molecular formula of ammonia?"; - ChatMessage promptMessage = prompt.ToUserMessage(); - messages.Add(promptMessage); - - ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptions); - ChatMessage responseMessage = response.Messages.Single(); - Assert.NotNull(responseMessage.Text); - - EvaluationResult result = await scenarioRun.EvaluateAsync(promptMessage, responseMessage); - - Assert.False(result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Informational)); - - NumericMetric relevance = result.Get(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName); - NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); - NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - - Assert.True(relevance.Value >= 4); - Assert.True(truth.Value >= 4); - Assert.True(completeness.Value >= 4); - } - - [ConditionalFact] - public async Task WithReasoning() - { - SkipIfNotConfigured(); - - await using ScenarioRun scenarioRun = - await _reportingConfigurationWithReasoning.CreateScenarioRunAsync( - scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(RelevanceTruthAndCompletenessEvaluatorTests)}.{nameof(WithReasoning)}"); - - IChatClient chatClient = scenarioRun.ChatConfiguration!.ChatClient; - - var messages = new List(); - string prompt = @"What is the molecular formula of glucose?"; - ChatMessage promptMessage = prompt.ToUserMessage(); - messages.Add(promptMessage); - - ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptions); - ChatMessage responseMessage = response.Messages.Single(); - Assert.NotNull(responseMessage.Text); - - EvaluationResult result = await scenarioRun.EvaluateAsync(promptMessage, responseMessage); - - Assert.True(result.ContainsDiagnostics(d => d.Severity == EvaluationDiagnosticSeverity.Informational)); - Assert.False(result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning)); - - NumericMetric relevance = result.Get(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName); - NumericMetric truth = result.Get(RelevanceTruthAndCompletenessEvaluator.TruthMetricName); - NumericMetric completeness = result.Get(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName); - - Assert.True(relevance.Value >= 4, string.Format("Relevance - Reasoning: {0}", relevance.Diagnostics.Single().Message)); - Assert.True(truth.Value >= 4, string.Format("Truth - Reasoning: {0}", truth.Diagnostics.Single().Message)); - Assert.True(completeness.Value >= 4, string.Format("Completeness - Reasoning: {0}", completeness.Diagnostics.Single().Message)); - } - - [MemberNotNull(nameof(_reportingConfigurationWithReasoning))] - [MemberNotNull(nameof(_reportingConfigurationWithoutReasoning))] - private static void SkipIfNotConfigured() - { - if (!Settings.Current.Configured) - { - throw new SkipTestException("Test is not configured"); - } - - Assert.NotNull(_reportingConfigurationWithReasoning); - Assert.NotNull(_reportingConfigurationWithoutReasoning); - } -} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs index 338532e5a3d..01241b5760b 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs @@ -151,7 +151,8 @@ public async Task ResultWithBooleanMetric() var metricA = new BooleanMetric("Metric with value false", false); var metricB = new BooleanMetric("Metric with value true", true); var metricC = new BooleanMetric("Metric without value"); - evaluator.TestMetrics = [metricA, metricB, metricC]; + var metricD = new BooleanMetric("Metric with reason", false, reason: "The reason"); + evaluator.TestMetrics = [metricA, metricB, metricC, metricD]; await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync( @@ -163,6 +164,7 @@ await reportingConfiguration.CreateScenarioRunAsync( Assert.Null(metricA.Interpretation); Assert.Null(metricB.Interpretation); Assert.Null(metricC.Interpretation); + Assert.Null(metricD.Interpretation); Assert.False(result.ContainsDiagnostics()); } @@ -176,7 +178,8 @@ public async Task ResultWithBooleanMetricAndInterpretation() var metricA = new BooleanMetric("Metric with value false", false); var metricB = new BooleanMetric("Metric with value true", true); var metricC = new BooleanMetric("Metric without value"); - evaluator.TestMetrics = [metricA, metricB, metricC]; + var metricD = new BooleanMetric("Metric with reason", false, reason: "The reason"); + evaluator.TestMetrics = [metricA, metricB, metricC, metricD]; await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync( @@ -190,6 +193,8 @@ await reportingConfiguration.CreateScenarioRunAsync( Assert.NotNull(metricB.Interpretation); Assert.True(metricB.Interpretation!.Failed); Assert.Null(metricC.Interpretation); + Assert.NotNull(metricD.Interpretation); + Assert.False(metricD.Interpretation!.Failed); Assert.False(result.ContainsDiagnostics()); } @@ -221,9 +226,9 @@ public async Task ResultWithStringMetric() var metricF = new StringMetric("Measurement System: Nautical", "Nautical"); var metricG = new StringMetric("Measurement System: Astronomical", "Astronomical"); var metricH = new StringMetric("Measurement System: Multiple", "Multiple"); - var metricI = new StringMetric("Measurement System: Blah", "Blah"); - var metricJ = new StringMetric("Measurement System: Empty", ""); - var metricK = new StringMetric("Measurement System: Null"); + var metricI = new StringMetric("Measurement System: Blah", "Blah", reason: "Value was unexpected"); + var metricJ = new StringMetric("Measurement System: Empty", "", reason: "Value was empty"); + var metricK = new StringMetric("Measurement System: Null", reason: "Value was null"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH, metricI, metricJ, metricK]; @@ -276,9 +281,9 @@ public async Task ResultWithStringMetricAndInterpretation() var metricF = new StringMetric("Measurement System: Nautical", "Nautical"); var metricG = new StringMetric("Measurement System: Astronomical", "Astronomical"); var metricH = new StringMetric("Measurement System: Multiple", "Multiple"); - var metricI = new StringMetric("Measurement System: Blah", "Blah"); - var metricJ = new StringMetric("Measurement System: Empty", ""); - var metricK = new StringMetric("Measurement System: Null"); + var metricI = new StringMetric("Measurement System: Blah", "Blah", reason: "Value was unexpected"); + var metricJ = new StringMetric("Measurement System: Empty", "", reason: "Value was empty"); + var metricK = new StringMetric("Measurement System: Null", reason: "Value was null"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH, metricI, metricJ, metricK]; @@ -322,14 +327,14 @@ public async Task ResultWithNumericMetrics() var evaluator = new TestEvaluator(); ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); - var metricA = new NumericMetric("Metric with value 0", 0); - var metricB = new NumericMetric("Metric with value 1", 1); - var metricC = new NumericMetric("Metric with value 2", 2); - var metricD = new NumericMetric("Metric with value 3", 3); - var metricE = new NumericMetric("Metric with value 4", 4); - var metricF = new NumericMetric("Metric with value 5", 5); - var metricG = new NumericMetric("Metric with value 6", 6); - var metricH = new NumericMetric("Metric with no value"); + var metricA = new NumericMetric("Metric with value 0", 0, reason: "Because of reason A"); + var metricB = new NumericMetric("Metric with value 1", 1, reason: "Because of reason B"); + var metricC = new NumericMetric("Metric with value 2", 2, reason: "Because of reason C"); + var metricD = new NumericMetric("Metric with value 3", 3, reason: "Because of reason D"); + var metricE = new NumericMetric("Metric with value 4", 4, reason: "Because of reason E"); + var metricF = new NumericMetric("Metric with value 5", 5, reason: "Because of reason F"); + var metricG = new NumericMetric("Metric with value 6", 6, reason: "Because of reason G"); + var metricH = new NumericMetric("Metric with no value", reason: "Because of reason H"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH]; await using ScenarioRun scenarioRun = @@ -357,14 +362,14 @@ public async Task ResultWithNumericMetricsAndInterpretation() var evaluator = new TestEvaluator(); ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); - var metricA = new NumericMetric("Metric with value 0", 0); - var metricB = new NumericMetric("Metric with value 1", 1); - var metricC = new NumericMetric("Metric with value 2", 2); - var metricD = new NumericMetric("Metric with value 3", 3); - var metricE = new NumericMetric("Metric with value 4", 4); - var metricF = new NumericMetric("Metric with value 5", 5); - var metricG = new NumericMetric("Metric with value 6", 6); - var metricH = new NumericMetric("Metric with no value"); + var metricA = new NumericMetric("Metric with value 0", 0, reason: "Because of reason A"); + var metricB = new NumericMetric("Metric with value 1", 1, reason: "Because of reason B"); + var metricC = new NumericMetric("Metric with value 2", 2, reason: "Because of reason C"); + var metricD = new NumericMetric("Metric with value 3", 3, reason: "Because of reason D"); + var metricE = new NumericMetric("Metric with value 4", 4, reason: "Because of reason E"); + var metricF = new NumericMetric("Metric with value 5", 5, reason: "Because of reason F"); + var metricG = new NumericMetric("Metric with value 6", 6, reason: "Because of reason G"); + var metricH = new NumericMetric("Metric with no value", reason: "Because of reason H"); evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH]; await using ScenarioRun scenarioRun = @@ -405,23 +410,28 @@ public async Task ResultWithDiagnosticsOnUninterpretedMetrics() metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics"); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.Reason = "Reason for metric 2"; var metric3 = new EvaluationMetric("Metric with error diagnostics only"); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only"); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only"); metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; @@ -452,23 +462,28 @@ public async Task ResultWithDiagnosticsOnFailingMetrics() metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics"); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.Reason = "Reason for metric 2"; var metric3 = new EvaluationMetric("Metric with error diagnostics only"); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only"); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only"); metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; @@ -505,23 +520,28 @@ public async Task ResultWithDiagnosticsOnPassingMetrics() metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics", value: true); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.Reason = "Reason for metric 2"; var metric3 = new NumericMetric("Metric with error diagnostics only", value: 5); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only", value: "A"); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only", value: 4); metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5];