dotnet · shyamnamboodiripad · Mar 13, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
@@ -7,93 +7,7 @@ public partial class RelevanceTruthAndCompletenessEvaluator
 {
     private static class Prompts
     {
-        internal static string BuildEvaluationPrompt(string userQuery, string modelResponse, string history)
-        {
-#pragma warning disable S103 // Lines should not be too long
-            return
-                $$"""
-                Read the History, User Query, and Model Response below and produce your response as a single JSON object.
-                Do not include any other text in your response besides the JSON object.
-
-                The JSON object should have the following format. However, do not include any markdown tags in your
-                response. Your response should start with an open curly brace and end with a closing curly brace for the
-                JSON.
-                ```
-                {
-                    "relevance": 1,
-                    "truth": 1,
-                    "completeness": 1
-                }
-                ```
-
-                -----
-
-                History: {{history}}
-
-                -----
-
-                User Query: {{userQuery}}
-
-                -----
-
-                Model Response: {{modelResponse}}
-
-                -----
-
-                That's the History, User Query, and Model Response you will rate. Now, in 3 Steps, you will evaluate the Model Response on 3 criteria.
-
-                -----
-
-                Step 1: Rate the relevance of the response.
-
-                Regardless of truth of statements, how much of the response is directly related to the request?
-
-                1 = None of the response is at all related
-                2 = A little of the response is directly related, or the response is a little bit related
-                3 = Some of the response is related, or the response is somewhat related
-                4 = Most of the response is related, or the response is mostly related
-                5 = 100% of the response is 100% related to the request
-
-                Record your response as the value of the "relevance" property in the JSON output.
-
-                -----
-
-                Step 2: Rate the truth of the response.
-
-                Read the History, Query, and Model Response again.
-
-                Regardless of relevance, how true are the verifiable statements in the response?
-
-                1 = The entire response is totally false
-                2 = A little of the response is true, or the response is a little bit true
-                3 = Some of the response is true, or the response is somewhat true
-                4 = Most of the response is true, or the response is mostly true
-                5 = 100% of the response is 100% true
-
-                Record your response as the value of the "truth" property in the JSON output.
-
-                -----
-
-                Step 3: Rate the completeness of the response.
-
-                Read the History, Query, and Model Response again.
-
-                Regardless of whether the statements made in the response are true, how many of the points necessary to address the request, does the response contain?
-
-                1 = The response omits all points that are necessary to address the request.
-                2 = The response includes a little of the points that are necessary to address the request.
-                3 = The response includes some of the points that are necessary to address the request.
-                4 = The response includes most of the points that are necessary to address the request.
-                5 = The response includes all points that are necessary to address the request. For explain tasks, nothing is left unexplained. For improve tasks, I looked for all potential improvements, and none were left out. For fix tasks, the response purports to get the user all the way to a fixed state (regardless of whether it actually works). For "do task" responses, it does everything requested.
-
-                Record your response as the value of the "completeness" property in the JSON output.
-
-                -----
-                """;
-#pragma warning restore S103
-        }
-
-        internal static string BuildEvaluationPromptWithReasoning(
+        internal static string BuildEvaluationPrompt(
             string userQuery,
             string modelResponse,
             string history)

@@ -23,11 +23,10 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality;
 /// <remarks>
 /// <see cref="RelevanceTruthAndCompletenessEvaluator"/> returns three <see cref="NumericMetric"/>s that contain scores
 /// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating
-/// a poor score, and 5 indicating an excellent score.
+/// a poor score, and 5 indicating an excellent score. Each returned score is also accompanied by a
+/// <see cref="EvaluationMetric.Reason"/> that provides an explanation for the score.
 /// </remarks>
-/// <param name="options">Options for <see cref="RelevanceTruthAndCompletenessEvaluator"/>.</param>
-public sealed partial class RelevanceTruthAndCompletenessEvaluator(
-    RelevanceTruthAndCompletenessEvaluatorOptions? options = null) : ChatConversationEvaluator
+public sealed partial class RelevanceTruthAndCompletenessEvaluator : ChatConversationEvaluator
 {
     /// <summary>
     /// Gets the <see cref="EvaluationMetric.Name"/> of the <see cref="NumericMetric"/> returned by
@@ -61,9 +60,6 @@ public sealed partial class RelevanceTruthAndCompletenessEvaluator(
             ResponseFormat = ChatResponseFormat.Json
         };
 
-    private readonly RelevanceTruthAndCompletenessEvaluatorOptions _options =
-        options ?? RelevanceTruthAndCompletenessEvaluatorOptions.Default;
-
     /// <inheritdoc/>
     protected override EvaluationResult InitializeResult()
     {
@@ -101,17 +97,7 @@ userRequest is not null
 
         string renderedHistory = builder.ToString();
 
-        string prompt =
-            _options.IncludeReasoning
-                ? Prompts.BuildEvaluationPromptWithReasoning(
-                    renderedUserRequest,
-                    renderedModelResponse,
-                    renderedHistory)
-                : Prompts.BuildEvaluationPrompt(
-                    renderedUserRequest,
-                    renderedModelResponse,
-                    renderedHistory);
-
+        string prompt = Prompts.BuildEvaluationPrompt(renderedUserRequest, renderedModelResponse, renderedHistory);
         return prompt;
     }
 
@@ -192,23 +178,23 @@ void UpdateResult(Rating rating)
             relevance.Interpretation = relevance.InterpretScore();
             if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning))
             {
-                relevance.AddDiagnostic(EvaluationDiagnostic.Informational(rating.RelevanceReasoning!));
+                relevance.Reason = rating.RelevanceReasoning!;
             }
 
             NumericMetric truth = result.Get<NumericMetric>(TruthMetricName);
             truth.Value = rating.Truth;
             truth.Interpretation = truth.InterpretScore();
             if (!string.IsNullOrWhiteSpace(rating.TruthReasoning))
             {
-                truth.AddDiagnostic(EvaluationDiagnostic.Informational(rating.TruthReasoning!));
+                truth.Reason = rating.TruthReasoning!;
             }
 
             NumericMetric completeness = result.Get<NumericMetric>(CompletenessMetricName);
             completeness.Value = rating.Completeness;
             completeness.Interpretation = completeness.InterpretScore();
             if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning))
             {
-                completeness.AddDiagnostic(EvaluationDiagnostic.Informational(rating.CompletenessReasoning!));
+                completeness.Reason = rating.CompletenessReasoning!;
             }
 
             if (!string.IsNullOrWhiteSpace(rating.Error))

@@ -5,7 +5,6 @@ The .NET Foundation licenses this file to you under the MIT license.
 
 #root {
   margin: 0 auto;
-  padding: 2rem;
+  padding: 0rem 2rem 2rem 2rem;
   background-color: white;
 }
-
@@ -1,6 +1,9 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+import { useState } from 'react';
+import { Settings28Regular } from '@fluentui/react-icons';
+import { Drawer, DrawerBody, DrawerHeader, DrawerHeaderTitle, Switch } from '@fluentui/react-components';
 import { makeStyles } from '@fluentui/react-components';
 import './App.css';
 import { ScoreNode } from './Summary';
@@ -12,20 +15,44 @@ type AppProperties = {
 };
 
 const useStyles = makeStyles({
-  footerText: { fontSize: '0.8rem', marginTop: '2rem' }
-})
+  header: { display: 'flex', justifyContent: 'space-between', alignItems: 'center', position: 'sticky', top: 0, backgroundColor: 'white', zIndex: 1 },
+  footerText: { fontSize: '0.8rem', marginTop: '2rem' },
+  closeButton: { position: 'absolute', top: '1.5rem', right: '1rem', cursor: 'pointer', fontSize: '2rem' },
+  switchLabel: { fontSize: '1rem', paddingTop: '1rem' },
+  drawerBody: { paddingTop: '1rem' },
+});
 
-function App({dataset, tree}:AppProperties) {
+function App({ dataset, tree }: AppProperties) {
   const classes = useStyles();
+  const [isSettingsOpen, setIsSettingsOpen] = useState(false);
+  const [renderMarkdown, setRenderMarkdown] = useState(true);
+
+  const toggleSettings = () => setIsSettingsOpen(!isSettingsOpen);
+  const toggleRenderMarkdown = () => setRenderMarkdown(!renderMarkdown);
+  const closeSettings = () => setIsSettingsOpen(false);
+
   return (
     <>
-      <h1>AI Evaluation Report</h1>
+      <div className={classes.header}>
+        <h1>AI Evaluation Report</h1>
+        <Settings28Regular onClick={toggleSettings} style={{ cursor: 'pointer' }} />
+      </div>
 
-      <ScenarioGroup node={tree} />
+      <ScenarioGroup node={tree} renderMarkdown={renderMarkdown} />
 
       <p className={classes.footerText}>Generated at {dataset.createdAt} by Microsoft.Extensions.AI.Evaluation.Reporting version {dataset.generatorVersion}</p>
+
+      <Drawer open={isSettingsOpen} onOpenChange={toggleSettings} position='end'>
+        <DrawerHeader>
+          <DrawerHeaderTitle>Settings</DrawerHeaderTitle>
+          <span className={classes.closeButton} onClick={closeSettings}>&times;</span>
+        </DrawerHeader>
+        <DrawerBody className={classes.drawerBody}>
+          <Switch checked={renderMarkdown} onChange={toggleRenderMarkdown} label={<span className={classes.switchLabel}>Render markdown for conversations</span>} />
+        </DrawerBody>
+      </Drawer>
     </>
-  )
+  );
 }
 
-export default App
+export default App;
@@ -65,20 +65,24 @@ type BaseEvaluationMetric = {
 
 type MetricWithNoValue = BaseEvaluationMetric & {
     $type: "none";
+    reason?: string;
     value: undefined;
 };
 
 type NumericMetric = BaseEvaluationMetric & {
     $type: "numeric";
+    reason?: string;
     value?: number;
 };
 
 type BooleanMetric = BaseEvaluationMetric & {
     $type: "boolean";
+    reason?: string;
     value?: boolean;
 };
 
 type StringMetric = BaseEvaluationMetric & {
     $type: "string";
+    reason?: string;
     value?: string;
 };