Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,93 +7,7 @@ public partial class RelevanceTruthAndCompletenessEvaluator
{
private static class Prompts
{
internal static string BuildEvaluationPrompt(string userQuery, string modelResponse, string history)
{
#pragma warning disable S103 // Lines should not be too long
return
$$"""
Read the History, User Query, and Model Response below and produce your response as a single JSON object.
Do not include any other text in your response besides the JSON object.

The JSON object should have the following format. However, do not include any markdown tags in your
response. Your response should start with an open curly brace and end with a closing curly brace for the
JSON.
```
{
"relevance": 1,
"truth": 1,
"completeness": 1
}
```

-----

History: {{history}}

-----

User Query: {{userQuery}}

-----

Model Response: {{modelResponse}}

-----

That's the History, User Query, and Model Response you will rate. Now, in 3 Steps, you will evaluate the Model Response on 3 criteria.

-----

Step 1: Rate the relevance of the response.

Regardless of truth of statements, how much of the response is directly related to the request?

1 = None of the response is at all related
2 = A little of the response is directly related, or the response is a little bit related
3 = Some of the response is related, or the response is somewhat related
4 = Most of the response is related, or the response is mostly related
5 = 100% of the response is 100% related to the request

Record your response as the value of the "relevance" property in the JSON output.

-----

Step 2: Rate the truth of the response.

Read the History, Query, and Model Response again.

Regardless of relevance, how true are the verifiable statements in the response?

1 = The entire response is totally false
2 = A little of the response is true, or the response is a little bit true
3 = Some of the response is true, or the response is somewhat true
4 = Most of the response is true, or the response is mostly true
5 = 100% of the response is 100% true

Record your response as the value of the "truth" property in the JSON output.

-----

Step 3: Rate the completeness of the response.

Read the History, Query, and Model Response again.

Regardless of whether the statements made in the response are true, how many of the points necessary to address the request, does the response contain?

1 = The response omits all points that are necessary to address the request.
2 = The response includes a little of the points that are necessary to address the request.
3 = The response includes some of the points that are necessary to address the request.
4 = The response includes most of the points that are necessary to address the request.
5 = The response includes all points that are necessary to address the request. For explain tasks, nothing is left unexplained. For improve tasks, I looked for all potential improvements, and none were left out. For fix tasks, the response purports to get the user all the way to a fixed state (regardless of whether it actually works). For "do task" responses, it does everything requested.

Record your response as the value of the "completeness" property in the JSON output.

-----
""";
#pragma warning restore S103
}

internal static string BuildEvaluationPromptWithReasoning(
internal static string BuildEvaluationPrompt(
string userQuery,
string modelResponse,
string history)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality;
/// <remarks>
/// <see cref="RelevanceTruthAndCompletenessEvaluator"/> returns three <see cref="NumericMetric"/>s that contain scores
/// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating
/// a poor score, and 5 indicating an excellent score.
/// a poor score, and 5 indicating an excellent score. Each returned score is also accompanied by a
/// <see cref="EvaluationMetric.Reason"/> that provides an explanation for the score.
/// </remarks>
/// <param name="options">Options for <see cref="RelevanceTruthAndCompletenessEvaluator"/>.</param>
public sealed partial class RelevanceTruthAndCompletenessEvaluator(
RelevanceTruthAndCompletenessEvaluatorOptions? options = null) : ChatConversationEvaluator
public sealed partial class RelevanceTruthAndCompletenessEvaluator : ChatConversationEvaluator
{
/// <summary>
/// Gets the <see cref="EvaluationMetric.Name"/> of the <see cref="NumericMetric"/> returned by
Expand Down Expand Up @@ -61,9 +60,6 @@ public sealed partial class RelevanceTruthAndCompletenessEvaluator(
ResponseFormat = ChatResponseFormat.Json
};

private readonly RelevanceTruthAndCompletenessEvaluatorOptions _options =
options ?? RelevanceTruthAndCompletenessEvaluatorOptions.Default;

/// <inheritdoc/>
protected override EvaluationResult InitializeResult()
{
Expand Down Expand Up @@ -101,17 +97,7 @@ userRequest is not null

string renderedHistory = builder.ToString();

string prompt =
_options.IncludeReasoning
? Prompts.BuildEvaluationPromptWithReasoning(
renderedUserRequest,
renderedModelResponse,
renderedHistory)
: Prompts.BuildEvaluationPrompt(
renderedUserRequest,
renderedModelResponse,
renderedHistory);

string prompt = Prompts.BuildEvaluationPrompt(renderedUserRequest, renderedModelResponse, renderedHistory);
return prompt;
}

Expand Down Expand Up @@ -192,23 +178,23 @@ void UpdateResult(Rating rating)
relevance.Interpretation = relevance.InterpretScore();
if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning))
{
relevance.AddDiagnostic(EvaluationDiagnostic.Informational(rating.RelevanceReasoning!));
relevance.Reason = rating.RelevanceReasoning!;
}

NumericMetric truth = result.Get<NumericMetric>(TruthMetricName);
truth.Value = rating.Truth;
truth.Interpretation = truth.InterpretScore();
if (!string.IsNullOrWhiteSpace(rating.TruthReasoning))
{
truth.AddDiagnostic(EvaluationDiagnostic.Informational(rating.TruthReasoning!));
truth.Reason = rating.TruthReasoning!;
}

NumericMetric completeness = result.Get<NumericMetric>(CompletenessMetricName);
completeness.Value = rating.Completeness;
completeness.Interpretation = completeness.InterpretScore();
if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning))
{
completeness.AddDiagnostic(EvaluationDiagnostic.Informational(rating.CompletenessReasoning!));
completeness.Reason = rating.CompletenessReasoning!;
}

if (!string.IsNullOrWhiteSpace(rating.Error))
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ The .NET Foundation licenses this file to you under the MIT license.

#root {
margin: 0 auto;
padding: 2rem;
padding: 0rem 2rem 2rem 2rem;
background-color: white;
}

Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

import { useState } from 'react';
import { Settings28Regular } from '@fluentui/react-icons';
import { Drawer, DrawerBody, DrawerHeader, DrawerHeaderTitle, Switch } from '@fluentui/react-components';
import { makeStyles } from '@fluentui/react-components';
import './App.css';
import { ScoreNode } from './Summary';
Expand All @@ -12,20 +15,44 @@ type AppProperties = {
};

const useStyles = makeStyles({
footerText: { fontSize: '0.8rem', marginTop: '2rem' }
})
header: { display: 'flex', justifyContent: 'space-between', alignItems: 'center', position: 'sticky', top: 0, backgroundColor: 'white', zIndex: 1 },
footerText: { fontSize: '0.8rem', marginTop: '2rem' },
closeButton: { position: 'absolute', top: '1.5rem', right: '1rem', cursor: 'pointer', fontSize: '2rem' },
switchLabel: { fontSize: '1rem', paddingTop: '1rem' },
drawerBody: { paddingTop: '1rem' },
});

function App({dataset, tree}:AppProperties) {
function App({ dataset, tree }: AppProperties) {
const classes = useStyles();
const [isSettingsOpen, setIsSettingsOpen] = useState(false);
const [renderMarkdown, setRenderMarkdown] = useState(true);

const toggleSettings = () => setIsSettingsOpen(!isSettingsOpen);
const toggleRenderMarkdown = () => setRenderMarkdown(!renderMarkdown);
const closeSettings = () => setIsSettingsOpen(false);

return (
<>
<h1>AI Evaluation Report</h1>
<div className={classes.header}>
<h1>AI Evaluation Report</h1>
<Settings28Regular onClick={toggleSettings} style={{ cursor: 'pointer' }} />
</div>

<ScenarioGroup node={tree} />
<ScenarioGroup node={tree} renderMarkdown={renderMarkdown} />

<p className={classes.footerText}>Generated at {dataset.createdAt} by Microsoft.Extensions.AI.Evaluation.Reporting version {dataset.generatorVersion}</p>

<Drawer open={isSettingsOpen} onOpenChange={toggleSettings} position='end'>
<DrawerHeader>
<DrawerHeaderTitle>Settings</DrawerHeaderTitle>
<span className={classes.closeButton} onClick={closeSettings}>&times;</span>
</DrawerHeader>
<DrawerBody className={classes.drawerBody}>
<Switch checked={renderMarkdown} onChange={toggleRenderMarkdown} label={<span className={classes.switchLabel}>Render markdown for conversations</span>} />
</DrawerBody>
</Drawer>
</>
)
);
}

export default App
export default App;
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,24 @@ type BaseEvaluationMetric = {

type MetricWithNoValue = BaseEvaluationMetric & {
$type: "none";
reason?: string;
value: undefined;
};

type NumericMetric = BaseEvaluationMetric & {
$type: "numeric";
reason?: string;
value?: number;
};

type BooleanMetric = BaseEvaluationMetric & {
$type: "boolean";
reason?: string;
value?: boolean;
};

type StringMetric = BaseEvaluationMetric & {
$type: "string";
reason?: string;
value?: string;
};
Loading
Loading