From 32e53febfad072e52a78692b3129fce79a69c57e Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 2 Jul 2025 15:26:46 -0400 Subject: [PATCH 1/6] Add reporting tests that show NLP results. --- .../BLEUEvaluator.cs | 2 +- .../F1Evaluator.cs | 2 +- .../GLEUEvaluator.cs | 2 +- ...ons.AI.Evaluation.Integration.Tests.csproj | 1 + .../NLPEvaluatorTests.cs | 138 ++++++++++++++++++ 5 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index e1419bd630e..f3030ec7cfb 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -86,7 +86,7 @@ public ValueTask EvaluateAsync( }); metric.Value = score; - string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); metric.Interpretation = metric.Interpret(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs index b0806be6d66..e070577c448 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs @@ -77,7 +77,7 @@ public ValueTask EvaluateAsync( }); metric.Value = score; - string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); metric.Interpretation = metric.Interpret(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs index 0c9805ee108..60df30879a4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs @@ -86,7 +86,7 @@ public ValueTask EvaluateAsync( }); metric.Value = score; - string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); metric.Interpretation = metric.Interpret(); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj index c08667ff421..6e3332ebca6 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj @@ -28,6 +28,7 @@ + diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs new file mode 100644 index 00000000000..387bf4d968a --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs @@ -0,0 +1,138 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable CA2016 // Forward the 'CancellationToken' parameter to methods that take it. +#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP; +using Microsoft.Extensions.AI.Evaluation.Reporting; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.Extensions.AI.Evaluation.Tests; +using Microsoft.TestUtilities; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; + +[Experimental("AIEVAL001")] +public class NLPEvaluatorTests +{ + private static readonly ChatOptions? _chatOptions; + private static readonly ReportingConfiguration? _nlpReportingConfiguration; + + static NLPEvaluatorTests() + { + if (Settings.Current.Configured) + { + IEvaluator bleuEvaluator = new BLEUEvaluator(); + IEvaluator gleuEvaluator = new GLEUEvaluator(); + IEvaluator f1Evaluator = new F1Evaluator(); + + _nlpReportingConfiguration = + DiskBasedReportingConfiguration.Create( + storageRootPath: Settings.Current.StorageRootPath, + evaluators: [bleuEvaluator, gleuEvaluator, f1Evaluator], + executionName: Constants.Version, + tags: []); + } + } + + [ConditionalFact] + public async Task ExactMatch() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(ExactMatch)}"); + + var referenceText = "The quick brown fox jumps over the lazy dog."; + var bleuContext = new BLEUEvaluatorContext(referenceText); + var gleuContext = new GLEUEvaluatorContext(referenceText); + var f1Context = new F1EvaluatorContext(referenceText); + + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "The quick brown fox jumps over the lazy dog.")); + + EvaluationResult result = await scenarioRun.EvaluateAsync(response, [bleuContext, gleuContext, f1Context]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task Unmatched() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(Unmatched)}"); + + var referenceText = "The quick brown fox jumps over the lazy dog."; + var bleuContext = new BLEUEvaluatorContext(referenceText); + var gleuContext = new GLEUEvaluatorContext(referenceText); + var f1Context = new F1EvaluatorContext(referenceText); + + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "What is the meaning of life?")); + + EvaluationResult result = await scenarioRun.EvaluateAsync(response, [bleuContext, gleuContext, f1Context]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task AdditionalContextIsNotPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(AdditionalContextIsNotPassed)}"); + + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "What is the meaning of life?")); + + EvaluationResult result = await scenarioRun.EvaluateAsync(response); + + Assert.True( + result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? bleu)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? gleu)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? f1)); + + Assert.Null(bleu.Context); + Assert.Null(gleu.Context); + Assert.Null(f1.Context); + + } + + [MemberNotNull(nameof(_nlpReportingConfiguration))] + private static void SkipIfNotConfigured() + { + if (!Settings.Current.Configured) + { + throw new SkipTestException("Test is not configured"); + } + + Assert.NotNull(_nlpReportingConfiguration); + } +} From b59747421bfc071b1cc131ac317b56433942ac88 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 2 Jul 2025 15:28:49 -0400 Subject: [PATCH 2/6] Cleanup analyzer errors. --- .../NLPEvaluatorTests.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs index 387bf4d968a..ef4d0e58215 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs @@ -4,15 +4,12 @@ #pragma warning disable CA2016 // Forward the 'CancellationToken' parameter to methods that take it. #pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. -using System; -using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Threading.Tasks; using Microsoft.Extensions.AI.Evaluation.NLP; using Microsoft.Extensions.AI.Evaluation.Reporting; using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; -using Microsoft.Extensions.AI.Evaluation.Tests; using Microsoft.TestUtilities; using Xunit; @@ -21,7 +18,6 @@ namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; [Experimental("AIEVAL001")] public class NLPEvaluatorTests { - private static readonly ChatOptions? _chatOptions; private static readonly ReportingConfiguration? _nlpReportingConfiguration; static NLPEvaluatorTests() From 02d472c8c384bebc07a6f6661ab50af5e33bb2fa Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 2 Jul 2025 15:45:24 -0400 Subject: [PATCH 3/6] Add global tags for NLP --- .../NLPEvaluatorTests.cs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs index ef4d0e58215..d4a5b9795d1 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs @@ -4,6 +4,7 @@ #pragma warning disable CA2016 // Forward the 'CancellationToken' parameter to methods that take it. #pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. +using System; using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Threading.Tasks; @@ -24,6 +25,11 @@ static NLPEvaluatorTests() { if (Settings.Current.Configured) { + string version = $"Product Version: {Constants.Version}"; + string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}"; + string projectName = $"Project: Integration Tests"; + string testClass = $"Test Class: {nameof(NLPEvaluatorTests)}"; + IEvaluator bleuEvaluator = new BLEUEvaluator(); IEvaluator gleuEvaluator = new GLEUEvaluator(); IEvaluator f1Evaluator = new F1Evaluator(); @@ -33,7 +39,7 @@ static NLPEvaluatorTests() storageRootPath: Settings.Current.StorageRootPath, evaluators: [bleuEvaluator, gleuEvaluator, f1Evaluator], executionName: Constants.Version, - tags: []); + tags: [version, date, projectName, testClass]); } } @@ -51,9 +57,7 @@ await _nlpReportingConfiguration.CreateScenarioRunAsync( var gleuContext = new GLEUEvaluatorContext(referenceText); var f1Context = new F1EvaluatorContext(referenceText); - var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "The quick brown fox jumps over the lazy dog.")); - - EvaluationResult result = await scenarioRun.EvaluateAsync(response, [bleuContext, gleuContext, f1Context]); + EvaluationResult result = await scenarioRun.EvaluateAsync(referenceText, [bleuContext, gleuContext, f1Context]); Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), @@ -79,9 +83,7 @@ await _nlpReportingConfiguration.CreateScenarioRunAsync( var gleuContext = new GLEUEvaluatorContext(referenceText); var f1Context = new F1EvaluatorContext(referenceText); - var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "What is the meaning of life?")); - - EvaluationResult result = await scenarioRun.EvaluateAsync(response, [bleuContext, gleuContext, f1Context]); + EvaluationResult result = await scenarioRun.EvaluateAsync("What is the meaning of life?", [bleuContext, gleuContext, f1Context]); Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), @@ -102,9 +104,7 @@ public async Task AdditionalContextIsNotPassed() await _nlpReportingConfiguration.CreateScenarioRunAsync( scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(AdditionalContextIsNotPassed)}"); - var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "What is the meaning of life?")); - - EvaluationResult result = await scenarioRun.EvaluateAsync(response); + EvaluationResult result = await scenarioRun.EvaluateAsync("What is the meaning of life?"); Assert.True( result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), From 84ecf9b617cb8efdcd4dd84a39abc0a520a009ef Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 2 Jul 2025 15:55:31 -0400 Subject: [PATCH 4/6] Add more precision to the evaluator timing --- .../EvaluationMetricExtensions.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs index d3012030cec..534f5e300f7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs @@ -177,7 +177,7 @@ public static void AddOrUpdateChatMetadata( if (duration is not null) { - string durationText = $"{duration.Value.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.Value.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); } } From f353a5d6f532d085fd5db2b0b2c727cd02085acd Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 2 Jul 2025 15:56:55 -0400 Subject: [PATCH 5/6] More tags --- .../NLPEvaluatorTests.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs index d4a5b9795d1..498938cb0ae 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs @@ -29,6 +29,7 @@ static NLPEvaluatorTests() string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}"; string projectName = $"Project: Integration Tests"; string testClass = $"Test Class: {nameof(NLPEvaluatorTests)}"; + string usesContext = $"Feature: Context"; IEvaluator bleuEvaluator = new BLEUEvaluator(); IEvaluator gleuEvaluator = new GLEUEvaluator(); @@ -39,7 +40,7 @@ static NLPEvaluatorTests() storageRootPath: Settings.Current.StorageRootPath, evaluators: [bleuEvaluator, gleuEvaluator, f1Evaluator], executionName: Constants.Version, - tags: [version, date, projectName, testClass]); + tags: [version, date, projectName, testClass, usesContext]); } } From 49881687d74faf1be1faa8c405901d6799e02792 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 2 Jul 2025 16:34:40 -0400 Subject: [PATCH 6/6] Add another partial match test --- .../NLPEvaluatorTests.cs | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs index 498938cb0ae..a4f3b75045a 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs @@ -70,6 +70,33 @@ await _nlpReportingConfiguration.CreateScenarioRunAsync( Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); } + [ConditionalFact] + public async Task PartialMatch() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(PartialMatch)}"); + + var referenceText = "The quick brown fox jumps over the lazy dog."; + var bleuContext = new BLEUEvaluatorContext(referenceText); + var gleuContext = new GLEUEvaluatorContext(referenceText); + var f1Context = new F1EvaluatorContext(referenceText); + + var similarText = "The brown fox quickly jumps over a lazy dog."; + EvaluationResult result = await scenarioRun.EvaluateAsync(similarText, [bleuContext, gleuContext, f1Context]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); + } + [ConditionalFact] public async Task Unmatched() { @@ -84,7 +111,7 @@ await _nlpReportingConfiguration.CreateScenarioRunAsync( var gleuContext = new GLEUEvaluatorContext(referenceText); var f1Context = new F1EvaluatorContext(referenceText); - EvaluationResult result = await scenarioRun.EvaluateAsync("What is the meaning of life?", [bleuContext, gleuContext, f1Context]); + EvaluationResult result = await scenarioRun.EvaluateAsync("What is life's meaning?", [bleuContext, gleuContext, f1Context]); Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),