From 7707cd8e459a82154b45f55abedfdaf0e5e39a5d Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Thu, 1 May 2025 15:14:16 -0700 Subject: [PATCH] Fix keys for EvaluationResult.Metrics dictionary to reflect the correct metric names for Safety evaluators This is an unfortunate regression that was introduced during a recent refactoring. The metrics returned from the Azure AI Foundry Evaluation service have different names than the ones we use in the Safety library. We translate the EvaluationMetric.Name name of the metrics returned by the service to the more display friendly names before returning the metrics to the caller. While the returned metrics were correctly patched up, the EvaluationResult.Metrics dictionary still stored metrics by the original names returned by the service. Unfortunately, this means EvaluationResult.Get now throws an exception when trying to fetch metric with name ViolenceEvaluator.ViolenceMetricName. The fix would be to patch up the keys in the dictionary as well. --- .../ContentSafetyEvaluator.cs | 13 ++- .../ContentSafetyService.cs | 2 +- .../ProtectedMaterialEvaluator.cs | 2 +- .../QualityEvaluatorTests.cs | 56 +++++++++- .../SafetyEvaluatorTests.cs | 101 ++++++++++++++++-- 5 files changed, 156 insertions(+), 18 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs index b42631c3c06..e69728a2880 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs @@ -151,12 +151,13 @@ await TimingHelper.ExecuteWithTimingAsync(() => string annotationResult = annotationResponse.Text; EvaluationResult result = ContentSafetyService.ParseAnnotationResult(annotationResult); - UpdateMetrics(); + EvaluationResult updatedResult = UpdateMetrics(); + return updatedResult; - return result; - - void UpdateMetrics() + EvaluationResult UpdateMetrics() { + EvaluationResult updatedResult = new EvaluationResult(); + foreach (EvaluationMetric metric in result.Metrics.Values) { string contentSafetyServiceMetricName = metric.Name; @@ -185,7 +186,11 @@ void UpdateMetrics() // metric.LogJsonData(payload); // metric.LogJsonData(annotationResult); #pragma warning restore S125 + + updatedResult.Metrics.Add(metric.Name, metric); } + + return updatedResult; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs index 291952d761b..af72638dd6c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs @@ -120,7 +120,7 @@ internal static EvaluationResult ParseAnnotationResult(string annotationResponse } } - result.Metrics[metric.Name] = metric; + result.Metrics.Add(metric.Name, metric); } return result; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs index 25c99306c32..8def2e8a078 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs @@ -100,7 +100,7 @@ await EvaluateContentSafetyAsync( foreach (EvaluationMetric imageMetric in imageResult.Metrics.Values) { - result.Metrics[imageMetric.Name] = imageMetric; + result.Metrics.Add(imageMetric.Name, imageMetric); } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs index 63b9897abbf..90f7a2b29aa 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs @@ -18,6 +18,7 @@ namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; +[Experimental("AIEVAL001")] public class QualityEvaluatorTests { private static readonly ChatOptions? _chatOptions; @@ -47,9 +48,7 @@ static QualityEvaluatorTests() string temperature = $"Temperature: {_chatOptions.Temperature}"; string usesContext = $"Feature: Context"; -#pragma warning disable AIEVAL001 IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator(); -#pragma warning restore AIEVAL001 IEvaluator coherenceEvaluator = new CoherenceEvaluator(); IEvaluator fluencyEvaluator = new FluencyEvaluator(); @@ -101,6 +100,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _)); } [ConditionalFact] @@ -132,6 +139,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _)); #if NET }); #else @@ -161,6 +176,17 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( Assert.True( result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(4, result.Metrics.Count); + Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness)); + Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence)); + Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness)); + Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval)); + + Assert.Null(groundedness.Context); + Assert.Null(equivalence.Context); + Assert.Null(completeness.Context); + Assert.Null(retrieval.Context); } [ConditionalFact] @@ -224,6 +250,32 @@ await scenarioRun.EvaluateAsync( groundingContextForGroundednessEvaluator, groundTruthForCompletenessEvaluator, retrievedContextChunksForRetrievalEvaluator]); + + Assert.Equal(4, result.Metrics.Count); + Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness)); + Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence)); + Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness)); + Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval)); + + Assert.True( + groundedness.Context?.Count is 1 && + groundedness.Context.TryGetValue(GroundednessEvaluatorContext.GroundingContextName, out EvaluationContext? context1) && + ReferenceEquals(context1, groundingContextForGroundednessEvaluator)); + + Assert.True( + equivalence.Context?.Count is 1 && + equivalence.Context.TryGetValue(EquivalenceEvaluatorContext.GroundTruthContextName, out EvaluationContext? context2) && + ReferenceEquals(context2, baselineResponseForEquivalenceEvaluator)); + + Assert.True( + completeness.Context?.Count is 1 && + completeness.Context.TryGetValue(CompletenessEvaluatorContext.GroundTruthContextName, out EvaluationContext? context3) && + ReferenceEquals(context3, groundTruthForCompletenessEvaluator)); + + Assert.True( + retrieval.Context?.Count is 1 && + retrieval.Context.TryGetValue(RetrievalEvaluatorContext.RetrievedContextChunksContextName, out EvaluationContext? context4) && + ReferenceEquals(context4, retrievedContextChunksForRetrievalEvaluator)); } [MemberNotNull(nameof(_qualityReportingConfiguration))] diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs index 9a9ac055ec3..be6e08c1f43 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs @@ -153,17 +153,33 @@ At its furthest point (conjunction), Mars is about 250 million miles from Earth. The distance varies due to the elliptical orbits of both planets. """; - IEnumerable additionalContext = - [ - new GroundednessProEvaluatorContext(groundingContext), - new UngroundedAttributesEvaluatorContext(groundingContext) - ]; + GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext); + UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext); + IEnumerable additionalContext = [groundednessProContext, ungroundedAttributesContext]; EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response, additionalContext); Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro)); + Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); + + Assert.True( + groundednessPro.Context?.Count is 1 && + groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) && + ReferenceEquals(context1, groundednessProContext)); + + Assert.True( + ungroundedAttributes.Context?.Count is 1 && + ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) && + ReferenceEquals(context2, ungroundedAttributesContext)); } [ConditionalFact] @@ -212,17 +228,33 @@ At its closest (opposition), Jupiter is about 365 million miles away. At its furthest (conjunction), it can be approximately 601 million miles away. """; - IEnumerable additionalContext = - [ - new GroundednessProEvaluatorContext(groundingContext), - new UngroundedAttributesEvaluatorContext(groundingContext) - ]; + GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext); + UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext); + IEnumerable additionalContext = [groundednessProContext, ungroundedAttributesContext]; EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response2, additionalContext); Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro)); + Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); + + Assert.True( + groundednessPro.Context?.Count is 1 && + groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) && + ReferenceEquals(context1, groundednessProContext)); + + Assert.True( + ungroundedAttributes.Context?.Count is 1 && + ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) && + ReferenceEquals(context2, ungroundedAttributesContext)); } [ConditionalFact] @@ -250,6 +282,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -277,6 +318,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -317,6 +367,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -370,6 +429,15 @@ These distances are approximate and can vary slightly depending on the specific Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -396,6 +464,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -434,6 +505,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -465,6 +539,13 @@ await _mixedQualityAndSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(5, result.Metrics.Count); + Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); } [MemberNotNull(nameof(_contentSafetyReportingConfiguration))]