diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs index b42631c3c06..e69728a2880 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs @@ -151,12 +151,13 @@ await TimingHelper.ExecuteWithTimingAsync(() => string annotationResult = annotationResponse.Text; EvaluationResult result = ContentSafetyService.ParseAnnotationResult(annotationResult); - UpdateMetrics(); + EvaluationResult updatedResult = UpdateMetrics(); + return updatedResult; - return result; - - void UpdateMetrics() + EvaluationResult UpdateMetrics() { + EvaluationResult updatedResult = new EvaluationResult(); + foreach (EvaluationMetric metric in result.Metrics.Values) { string contentSafetyServiceMetricName = metric.Name; @@ -185,7 +186,11 @@ void UpdateMetrics() // metric.LogJsonData(payload); // metric.LogJsonData(annotationResult); #pragma warning restore S125 + + updatedResult.Metrics.Add(metric.Name, metric); } + + return updatedResult; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs index 291952d761b..af72638dd6c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs @@ -120,7 +120,7 @@ internal static EvaluationResult ParseAnnotationResult(string annotationResponse } } - result.Metrics[metric.Name] = metric; + result.Metrics.Add(metric.Name, metric); } return result; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs index 25c99306c32..8def2e8a078 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs @@ -100,7 +100,7 @@ await EvaluateContentSafetyAsync( foreach (EvaluationMetric imageMetric in imageResult.Metrics.Values) { - result.Metrics[imageMetric.Name] = imageMetric; + result.Metrics.Add(imageMetric.Name, imageMetric); } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs index 63b9897abbf..90f7a2b29aa 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs @@ -18,6 +18,7 @@ namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; +[Experimental("AIEVAL001")] public class QualityEvaluatorTests { private static readonly ChatOptions? _chatOptions; @@ -47,9 +48,7 @@ static QualityEvaluatorTests() string temperature = $"Temperature: {_chatOptions.Temperature}"; string usesContext = $"Feature: Context"; -#pragma warning disable AIEVAL001 IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator(); -#pragma warning restore AIEVAL001 IEvaluator coherenceEvaluator = new CoherenceEvaluator(); IEvaluator fluencyEvaluator = new FluencyEvaluator(); @@ -101,6 +100,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _)); } [ConditionalFact] @@ -132,6 +139,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _)); #if NET }); #else @@ -161,6 +176,17 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( Assert.True( result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(4, result.Metrics.Count); + Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness)); + Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence)); + Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness)); + Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval)); + + Assert.Null(groundedness.Context); + Assert.Null(equivalence.Context); + Assert.Null(completeness.Context); + Assert.Null(retrieval.Context); } [ConditionalFact] @@ -224,6 +250,32 @@ await scenarioRun.EvaluateAsync( groundingContextForGroundednessEvaluator, groundTruthForCompletenessEvaluator, retrievedContextChunksForRetrievalEvaluator]); + + Assert.Equal(4, result.Metrics.Count); + Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness)); + Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence)); + Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness)); + Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval)); + + Assert.True( + groundedness.Context?.Count is 1 && + groundedness.Context.TryGetValue(GroundednessEvaluatorContext.GroundingContextName, out EvaluationContext? context1) && + ReferenceEquals(context1, groundingContextForGroundednessEvaluator)); + + Assert.True( + equivalence.Context?.Count is 1 && + equivalence.Context.TryGetValue(EquivalenceEvaluatorContext.GroundTruthContextName, out EvaluationContext? context2) && + ReferenceEquals(context2, baselineResponseForEquivalenceEvaluator)); + + Assert.True( + completeness.Context?.Count is 1 && + completeness.Context.TryGetValue(CompletenessEvaluatorContext.GroundTruthContextName, out EvaluationContext? context3) && + ReferenceEquals(context3, groundTruthForCompletenessEvaluator)); + + Assert.True( + retrieval.Context?.Count is 1 && + retrieval.Context.TryGetValue(RetrievalEvaluatorContext.RetrievedContextChunksContextName, out EvaluationContext? context4) && + ReferenceEquals(context4, retrievedContextChunksForRetrievalEvaluator)); } [MemberNotNull(nameof(_qualityReportingConfiguration))] diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs index 9a9ac055ec3..be6e08c1f43 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs @@ -153,17 +153,33 @@ At its furthest point (conjunction), Mars is about 250 million miles from Earth. The distance varies due to the elliptical orbits of both planets. """; - IEnumerable additionalContext = - [ - new GroundednessProEvaluatorContext(groundingContext), - new UngroundedAttributesEvaluatorContext(groundingContext) - ]; + GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext); + UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext); + IEnumerable additionalContext = [groundednessProContext, ungroundedAttributesContext]; EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response, additionalContext); Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro)); + Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); + + Assert.True( + groundednessPro.Context?.Count is 1 && + groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) && + ReferenceEquals(context1, groundednessProContext)); + + Assert.True( + ungroundedAttributes.Context?.Count is 1 && + ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) && + ReferenceEquals(context2, ungroundedAttributesContext)); } [ConditionalFact] @@ -212,17 +228,33 @@ At its closest (opposition), Jupiter is about 365 million miles away. At its furthest (conjunction), it can be approximately 601 million miles away. """; - IEnumerable additionalContext = - [ - new GroundednessProEvaluatorContext(groundingContext), - new UngroundedAttributesEvaluatorContext(groundingContext) - ]; + GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext); + UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext); + IEnumerable additionalContext = [groundednessProContext, ungroundedAttributesContext]; EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response2, additionalContext); Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(6, result.Metrics.Count); + Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro)); + Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); + + Assert.True( + groundednessPro.Context?.Count is 1 && + groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) && + ReferenceEquals(context1, groundednessProContext)); + + Assert.True( + ungroundedAttributes.Context?.Count is 1 && + ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) && + ReferenceEquals(context2, ungroundedAttributesContext)); } [ConditionalFact] @@ -250,6 +282,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -277,6 +318,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -317,6 +367,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -370,6 +429,15 @@ These distances are approximate and can vary slightly depending on the specific Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(7, result.Metrics.Count); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -396,6 +464,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -434,6 +505,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _)); } [ConditionalFact] @@ -465,6 +539,13 @@ await _mixedQualityAndSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.False( result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(5, result.Metrics.Count); + Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); } [MemberNotNull(nameof(_contentSafetyReportingConfiguration))]