Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,13 @@ await TimingHelper.ExecuteWithTimingAsync(() =>
string annotationResult = annotationResponse.Text;
EvaluationResult result = ContentSafetyService.ParseAnnotationResult(annotationResult);

UpdateMetrics();
EvaluationResult updatedResult = UpdateMetrics();
return updatedResult;

return result;

void UpdateMetrics()
EvaluationResult UpdateMetrics()
{
EvaluationResult updatedResult = new EvaluationResult();

foreach (EvaluationMetric metric in result.Metrics.Values)
{
string contentSafetyServiceMetricName = metric.Name;
Expand Down Expand Up @@ -185,7 +186,11 @@ void UpdateMetrics()
// metric.LogJsonData(payload);
// metric.LogJsonData(annotationResult);
#pragma warning restore S125

updatedResult.Metrics.Add(metric.Name, metric);
}

return updatedResult;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ internal static EvaluationResult ParseAnnotationResult(string annotationResponse
}
}

result.Metrics[metric.Name] = metric;
result.Metrics.Add(metric.Name, metric);
}

return result;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ await EvaluateContentSafetyAsync(

foreach (EvaluationMetric imageMetric in imageResult.Metrics.Values)
{
result.Metrics[imageMetric.Name] = imageMetric;
result.Metrics.Add(imageMetric.Name, imageMetric);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests;

[Experimental("AIEVAL001")]
public class QualityEvaluatorTests
{
private static readonly ChatOptions? _chatOptions;
Expand Down Expand Up @@ -47,9 +48,7 @@ static QualityEvaluatorTests()
string temperature = $"Temperature: {_chatOptions.Temperature}";
string usesContext = $"Feature: Context";

#pragma warning disable AIEVAL001
IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator();
#pragma warning restore AIEVAL001

IEvaluator coherenceEvaluator = new CoherenceEvaluator();
IEvaluator fluencyEvaluator = new FluencyEvaluator();
Expand Down Expand Up @@ -101,6 +100,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(6, result.Metrics.Count);
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _));
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _));
Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _));
}

[ConditionalFact]
Expand Down Expand Up @@ -132,6 +139,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(6, result.Metrics.Count);
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _));
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _));
Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _));
#if NET
});
#else
Expand Down Expand Up @@ -161,6 +176,17 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync(
Assert.True(
result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(4, result.Metrics.Count);
Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness));
Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence));
Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness));
Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval));

Assert.Null(groundedness.Context);
Assert.Null(equivalence.Context);
Assert.Null(completeness.Context);
Assert.Null(retrieval.Context);
}

[ConditionalFact]
Expand Down Expand Up @@ -224,6 +250,32 @@ await scenarioRun.EvaluateAsync(
groundingContextForGroundednessEvaluator,
groundTruthForCompletenessEvaluator,
retrievedContextChunksForRetrievalEvaluator]);

Assert.Equal(4, result.Metrics.Count);
Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness));
Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence));
Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness));
Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval));

Assert.True(
groundedness.Context?.Count is 1 &&
groundedness.Context.TryGetValue(GroundednessEvaluatorContext.GroundingContextName, out EvaluationContext? context1) &&
ReferenceEquals(context1, groundingContextForGroundednessEvaluator));

Assert.True(
equivalence.Context?.Count is 1 &&
equivalence.Context.TryGetValue(EquivalenceEvaluatorContext.GroundTruthContextName, out EvaluationContext? context2) &&
ReferenceEquals(context2, baselineResponseForEquivalenceEvaluator));

Assert.True(
completeness.Context?.Count is 1 &&
completeness.Context.TryGetValue(CompletenessEvaluatorContext.GroundTruthContextName, out EvaluationContext? context3) &&
ReferenceEquals(context3, groundTruthForCompletenessEvaluator));

Assert.True(
retrieval.Context?.Count is 1 &&
retrieval.Context.TryGetValue(RetrievalEvaluatorContext.RetrievedContextChunksContextName, out EvaluationContext? context4) &&
ReferenceEquals(context4, retrievedContextChunksForRetrievalEvaluator));
}

[MemberNotNull(nameof(_qualityReportingConfiguration))]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,33 @@ At its furthest point (conjunction), Mars is about 250 million miles from Earth.
The distance varies due to the elliptical orbits of both planets.
""";

IEnumerable<EvaluationContext> additionalContext =
[
new GroundednessProEvaluatorContext(groundingContext),
new UngroundedAttributesEvaluatorContext(groundingContext)
];
GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext);
UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext);
IEnumerable<EvaluationContext> additionalContext = [groundednessProContext, ungroundedAttributesContext];

EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response, additionalContext);

Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(6, result.Metrics.Count);
Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _));
Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro));
Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes));
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));

Assert.True(
groundednessPro.Context?.Count is 1 &&
groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) &&
ReferenceEquals(context1, groundednessProContext));

Assert.True(
ungroundedAttributes.Context?.Count is 1 &&
ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) &&
ReferenceEquals(context2, ungroundedAttributesContext));
}

[ConditionalFact]
Expand Down Expand Up @@ -212,17 +228,33 @@ At its closest (opposition), Jupiter is about 365 million miles away.
At its furthest (conjunction), it can be approximately 601 million miles away.
""";

IEnumerable<EvaluationContext> additionalContext =
[
new GroundednessProEvaluatorContext(groundingContext),
new UngroundedAttributesEvaluatorContext(groundingContext)
];
GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext);
UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext);
IEnumerable<EvaluationContext> additionalContext = [groundednessProContext, ungroundedAttributesContext];

EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response2, additionalContext);

Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(6, result.Metrics.Count);
Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _));
Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro));
Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes));
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));

Assert.True(
groundednessPro.Context?.Count is 1 &&
groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) &&
ReferenceEquals(context1, groundednessProContext));

Assert.True(
ungroundedAttributes.Context?.Count is 1 &&
ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) &&
ReferenceEquals(context2, ungroundedAttributesContext));
}

[ConditionalFact]
Expand Down Expand Up @@ -250,6 +282,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(7, result.Metrics.Count);
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
}

[ConditionalFact]
Expand Down Expand Up @@ -277,6 +318,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(7, result.Metrics.Count);
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
}

[ConditionalFact]
Expand Down Expand Up @@ -317,6 +367,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(7, result.Metrics.Count);
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
}

[ConditionalFact]
Expand Down Expand Up @@ -370,6 +429,15 @@ These distances are approximate and can vary slightly depending on the specific
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(7, result.Metrics.Count);
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
}

[ConditionalFact]
Expand All @@ -396,6 +464,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Single(result.Metrics);
Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _));
}

[ConditionalFact]
Expand Down Expand Up @@ -434,6 +505,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Single(result.Metrics);
Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _));
}

[ConditionalFact]
Expand Down Expand Up @@ -465,6 +539,13 @@ await _mixedQualityAndSafetyReportingConfiguration.CreateScenarioRunAsync(
Assert.False(
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));

Assert.Equal(5, result.Metrics.Count);
Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _));
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _));
Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _));
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
}

[MemberNotNull(nameof(_contentSafetyReportingConfiguration))]
Expand Down
Loading