diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md index 580facd6294..dfc15311489 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md @@ -6,7 +6,7 @@ * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural -language processing tasks. Evaluators currently include BLEU score, with more planned. +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index 8ce43d48e52..bfe1b9af589 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System; using System.Collections.Generic; using System.Globalization; using System.Linq; @@ -77,10 +78,10 @@ public ValueTask EvaluateAsync( return new ValueTask(result); } - var (score, duration) = TimingHelper.ExecuteWithTiming(() => + (double score, TimeSpan duration) = TimingHelper.ExecuteWithTiming(() => { - var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference)); - var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); + string[][] references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference).ToArray()).ToArray(); + string[] hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text).ToArray(); return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); }); @@ -88,7 +89,7 @@ public ValueTask EvaluateAsync( string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); - metric.Interpretation = NLPScoreInterpretation.Interpret(metric); + metric.Interpretation = metric.Interpret(); return new ValueTask(result); } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs index 320b20e9116..4085355db92 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs @@ -24,10 +24,10 @@ public sealed class BLEUEvaluatorContext : EvaluationContext /// Gets the unique that is used for /// . /// - public static string BLEUContextName => "BLEU Context"; + public static string ReferencesContextName => "References (BLEU)"; /// - /// Gets the reference responses against which the provided model response will be scored. + /// Gets the references against which the provided response will be scored. /// /// /// The measures the degree to which the response being evaluated is similar to @@ -41,8 +41,8 @@ public sealed class BLEUEvaluatorContext : EvaluationContext /// /// The reference responses against which the response that is being evaluated is compared. /// - public BLEUEvaluatorContext(params string[] references) - : this(references as IEnumerable) + public BLEUEvaluatorContext(IEnumerable references) + : this(references.ToArray()) { } @@ -52,11 +52,11 @@ public BLEUEvaluatorContext(params string[] references) /// /// The reference responses against which the response that is being evaluated is compared. /// - public BLEUEvaluatorContext(IEnumerable references) + public BLEUEvaluatorContext(params string[] references) : base( - name: BLEUContextName, + name: ReferencesContextName, contents: [.. references.Select(c => new TextContent(c))]) { - References = [.. references]; + References = references; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs index c7420d0be7a..b5ffb0ba3d2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs @@ -16,7 +16,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; /// internal static class BLEUAlgorithm { - internal static int ClosestRefLength(IEnumerable> references, int hypLength) + internal static int ClosestRefLength(string[][] references, int hypLength) { if (!references.Any()) { @@ -27,7 +27,7 @@ internal static int ClosestRefLength(IEnumerable> references int smallestDiff = int.MaxValue; foreach (var reference in references) { - int refLength = reference.Count(); + int refLength = reference.Length; int diff = Math.Abs(refLength - hypLength); if (diff < smallestDiff || (diff == smallestDiff && refLength < closestRefLength)) @@ -55,27 +55,27 @@ internal static double BrevityPenalty(int closestRefLength, int hypLength) return Math.Exp(1 - ((double)closestRefLength / hypLength)); } - internal static RationalNumber ModifiedPrecision(IEnumerable> references, IEnumerable hypothesis, int n = 1) + internal static RationalNumber ModifiedPrecision(string[][] references, string[] hypothesis, int n = 1) { if (n <= 0) { Throw.ArgumentOutOfRangeException(nameof(n), $"`{nameof(n)}` must be greater than zero."); } - if (!references.Any() || !hypothesis.Any()) + if (references.Length == 0 || hypothesis.Length == 0) { return RationalNumber.Zero; } - var hyp = hypothesis.CreateNGrams(n); - var hypCounts = new MatchCounter>(hyp); + List> hypGrams = hypothesis.CreateNGrams(n); + MatchCounter> hypCounts = new(hypGrams); Dictionary, int> maxCounts = []; foreach (var rf in references) { - IEnumerable> refGrams = rf.CreateNGrams(n); - var refCounts = new MatchCounter>(refGrams); + List> refGrams = rf.CreateNGrams(n); + MatchCounter> refCounts = new(refGrams); foreach (var ct in refCounts) { @@ -123,25 +123,28 @@ internal static double[] EqualWeights(int n) } double[] weights = new double[n]; +#if NET8_0_OR_GREATER + Array.Fill(weights, 1.0 / n); +#else for (int i = 0; i < n; i++) { weights[i] = 1.0 / n; } - +#endif return weights; } internal static readonly double[] DefaultBLEUWeights = EqualWeights(4); - internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis, + internal static double SentenceBLEU(string[][] references, string[] hypothesis, double[]? weights = null, Func? smoothingFunction = null) { - if (references == null || !references.Any()) + if (references == null || references.Length == 0) { Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty."); } - if (hypothesis == null || !hypothesis.Any()) + if (hypothesis == null || hypothesis.Length == 0) { Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty."); } @@ -171,7 +174,7 @@ internal static double SentenceBLEU(IEnumerable> references, precisionValues[i] = prec; } - int hypLen = hypothesis.Count(); + int hypLen = hypothesis.Length; int closestRefLength = ClosestRefLength(references, hypLen); double brevityPenalty = BrevityPenalty(closestRefLength, hypLen); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/F1Algorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/F1Algorithm.cs new file mode 100644 index 00000000000..cfc077e11a0 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/F1Algorithm.cs @@ -0,0 +1,44 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; + +/// +/// F1 score for a response is the ratio of the number of shared words between the generated response +/// and the reference response. Python implementation reference +/// https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py. +/// +internal static class F1Algorithm +{ + public static double CalculateF1Score(string[] groundTruth, string[] response) + { + if (groundTruth == null || groundTruth.Length == 0) + { + Throw.ArgumentNullException(nameof(groundTruth), $"'{nameof(groundTruth)}' cannot be null or empty."); + } + + if (response == null || response.Length == 0) + { + Throw.ArgumentNullException(nameof(response), $"'{nameof(response)}' cannot be null or empty."); + } + + MatchCounter referenceTokens = new(groundTruth); + MatchCounter predictionTokens = new(response); + MatchCounter commonTokens = referenceTokens.Intersect(predictionTokens); + int numCommonTokens = commonTokens.Sum(); + + if (numCommonTokens == 0) + { + return 0.0; // F1 score is 0 if there are no common tokens + } + else + { + double precision = (double)numCommonTokens / response.Length; + double recall = (double)numCommonTokens / groundTruth.Length; + double f1 = (2.0 * precision * recall) / (precision + recall); + return f1; + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/GLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/GLEUAlgorithm.cs new file mode 100644 index 00000000000..cd25b9beb5d --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/GLEUAlgorithm.cs @@ -0,0 +1,66 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; + +/// +/// Google-BLEU (GLEU) algorithm implementation for evaluating the quality of a response. +/// Python implementation reference: https://www.nltk.org/api/nltk.translate.gleu_score.html. +/// +internal static class GLEUAlgorithm +{ + internal static double SentenceGLEU(string[][] references, string[] hypothesis, int minN = 1, int maxN = 4) + { + if (references == null || references.Length == 0) + { + Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty."); + } + + if (hypothesis == null || hypothesis.Length == 0) + { + Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty."); + } + + MatchCounter> hypNGrams = new(hypothesis.CreateAllNGrams(minN, maxN)); + int truePosFalsePos = hypNGrams.Sum(); + + List<(int, int)> hypCounts = []; + foreach (var reference in references) + { + MatchCounter> refNGrams = new(reference.CreateAllNGrams(minN, maxN)); + int truePosFalseNeg = refNGrams.Sum(); + + MatchCounter> overlapNGrams = hypNGrams.Intersect(refNGrams); + int truePos = overlapNGrams.Sum(); + + int nAll = Math.Max(truePosFalsePos, truePosFalseNeg); + + if (nAll > 0) + { + hypCounts.Add((truePos, nAll)); + } + } + + int corpusNMatch = 0; + int corpusNAll = 0; + + foreach (var (truePos, nAll) in hypCounts) + { + corpusNMatch += truePos; + corpusNAll += nAll; + } + + if (corpusNAll == 0) + { + return 0.0; + } + else + { + return (double)corpusNMatch / corpusNAll; + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs index bbca2252057..b54c67d14a2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs @@ -53,7 +53,26 @@ public void AddRange(IEnumerable items) } } - public string ToDebugString() => string.Concat(_counts.Select(v => $"{v.Key}: {v.Value}, ")); + public MatchCounter Intersect(MatchCounter other) + { + _ = Throw.IfNull(other, nameof(other)); + var intersection = new MatchCounter(); + + (Dictionary smaller, Dictionary larger) = + _counts.Count < other._counts.Count ? (_counts, other._counts) : (other._counts, _counts); + + foreach (var kvp in smaller) + { + if (larger.TryGetValue(kvp.Key, out int otherCount)) + { + intersection._counts[kvp.Key] = Math.Min(kvp.Value, otherCount); + } + } + + return intersection; + } + + public string ToDebugString() => string.Join(",", _counts.Select(v => $"{v.Key}: {v.Value}")); public IEnumerator> GetEnumerator() => _counts.GetEnumerator(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs index 149d3820328..bde63f74c73 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.Shared.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; @@ -14,12 +13,16 @@ internal static class NGramExtensions public static NGram CreateNGram(this ReadOnlySpan values) where T : IEquatable => new(values); + internal static List> CreateNGrams(this T[] input, int n) + where T : IEquatable + => CreateNGrams((ReadOnlySpan)input, n); + /// /// Create a sequence of n-grams from the input sequence. /// /// The input sequence of items. /// The size of each n-gram. - internal static IEnumerable> CreateNGrams(this IEnumerable input, int n) + internal static List> CreateNGrams(this ReadOnlySpan input, int n) where T : IEquatable { if (n <= 0) @@ -27,15 +30,56 @@ internal static IEnumerable> CreateNGrams(this IEnumerable input, Throw.ArgumentOutOfRangeException(nameof(n), $"'{nameof(n)}' must be greater than zero."); } - T[] output = [.. input.Take(n)]; + List> nGrams = []; + + ReadOnlySpan next = input.Slice(0, Math.Min(n, input.Length)); - while (output.Length == n) + while (next.Length == n) { - yield return new NGram(output); + nGrams.Add(new NGram(next)); - input = input.Skip(1); - output = [.. input.Take(n)]; + input = input.Slice(1); + next = input.Slice(0, Math.Min(n, input.Length)); } + + return nGrams; } + internal static List> CreateAllNGrams(this T[] input, int minN, int maxN = -1) + where T : IEquatable + => CreateAllNGrams((ReadOnlySpan)input, minN, maxN); + + /// + /// Create a sequence of all n-grams from the input sequence from minN to maxN. + /// + /// The input sequence of items. + /// The minimum size of n-gram. + /// The maximum size of n-gram. If not specified, the default is to include up to length of the input. + internal static List> CreateAllNGrams(this ReadOnlySpan input, int minN, int maxN = -1) + where T : IEquatable + { + _ = Throw.IfLessThanOrEqual(minN, 0, nameof(minN)); + + if (maxN < 0) + { + maxN = input.Length; // Update to use Length instead of Count() + } + else if (maxN < minN) + { + Throw.ArgumentOutOfRangeException(nameof(maxN), $"'{nameof(maxN)}' must be greater than or equal to '{nameof(minN)}'."); + } + + List> nGrams = []; + + for (int i = 0; i <= input.Length - minN; i++) + { + for (int s = minN; s <= maxN && s <= input.Length - i; s++) + { + nGrams.Add(new NGram(input.Slice(i, s))); + } + } + + return nGrams; + } } + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/ScoreInterpretationExtensions.cs similarity index 90% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/ScoreInterpretationExtensions.cs index 4ef1d08b468..9fe6df64452 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/ScoreInterpretationExtensions.cs @@ -3,9 +3,9 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; -internal static class NLPScoreInterpretation +internal static class ScoreInterpretationExtensions { - internal static EvaluationMetricInterpretation Interpret(NumericMetric metric) + internal static EvaluationMetricInterpretation Interpret(this NumericMetric metric) { // Many NLP scores range from 0.0 to 1.0, where: // - 0.0 means no match at all, diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs new file mode 100644 index 00000000000..b0806be6d66 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs @@ -0,0 +1,88 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// An that evaluates the quality of a response produced by an AI model by comparing +/// it to a reference response using the F1 scoring algorithm. F1 score is the ratio of the number of shared +/// words between the generated response and the reference response. +/// +/// +/// +/// The computes the F1 score of a response ("hypothesis") in relation to a ground-truth reference +/// supplied by . The score is returned in a +/// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match. +/// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is +/// passing and a score below 0.5 is failing. +/// +/// +public sealed class F1Evaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string F1MetricName => "F1"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [F1MetricName]; + + /// + public ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + + var metric = new NumericMetric(F1MetricName); + var result = new EvaluationResult(metric); + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + + return new ValueTask(result); + } + + if (additionalContext?.OfType().FirstOrDefault() + is not F1EvaluatorContext context) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"A value of type '{nameof(F1EvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + + return new ValueTask(result); + } + + (double score, TimeSpan duration) = TimingHelper.ExecuteWithTiming(() => + { + string[] reference = SimpleWordTokenizer.WordTokenize(context.GroundTruth).ToArray(); + string[] hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text).ToArray(); + return F1Algorithm.CalculateF1Score(reference, hypothesis); + }); + + metric.Value = score; + string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); + metric.AddOrUpdateContext(context); + metric.Interpretation = metric.Interpret(); + + return new ValueTask(result); + } + +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1EvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1EvaluatorContext.cs new file mode 100644 index 00000000000..d6dafcc3c6a --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1EvaluatorContext.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// Contextual information that the uses to compute the F1 score for a response. +/// +/// +/// measures the F1 score of a response compared to a reference response that is supplied via +/// . F1 is a metric used to valuate the quality of machine-generated text. It is the ratio +/// of the number of shared words between the generated response and the reference response. +/// +public sealed class F1EvaluatorContext : EvaluationContext +{ + /// + /// Gets the unique that is used for + /// . + /// + public static string GroundTruthContextName => "Ground Truth (F1)"; + + /// + /// Gets the reference response against which the provided response will be scored. + /// + /// + /// The measures the degree to which the response being evaluated is similar to + /// the response supplied via . The metric will be reported as an F1 score. + /// + public string GroundTruth { get; } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The reference response against which the provided response will be scored. + /// + public F1EvaluatorContext(string groundTruth) + : base( + name: GroundTruthContextName, + content: groundTruth) + { + GroundTruth = groundTruth; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs new file mode 100644 index 00000000000..d33ed07f5cb --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs @@ -0,0 +1,97 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// An that evaluates the quality of a response produced by an AI model by comparing +/// it to a reference response using the GLEU (Google-BLEU) algorithm. The GLEU evaluator measures the similarity +/// between the generated response and one or more reference responses using n-gram overlap. +/// +/// +/// +/// The computes the GLEU score of a response ("hypothesis") compared to a reference +/// supplied via . The score is returned in a +/// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match. +/// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is +/// passing and a score below 0.5 is failing. +/// +/// +public sealed class GLEUEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string GLEUMetricName => "GLEU"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [GLEUMetricName]; + + /// + public ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + + var metric = new NumericMetric(GLEUMetricName); + var result = new EvaluationResult(metric); + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + + return new ValueTask(result); + } + + if (additionalContext?.OfType().FirstOrDefault() + is not GLEUEvaluatorContext context) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"A value of type '{nameof(GLEUEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + + return new ValueTask(result); + } + + if (context.References.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied '{nameof(GLEUEvaluatorContext)}' did not contain any '{nameof(GLEUEvaluatorContext.References)}'.")); + + return new ValueTask(result); + } + + (double score, TimeSpan duration) = TimingHelper.ExecuteWithTiming(() => + { + string[][] references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference).ToArray()).ToArray(); + string[] hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text).ToArray(); + return GLEUAlgorithm.SentenceGLEU(references, hypothesis); + }); + + metric.Value = score; + string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); + metric.AddOrUpdateContext(context); + metric.Interpretation = metric.Interpret(); + + return new ValueTask(result); + } + +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluatorContext.cs new file mode 100644 index 00000000000..b41b1f80f42 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluatorContext.cs @@ -0,0 +1,62 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// Contextual information that the uses to compute the GLEU score for a response. +/// +/// +/// measures the GLEU score of a response compared to one or more reference responses +/// supplied via . GLEU (Google-BLEU) is a metric used to evaluate the quality of machine-generated text. +/// +public sealed class GLEUEvaluatorContext : EvaluationContext +{ + /// + /// Gets the unique that is used for + /// . + /// + public static string ReferencesContextName => "References (GLEU)"; + + /// + /// Gets the reference against which the provided response will be scored. + /// + /// + /// The measures the degree to which the response being evaluated is similar to + /// the response supplied via . The metric will be reported as a GLEU score. + /// + public IReadOnlyList References { get; } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The reference responses against which the response that is being evaluated is compared. + /// + public GLEUEvaluatorContext(IEnumerable references) + : this(references.ToArray()) + { + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The reference responses against which the response that is being evaluated is compared. + /// + public GLEUEvaluatorContext(params string[] references) + : base( + name: ReferencesContextName, + contents: [.. references.Select(c => new TextContent(c))]) + { + References = references; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj index 0bab1cf7fb0..12e7cebb957 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -17,6 +17,7 @@ true + true diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md index 580facd6294..dfc15311489 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md @@ -6,7 +6,7 @@ * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural -language processing tasks. Evaluators currently include BLEU score, with more planned. +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs index c8dcbc996b7..cf4a9b17004 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Linq; namespace Microsoft.Extensions.AI.Evaluation.Quality; @@ -38,7 +39,7 @@ public sealed class IntentResolutionEvaluatorContext : EvaluationContext /// are defined as s. Any other definitions will be ignored. /// /// - public IntentResolutionEvaluatorContext(IEnumerable toolDefinitions) + public IntentResolutionEvaluatorContext(params AITool[] toolDefinitions) : base(name: IntentResolutionContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) { ToolDefinitions = [.. toolDefinitions]; @@ -57,8 +58,8 @@ public IntentResolutionEvaluatorContext(IEnumerable toolDefinitions) /// are defined as s. Any other definitions will be ignored. /// /// - public IntentResolutionEvaluatorContext(params AITool[] toolDefinitions) - : this(toolDefinitions as IEnumerable) + public IntentResolutionEvaluatorContext(IEnumerable toolDefinitions) + : this(toolDefinitions.ToArray()) { } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md index 580facd6294..dfc15311489 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md @@ -6,7 +6,7 @@ * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural -language processing tasks. Evaluators currently include BLEU score, with more planned. +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluatorContext.cs index 1b3f94bcdf9..50c80f42fa6 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluatorContext.cs @@ -41,12 +41,12 @@ public sealed class RetrievalEvaluatorContext : EvaluationContext /// /// The context chunks that were retrieved in response to the user request being evaluated. /// - public RetrievalEvaluatorContext(IEnumerable retrievedContextChunks) + public RetrievalEvaluatorContext(params string[] retrievedContextChunks) : base( name: RetrievedContextChunksContextName, contents: [.. retrievedContextChunks.Select(c => new TextContent(c))]) { - RetrievedContextChunks = [.. retrievedContextChunks]; + RetrievedContextChunks = retrievedContextChunks; } /// @@ -55,8 +55,8 @@ public RetrievalEvaluatorContext(IEnumerable retrievedContextChunks) /// /// The context chunks that were retrieved in response to the user request being evaluated. /// - public RetrievalEvaluatorContext(params string[] retrievedContextChunks) - : this(retrievedContextChunks as IEnumerable) + public RetrievalEvaluatorContext(IEnumerable retrievedContextChunks) + : this(retrievedContextChunks.ToArray()) { } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs index 3d54ed74dab..4557f2536d2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Linq; namespace Microsoft.Extensions.AI.Evaluation.Quality; @@ -39,7 +40,7 @@ public sealed class TaskAdherenceEvaluatorContext : EvaluationContext /// are defined as s. Any other definitions will be ignored. /// /// - public TaskAdherenceEvaluatorContext(IEnumerable toolDefinitions) + public TaskAdherenceEvaluatorContext(params AITool[] toolDefinitions) : base(name: TaskAdherenceContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) { ToolDefinitions = [.. toolDefinitions]; @@ -58,8 +59,8 @@ public TaskAdherenceEvaluatorContext(IEnumerable toolDefinitions) /// are defined as s. Any other definitions will be ignored. /// /// - public TaskAdherenceEvaluatorContext(params AITool[] toolDefinitions) - : this(toolDefinitions as IEnumerable) + public TaskAdherenceEvaluatorContext(IEnumerable toolDefinitions) + : this(toolDefinitions.ToArray()) { } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs index d25e586163a..79ebc923d6c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Linq; namespace Microsoft.Extensions.AI.Evaluation.Quality; @@ -40,7 +41,7 @@ public sealed class ToolCallAccuracyEvaluatorContext : EvaluationContext /// are defined as s. Any other definitions will be ignored. /// /// - public ToolCallAccuracyEvaluatorContext(IEnumerable toolDefinitions) + public ToolCallAccuracyEvaluatorContext(params AITool[] toolDefinitions) : base(name: ToolCallAccuracyContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) { ToolDefinitions = [.. toolDefinitions]; @@ -59,8 +60,8 @@ public ToolCallAccuracyEvaluatorContext(IEnumerable toolDefinitions) /// are defined as s. Any other definitions will be ignored. /// /// - public ToolCallAccuracyEvaluatorContext(params AITool[] toolDefinitions) - : this(toolDefinitions as IEnumerable) + public ToolCallAccuracyEvaluatorContext(IEnumerable toolDefinitions) + : this(toolDefinitions.ToArray()) { } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md index 580facd6294..dfc15311489 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md @@ -6,7 +6,7 @@ * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural -language processing tasks. Evaluators currently include BLEU score, with more planned. +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md index c21e2a299ad..dfc15311489 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md index e135ed24cfe..9bf406ba052 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md @@ -6,7 +6,7 @@ * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural -language processing tasks. Evaluators currently include BLEU score, with more planned. +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md index 580facd6294..dfc15311489 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md @@ -6,7 +6,7 @@ * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural -language processing tasks. Evaluators currently include BLEU score, with more planned. +language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs index 1b029dc4a37..9260a688cc4 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using System.Collections.Generic; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -15,8 +14,8 @@ public class BLEUAlgorithmTests [Fact] public void ModifiedPrecisionTests() { - IEnumerable> references = ["the cat is on the mat".Split(' '), "there is a cat on the mat".Split(' ')]; - IEnumerable hypothesis = "the the the the the the the".Split(' '); + string[][] references = ["the cat is on the mat".Split(' '), "there is a cat on the mat".Split(' ')]; + string[] hypothesis = "the the the the the the the".Split(' '); RationalNumber prec = ModifiedPrecision(references, hypothesis, 1); Assert.Equal(0.2857, prec.ToDouble(), 4); @@ -36,8 +35,8 @@ public void ModifiedPrecisionTests() "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), "It is the practical guide for the army always to heed the directions of the party".Split(' '), ]; - IEnumerable hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); - IEnumerable hypothesis2 = "It is to insure the troops forever hearing the activity guidebook that party direct".Split(' '); + string[] hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + string[] hypothesis2 = "It is to insure the troops forever hearing the activity guidebook that party direct".Split(' '); prec = ModifiedPrecision(references, hypothesis1, 1); Assert.Equal(0.9444, prec.ToDouble(), 4); prec = ModifiedPrecision(references, hypothesis2, 1); @@ -76,63 +75,63 @@ public void SmoothingMethod4Tests(int[] num_denom, int hypLen, double[] vals) [Fact] public void TestBrevityPenalty() { - IEnumerable> references = [ - Enumerable.Repeat("a", 11), - Enumerable.Repeat("a", 8), + string[][] references = [ + [.. Enumerable.Repeat("a", 11)], + [.. Enumerable.Repeat("a", 8)], ]; - IEnumerable hypothesis = Enumerable.Repeat("a", 7); + string[] hypothesis = [.. Enumerable.Repeat("a", 7)]; int hypLength = hypothesis.Count(); int closestRefLength = ClosestRefLength(references, hypLength); double brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(0.8669, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 11), - Enumerable.Repeat("a", 8), - Enumerable.Repeat("a", 6), - Enumerable.Repeat("a", 7), + [.. Enumerable.Repeat("a", 11)], + [.. Enumerable.Repeat("a", 8)], + [.. Enumerable.Repeat("a", 6)], + [.. Enumerable.Repeat("a", 7)], ]; - hypothesis = Enumerable.Repeat("a", 7); + hypothesis = [.. Enumerable.Repeat("a", 7)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(1.0, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 28), - Enumerable.Repeat("a", 28), + [.. Enumerable.Repeat("a", 28)], + [.. Enumerable.Repeat("a", 28)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(0.26359, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 13), - Enumerable.Repeat("a", 2), + [.. Enumerable.Repeat("a", 13)], + [.. Enumerable.Repeat("a", 2)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(0.9200, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 13), - Enumerable.Repeat("a", 11), + [.. Enumerable.Repeat("a", 13)], + [.. Enumerable.Repeat("a", 11)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(1.0, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 11), - Enumerable.Repeat("a", 13), + [.. Enumerable.Repeat("a", 11)], + [.. Enumerable.Repeat("a", 13)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); @@ -143,8 +142,8 @@ public void TestBrevityPenalty() [Fact] public void TestZeroMatches() { - IEnumerable> references = ["The candidate has no alignment to any of the references".Split(' '),]; - IEnumerable hypothesis = "John loves Mary".Split(' '); + string[][] references = ["The candidate has no alignment to any of the references".Split(' '),]; + string[] hypothesis = "John loves Mary".Split(' '); double score = SentenceBLEU(references, hypothesis, EqualWeights(hypothesis.Count())); Assert.Equal(0.0, score, 4); @@ -153,8 +152,8 @@ public void TestZeroMatches() [Fact] public void TestFullMatches() { - IEnumerable> references = ["John loves Mary".Split(' '),]; - IEnumerable hypothesis = "John loves Mary".Split(' '); + string[][] references = ["John loves Mary".Split(' '),]; + string[] hypothesis = "John loves Mary".Split(' '); double score = SentenceBLEU(references, hypothesis, EqualWeights(hypothesis.Count())); Assert.Equal(1.0, score, 4); @@ -163,8 +162,8 @@ public void TestFullMatches() [Fact] public void TestPartialMatchesHypothesisLongerThanReference() { - IEnumerable> references = ["John loves Mary".Split(' '),]; - IEnumerable hypothesis = "John loves Mary who loves Mike".Split(' '); + string[][] references = ["John loves Mary".Split(' '),]; + string[] hypothesis = "John loves Mary who loves Mike".Split(' '); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0, score, 4); @@ -173,12 +172,12 @@ public void TestPartialMatchesHypothesisLongerThanReference() [Fact] public void TestSentenceBLEUExampleA() { - IEnumerable> references = [ + string[][] references = [ "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), "It is the practical guide for the army always to heed the directions of the party".Split(' ') ]; - IEnumerable hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + string[] hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.5046, score, 4); @@ -188,10 +187,10 @@ public void TestSentenceBLEUExampleA() [Fact] public void TestSentenceBLEUExampleB() { - IEnumerable> references = [ + string[][] references = [ "he was interested in world history because he read the book".Split(' '), ]; - IEnumerable hypothesis = "he read the book because he was interested in world history".Split(' '); + string[] hypothesis = "he read the book because he was interested in world history".Split(' '); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.74009, score, 4); @@ -200,12 +199,12 @@ public void TestSentenceBLEUExampleB() [Fact] public void TestSentenceBLEUExampleAWithWordTokenizer() { - IEnumerable> references = [ - SimpleWordTokenizer.WordTokenize("It is a guide to action that ensures that the military will forever heed Party commands"), - SimpleWordTokenizer.WordTokenize("It is the guiding principle which guarantees the military forces always being under the command of the Party"), - SimpleWordTokenizer.WordTokenize("It is the practical guide for the army always to heed the directions of the party") + string[][] references = [ + SimpleWordTokenizer.WordTokenize("It is a guide to action that ensures that the military will forever heed Party commands").ToArray(), + SimpleWordTokenizer.WordTokenize("It is the guiding principle which guarantees the military forces always being under the command of the Party").ToArray(), + SimpleWordTokenizer.WordTokenize("It is the practical guide for the army always to heed the directions of the party").ToArray(), ]; - IEnumerable hypothesis = SimpleWordTokenizer.WordTokenize("It is a guide to action which ensures that the military always obeys the commands of the party"); + string[] hypothesis = SimpleWordTokenizer.WordTokenize("It is a guide to action which ensures that the military always obeys the commands of the party").ToArray(); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.5046, score, 4); @@ -215,10 +214,10 @@ public void TestSentenceBLEUExampleAWithWordTokenizer() [Fact] public void TestSentenceBLEUExampleBWithWordTokenizer() { - IEnumerable> references = [ - SimpleWordTokenizer.WordTokenize("he was interested in world history because he read the book"), + string[][] references = [ + SimpleWordTokenizer.WordTokenize("he was interested in world history because he read the book").ToArray(), ]; - IEnumerable hypothesis = SimpleWordTokenizer.WordTokenize("he read the book because he was interested in world history"); + string[] hypothesis = SimpleWordTokenizer.WordTokenize("he read the book because he was interested in world history").ToArray(); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.74009, score, 4); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/F1EvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/F1EvaluatorTests.cs new file mode 100644 index 00000000000..52a87badf41 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/F1EvaluatorTests.cs @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +#pragma warning disable AIEVAL001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + +public class F1EvaluatorTests +{ + [Fact] + public async Task ReturnsPerfectScoreForIdenticalText() + { + var evaluator = new F1Evaluator(); + var context = new F1EvaluatorContext("The quick brown fox jumps over the lazy dog."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "The quick brown fox jumps over the lazy dog.")); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(F1Evaluator.F1MetricName, metric.Name); + Assert.Equal(1.0, (double)metric!.Value!, 4); + Assert.NotNull(metric.Interpretation); + Assert.Equal(EvaluationRating.Exceptional, metric.Interpretation.Rating); + Assert.False(metric.Interpretation.Failed); + } + + [Fact] + public async Task ReturnsLowScoreForCompletelyDifferentText() + { + var evaluator = new F1Evaluator(); + var context = new F1EvaluatorContext("The quick brown fox jumps over the lazy dog."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Completely unrelated sentence.")); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(F1Evaluator.F1MetricName, metric.Name); + Assert.Equal(0.1429, (double)metric!.Value!, 4); + Assert.NotNull(metric.Interpretation); + Assert.Equal(EvaluationRating.Unacceptable, metric.Interpretation.Rating); + Assert.True(metric.Interpretation.Failed); + } + + [Fact] + public async Task ReturnsErrorDiagnosticIfNoContext() + { + var evaluator = new F1Evaluator(); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Some text.")); + var result = await evaluator.EvaluateAsync(response, null, null); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(F1Evaluator.F1MetricName, metric.Name); + Assert.NotNull(metric.Diagnostics); + Assert.Contains(metric.Diagnostics, d => d.Severity == EvaluationDiagnosticSeverity.Error); + } + + [Theory] + [InlineData("the cat is on the mat", + "the the the the the the the", 0.30769)] + [InlineData("It is a guide to action that ensures that the military will forever heed Party commands", + "It is a guide to action which ensures that the military always obeys the commands of the party", 0.70589)] + [InlineData("It is the practical guide for the army always to heed the directions of the party", + "It is to insure the troops forever hearing the activity guidebook that party direct", 0.4000)] + public async Task SampleCases(string reference, string hypothesis, double score) + { + var evaluator = new F1Evaluator(); + var context = new F1EvaluatorContext(reference); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(F1Evaluator.F1MetricName, metric.Name); + Assert.Equal(score, (double)metric!.Value!, 4); + } + + [Fact] + public async Task ReturnsErrorDiagnosticIfEmptyResponse() + { + var evaluator = new F1Evaluator(); + var context = new F1EvaluatorContext("Reference text."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "")); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(F1Evaluator.F1MetricName, metric.Name); + Assert.NotNull(metric.Diagnostics); + Assert.Contains(metric.Diagnostics, d => d.Severity == EvaluationDiagnosticSeverity.Error); + } + +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/GLEUAlgorithmTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/GLEUAlgorithmTests.cs new file mode 100644 index 00000000000..794b85c6595 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/GLEUAlgorithmTests.cs @@ -0,0 +1,120 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Linq; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; +using Xunit; +using static Microsoft.Extensions.AI.Evaluation.NLP.Common.GLEUAlgorithm; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class GLEUAlgorithmTests +{ + [Fact] + public void TestZeroMatches() + { + string[][] references = ["The candidate has no alignment to any of the references".Split(' '),]; + string[] hypothesis = "John loves Mary".Split(' '); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.0, score, 4); + } + + [Fact] + public void TestFullMatches() + { + string[][] references = ["John loves Mary".Split(' '),]; + string[] hypothesis = "John loves Mary".Split(' '); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(1.0, score, 4); + } + + [Fact] + public void TestSentenceGLEUExampleA() + { + string[][] references = [ + "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), + "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), + "It is the practical guide for the army always to heed the directions of the party".Split(' ') + ]; + string[] hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.2778, score, 4); + } + + [Fact] + public void TestSentenceGLEUMilitaryExampleA() + { + string[][] references = [ + "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), + ]; + string[] hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.43939, score, 4); + } + + [Fact] + public void TestSentenceGLEUMilitaryExampleB() + { + string[][] references = [ + "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), + ]; + string[] hypothesis = "It is to insure the troops forever hearing the activity guidebook that party direct".Split(' '); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.12069, score, 4); + } + + [Fact] + public void TestSentenceGLEUExampleB() + { + string[][] references = [ + "he was interested in world history because he read the book".Split(' '), + ]; + string[] hypothesis = "he read the book because he was interested in world history".Split(' '); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.7895, score, 4); + } + + [Fact] + public void TestSentenceGLEUExampleAWithWordTokenizer() + { + string[][] references = [ + SimpleWordTokenizer.WordTokenize("It is a guide to action that ensures that the military will forever heed Party commands").ToArray(), + SimpleWordTokenizer.WordTokenize("It is the guiding principle which guarantees the military forces always being under the command of the Party").ToArray(), + SimpleWordTokenizer.WordTokenize("It is the practical guide for the army always to heed the directions of the party").ToArray(), + ]; + string[] hypothesis = SimpleWordTokenizer.WordTokenize("It is a guide to action which ensures that the military always obeys the commands of the party").ToArray(); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.2980, score, 4); + + } + + [Fact] + public void TestSentenceGLEUExampleBWithWordTokenizer() + { + string[][] references = [ + SimpleWordTokenizer.WordTokenize("he was interested in world history because he read the book").ToArray(), + ]; + string[] hypothesis = SimpleWordTokenizer.WordTokenize("he read the book because he was interested in world history").ToArray(); + + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.7895, score, 4); + } + + [Fact] + public void TestSentenceGLEUCatExample() + { + string[][] references = [ + "the cat is on the mat".Split(' '), + ]; + string[] hypothesis = "the the the the the the the".Split(' '); + double score = SentenceGLEU(references, hypothesis); + Assert.Equal(0.0909, score, 4); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/GLEUEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/GLEUEvaluatorTests.cs new file mode 100644 index 00000000000..f27d11e4e2e --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/GLEUEvaluatorTests.cs @@ -0,0 +1,113 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +#pragma warning disable AIEVAL001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + +public class GLEUEvaluatorTests +{ + [Fact] + public async Task ReturnsPerfectScoreForIdenticalText() + { + var evaluator = new GLEUEvaluator(); + var context = new GLEUEvaluatorContext("The quick brown fox jumps over the lazy dog."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "The quick brown fox jumps over the lazy dog.")); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(GLEUEvaluator.GLEUMetricName, metric.Name); + Assert.Equal(1.0, (double)metric!.Value!, 4); + Assert.NotNull(metric.Interpretation); + Assert.Equal(EvaluationRating.Exceptional, metric.Interpretation.Rating); + Assert.False(metric.Interpretation.Failed); + } + + [Fact] + public async Task ReturnsLowScoreForCompletelyDifferentText() + { + var evaluator = new GLEUEvaluator(); + var context = new GLEUEvaluatorContext("The quick brown fox jumps over the lazy dog."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Completely unrelated sentence.")); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(GLEUEvaluator.GLEUMetricName, metric.Name); + Assert.Equal(0.02939, (double)metric!.Value!, 4); + Assert.NotNull(metric.Interpretation); + Assert.Equal(EvaluationRating.Unacceptable, metric.Interpretation.Rating); + Assert.True(metric.Interpretation.Failed); + } + + [Fact] + public async Task ReturnsErrorDiagnosticIfNoContext() + { + var evaluator = new GLEUEvaluator(); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Some text.")); + var result = await evaluator.EvaluateAsync(response, null, null); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(GLEUEvaluator.GLEUMetricName, metric.Name); + Assert.NotNull(metric.Diagnostics); + Assert.Contains(metric.Diagnostics, d => d.Severity == EvaluationDiagnosticSeverity.Error); + } + + [Theory] + [InlineData("the cat is on the mat", + "the the the the the the the", 0.0909)] + [InlineData("It is a guide to action that ensures that the military will forever heed Party commands", + "It is a guide to action which ensures that the military always obeys the commands of the party", 0.4545)] + [InlineData("It is the practical guide for the army always to heed the directions of the party", + "It is to insure the troops forever hearing the activity guidebook that party direct", 0.12069)] + public async Task SampleCases(string reference, string hypothesis, double score) + { + var evaluator = new GLEUEvaluator(); + var context = new GLEUEvaluatorContext(reference); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(GLEUEvaluator.GLEUMetricName, metric.Name); + Assert.Equal(score, (double)metric!.Value!, 4); + } + + [Fact] + public async Task MultipleReferences() + { + string[] references = [ + "It is a guide to action that ensures that the military will forever heed Party commands", + "It is the guiding principle which guarantees the military forces always being under the command of the Party", + "It is the practical guide for the army always to heed the directions of the party", + ]; + string hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"; + + var evaluator = new GLEUEvaluator(); + var context = new GLEUEvaluatorContext(references); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(GLEUEvaluator.GLEUMetricName, metric.Name); + Assert.Equal(0.29799, (double)metric!.Value!, 4); + } + + [Fact] + public async Task ReturnsErrorDiagnosticIfEmptyResponse() + { + var evaluator = new GLEUEvaluator(); + var context = new GLEUEvaluatorContext("Reference text."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "")); + var result = await evaluator.EvaluateAsync(response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(GLEUEvaluator.GLEUMetricName, metric.Name); + Assert.NotNull(metric.Diagnostics); + Assert.Contains(metric.Diagnostics, d => d.Severity == EvaluationDiagnosticSeverity.Error); + } + +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs index 9c2a5b68900..71765ca3eff 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Collections.Generic; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -62,4 +63,21 @@ public void ToDebugString_FormatsCorrectly() Assert.Contains("x: 2", str); Assert.Contains("y: 1", str); } + + [Fact] + public void Intersect_ReturnsCorrectIntersection() + { + MatchCounter counter1 = new(new[] { 1, 2, 2, 3 }); + MatchCounter counter2 = new(new[] { 2, 2, 4 }); + + MatchCounter intersection = counter1.Intersect(counter2); + Dictionary dict = intersection.ToDictionary(kv => kv.Key, kv => kv.Value); + Assert.Equal(2, dict[2]); + Assert.Equal(2, intersection.Sum()); + + intersection = counter2.Intersect(counter1); + dict = intersection.ToDictionary(kv => kv.Key, kv => kv.Value); + Assert.Equal(2, dict[2]); + Assert.Equal(2, intersection.Sum()); + } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs index d782c3c8f88..6c0aefbb02a 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using System.Collections.Generic; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -60,21 +59,55 @@ public void NGramBuilder_Create_Works() } [Fact] - public void NGramGenerationNoPadding() + public void CreateNGrams() { - int[] input = [1, 2, 3, 4, 5]; + Assert.Throws(() => new int[0].CreateNGrams(-1).ToList()); - IEnumerable> result = input.CreateNGrams(1); - List> expected = [[1], [2], [3], [4], [5]]; - Assert.True(result.SequenceEqual(expected)); + ReadOnlySpan data = [1, 2, 3]; - result = input.CreateNGrams(2); - expected = [[1, 2], [2, 3], [3, 4], [4, 5]]; - Assert.True(result.SequenceEqual(expected)); + var nGram = data.CreateNGrams(1); + Assert.Equal([[1], [2], [3]], nGram); - result = input.CreateNGrams(3); - expected = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]; - Assert.True(result.SequenceEqual(expected)); + nGram = data.CreateNGrams(2); + Assert.Equal([[1, 2], [2, 3]], nGram); + + nGram = data.CreateNGrams(3); + Assert.Equal([[1, 2, 3]], nGram); + + nGram = data.CreateNGrams(4); + Assert.Equal([], nGram); } + [Fact] + public void CreateAllNGrams() + { + Assert.Throws(() => new int[0].CreateAllNGrams(-1).ToList()); + + Assert.Throws(() => new int[0].CreateAllNGrams(0).ToList()); + + Assert.Throws(() => new int[0].CreateAllNGrams(1, 0).ToList()); + + ReadOnlySpan arr = [1, 2, 3]; + + var nGram = arr.CreateAllNGrams(1).ToList(); + Assert.Equal([[1], [1, 2], [1, 2, 3], [2], [2, 3], [3]], nGram); + + nGram = arr.CreateAllNGrams(2).ToList(); + Assert.Equal([[1, 2], [1, 2, 3], [2, 3]], nGram); + + nGram = arr.CreateAllNGrams(3).ToList(); + Assert.Equal([[1, 2, 3]], nGram); + + nGram = arr.CreateAllNGrams(3, 5).ToList(); + Assert.Equal([[1, 2, 3]], nGram); + + nGram = arr.CreateAllNGrams(1, 2).ToList(); + Assert.Equal([[1], [1, 2], [2], [2, 3], [3]], nGram); + + nGram = arr.CreateAllNGrams(1, 1).ToList(); + Assert.Equal([[1], [2], [3]], nGram); + + nGram = arr.CreateAllNGrams(4).ToList(); + Assert.Equal([], nGram); + } }