From f5670be68bc75da955928f319956b0dabd6707a5 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Mon, 30 Jun 2025 15:19:29 -0400 Subject: [PATCH] Use arrays in place of IEnumerable for internal processing. --- .../BLEUEvaluator.cs | 5 +- .../BLEUEvaluatorContext.cs | 7 +- .../Common/BLEUAlgorithm.cs | 27 +++--- .../Common/MatchCounter.cs | 2 +- .../Common/NGramExtensions.cs | 19 ++-- ...rosoft.Extensions.AI.Evaluation.NLP.csproj | 1 + .../BLEUAlgorithmTests.cs | 87 +++++++++---------- .../NGramTests.cs | 2 +- 8 files changed, 81 insertions(+), 69 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index 8ce43d48e52..a1aeff2d452 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System; using System.Collections.Generic; using System.Globalization; using System.Linq; @@ -79,8 +80,8 @@ public ValueTask EvaluateAsync( var (score, duration) = TimingHelper.ExecuteWithTiming(() => { - var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference)); - var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); + var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference).ToArray()).ToArray(); + var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text).ToArray(); return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); }); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs index 320b20e9116..5c44dcd3c63 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs @@ -41,8 +41,8 @@ public sealed class BLEUEvaluatorContext : EvaluationContext /// /// The reference responses against which the response that is being evaluated is compared. /// - public BLEUEvaluatorContext(params string[] references) - : this(references as IEnumerable) + public BLEUEvaluatorContext(IEnumerable references) + : this(references.ToArray()) { } @@ -52,11 +52,12 @@ public BLEUEvaluatorContext(params string[] references) /// /// The reference responses against which the response that is being evaluated is compared. /// - public BLEUEvaluatorContext(IEnumerable references) + public BLEUEvaluatorContext(params string[] references) : base( name: BLEUContextName, contents: [.. references.Select(c => new TextContent(c))]) { References = [.. references]; } + } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs index c7420d0be7a..05001d2d380 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs @@ -16,7 +16,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; /// internal static class BLEUAlgorithm { - internal static int ClosestRefLength(IEnumerable> references, int hypLength) + internal static int ClosestRefLength(string[][] references, int hypLength) { if (!references.Any()) { @@ -27,7 +27,7 @@ internal static int ClosestRefLength(IEnumerable> references int smallestDiff = int.MaxValue; foreach (var reference in references) { - int refLength = reference.Count(); + int refLength = reference.Length; int diff = Math.Abs(refLength - hypLength); if (diff < smallestDiff || (diff == smallestDiff && refLength < closestRefLength)) @@ -55,26 +55,26 @@ internal static double BrevityPenalty(int closestRefLength, int hypLength) return Math.Exp(1 - ((double)closestRefLength / hypLength)); } - internal static RationalNumber ModifiedPrecision(IEnumerable> references, IEnumerable hypothesis, int n = 1) + internal static RationalNumber ModifiedPrecision(string[][] references, string[] hypothesis, int n = 1) { if (n <= 0) { Throw.ArgumentOutOfRangeException(nameof(n), $"`{nameof(n)}` must be greater than zero."); } - if (!references.Any() || !hypothesis.Any()) + if (references.Length == 0 || hypothesis.Length == 0) { return RationalNumber.Zero; } - var hyp = hypothesis.CreateNGrams(n); - var hypCounts = new MatchCounter>(hyp); + var hypGrams = hypothesis.AsSpan().CreateNGrams(n); + var hypCounts = new MatchCounter>(hypGrams); Dictionary, int> maxCounts = []; foreach (var rf in references) { - IEnumerable> refGrams = rf.CreateNGrams(n); + IEnumerable> refGrams = rf.AsSpan().CreateNGrams(n); var refCounts = new MatchCounter>(refGrams); foreach (var ct in refCounts) @@ -123,25 +123,28 @@ internal static double[] EqualWeights(int n) } double[] weights = new double[n]; +#if NET8_0_OR_GREATER + Array.Fill(weights, 1.0 / n); +#else for (int i = 0; i < n; i++) { weights[i] = 1.0 / n; } - +#endif return weights; } internal static readonly double[] DefaultBLEUWeights = EqualWeights(4); - internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis, + internal static double SentenceBLEU(string[][] references, string[] hypothesis, double[]? weights = null, Func? smoothingFunction = null) { - if (references == null || !references.Any()) + if (references == null || references.Length == 0) { Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty."); } - if (hypothesis == null || !hypothesis.Any()) + if (hypothesis == null || hypothesis.Length == 0) { Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty."); } @@ -171,7 +174,7 @@ internal static double SentenceBLEU(IEnumerable> references, precisionValues[i] = prec; } - int hypLen = hypothesis.Count(); + int hypLen = hypothesis.Length; int closestRefLength = ClosestRefLength(references, hypLen); double brevityPenalty = BrevityPenalty(closestRefLength, hypLen); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs index bbca2252057..13731b34789 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs @@ -53,7 +53,7 @@ public void AddRange(IEnumerable items) } } - public string ToDebugString() => string.Concat(_counts.Select(v => $"{v.Key}: {v.Value}, ")); + public string ToDebugString() => string.Join(",", _counts.Select(v => $"{v.Key}: {v.Value}")); public IEnumerator> GetEnumerator() => _counts.GetEnumerator(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs index 149d3820328..467d0dc0160 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.Shared.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; @@ -14,12 +13,16 @@ internal static class NGramExtensions public static NGram CreateNGram(this ReadOnlySpan values) where T : IEquatable => new(values); + internal static IEnumerable> CreateNGrams(this Span input, int n) + where T : IEquatable + => CreateNGrams((ReadOnlySpan)input, n); + /// /// Create a sequence of n-grams from the input sequence. /// /// The input sequence of items. /// The size of each n-gram. - internal static IEnumerable> CreateNGrams(this IEnumerable input, int n) + internal static List> CreateNGrams(this ReadOnlySpan input, int n) where T : IEquatable { if (n <= 0) @@ -27,15 +30,19 @@ internal static IEnumerable> CreateNGrams(this IEnumerable input, Throw.ArgumentOutOfRangeException(nameof(n), $"'{nameof(n)}' must be greater than zero."); } - T[] output = [.. input.Take(n)]; + List> nGrams = []; + + ReadOnlySpan output = input.Slice(0, Math.Min(n, input.Length)); while (output.Length == n) { - yield return new NGram(output); + nGrams.Add(new NGram(output)); - input = input.Skip(1); - output = [.. input.Take(n)]; + input = input.Slice(1); + output = input.Slice(0, Math.Min(n, input.Length)); } + + return nGrams; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj index 0bab1cf7fb0..12e7cebb957 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -17,6 +17,7 @@ true + true diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs index 1b029dc4a37..9260a688cc4 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using System.Collections.Generic; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -15,8 +14,8 @@ public class BLEUAlgorithmTests [Fact] public void ModifiedPrecisionTests() { - IEnumerable> references = ["the cat is on the mat".Split(' '), "there is a cat on the mat".Split(' ')]; - IEnumerable hypothesis = "the the the the the the the".Split(' '); + string[][] references = ["the cat is on the mat".Split(' '), "there is a cat on the mat".Split(' ')]; + string[] hypothesis = "the the the the the the the".Split(' '); RationalNumber prec = ModifiedPrecision(references, hypothesis, 1); Assert.Equal(0.2857, prec.ToDouble(), 4); @@ -36,8 +35,8 @@ public void ModifiedPrecisionTests() "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), "It is the practical guide for the army always to heed the directions of the party".Split(' '), ]; - IEnumerable hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); - IEnumerable hypothesis2 = "It is to insure the troops forever hearing the activity guidebook that party direct".Split(' '); + string[] hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + string[] hypothesis2 = "It is to insure the troops forever hearing the activity guidebook that party direct".Split(' '); prec = ModifiedPrecision(references, hypothesis1, 1); Assert.Equal(0.9444, prec.ToDouble(), 4); prec = ModifiedPrecision(references, hypothesis2, 1); @@ -76,63 +75,63 @@ public void SmoothingMethod4Tests(int[] num_denom, int hypLen, double[] vals) [Fact] public void TestBrevityPenalty() { - IEnumerable> references = [ - Enumerable.Repeat("a", 11), - Enumerable.Repeat("a", 8), + string[][] references = [ + [.. Enumerable.Repeat("a", 11)], + [.. Enumerable.Repeat("a", 8)], ]; - IEnumerable hypothesis = Enumerable.Repeat("a", 7); + string[] hypothesis = [.. Enumerable.Repeat("a", 7)]; int hypLength = hypothesis.Count(); int closestRefLength = ClosestRefLength(references, hypLength); double brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(0.8669, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 11), - Enumerable.Repeat("a", 8), - Enumerable.Repeat("a", 6), - Enumerable.Repeat("a", 7), + [.. Enumerable.Repeat("a", 11)], + [.. Enumerable.Repeat("a", 8)], + [.. Enumerable.Repeat("a", 6)], + [.. Enumerable.Repeat("a", 7)], ]; - hypothesis = Enumerable.Repeat("a", 7); + hypothesis = [.. Enumerable.Repeat("a", 7)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(1.0, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 28), - Enumerable.Repeat("a", 28), + [.. Enumerable.Repeat("a", 28)], + [.. Enumerable.Repeat("a", 28)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(0.26359, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 13), - Enumerable.Repeat("a", 2), + [.. Enumerable.Repeat("a", 13)], + [.. Enumerable.Repeat("a", 2)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(0.9200, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 13), - Enumerable.Repeat("a", 11), + [.. Enumerable.Repeat("a", 13)], + [.. Enumerable.Repeat("a", 11)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); Assert.Equal(1.0, brevityPenalty, 4); references = [ - Enumerable.Repeat("a", 11), - Enumerable.Repeat("a", 13), + [.. Enumerable.Repeat("a", 11)], + [.. Enumerable.Repeat("a", 13)], ]; - hypothesis = Enumerable.Repeat("a", 12); + hypothesis = [.. Enumerable.Repeat("a", 12)]; hypLength = hypothesis.Count(); closestRefLength = ClosestRefLength(references, hypLength); brevityPenalty = BrevityPenalty(closestRefLength, hypLength); @@ -143,8 +142,8 @@ public void TestBrevityPenalty() [Fact] public void TestZeroMatches() { - IEnumerable> references = ["The candidate has no alignment to any of the references".Split(' '),]; - IEnumerable hypothesis = "John loves Mary".Split(' '); + string[][] references = ["The candidate has no alignment to any of the references".Split(' '),]; + string[] hypothesis = "John loves Mary".Split(' '); double score = SentenceBLEU(references, hypothesis, EqualWeights(hypothesis.Count())); Assert.Equal(0.0, score, 4); @@ -153,8 +152,8 @@ public void TestZeroMatches() [Fact] public void TestFullMatches() { - IEnumerable> references = ["John loves Mary".Split(' '),]; - IEnumerable hypothesis = "John loves Mary".Split(' '); + string[][] references = ["John loves Mary".Split(' '),]; + string[] hypothesis = "John loves Mary".Split(' '); double score = SentenceBLEU(references, hypothesis, EqualWeights(hypothesis.Count())); Assert.Equal(1.0, score, 4); @@ -163,8 +162,8 @@ public void TestFullMatches() [Fact] public void TestPartialMatchesHypothesisLongerThanReference() { - IEnumerable> references = ["John loves Mary".Split(' '),]; - IEnumerable hypothesis = "John loves Mary who loves Mike".Split(' '); + string[][] references = ["John loves Mary".Split(' '),]; + string[] hypothesis = "John loves Mary who loves Mike".Split(' '); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0, score, 4); @@ -173,12 +172,12 @@ public void TestPartialMatchesHypothesisLongerThanReference() [Fact] public void TestSentenceBLEUExampleA() { - IEnumerable> references = [ + string[][] references = [ "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), "It is the practical guide for the army always to heed the directions of the party".Split(' ') ]; - IEnumerable hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + string[] hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.5046, score, 4); @@ -188,10 +187,10 @@ public void TestSentenceBLEUExampleA() [Fact] public void TestSentenceBLEUExampleB() { - IEnumerable> references = [ + string[][] references = [ "he was interested in world history because he read the book".Split(' '), ]; - IEnumerable hypothesis = "he read the book because he was interested in world history".Split(' '); + string[] hypothesis = "he read the book because he was interested in world history".Split(' '); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.74009, score, 4); @@ -200,12 +199,12 @@ public void TestSentenceBLEUExampleB() [Fact] public void TestSentenceBLEUExampleAWithWordTokenizer() { - IEnumerable> references = [ - SimpleWordTokenizer.WordTokenize("It is a guide to action that ensures that the military will forever heed Party commands"), - SimpleWordTokenizer.WordTokenize("It is the guiding principle which guarantees the military forces always being under the command of the Party"), - SimpleWordTokenizer.WordTokenize("It is the practical guide for the army always to heed the directions of the party") + string[][] references = [ + SimpleWordTokenizer.WordTokenize("It is a guide to action that ensures that the military will forever heed Party commands").ToArray(), + SimpleWordTokenizer.WordTokenize("It is the guiding principle which guarantees the military forces always being under the command of the Party").ToArray(), + SimpleWordTokenizer.WordTokenize("It is the practical guide for the army always to heed the directions of the party").ToArray(), ]; - IEnumerable hypothesis = SimpleWordTokenizer.WordTokenize("It is a guide to action which ensures that the military always obeys the commands of the party"); + string[] hypothesis = SimpleWordTokenizer.WordTokenize("It is a guide to action which ensures that the military always obeys the commands of the party").ToArray(); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.5046, score, 4); @@ -215,10 +214,10 @@ public void TestSentenceBLEUExampleAWithWordTokenizer() [Fact] public void TestSentenceBLEUExampleBWithWordTokenizer() { - IEnumerable> references = [ - SimpleWordTokenizer.WordTokenize("he was interested in world history because he read the book"), + string[][] references = [ + SimpleWordTokenizer.WordTokenize("he was interested in world history because he read the book").ToArray(), ]; - IEnumerable hypothesis = SimpleWordTokenizer.WordTokenize("he read the book because he was interested in world history"); + string[] hypothesis = SimpleWordTokenizer.WordTokenize("he read the book because he was interested in world history").ToArray(); double score = SentenceBLEU(references, hypothesis); Assert.Equal(0.74009, score, 4); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs index d782c3c8f88..3c049efcea9 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -62,7 +62,7 @@ public void NGramBuilder_Create_Works() [Fact] public void NGramGenerationNoPadding() { - int[] input = [1, 2, 3, 4, 5]; + ReadOnlySpan input = [1, 2, 3, 4, 5]; IEnumerable> result = input.CreateNGrams(1); List> expected = [[1], [2], [3], [4], [5]];