From e8fffb3f7b6d96e9c5c52fada87555bcc277d989 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Tue, 1 Jul 2025 13:58:48 -0400 Subject: [PATCH] Update tokenizer alg to use is [.] --- .../Common/SimpleWordTokenizer.cs | 9 ++------- .../SimpleTokenizerTests.cs | 2 ++ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs index 4f4717852bd..322ea4cedd6 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs @@ -76,18 +76,13 @@ public static IEnumerable WordTokenize(ReadOnlyMemory text) } // Join hyphenated words - if (span[0] == '-' && - span.Length > 1 && - span[1] == '\n') + if (span is ['-', '\n', ..]) { text = text.Slice(2); continue; } - if (span[0] == '-' && - span.Length > 2 && - span[1] == '\r' && - span[2] == '\n') + if (span is ['-', '\r', '\n', ..]) { text = text.Slice(3); continue; diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs index 3451a6c38c9..5766c2e7fc0 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs @@ -20,6 +20,8 @@ public class SimpleTokenizerTests [InlineData("word1-word2", new[] { "WORD1", "-", "WORD2" })] [InlineData("word1 - word2", new[] { "WORD1", "-", "WORD2" })] [InlineData("word1-\n word2", new[] { "WORD1", "WORD2" })] + [InlineData("word1-", new[] { "WORD1", "-" })] + [InlineData("word1&", new[] { "WORD1", "&" })] [InlineData("word1-\r\n word2", new[] { "WORD1", "WORD2" })] [InlineData("word1-\r\nword2", new[] { "WORD1WORD2" })] [InlineData("word1-\nword2", new[] { "WORD1WORD2" })]