dotnet
diff --git a/‎THIRD-PARTY-NOTICES.TXT‎
Lines changed: 19 additions & 0 deletions b/‎THIRD-PARTY-NOTICES.TXT‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/EncodedToken.cs‎
Lines changed: 4 additions & 1 deletion b/‎src/Microsoft.ML.Tokenizers/EncodedToken.cs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs‎
Lines changed: 3 additions & 3 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BertTokenizer.cs‎
Lines changed: 729 additions & 0 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BertTokenizer.cs‎
Lines changed: 729 additions & 0 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/WordPieceTokenizer.cs‎
Lines changed: 858 additions & 0 deletions b/‎src/Microsoft.ML.Tokenizers/Model/WordPieceTokenizer.cs‎
Lines changed: 858 additions & 0 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Normalizer/BertNormalizer.cs‎
Lines changed: 200 additions & 0 deletions b/‎src/Microsoft.ML.Tokenizers/Normalizer/BertNormalizer.cs‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs‎
Lines changed: 57 additions & 5 deletions b/‎src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs‎
Lines changed: 57 additions & 5 deletions
@@ -152,6 +152,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
+License notice for WordPiece and Bert tokenizers
+------------------------------------------------
+
+https://github.com/huggingface/transformers/blob/8e3e145b427196e014f37aa42ba890b9bc94275e/src/transformers/models/bert/tokenization_bert.py#L2
+
+Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
 License notice for BitUtility
 ------------------------------------------
 
 
@@ -10,7 +10,7 @@ namespace Microsoft.ML.Tokenizers
     /// Represent the token produced from the tokenization process containing the token substring,
     /// the id associated to the token substring, and the offset mapping to the original string.
     /// </summary>
-    public readonly struct EncodedToken
+    public readonly struct EncodedToken : IEquatable<EncodedToken>
     {
         /// <summary>
         /// Gets the Id value associated to the token.
@@ -39,5 +39,8 @@ public EncodedToken(int id, string value, Range offset)
             Offset = offset;
             Value = value;
         }
+
+        /// inherited
+        public bool Equals(EncodedToken other) => Id == other.Id && Value == other.Value && Offset.Equals(other.Offset);
     }
 }
@@ -87,7 +87,7 @@ private set
         /// <param name="vocabFile">The JSON file path containing the dictionary of string keys and their ids.</param>
         /// <param name="mergesFile">The file path containing the tokens's pairs list.</param>
         public static BpeTokenizer Create(string vocabFile, string? mergesFile)
-            => Create(vocabFile, mergesFile, preTokenizer: PreTokenizer.CreateWhiteSpace(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
+            => Create(vocabFile, mergesFile, preTokenizer: PreTokenizer.CreateWordOrNonWordPreTokenizer(), normalizer: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
 
         /// <summary>
         /// Create a new Bpe tokenizer object to use for text encoding.
@@ -131,7 +131,7 @@ public static BpeTokenizer Create(
         /// <param name="vocabStream">The JSON stream containing the dictionary of string keys and their ids.</param>
         /// <param name="mergesStream">The stream containing the tokens's pairs list.</param>
         public static BpeTokenizer Create(Stream vocabStream, Stream? mergesStream)
-            => Create(vocabStream, mergesStream, preTokenizer: PreTokenizer.CreateWhiteSpace(), normalizer: null, addedTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
+            => Create(vocabStream, mergesStream, preTokenizer: PreTokenizer.CreateWordOrNonWordPreTokenizer(), normalizer: null, addedTokens: null, unknownToken: null, continuingSubwordPrefix: null, endOfWordSuffix: null, fuseUnknownTokens: false);
 
         /// <summary>
         /// Create a new Bpe tokenizer object to use for text encoding.
@@ -225,7 +225,7 @@ private BpeTokenizer(
             FuseUnknownTokens = fuseUnknownTokens;
             ContinuingSubwordPrefix = continuingSubwordPrefix;
             EndOfWordSuffix = endOfWordSuffix;
-            _preTokenizer = preTokenizer ?? PreTokenizer.CreateWhiteSpace(); // Default to WhiteSpace pre-tokenizer
+            _preTokenizer = preTokenizer ?? PreTokenizer.CreateWordOrNonWordPreTokenizer(); // Default to WordOrNonWord pre-tokenizer
             _normalizer = normalizer;
 
             _vocab = vocab ?? new Dictionary<StringSpanOrdinalKey, int>();
 
@@ -0,0 +1,200 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers;
+using System.Diagnostics;
+using System.Globalization;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Normalizer that performs the Bert model normalization.
+    /// </summary>
+    internal sealed class BertNormalizer : Normalizer
+    {
+        private readonly bool _doLowerCase;
+        private readonly bool _tokenizeChineseChars;
+        private readonly bool _stripAccents;
+
+        /// <summary>
+        /// Normalize the input string.
+        /// </summary>
+        /// <param name="original">The input string to normalize.</param>
+        /// <returns>The normalized string.</returns>
+        public override string Normalize(string original)
+        {
+            if (string.IsNullOrEmpty(original))
+            {
+                return string.Empty;
+            }
+
+            if (_stripAccents)
+            {
+                original = original.Normalize(NormalizationForm.FormD);
+            }
+
+            Span<char> casingBuffer = stackalloc char[10];
+            char[] buffer = ArrayPool<char>.Shared.Rent(original.Length);
+            int index = 0;
+
+            for (int i = 0; i < original.Length; i++)
+            {
+                char c = original[i];
+
+                if (c == '\u0000' || c == '\uFFFD')
+                {
+                    continue;
+                }
+
+                int inc = 0;
+                int codePoint = (int)c;
+                if (char.IsHighSurrogate(c) && i + 1 < original.Length && char.IsLowSurrogate(original[i + 1]))
+                {
+                    codePoint = char.ConvertToUtf32(c, original[i + 1]);
+                    inc = 1;
+                }
+
+                UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(original, i);
+
+                if (category == UnicodeCategory.Control)
+                {
+                    i += inc;
+                    continue;
+                }
+
+                if (category == UnicodeCategory.SpaceSeparator)
+                {
+                    InsertChar(ref buffer, ref index, ' ');
+                    i += inc;
+                    continue;
+                }
+
+                if (_stripAccents && category is UnicodeCategory.NonSpacingMark or UnicodeCategory.SpacingCombiningMark)
+                {
+                    i += inc;
+                    continue;
+                }
+
+                if (_doLowerCase && category == UnicodeCategory.UppercaseLetter)
+                {
+                    int length = original.AsSpan().Slice(i, inc + 1).ToLowerInvariant(casingBuffer);
+                    Debug.Assert(length > 0);
+
+                    InsertSpan(ref buffer, ref index, casingBuffer.Slice(0, length));
+
+                    i += inc;
+                    continue;
+                }
+
+                if (_tokenizeChineseChars && IsChineseChar(codePoint))
+                {
+                    InsertChar(ref buffer, ref index, ' ');
+                    InsertChar(ref buffer, ref index, c);
+                    if (inc > 0)
+                    {
+                        InsertChar(ref buffer, ref index, original[i + 1]);
+                    }
+                    InsertChar(ref buffer, ref index, ' ');
+
+                    i += inc;
+                    continue;
+                }
+
+                InsertChar(ref buffer, ref index, c);
+                if (inc > 0)
+                {
+                    InsertChar(ref buffer, ref index, original[i + 1]);
+                }
+                i += inc;
+            }
+
+            string result = index == 0 ? string.Empty : new string(buffer, 0, index).Normalize(NormalizationForm.FormC);
+            ArrayPool<char>.Shared.Return(buffer);
+            return result;
+        }
+
+        /// <summary>
+        /// Normalize the input character span.
+        /// </summary>
+        /// <param name="original">The input character span to normalize.</param>
+        /// <returns>The normalized string.</returns>
+        public override string Normalize(ReadOnlySpan<char> original)
+        {
+            if (original.IsEmpty)
+            {
+                return string.Empty;
+            }
+
+            return Normalize(original.ToString());
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="BertNormalizer"/> class.
+        /// </summary>
+        /// <param name="doLowerCase">Whether to lowercase the input.</param>
+        /// <param name="tokenizeChineseChars">Whether to tokenize Chinese characters.</param>
+        /// <param name="stripAccents">Whether to strip accents from the input.</param>
+        public BertNormalizer(bool doLowerCase, bool tokenizeChineseChars, bool stripAccents)
+        {
+            _doLowerCase = doLowerCase;
+            _tokenizeChineseChars = tokenizeChineseChars;
+            _stripAccents = stripAccents;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void InsertChar(ref char[] buffer, ref int index, char c)
+        {
+            if (index >= buffer.Length)
+            {
+                Helpers.ArrayPoolGrow(ref buffer, index + 40);
+            }
+
+            buffer[index++] = c;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void InsertSpan(ref char[] buffer, ref int index, Span<char> chars)
+        {
+            if (index + buffer.Length >= buffer.Length)
+            {
+                Helpers.ArrayPoolGrow(ref buffer, index + buffer.Length + 10);
+            }
+
+            chars.CopyTo(buffer.AsSpan(index));
+            index += chars.Length;
+        }
+
+        /// <summary>
+        /// Checks whether CP is the codepoint of a CJK character.
+        /// This defines a "chinese character" as anything in the CJK Unicode block:
+        ///   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        /// </summary>
+        /// <param name="codePoint">The codepoint to check.</param>
+        /// <remarks>
+        /// The CJK Unicode block is NOT all Japanese and Korean characters,
+        /// despite its name. The modern Korean Hangul alphabet is a different block,
+        /// as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        /// space-separated words, so they are not treated specially and handled
+        /// like the all of the other languages.
+        /// </remarks>
+        /// <returns>True if the codepoint is a CJK character, false otherwise.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsChineseChar(int codePoint)
+        {
+            return (codePoint > 0x3400) && // Quick check to exit early if the codepoint is outside of the CJK range
+               (((uint)(codePoint - 0x3400) <= (uint)(0x4DBF - 0x3400)) ||
+                ((uint)(codePoint - 0xF900) <= (uint)(0xFAFF - 0xF900)) ||
+                ((uint)(codePoint - 0x4E00) <= (uint)(0x9FFF - 0x4E00)) ||
+                ((uint)(codePoint - 0x20000) <= (uint)(0x2A6DF - 0x20000)) ||
+                ((uint)(codePoint - 0x2A700) <= (uint)(0x2B73F - 0x2A700)) ||
+                ((uint)(codePoint - 0x2B740) <= (uint)(0x2B81F - 0x2B740)) ||
+                ((uint)(codePoint - 0x2B820) <= (uint)(0x2CEAF - 0x2B820)) ||
+                ((uint)(codePoint - 0x2F800) <= (uint)(0x2FA1F - 0x2F800)));
+        }
+    }
+}
@@ -40,8 +40,61 @@ public abstract partial class PreTokenizer
             }
         }
 
-        private const string WhiteSpacePattern = /*lang=regex*/ @"\w+|[^\w\s]+";
+        private const string WhiteSpaceOrPunctuationPattern = @"\w+|[\p{P}]";
+        private static PreTokenizer? _whiteSpaceOrPunctuationPreTokenizer;
+#if NET7_0_OR_GREATER
+        [GeneratedRegex(WhiteSpaceOrPunctuationPattern)]
+        private static partial Regex WhiteSpaceOrPunctuationRegex();
+#else
+        private static Regex WhiteSpaceOrPunctuationRegex() => new Regex(WhiteSpaceOrPunctuationPattern, RegexOptions.Compiled);
+#endif
+
+        /// <summary>
+        /// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the whitespace or punctuation characters.
+        /// </summary>
+        /// <param name="specialTokensEncoder">The dictionary containing the special tokens and their corresponding ids.</param>
+        /// <returns>The pre-tokenizer that splits the text at the whitespace or punctuation characters.</returns>
+        public static PreTokenizer CreateWhiteSpaceOrPunctuationPreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
+        {
+            if (specialTokensEncoder is null)
+            {
+                // return a singleton instance of the WhiteSpace pre-tokenizer
+                return _whiteSpaceOrPunctuationPreTokenizer ??= new RegexPreTokenizer(WhiteSpaceOrPunctuationRegex(), null);
+            }
+
+            return new RegexPreTokenizer(WhiteSpaceOrPunctuationRegex(), specialTokensEncoder);
+        }
+
+        private const string WordOrNonWordPattern = /*lang=regex*/ @"\w+|[^\w\s]+";
+        private static PreTokenizer? _wordOrNonWordPreTokenizer;
+
+#if NET7_0_OR_GREATER
+        [GeneratedRegex(WordOrNonWordPattern)]
+        private static partial Regex WordOrNonWordRegex();
+#else
+        private static Regex WordOrNonWordRegex() => new Regex(WordOrNonWordPattern, RegexOptions.Compiled);
+#endif
+
+        /// <summary>
+        /// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the word or non-word boundary.
+        /// The word is a set of alphabet, numeric, and underscore characters.
+        /// </summary>
+        /// <param name="specialTokensEncoder">The dictionary containing the special tokens and their corresponding ids.</param>
+        /// <returns>The pre-tokenizer that splits the text at the word boundary.</returns>
+        public static PreTokenizer CreateWordOrNonWordPreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
+        {
+            if (specialTokensEncoder is null)
+            {
+                // return a singleton instance of the WhiteSpace pre-tokenizer
+                return _wordOrNonWordPreTokenizer ??= new RegexPreTokenizer(WordOrNonWordRegex(), null);
+            }
+
+            return new RegexPreTokenizer(WordOrNonWordRegex(), specialTokensEncoder);
+        }
+
+        private const string WhiteSpacePattern = @"\S+";
         private static PreTokenizer? _whiteSpacePreTokenizer;
+
 #if NET7_0_OR_GREATER
         [GeneratedRegex(WhiteSpacePattern)]
         private static partial Regex WhiteSpaceRegex();
@@ -50,12 +103,11 @@ public abstract partial class PreTokenizer
 #endif
 
         /// <summary>
-        /// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the word boundary.
-        /// The word is a set of alphabet, numeric, and underscore characters.
+        /// Create a new instance of the <see cref="PreTokenizer"/> class which split the text at the white spaces.
         /// </summary>
         /// <param name="specialTokensEncoder">The dictionary containing the special tokens and their corresponding ids.</param>
-        /// <returns>The pre-tokenizer that splits the text at the word boundary.</returns>
-        public static PreTokenizer CreateWhiteSpace(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
+        /// <returns>The pre-tokenizer that splits the text at the white spaces.</returns>
+        public static PreTokenizer CreateWhiteSpacePreTokenizer(IReadOnlyDictionary<string, int>? specialTokensEncoder = null)
         {
             if (specialTokensEncoder is null)
             {
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ namespace Microsoft.ML.Tokenizers`
`10`	`10`	`/// Represent the token produced from the tokenization process containing the token substring,`
`11`	`11`	`/// the id associated to the token substring, and the offset mapping to the original string.`
`12`	`12`	`/// </summary>`
`13`		`- public readonly struct EncodedToken`
	`13`	`+ public readonly struct EncodedToken : IEquatable<EncodedToken>`
`14`	`14`	`{`
`15`	`15`	`/// <summary>`
`16`	`16`	`/// Gets the Id value associated to the token.`
`@@ -39,5 +39,8 @@ public EncodedToken(int id, string value, Range offset)`
`39`	`39`	`Offset = offset;`
`40`	`40`	`Value = value;`
`41`	`41`	`}`
	`42`	`+`
	`43`	`+ /// inherited`
	`44`	`+ public bool Equals(EncodedToken other) => Id == other.Id && Value == other.Value && Offset.Equals(other.Offset);`
`42`	`45`	`}`
`43`	`46`	`}`