Tokenizer's Interfaces Cleanup (#7001)

* Tokenizer's Interfaces Cleanup * Address the feedback * Optimization
dotnet · Feb 16, 2024 · 4635a86 · 4635a86
1 parent 64523e8
commit 4635a86
Show file tree

Hide file tree

Showing 11 changed files with 470 additions and 226 deletions.
diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs
@@ -95,6 +95,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
 
             (Dictionary<string, int>? vocab1, Vec<(string, string)> merges) = ReadFile(vocabFile, mergesFile);
             Vocab = vocab1 ?? new Dictionary<string, int>();
+            Cache = new Cache<string, Word>();
 
             VocabReverse = new();
 
@@ -146,23 +147,33 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
         /// Tokenize a sequence string to a list of tokens.
         /// </summary>
         /// <param name="sequence">The sequence to tokenize.</param>
+        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
         /// <returns>The list of tokens generated from the sequence tokenization.</returns>
-        public override IReadOnlyList<Token> Tokenize(string sequence)
+        public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken = false)
         {
             if (sequence.Length == 0)
             {
                 return EmptyTokensList;
             }
 
-            if (!Dropout.HasValue)
-            {
-                return TokenizeWithCache(sequence);
-            }
+            return TokenizeWithCache(sequence);
+        }
 
-            Word word = MergeWord(sequence);
+        /// <summary>
+        /// Tokenize a split sequence string to a list of Ids and add them to the accumulatedIds list.
+        /// </summary>
+        /// <param name="sequence">The sequence to split.</param>
+        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
+        public override void TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds) => TokenizeToIdsWithCache(sequence, accumulatedIds);
 
-            return WordToTokens(ref word);
-        }
+        /// <summary>
+        /// Get the number of tokens that the input sequence will be encoded to.
+        /// </summary>
+        /// <param name="sequence">The text to tokenize.</param>
+        /// <param name="isSpecialToken">Indicate if the token is special token.</param>
+        /// <returns>The number of tokens that the input sequence will be encoded to.</returns>
+        public override int CountTokens(string sequence, bool isSpecialToken) => TokenizeToIdsWithCache(sequence, null);
 
         /// <summary>
         /// Map the token to tokenized Id.
@@ -195,14 +206,6 @@ public override IReadOnlyList<Token> Tokenize(string sequence)
             return null;
         }
 
-        /// <summary>
-        /// Map the tokenized Id to the token.
-        /// </summary>
-        /// <param name="id">The Id to map to the token.</param>
-        /// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the decoding.</param>
-        /// <returns>The mapped token of the Id.</returns>
-        public override string? IdToString(int id, bool skipSpecialTokens = false) => throw new NotImplementedException();
-
         /// <summary>
         /// Gets the dictionary mapping tokens to Ids.
         /// </summary>
@@ -332,7 +335,7 @@ internal string CharToString(char c)
 
         internal Word MergeWord(string w)
         {
-            Word word = Word.WithCapacity((int)w.Length);
+            Word word = Word.WithCapacity(w.Length);
             (int Id, int Len)? unk = null;
             int i = 0;
 
@@ -344,7 +347,7 @@ internal Word MergeWord(string w)
                 if (Char.IsHighSurrogate(w[i]) && i < w.Length - 1 && Char.IsLowSurrogate(w[i + 1]))
                 {
                     length = 2;
-                    s = w.Substring(i, (int)length);
+                    s = w.Substring(i, length);
                 }
                 else
                 {
@@ -403,7 +406,7 @@ internal Word MergeWord(string w)
                     }
                 }
 
-                i += (int)length;
+                i += length;
             }
 
             if (unk.HasValue)
@@ -415,45 +418,59 @@ internal Word MergeWord(string w)
             return word;
         }
 
-        // internal Word.Enumerator WordToTokens(Word word) => word.GetIterator(VocabReverse);
-        internal List<Token> WordToTokens(ref Word word)
+        internal List<Token> WordToTokens(ref Word word) => word.ToTokens(VocabReverse);
+
+        internal List<Token> TokenizeWithCache(string sequence)
         {
-            List<Token> tokens = new(word.SymbolsCount);
+            Word word;
+            if (Cache is not null)
+            {
+                if (Cache.TryGet(sequence, out word))
+                {
+                    return WordToTokens(ref word);
+                }
 
-            foreach (Token token in word.GetIterator(VocabReverse))
+                word = MergeWord(sequence);
+                Cache.Set(sequence, word);
+            }
+            else
             {
-                tokens.Add(token);
+                word = MergeWord(sequence);
             }
 
-            return tokens;
+            return WordToTokens(ref word);
         }
 
-        internal List<Token> TokenizeWithCache(string sequence)
+        internal int WordToIds(ref Word word, IList<int>? accumulatedIds)
         {
-            if (Cache is not null)
+            if (accumulatedIds is not null)
             {
-                Word? hit = Cache.Get(sequence);
-                if (hit.HasValue)
-                {
-                    Word w = hit.Value;
-                    return WordToTokens(ref w);
-                }
+                word.PopulateIds(accumulatedIds);
             }
 
-            Word word = MergeWord(sequence);
-            List<Token> tokens = WordToTokens(ref word);
+            return word.SymbolsCount;
+        }
+
+        internal int TokenizeToIdsWithCache(string sequence, IList<int>? accumulatedIds)
+        {
+            Word word;
 
             if (Cache is not null)
             {
+                if (Cache.TryGet(sequence, out Word hit))
+                {
+                    return WordToIds(ref hit, accumulatedIds);
+                }
+
+                word = MergeWord(sequence);
                 Cache.Set(sequence, word);
             }
+            else
+            {
+                word = MergeWord(sequence);
+            }
 
-            return tokens;
-        }
-
-        public override bool IsValidChar(char ch)
-        {
-            throw new NotImplementedException();
+            return WordToIds(ref word, accumulatedIds);
         }
 
         internal static readonly List<Token> EmptyTokensList = new();

diff --git a/src/Microsoft.ML.Tokenizers/Model/Cache.cs b/src/Microsoft.ML.Tokenizers/Model/Cache.cs
@@ -9,14 +9,14 @@
 
 namespace Microsoft.ML.Tokenizers
 {
-    internal sealed class Cache<TKey, TValue> where TKey : notnull
+    internal sealed class Cache<TKey, TValue> where TKey : notnull where TValue : notnull
     {
         internal Cache() : this(Bpe.DefaultCacheCapacity) { }
 
         internal Cache(int capacity)
         {
             Capacity = capacity;
-            Map = new Dictionary<TKey, TValue>((int)Capacity);
+            Map = new Dictionary<TKey, TValue>(Capacity);
         }
 
         private readonly ReaderWriterLockSlim _cacheLock = new ReaderWriterLockSlim();
@@ -25,7 +25,7 @@ internal Cache(int capacity)
 
         internal int Capacity { get; set; }
 
-        internal void Fresh() => Map = new Dictionary<TKey, TValue>((int)Capacity);
+        internal void Fresh() => Map = new Dictionary<TKey, TValue>(Capacity);
 
         internal void Clear()
         {
@@ -56,27 +56,22 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)
             return values;
         }
 
-        internal TValue? Get(TKey key)
+        internal bool TryGet(TKey key, out TValue value)
         {
             _cacheLock.EnterReadLock();
             try
             {
-                if (Map.TryGetValue(key, out TValue? value))
-                {
-                    return value;
-                }
+                return Map.TryGetValue(key, out value!);
             }
             finally { _cacheLock.ExitReadLock(); }
-
-            return default;
         }
 
-        internal void SetValues(IEnumerable<(TKey, TValue)> enteries)
+        internal void SetValues(IEnumerable<(TKey, TValue)> entries)
         {
             _cacheLock.EnterWriteLock();
             try
             {
-                foreach ((TKey, TValue) entry in enteries)
+                foreach ((TKey, TValue) entry in entries)
                 {
                     if (Capacity <= Map.Count)
                     {