dotnet · tarekgh · Feb 18, 2024 · Feb 17, 2024
diff --git a/src/Microsoft.ML.Tokenizers/AssemblyInfo.cs b/src/Microsoft.ML.Tokenizers/AssemblyInfo.cs
@@ -0,0 +1,7 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if NET5_0_OR_GREATER
+[module: System.Runtime.CompilerServices.SkipLocalsInit]
+#endif
diff --git a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
@@ -5,6 +5,7 @@
     <TargetFrameworks>netstandard2.0;net8.0</TargetFrameworks>
     <Nullable>enable</Nullable>
     <PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
   <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">

diff --git a/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs b/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
@@ -8,10 +8,7 @@
 using System.Diagnostics;
 using System.IO;
 using System.Linq;
-using System.Runtime.CompilerServices;
-using System.Text;
 using System.Text.Json;
-using System.Text.Json.Serialization;
 
 namespace Microsoft.ML.Tokenizers
 {
@@ -27,7 +24,7 @@ public sealed class EnglishRoberta : Model
         private readonly IReadOnlyDictionary<char, char> _byteToUnicode;
         private readonly IReadOnlyDictionary<char, char> _unicodeToByte;
         private readonly string[] _charToString;
-        private readonly Cache<string, IReadOnlyList<Token>> _cache;
+        private readonly Cache<string, List<Token>> _cache;
 
         /// <summary>
         /// Construct tokenizer object to use with the English Robert model.
@@ -72,7 +69,7 @@ public EnglishRoberta(string vocabularyPath, string mergePath, string highestOcc
             }
 
             _unicodeToByte = _byteToUnicode.Reverse();
-            _cache = new Cache<string, IReadOnlyList<Token>>();
+            _cache = new Cache<string, List<Token>>();
         }
 
         /// <summary>
@@ -110,7 +107,7 @@ public EnglishRoberta(Stream vocabularyStream, Stream mergeStream, Stream highes
             }
 
             _unicodeToByte = _byteToUnicode.Reverse();
-            _cache = new Cache<string, IReadOnlyList<Token>>();
+            _cache = new Cache<string, List<Token>>();
         }
 
         //
@@ -226,17 +223,17 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
             {
                 ArrayPool<char>.Shared.Return(token);
                 ArrayPool<int>.Shared.Return(indexMapping);
-                return Bpe.EmptyTokensList;
+                return Array.Empty<Token>();
             }
 
-            if (_cache.TryGet(sequence, out IReadOnlyList<Token>? hit))
+            if (_cache.TryGet(sequence, out List<Token>? hit))
             {
                 ArrayPool<char>.Shared.Return(token);
                 ArrayPool<int>.Shared.Return(indexMapping);
                 return ModifyTokenListOffsets(hit, indexMapping);
             }
 
-            IReadOnlyList<Token> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
+            List<Token> result = EncodeToTokens(token.AsSpan().Slice(0, newTokenIndex), indexMapping);
             _cache.Set(sequence, result);
             ArrayPool<char>.Shared.Return(token);
             ArrayPool<int>.Shared.Return(indexMapping);
@@ -261,7 +258,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
 
         private int TokenizeToIds(string sequence, IList<int>? accumulatedIds)
         {
-            if (_cache.TryGet(sequence, out IReadOnlyList<Token>? hit))
+            if (_cache.TryGet(sequence, out List<Token>? hit))
             {
                 if (accumulatedIds is not null)
                 {
@@ -299,7 +296,7 @@ private int TokenizeToIds(string sequence, IList<int>? accumulatedIds)
                 return 0;
             }
 
-            IReadOnlyList<Token> result = EncodeToTokens(token.Slice(0, newTokenIndex), indexMapping);
+            List<Token> result = EncodeToTokens(token.Slice(0, newTokenIndex), indexMapping);
             _cache.Set(sequence, result);
             return result.Count;
         }