-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Closed
Closed
Copy link
Labels
bugSomething isn't workingSomething isn't working
Description
If the data isn't found in the cache, nothing is added to the list.
machinelearning/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
Lines 262 to 305 in 4635a86
| private int TokenizeToIds(string sequence, IList<int>? accumulatedIds) | |
| { | |
| if (_cache.TryGet(sequence, out IReadOnlyList<Token>? hit)) | |
| { | |
| if (accumulatedIds is not null) | |
| { | |
| foreach (var t in hit) | |
| { | |
| accumulatedIds.Add(t.Id); | |
| } | |
| } | |
| return hit.Count; | |
| } | |
| Span<char> token = stackalloc char[100]; | |
| Span<int> indexMapping = stackalloc int[100]; | |
| if (sequence.Length > 100) | |
| { | |
| token = new char[sequence.Length].AsSpan(); | |
| indexMapping = new int[sequence.Length].AsSpan(); | |
| } | |
| int newTokenIndex = 0; | |
| for (int i = 0; i < sequence.Length; i++) | |
| { | |
| if (_byteToUnicode.TryGetValue(sequence[i], out var value)) | |
| { | |
| token[newTokenIndex] = value; | |
| indexMapping[newTokenIndex] = i; | |
| newTokenIndex++; | |
| } | |
| } | |
| if (newTokenIndex == 0) | |
| { | |
| return 0; | |
| } | |
| IReadOnlyList<Token> result = EncodeToTokens(token.Slice(0, newTokenIndex), indexMapping); | |
| _cache.Set(sequence, result); | |
| return result.Count; | |
| } |
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working