Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenizer's Interfaces Cleanup #7001

Merged
merged 3 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 58 additions & 41 deletions src/Microsoft.ML.Tokenizers/Model/BPE.cs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st

(Dictionary<string, int>? vocab1, Vec<(string, string)> merges) = ReadFile(vocabFile, mergesFile);
Vocab = vocab1 ?? new Dictionary<string, int>();
Cache = new Cache<string, Word>();

VocabReverse = new();

Expand Down Expand Up @@ -146,23 +147,33 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
/// Tokenize a sequence string to a list of tokens.
/// </summary>
/// <param name="sequence">The sequence to tokenize.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <returns>The list of tokens generated from the sequence tokenization.</returns>
public override IReadOnlyList<Token> Tokenize(string sequence)
public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken = false)
{
if (sequence.Length == 0)
{
return EmptyTokensList;
}

if (!Dropout.HasValue)
{
return TokenizeWithCache(sequence);
}
return TokenizeWithCache(sequence);
}

Word word = MergeWord(sequence);
/// <summary>
/// Tokenize a split sequence string to a list of Ids and add them to the accumulatedIds list.
/// </summary>
/// <param name="sequence">The sequence to split.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
public override void TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds) => TokenizeToIdsWithCache(sequence, accumulatedIds);

return WordToTokens(ref word);
}
/// <summary>
/// Get the number of tokens that the input sequence will be encoded to.
/// </summary>
/// <param name="sequence">The text to tokenize.</param>
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
/// <returns>The number of tokens that the input sequence will be encoded to.</returns>
public override int CountTokens(string sequence, bool isSpecialToken) => TokenizeToIdsWithCache(sequence, null);

/// <summary>
/// Map the token to tokenized Id.
Expand Down Expand Up @@ -195,14 +206,6 @@ public override IReadOnlyList<Token> Tokenize(string sequence)
return null;
}

/// <summary>
/// Map the tokenized Id to the token.
/// </summary>
/// <param name="id">The Id to map to the token.</param>
/// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the decoding.</param>
/// <returns>The mapped token of the Id.</returns>
public override string? IdToString(int id, bool skipSpecialTokens = false) => throw new NotImplementedException();

/// <summary>
/// Gets the dictionary mapping tokens to Ids.
/// </summary>
Expand Down Expand Up @@ -332,7 +335,7 @@ internal string CharToString(char c)

internal Word MergeWord(string w)
{
Word word = Word.WithCapacity((int)w.Length);
Word word = Word.WithCapacity(w.Length);
(int Id, int Len)? unk = null;
int i = 0;

Expand All @@ -344,7 +347,7 @@ internal Word MergeWord(string w)
if (Char.IsHighSurrogate(w[i]) && i < w.Length - 1 && Char.IsLowSurrogate(w[i + 1]))
{
length = 2;
s = w.Substring(i, (int)length);
s = w.Substring(i, length);
}
else
{
Expand Down Expand Up @@ -403,7 +406,7 @@ internal Word MergeWord(string w)
}
}

i += (int)length;
i += length;
}

if (unk.HasValue)
Expand All @@ -415,45 +418,59 @@ internal Word MergeWord(string w)
return word;
}

// internal Word.Enumerator WordToTokens(Word word) => word.GetIterator(VocabReverse);
internal List<Token> WordToTokens(ref Word word)
internal List<Token> WordToTokens(ref Word word) => word.ToTokens(VocabReverse);

internal List<Token> TokenizeWithCache(string sequence)
{
List<Token> tokens = new(word.SymbolsCount);
Word word;
if (Cache is not null)
{
if (Cache.TryGet(sequence, out word))
{
return WordToTokens(ref word);
}

foreach (Token token in word.GetIterator(VocabReverse))
word = MergeWord(sequence);
Cache.Set(sequence, word);
}
else
{
tokens.Add(token);
word = MergeWord(sequence);
}

return tokens;
return WordToTokens(ref word);
Comment on lines +423 to +441
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this whole method just be:

List<Token> result = new();
TokenizeToIdsWithCache(sequence, result);
return result;

?

Copy link
Member Author

@tarekgh tarekgh Feb 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, TokenizeToIdsWithCache(sequence, result); fill only the Ids and removing the overhead for filling the whole tokens. Note the difference in the implementation between TokenizeToIdsWithCache and TokenizeWithCache. The first is calling WordToIds while the second is calling WordToTokens

}

internal List<Token> TokenizeWithCache(string sequence)
internal int WordToIds(ref Word word, IList<int>? accumulatedIds)
{
if (Cache is not null)
if (accumulatedIds is not null)
{
Word? hit = Cache.Get(sequence);
if (hit.HasValue)
{
Word w = hit.Value;
return WordToTokens(ref w);
}
word.PopulateIds(accumulatedIds);
}

Word word = MergeWord(sequence);
List<Token> tokens = WordToTokens(ref word);
return word.SymbolsCount;
}

internal int TokenizeToIdsWithCache(string sequence, IList<int>? accumulatedIds)
{
Word word;

if (Cache is not null)
{
if (Cache.TryGet(sequence, out Word hit))
{
return WordToIds(ref hit, accumulatedIds);
}

word = MergeWord(sequence);
Cache.Set(sequence, word);
}
else
{
word = MergeWord(sequence);
}

return tokens;
}

public override bool IsValidChar(char ch)
{
throw new NotImplementedException();
return WordToIds(ref word, accumulatedIds);
}

internal static readonly List<Token> EmptyTokensList = new();
Expand Down
19 changes: 7 additions & 12 deletions src/Microsoft.ML.Tokenizers/Model/Cache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@

namespace Microsoft.ML.Tokenizers
{
internal sealed class Cache<TKey, TValue> where TKey : notnull
internal sealed class Cache<TKey, TValue> where TKey : notnull where TValue : notnull
{
internal Cache() : this(Bpe.DefaultCacheCapacity) { }

internal Cache(int capacity)
{
Capacity = capacity;
Map = new Dictionary<TKey, TValue>((int)Capacity);
Map = new Dictionary<TKey, TValue>(Capacity);
}

private readonly ReaderWriterLockSlim _cacheLock = new ReaderWriterLockSlim();
Expand All @@ -25,7 +25,7 @@ internal Cache(int capacity)

internal int Capacity { get; set; }

internal void Fresh() => Map = new Dictionary<TKey, TValue>((int)Capacity);
internal void Fresh() => Map = new Dictionary<TKey, TValue>(Capacity);

internal void Clear()
{
Expand Down Expand Up @@ -56,27 +56,22 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)
return values;
}

internal TValue? Get(TKey key)
internal bool TryGet(TKey key, out TValue value)
{
_cacheLock.EnterReadLock();
try
{
if (Map.TryGetValue(key, out TValue? value))
{
return value;
}
return Map.TryGetValue(key, out value!);
}
finally { _cacheLock.ExitReadLock(); }

return default;
}

internal void SetValues(IEnumerable<(TKey, TValue)> enteries)
internal void SetValues(IEnumerable<(TKey, TValue)> entries)
{
_cacheLock.EnterWriteLock();
try
{
foreach ((TKey, TValue) entry in enteries)
foreach ((TKey, TValue) entry in entries)
{
if (Capacity <= Map.Count)
{
Expand Down
Loading