Skip to content

Commit ab8d54b

Browse files
committed
Minor Bpe cleanup
1 parent 8cd8bb7 commit ab8d54b

File tree

1 file changed

+12
-7
lines changed
  • src/Microsoft.ML.Tokenizers/Model

1 file changed

+12
-7
lines changed

src/Microsoft.ML.Tokenizers/Model/BPE.cs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,17 @@ private set
5656
/// <summary>
5757
/// A prefix to be used for every subword that is not a beginning-of-word
5858
/// </summary>
59-
public string? ContinuingSubwordPrefix { get; private set; }
59+
public string? ContinuingSubwordPrefix { get; }
6060

6161
/// <summary>
6262
/// An optional suffix to characterize and end-of-word sub-word
6363
/// </summary>
64-
public string? EndOfWordSuffix { get; private set; }
64+
public string? EndOfWordSuffix { get; }
6565

6666
/// <summary>
6767
/// Gets or sets whether allowing multiple unknown tokens get fused
6868
/// </summary>
69-
public bool FuseUnknownTokens { get; private set; }
69+
public bool FuseUnknownTokens { get; }
7070

7171

7272
/// <summary>
@@ -146,6 +146,11 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri
146146
throw new InvalidOperationException($"Trying to merge a token '{mergeValues.b}' which not exist in the vocabulary.");
147147
}
148148

149+
if (mergeValues.b.Length <= prefixLen)
150+
{
151+
throw new InvalidOperationException($"The merge value '{mergeValues.b}' is too short to be merged with a prefix of length {prefixLen}. This implies that the merge file is either damaged or missing the prefix in its entries.");
152+
}
153+
149154
string newToken = $"{mergeValues.a}{mergeValues.b.Substring(prefixLen)}";
150155
if (!_vocab.TryGetValue(newToken, out int newId))
151156
{
@@ -252,19 +257,19 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadModelData(
252257
private readonly Dictionary<string, int> _vocab;
253258

254259
/// Contains the mapping between Pairs and their (rank, newId).
255-
internal Dictionary<Pair<int>, (int, int)> Merges { get; set; }
260+
internal Dictionary<Pair<int>, (int, int)> Merges { get; }
256261

257262
/// Contains the cache for optimizing the encoding step.
258-
internal Cache<string, Word>? Cache { get; set; }
263+
internal Cache<string, Word>? Cache { get; }
259264

260265
internal static readonly int DefaultCacheCapacity = 10_000;
261266

262267
/// Reversed vocabulary, to rebuild the text.
263-
internal SortedDictionary<int, string> VocabReverse { get; set; }
268+
internal SortedDictionary<int, string> VocabReverse { get; }
264269

265270
/// Dropout probability for merges. 0 = no dropout is the default. At 1.0, tokenization will
266271
/// perform no merges, so the result will just be characters.
267-
internal float? Dropout { get; set; }
272+
internal float? Dropout { get; }
268273

269274
/// Converts the merges strings (for example from `merges.txt` file) with the format
270275
/// "{pair_a} {pair_b}" into the format expected by the BPE struct

0 commit comments

Comments
 (0)