We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 344af32 commit c305c38Copy full SHA for c305c38
src/tokenizers.js
@@ -2559,7 +2559,11 @@ export class PreTrainedTokenizer extends Callable {
2559
2560
2561
this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
2562
- this.added_tokens.map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`).join('|')
+ this.added_tokens
2563
+ // Sort by length (desc) to avoid early partial matches
2564
+ .toSorted((a, b) => b.content.length - a.content.length)
2565
+ .map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
2566
+ .join('|')
2567
) : null;
2568
2569
// Set mask token if present (otherwise will be undefined, which is fine)
0 commit comments