Skip to content

Commit c305c38

Browse files
committed
Sort added tokens by length to avoid early partial matches
1 parent 344af32 commit c305c38

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

src/tokenizers.js

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2559,7 +2559,11 @@ export class PreTrainedTokenizer extends Callable {
25592559

25602560

25612561
this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
2562-
this.added_tokens.map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`).join('|')
2562+
this.added_tokens
2563+
// Sort by length (desc) to avoid early partial matches
2564+
.toSorted((a, b) => b.content.length - a.content.length)
2565+
.map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
2566+
.join('|')
25632567
) : null;
25642568

25652569
// Set mask token if present (otherwise will be undefined, which is fine)

0 commit comments

Comments
 (0)