Skip to content

Commit

Permalink
Fix a tokenizer crash
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 10, 2024
1 parent 67ebd7c commit 3db0b71
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
10 changes: 7 additions & 3 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def split(self, text: str, flags: Flags = Flags.FULL) -> Iterable[str]:
continue

# Split on "_" (have to do this b/c "\w" matches it)
new_splits = curr.split("_")
new_splits = curr.split("_", maxsplit=1)
if len(new_splits) > 1:
if VERBOSE: # pragma: no cover
debug("it's stuff with underscores")
Expand Down Expand Up @@ -310,9 +310,13 @@ def _sub_js_escape(self, splits, cursor):
# Terminal backslash
splits.pop(cursor)
return cursor
else: # curr == "\\"
curr = splits[cursor_limit]
elif (curr := splits[cursor_limit]):
# Regular escape
cursor_limit += 1
else:
# Backslash followed by something that's been split away?
splits[cursor] = SPLIT
return cursor_limit

# Store what we want at `splits[cursor:cursor_limit]` in `result`.
match curr[0]:
Expand Down
19 changes: 18 additions & 1 deletion tests/test_splitter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from dom_tokenizers.pre_tokenizers.splitter import TextSplitter
from dom_tokenizers.pre_tokenizers.splitter import TextSplitter, SPLIT


@pytest.mark.parametrize(
Expand Down Expand Up @@ -73,6 +73,9 @@ def test_first_split_re(text, expect_splits):
(r"hello\world", ["hello", "world"]), # not valid => not handled
("hello\\", ["hello"]),
("\\hello", ["hello"]),
("_ \\_", []),
("_ \\_a", ["a"]),
("_ \\_ b", ["b"]),
# Javascript unicode escapes
(r"hello\u0020world", ["hello", "world"]),
Expand Down Expand Up @@ -201,6 +204,20 @@ def test_prefixed_hex(text, expect_tokens):
assert list(TextSplitter().split(text)) == expect_tokens


def test_sub_js_escape_crasher():
"""Ensure `_sub_js_escape()` doesn't crash when fed `["\\", ""]`
`_sub_js_escape()` used to raise an `IndexError` if fed `["\\", ""]`.
That's now been fixed, but the error that caused it to be fed that
sequence has also been fixed, meaning the code this testcase flexes
wasn't being tested by any of the regular `TextSplitter().split()`
tests, hence this testcase to flex it specifically.
"""
splits = ["\\", ""]
assert TextSplitter()._sub_js_escape(splits, 0) == 1
assert splits == [SPLIT, ""]


@pytest.mark.parametrize(
"text,expect_tokens",
(("That\u2019s all we know.",
Expand Down

0 comments on commit 3db0b71

Please sign in to comment.