Skip to content

Commit

Permalink
Migrate base64_symbol_indexes wrapper to vec64
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jul 3, 2024
1 parent d63987c commit 5a67578
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies = [
"python-magic", # XXX review
"tokenizers",
"unidecode", # XXX review
"vec64>0.0.2",
"vec64>0.0.5",
]

[project.urls]
Expand Down
12 changes: 0 additions & 12 deletions src/dom_tokenizers/internal/base64.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from base64 import b64decode as _b64decode, _bytes_from_decode_data, binascii

from vec64 import base64_symbol_indexes as _base64_symbol_indexes


def b64decode(s, *args, **kwargs) -> bytes:
fix_padding = kwargs.pop("fix_padding", False)
Expand All @@ -16,13 +14,3 @@ def b64decode(s, *args, **kwargs) -> bytes:
n = len(t) & 3
t += b"AA=="[n:]
return _b64decode(t, *args, **kwargs)


def base64_symbol_indexes(text: str) -> bytes:
try:
return _base64_symbol_indexes(text)
except UnicodeEncodeError:
return _base64_symbol_indexes(text.encode(errors="replace"))


base64_symbol_indexes.__doc__ = _base64_symbol_indexes.__doc__
3 changes: 2 additions & 1 deletion src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
import magic

from unidecode import unidecode
from vec64 import base64_symbol_indexes

from ..internal import json
from ..internal.base64 import b64decode, base64_symbol_indexes
from ..internal.base64 import b64decode
from .base64 import base64_probability

logger = logging.getLogger(__name__)
Expand Down

0 comments on commit 5a67578

Please sign in to comment.