|
16 | 16 | from .. import _utils, usage |
17 | 17 | from .._utils import PeekableAsyncStream |
18 | 18 | from ..messages import ( |
19 | | - AudioUrl, |
20 | 19 | BinaryContent, |
21 | | - ImageUrl, |
22 | 20 | ModelMessage, |
23 | 21 | ModelRequest, |
24 | 22 | ModelResponse, |
@@ -345,18 +343,19 @@ def _estimate_usage(messages: Iterable[ModelMessage]) -> usage.Usage: |
345 | 343 | def _estimate_string_tokens(content: str | Sequence[UserContent]) -> int: |
346 | 344 | if not content: |
347 | 345 | return 0 |
| 346 | + |
348 | 347 | if isinstance(content, str): |
349 | | - return len(re.split(r'[\s",.:]+', content.strip())) |
350 | | - else: |
351 | | - tokens = 0 |
352 | | - for part in content: |
353 | | - if isinstance(part, str): |
354 | | - tokens += len(re.split(r'[\s",.:]+', part.strip())) |
355 | | - # TODO(Marcelo): We need to study how we can estimate the tokens for these types of content. |
356 | | - if isinstance(part, (AudioUrl, ImageUrl)): |
357 | | - tokens += 0 |
358 | | - elif isinstance(part, BinaryContent): |
359 | | - tokens += len(part.data) |
360 | | - else: |
361 | | - tokens += 0 |
362 | | - return tokens |
| 348 | + return len(_TOKEN_SPLIT_RE.split(content.strip())) |
| 349 | + |
| 350 | + tokens = 0 |
| 351 | + for part in content: |
| 352 | + if isinstance(part, str): |
| 353 | + tokens += len(_TOKEN_SPLIT_RE.split(part.strip())) |
| 354 | + elif isinstance(part, BinaryContent): |
| 355 | + tokens += len(part.data) |
| 356 | + # TODO(Marcelo): We need to study how we can estimate the tokens for AudioUrl or ImageUrl. |
| 357 | + |
| 358 | + return tokens |
| 359 | + |
| 360 | + |
| 361 | +_TOKEN_SPLIT_RE = re.compile(r'[\s",.:]+') |
0 commit comments