Skip to content

Commit

Permalink
fix: Avoid throw error when Part.text is empty in modality content ch…
Browse files Browse the repository at this point in the history
…ecks

PiperOrigin-RevId: 650788670
  • Loading branch information
happy-qiao authored and copybara-github committed Jul 9, 2024
1 parent fcdcc11 commit bbd4a49
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
28 changes: 22 additions & 6 deletions tests/unit/vertexai/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import hashlib
import io
import os
import tempfile
import shutil
import tempfile
from typing import List
from unittest import mock
from vertexai.generative_models import Content, Image, Part
Expand All @@ -27,8 +27,11 @@
get_tokenizer_for_model,
)
import pytest
from sentencepiece import sentencepiece_model_pb2
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2
from google.cloud.aiplatform_v1beta1.types import (
content as gapic_content_types,
)

_TOKENIZER_NAME = "google/gemma"
_MODEL_NAME = "gemini-1.5-pro"
Expand Down Expand Up @@ -63,9 +66,14 @@
[
Part.from_text(_SENTENCE_1),
Part.from_text(_SENTENCE_2),
Part.from_text(_EMPTY_SENTENCE),
],
[_SENTENCE_1, _SENTENCE_2, _EMPTY_SENTENCE],
[
_TOKENS_MAP[_SENTENCE_1]["ids"],
_TOKENS_MAP[_SENTENCE_2]["ids"],
_TOKENS_MAP[_EMPTY_SENTENCE]["ids"],
],
[_SENTENCE_1, _SENTENCE_2],
[_TOKENS_MAP[_SENTENCE_1]["ids"], _TOKENS_MAP[_SENTENCE_2]["ids"]],
),
(
Content(role="user", parts=[Part.from_text(_SENTENCE_1)]),
Expand All @@ -78,10 +86,15 @@
parts=[
Part.from_text(_SENTENCE_1),
Part.from_text(_SENTENCE_2),
Part.from_text(_EMPTY_SENTENCE),
],
),
[_SENTENCE_1, _SENTENCE_2],
[_TOKENS_MAP[_SENTENCE_1]["ids"], _TOKENS_MAP[_SENTENCE_2]["ids"]],
[_SENTENCE_1, _SENTENCE_2, _EMPTY_SENTENCE],
[
_TOKENS_MAP[_SENTENCE_1]["ids"],
_TOKENS_MAP[_SENTENCE_2]["ids"],
_TOKENS_MAP[_EMPTY_SENTENCE]["ids"],
],
),
(
[
Expand Down Expand Up @@ -128,6 +141,9 @@


_LIST_OF_UNSUPPORTED_CONTENTS = [
gapic_content_types.Part(
video_metadata=gapic_content_types.VideoMetadata(start_offset="10s")
),
Part.from_uri("gs://bucket/object", mime_type="mime_type"),
Part.from_data(b"inline_data_bytes", mime_type="mime_type"),
Part.from_dict({"function_call": {"name": "test_function_call"}}),
Expand Down
9 changes: 8 additions & 1 deletion vertexai/tokenization/_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from google.cloud.aiplatform_v1beta1.types import (
content as gapic_content_types,
tool as gapic_tool_types,
)


Expand Down Expand Up @@ -120,7 +121,13 @@ def _assert_text_only_content_types_sequence(

def _assert_text_only_gapic_part(value: gapic_content_types.Part):
"""Asserts that the gapic content part is a text content type."""
if not value.text:
if (
gapic_content_types.FileData() != value.file_data
or gapic_content_types.Blob() != value.inline_data
or gapic_tool_types.FunctionCall() != value.function_call
or gapic_tool_types.FunctionResponse() != value.function_response
or gapic_content_types.VideoMetadata() != value.video_metadata
):
raise ValueError("Tokenizers do not support non-text content types.")


Expand Down

0 comments on commit bbd4a49

Please sign in to comment.