Skip to content

Commit

Permalink
Support any string based ID
Browse files Browse the repository at this point in the history
* Previously string based IDs that weren't convertable to hex would
  throw an exception. This fixes that issue by converting strings
  to 128 bit integer and converting the md5 hex into a UUID
* Update Weaviate to 1.17.3
  • Loading branch information
samos123 committed Feb 7, 2023
1 parent c955e61 commit 555ae3a
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ COPY destination_weaviate ./destination_weaviate
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.1.0
LABEL io.airbyte.version=0.1.1
LABEL io.airbyte.name=airbyte/destination-weaviate
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import re
import uuid
import hashlib
from typing import Any, Mapping

from airbyte_cdk.models import ConfiguredAirbyteCatalog
Expand Down Expand Up @@ -40,16 +41,23 @@ def hex_to_int(hex_str: str) -> int:
return 0


def is_uuid_string(uuid_string):
uuid_pattern = "^[0-9a-f]{8}-[0-9a-f]{4}-[0-5][0-9a-f]{3}-[089ab][0-9a-f]{3}-[0-9a-f]{12}$"
return re.match(uuid_pattern, uuid_string)


def generate_id(record_id: Any) -> uuid.UUID:
if isinstance(record_id, int):
return uuid.UUID(int=record_id)

if isinstance(record_id, str):
if is_uuid_string(record_id):
return uuid.UUID(record_id)
id_int = hex_to_int(record_id)
if hex_to_int(record_id) > 0:
if id_int > 0:
return uuid.UUID(int=id_int)

return uuid.UUID(record_id)
hex_string = hashlib.md5(record_id.encode("UTF-8")).hexdigest()
return uuid.UUID(hex=hex_string)


def get_schema_from_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> Mapping[str, Mapping[str, str]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def setup_teardown(config: Mapping):
pass

docker_client.containers.run(
"semitechnologies/weaviate:1.16.1", detach=True, environment=env_vars, name=name,
"semitechnologies/weaviate:1.17.3", detach=True, environment=env_vars, name=name,
ports={8080: ('127.0.0.1', 8081)}
)
time.sleep(0.5)
Expand Down Expand Up @@ -403,6 +403,33 @@ def test_id_starting_with_underscore(config: Mapping, client: Client):
assert actual.get("id") == str(uuid.UUID(int=int(data.get("_id"), 16))), "UUID should be created for _id field"


def test_id_with_text_string(config: Mapping, client: Client):
stream_name = "article"
stream_schema = {"type": "object", "properties": {
"title": {"type": "string"},
"id": {"type": "string"}
}}
catalog = create_catalog(stream_name, stream_schema)
first_state_message = _state({"state": "1"})
data = {"title": "test1", "id": "not a real id"}
first_record_chunk = [_record(stream_name, data)]

destination = DestinationWeaviate()

expected_states = [first_state_message]
output_states = list(
destination.write(
config, catalog, [*first_record_chunk, first_state_message]
)
)
assert expected_states == output_states, "Checkpoint state messages were expected from the destination"

class_name = stream_to_class_name(stream_name)
assert count_objects(client, class_name) == 1, "There should be only 1 object of in Weaviate"
actual = get_objects(client, class_name)[0]
assert actual.get("id")


def test_id_custom_field_name(config: Mapping, client: Client):
# This is common scenario from mongoDB
stream_name = "article"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from setuptools import find_packages, setup

MAIN_REQUIREMENTS = ["airbyte-cdk", "weaviate-client==3.9.0"]
MAIN_REQUIREMENTS = ["airbyte-cdk", "weaviate-client==3.11.0"]

TEST_REQUIREMENTS = ["pytest~=6.2", "docker"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ def test_generate_id():
assert generate_id("0x1") == uuid.UUID(int=1)
assert generate_id(1) == uuid.UUID(int=1)
assert generate_id("123e4567-e89b-12d3-a456-426614174000") == uuid.UUID("123e4567-e89b-12d3-a456-426614174000")
assert generate_id("123e4567e89b12d3a456426614174000") == uuid.UUID("123e4567-e89b-12d3-a456-426614174000")
for i in range(10):
assert generate_id("this should be using md5") == uuid.UUID("802a479a-190e-92c8-8340-d687c860f53d")

0 comments on commit 555ae3a

Please sign in to comment.