Skip to content

Commit

Permalink
AirbyteLib: improve json schema type detection (airbytehq#35263)
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronsteers authored and jatinyadav-cc committed Feb 26, 2024
1 parent 04c9eea commit de03e69
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 6 deletions.
25 changes: 20 additions & 5 deletions airbyte-lib/airbyte_lib/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class SQLTypeConversionError(Exception):
"""An exception to be raised when a type conversion fails."""


def _get_airbyte_type(
json_schema_property_def: dict[str, str | dict],
def _get_airbyte_type( # noqa: PLR0911 # Too many return statements
json_schema_property_def: dict[str, str | dict | list],
) -> tuple[str, str | None]:
"""Get the airbyte type and subtype from a JSON schema property definition.
Expand All @@ -45,6 +45,13 @@ def _get_airbyte_type(
json_schema_type = json_schema_property_def.get("type", None)
json_schema_format = json_schema_property_def.get("format", None)

# if json_schema_type is an array of two strings with one of them being null, pick the other one
# this strategy is often used by connectors to indicate a field might not be set all the time
if isinstance(json_schema_type, list):
non_null_types = [t for t in json_schema_type if t != "null"]
if len(non_null_types) == 1:
json_schema_type = non_null_types[0]

if json_schema_type == "string":
if json_schema_format == "date":
return "date", None
Expand All @@ -58,9 +65,17 @@ def _get_airbyte_type(
if json_schema_type in ["string", "number", "boolean", "integer"]:
return cast(str, json_schema_type), None

if json_schema_type == "object" and "properties" in json_schema_property_def:
if json_schema_type == "object":
return "object", None

if json_schema_type == "array":
items_def = json_schema_property_def.get("items", None)
if isinstance(items_def, dict):
subtype, _ = _get_airbyte_type(items_def)
return "array", subtype

return "array", None

err_msg = f"Could not determine airbyte type from JSON schema type: {json_schema_property_def}"
raise SQLTypeConversionError(err_msg)

Expand All @@ -81,11 +96,11 @@ def get_failover_type() -> sqlalchemy.types.TypeEngine:

def to_sql_type(
self,
json_schema_property_def: dict[str, str | dict],
json_schema_property_def: dict[str, str | dict | list],
) -> sqlalchemy.types.TypeEngine:
"""Convert a value to a SQL type."""
try:
airbyte_type, airbyte_subtype = _get_airbyte_type(json_schema_property_def)
airbyte_type, _ = _get_airbyte_type(json_schema_property_def)
return self.conversion_map[airbyte_type]()
except SQLTypeConversionError:
print(f"Could not determine airbyte type from JSON schema: {json_schema_property_def}")
Expand Down
1 change: 1 addition & 0 deletions airbyte-lib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ ignore = [
"PIE790", # Allow unnecssary 'pass' (sometimes useful for readability)
"PERF203", # exception handling in loop
"S", # flake8-bandit (noisy, security related)
"SIM910", # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit."
"TD002", # Require author for TODOs
"TRIO", # flake8-trio (opinionated, noisy)
"INP001", # Dir 'examples' is part of an implicit namespace package. Add an __init__.py.
Expand Down
54 changes: 53 additions & 1 deletion airbyte-lib/tests/unit_tests/test_type_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

import pytest
from sqlalchemy import types
from airbyte_lib.types import SQLTypeConverter
from airbyte_lib.types import SQLTypeConverter, _get_airbyte_type


@pytest.mark.parametrize(
"json_schema_property_def, expected_sql_type",
[
({"type": "string"}, types.VARCHAR),
({"type": ["boolean", "null"]}, types.BOOLEAN),
({"type": ["null", "boolean"]}, types.BOOLEAN),
({"type": "string"}, types.VARCHAR),
({"type": ["null", "string"]}, types.VARCHAR),
({"type": "boolean"}, types.BOOLEAN),
({"type": "string", "format": "date"}, types.DATE),
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, types.TIMESTAMP),
Expand All @@ -25,3 +30,50 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type):
converter = SQLTypeConverter()
sql_type = converter.to_sql_type(json_schema_property_def)
assert isinstance(sql_type, expected_sql_type)


@pytest.mark.parametrize(
"json_schema_property_def, expected_airbyte_type",
[
({"type": "string"}, "string"),
({"type": ["boolean", "null"]}, "boolean"),
({"type": ["null", "boolean"]}, "boolean"),
({"type": "string"}, "string"),
({"type": ["null", "string"]}, "string"),
({"type": "boolean"}, "boolean"),
({"type": "string", "format": "date"}, "date"),
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, "timestamp_without_timezone"),
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone"}, "timestamp_with_timezone"),
({"type": "string", "format": "time", "airbyte_type": "time_without_timezone"}, "time_without_timezone"),
({"type": "string", "format": "time", "airbyte_type": "time_with_timezone"}, "time_with_timezone"),
({"type": "integer"}, "integer"),
({"type": "number", "airbyte_type": "integer"}, "integer"),
({"type": "number"}, "number"),
({"type": "array"}, "array"),
({"type": "object"}, "object"),
],
)
def test_to_airbyte_type(json_schema_property_def, expected_airbyte_type):
airbyte_type, _ = _get_airbyte_type(json_schema_property_def)
assert airbyte_type == expected_airbyte_type


@pytest.mark.parametrize(
"json_schema_property_def, expected_airbyte_type, expected_airbyte_subtype",
[
({"type": "string"}, "string", None),
({"type": "number"}, "number", None),
({"type": "array"}, "array", None),
({"type": "object"}, "object", None),
({"type": "array", "items": {"type": ["null", "string"]}}, "array", "string"),
({"type": "array", "items": {"type": ["boolean"]}}, "array", "boolean"),
],
)
def test_to_airbyte_subtype(
json_schema_property_def,
expected_airbyte_type,
expected_airbyte_subtype,
):
airbyte_type, subtype = _get_airbyte_type(json_schema_property_def)
assert airbyte_type == expected_airbyte_type
assert subtype == expected_airbyte_subtype

0 comments on commit de03e69

Please sign in to comment.