diff --git a/airbyte-lib/airbyte_lib/types.py b/airbyte-lib/airbyte_lib/types.py index fa4f39737343..c133b090c347 100644 --- a/airbyte-lib/airbyte_lib/types.py +++ b/airbyte-lib/airbyte_lib/types.py @@ -31,8 +31,8 @@ class SQLTypeConversionError(Exception): """An exception to be raised when a type conversion fails.""" -def _get_airbyte_type( - json_schema_property_def: dict[str, str | dict], +def _get_airbyte_type( # noqa: PLR0911 # Too many return statements + json_schema_property_def: dict[str, str | dict | list], ) -> tuple[str, str | None]: """Get the airbyte type and subtype from a JSON schema property definition. @@ -45,6 +45,13 @@ def _get_airbyte_type( json_schema_type = json_schema_property_def.get("type", None) json_schema_format = json_schema_property_def.get("format", None) + # if json_schema_type is an array of two strings with one of them being null, pick the other one + # this strategy is often used by connectors to indicate a field might not be set all the time + if isinstance(json_schema_type, list): + non_null_types = [t for t in json_schema_type if t != "null"] + if len(non_null_types) == 1: + json_schema_type = non_null_types[0] + if json_schema_type == "string": if json_schema_format == "date": return "date", None @@ -58,9 +65,17 @@ def _get_airbyte_type( if json_schema_type in ["string", "number", "boolean", "integer"]: return cast(str, json_schema_type), None - if json_schema_type == "object" and "properties" in json_schema_property_def: + if json_schema_type == "object": return "object", None + if json_schema_type == "array": + items_def = json_schema_property_def.get("items", None) + if isinstance(items_def, dict): + subtype, _ = _get_airbyte_type(items_def) + return "array", subtype + + return "array", None + err_msg = f"Could not determine airbyte type from JSON schema type: {json_schema_property_def}" raise SQLTypeConversionError(err_msg) @@ -81,11 +96,11 @@ def get_failover_type() -> sqlalchemy.types.TypeEngine: def to_sql_type( self, - json_schema_property_def: dict[str, str | dict], + json_schema_property_def: dict[str, str | dict | list], ) -> sqlalchemy.types.TypeEngine: """Convert a value to a SQL type.""" try: - airbyte_type, airbyte_subtype = _get_airbyte_type(json_schema_property_def) + airbyte_type, _ = _get_airbyte_type(json_schema_property_def) return self.conversion_map[airbyte_type]() except SQLTypeConversionError: print(f"Could not determine airbyte type from JSON schema: {json_schema_property_def}") diff --git a/airbyte-lib/pyproject.toml b/airbyte-lib/pyproject.toml index d367368bd17a..0101f8b49e18 100644 --- a/airbyte-lib/pyproject.toml +++ b/airbyte-lib/pyproject.toml @@ -135,6 +135,7 @@ ignore = [ "PIE790", # Allow unnecssary 'pass' (sometimes useful for readability) "PERF203", # exception handling in loop "S", # flake8-bandit (noisy, security related) + "SIM910", # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit." "TD002", # Require author for TODOs "TRIO", # flake8-trio (opinionated, noisy) "INP001", # Dir 'examples' is part of an implicit namespace package. Add an __init__.py. diff --git a/airbyte-lib/tests/unit_tests/test_type_translation.py b/airbyte-lib/tests/unit_tests/test_type_translation.py index 80c1e611c662..cb5f59f7feba 100644 --- a/airbyte-lib/tests/unit_tests/test_type_translation.py +++ b/airbyte-lib/tests/unit_tests/test_type_translation.py @@ -2,12 +2,17 @@ import pytest from sqlalchemy import types -from airbyte_lib.types import SQLTypeConverter +from airbyte_lib.types import SQLTypeConverter, _get_airbyte_type + @pytest.mark.parametrize( "json_schema_property_def, expected_sql_type", [ ({"type": "string"}, types.VARCHAR), + ({"type": ["boolean", "null"]}, types.BOOLEAN), + ({"type": ["null", "boolean"]}, types.BOOLEAN), + ({"type": "string"}, types.VARCHAR), + ({"type": ["null", "string"]}, types.VARCHAR), ({"type": "boolean"}, types.BOOLEAN), ({"type": "string", "format": "date"}, types.DATE), ({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, types.TIMESTAMP), @@ -25,3 +30,50 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): converter = SQLTypeConverter() sql_type = converter.to_sql_type(json_schema_property_def) assert isinstance(sql_type, expected_sql_type) + + +@pytest.mark.parametrize( + "json_schema_property_def, expected_airbyte_type", + [ + ({"type": "string"}, "string"), + ({"type": ["boolean", "null"]}, "boolean"), + ({"type": ["null", "boolean"]}, "boolean"), + ({"type": "string"}, "string"), + ({"type": ["null", "string"]}, "string"), + ({"type": "boolean"}, "boolean"), + ({"type": "string", "format": "date"}, "date"), + ({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, "timestamp_without_timezone"), + ({"type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone"}, "timestamp_with_timezone"), + ({"type": "string", "format": "time", "airbyte_type": "time_without_timezone"}, "time_without_timezone"), + ({"type": "string", "format": "time", "airbyte_type": "time_with_timezone"}, "time_with_timezone"), + ({"type": "integer"}, "integer"), + ({"type": "number", "airbyte_type": "integer"}, "integer"), + ({"type": "number"}, "number"), + ({"type": "array"}, "array"), + ({"type": "object"}, "object"), + ], +) +def test_to_airbyte_type(json_schema_property_def, expected_airbyte_type): + airbyte_type, _ = _get_airbyte_type(json_schema_property_def) + assert airbyte_type == expected_airbyte_type + + +@pytest.mark.parametrize( + "json_schema_property_def, expected_airbyte_type, expected_airbyte_subtype", + [ + ({"type": "string"}, "string", None), + ({"type": "number"}, "number", None), + ({"type": "array"}, "array", None), + ({"type": "object"}, "object", None), + ({"type": "array", "items": {"type": ["null", "string"]}}, "array", "string"), + ({"type": "array", "items": {"type": ["boolean"]}}, "array", "boolean"), + ], +) +def test_to_airbyte_subtype( + json_schema_property_def, + expected_airbyte_type, + expected_airbyte_subtype, +): + airbyte_type, subtype = _get_airbyte_type(json_schema_property_def) + assert airbyte_type == expected_airbyte_type + assert subtype == expected_airbyte_subtype