diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py index a3c361d74150..6f9a4cf2b2d3 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py @@ -10,8 +10,8 @@ class AvroFormat(BaseModel): filetype: Literal["avro"] = "avro" # This option is not recommended, but necessary for backwards compatibility - decimal_as_float: bool = Field( - title="Convert Decimal Fields to Floats", - description="Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.", - default=False, + double_as_string: bool = Field( + title="Convert Double Fields to Strings", + description="Whether to convert double fields to strings. There is a loss of precision when converting decimals to floats, so this is recommended.", + default=True, ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index 91e1cf4eb08b..f9f0faf3c5e8 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -66,7 +66,7 @@ async def infer_schema( def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]: if isinstance(avro_field, str) and avro_field in AVRO_TYPE_TO_JSON_TYPE: # Legacy behavior to retain backwards compatibility. Long term we should always represent doubles as strings - if avro_field == "double" and avro_format.decimal_as_float: + if avro_field == "double" and not avro_format.double_as_string: return {"type": "number"} return {"type": AVRO_TYPE_TO_JSON_TYPE[avro_field]} if isinstance(avro_field, Mapping): @@ -152,10 +152,18 @@ def file_read_mode(self) -> FileReadMode: @staticmethod def _to_output_value(avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any) -> Any: if not isinstance(record_type, Mapping): - if record_type == "double" and not avro_format.decimal_as_float: + if record_type == "double" and avro_format.double_as_string: return str(record_value) return record_value if record_type.get("logicalType") == "uuid": return uuid.UUID(bytes=record_value) + elif record_type.get("logicalType") == "decimal": + return str(record_value) + elif record_type.get("logicalType") == "date": + return record_value.isoformat() + elif record_type.get("logicalType") == "local-timestamp-millis": + return record_value.isoformat(sep="T", timespec="milliseconds") + elif record_type.get("logicalType") == "local-timestamp-micros": + return record_value.isoformat(sep="T", timespec="microseconds") else: return record_value diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py index 40684985abec..d72cd77d888c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py @@ -2,12 +2,15 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +import datetime +import uuid + import pytest from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat from airbyte_cdk.sources.file_based.file_types import AvroParser _default_avro_format = AvroFormat() -_decimal_as_float_avro_format = AvroFormat(decimal_as_float=True) +_double_as_numer_avro_format = AvroFormat(double_as_string=False) @pytest.mark.parametrize( @@ -20,7 +23,7 @@ pytest.param(_default_avro_format, "long", {"type": "integer"}, None, id="test_long"), pytest.param(_default_avro_format, "float", {"type": "number"}, None, id="test_float"), pytest.param(_default_avro_format, "double", {"type": "string"}, None, id="test_double"), - pytest.param(_decimal_as_float_avro_format, "double", {"type": "number"}, None, id="test_double_as_float"), + pytest.param(_double_as_numer_avro_format, "double", {"type": "number"}, None, id="test_double_as_float"), pytest.param(_default_avro_format, "bytes", {"type": "string"}, None, id="test_bytes"), pytest.param(_default_avro_format, "string", {"type": "string"}, None, id="test_string"), pytest.param(_default_avro_format, "void", None, ValueError, id="test_invalid_type"), @@ -168,3 +171,39 @@ def test_convert_primitive_avro_type_to_json(avro_format, avro_type, expected_js else: actual_json_type = AvroParser._convert_avro_type_to_json(avro_format, "field_name", avro_type) assert actual_json_type == expected_json_type + + +@pytest.mark.parametrize( + "avro_format, record_type, record_value, expected_value", [ + pytest.param(_double_as_numer_avro_format, "boolean", True, True, id="test_boolean"), + pytest.param(_double_as_numer_avro_format, "int", 123, 123, id="test_int"), + pytest.param(_double_as_numer_avro_format, "long", 123, 123, id="test_long"), + pytest.param(_double_as_numer_avro_format, "float", 123.456, 123.456, id="test_float"), + pytest.param(_default_avro_format, "double", 123.456, "123.456", id="test_double_default_config"), + pytest.param(_double_as_numer_avro_format, "double", 123.456, 123.456, id="test_double_as_number"), + pytest.param(_double_as_numer_avro_format, "bytes", b"hello world", b"hello world", id="test_bytes"), + pytest.param(_double_as_numer_avro_format, "string", "hello world", "hello world", id="test_string"), + pytest.param(_double_as_numer_avro_format, {"logicalType": "decimal"}, 3.1415, "3.1415", id="test_decimal"), + pytest.param(_double_as_numer_avro_format, {"logicalType": "uuid"}, b"abcdefghijklmnop", uuid.UUID(bytes=b"abcdefghijklmnop"), id="test_uuid"), + pytest.param(_double_as_numer_avro_format, + {"logicalType": "date"}, + datetime.date(2023, 8, 7), + "2023-08-07", + id="test_date"), + pytest.param(_double_as_numer_avro_format, {"logicalType": "time-millis"}, 70267068, 70267068, id="test_time_millis"), + pytest.param(_double_as_numer_avro_format, {"logicalType": "time-micros"}, 70267068, 70267068, id="test_time_micros"), + pytest.param(_double_as_numer_avro_format, + {"logicalType": "local-timestamp-millis"}, + datetime.datetime(2023, 8, 7, 19, 31, 7, 68000, tzinfo=datetime.timezone.utc), + "2023-08-07T19:31:07.068+00:00", + id="test_timestamp_millis"), + pytest.param(_double_as_numer_avro_format, + {"logicalType": "local-timestamp-micros"}, + datetime.datetime(2023, 8, 7, 19, 31, 7, 68000, tzinfo=datetime.timezone.utc), + "2023-08-07T19:31:07.068000+00:00", + id="test_timestamo_micros"), + ] +) +def test_to_output_value(avro_format, record_type, record_value, expected_value): + parser = AvroParser() + assert parser._to_output_value(avro_format, record_type, record_value) == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py index a0602bc7bc60..02d4cfb93b73 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py @@ -398,7 +398,7 @@ "drummer": "George Daniel", }, "col_fixed": "\x12\x34\x56\x78", - "col_decimal": 1234.56789, + "col_decimal": "1234.56789", "col_uuid": "123e4567-e89b-12d3-a456-426655440000", "col_date": "2022-05-29", "col_time_millis": "06:00:00.456000", @@ -623,9 +623,9 @@ ) ).build() -avro_file_with_decimal_as_float_scenario = ( +avro_file_with_double_as_number_scenario = ( TestScenarioBuilder() - .set_name("avro_file_with_decimal_as_float_stream") + .set_name("avro_file_with_double_as_number_stream") .set_config( { "streams": [ @@ -637,7 +637,7 @@ "format": { "avro": { "filetype": "avro", - "decimal_as_float": True + "double_as_string": False } } } diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index c5714cc570b3..8d5d58fc3290 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -128,10 +128,10 @@ ], "type": "string" }, - "decimal_as_float": { - "title": "Convert Decimal Fields to Floats", - "description": "Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.", - "default": False, + "double_as_string": { + "title": "Convert Double Fields to Strings", + "description": "Whether to convert double fields to strings. There is a loss of precision when converting decimals to floats, so this is recommended.", + "default": True, "type": "boolean" } } diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index f6116e482b3c..7ac5a915151c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -16,7 +16,7 @@ from pytest import LogCaptureFixture from unit_tests.sources.file_based.scenarios.avro_scenarios import ( avro_all_types_scenario, - avro_file_with_decimal_as_float_scenario, + avro_file_with_double_as_number_scenario, multiple_avro_combine_schema_scenario, multiple_streams_avro_scenario, single_avro_scenario, @@ -194,7 +194,7 @@ avro_all_types_scenario, multiple_avro_combine_schema_scenario, multiple_streams_avro_scenario, - avro_file_with_decimal_as_float_scenario, + avro_file_with_double_as_number_scenario, csv_newline_in_values_not_quoted_scenario, csv_autogenerate_column_names_scenario, ]