Skip to content

Commit

Permalink
feat(python): add schema conversion of FixedSizeBinaryArray and Fixed…
Browse files Browse the repository at this point in the history
…SizeListType (#2005)

# Description
Map `FixedSizeBinaryType` to `BinaryArray`, since Delta does not support
fixed arrays.

# Related Issue(s)
None.

# Documentation
N/A

# Minimal Example
I've noticed this error when doing subsequent calls to like so:
```
import deltalake as dl
import pyarrow as pa

schema = pa.schema([
    ("field_a", pa.binary(4)),
    # To simulate fix, switch this line to: ("field_a", pa.binary()),
])
table = pa.Table.from_pylist(
    [
        {"field_a": val.to_bytes(4, "little")}
        for val in range(0, 100)
    ],
    schema=schema
)

# This works
dl.write_deltalake(
    "bad_table",
    data=table,
    mode="append",
)

# This fails
dl.write_deltalake(
    "bad_table",
    data=table,
    mode="append",
)
```
with error:
```
ValueError: Schema of data does not match table schema
Data schema:
field_a: fixed_size_binary[4]
Table Schema:
field_a: binary
```

---------

Co-authored-by: Jakub Filipek <jakub@overland.ai>
Co-authored-by: Jakub Filipek <jfilipek@proton.me>
  • Loading branch information
3 people authored Jan 7, 2024
1 parent 25040b8 commit a86cf66
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 1 deletion.
4 changes: 3 additions & 1 deletion python/deltalake/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,16 @@ def _convert_pa_schema_to_delta(

def dtype_to_delta_dtype(dtype: pa.DataType) -> pa.DataType:
# Handle nested types
if isinstance(dtype, (pa.LargeListType, pa.ListType)):
if isinstance(dtype, (pa.LargeListType, pa.ListType, pa.FixedSizeListType)):
return list_to_delta_dtype(dtype)
elif isinstance(dtype, pa.StructType):
return struct_to_delta_dtype(dtype)
elif isinstance(dtype, pa.TimestampType):
return pa.timestamp(
"us"
) # TODO(ion): propagate also timezone information during writeonce we can properly read TZ in delta schema
elif type(dtype) is pa.FixedSizeBinaryType:
return pa.binary()
try:
return dtype_map[dtype]
except KeyError:
Expand Down
2 changes: 2 additions & 0 deletions python/stubs/pyarrow/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ DataType: Any
ListType: Any
StructType: Any
MapType: Any
FixedSizeListType: Any
FixedSizeBinaryType: Any
schema: Any
map_: Any
list_: Any
Expand Down
6 changes: 6 additions & 0 deletions python/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,16 @@ def test_delta_schema():
[
pa.field("some_int", pa.uint32(), nullable=True),
pa.field("some_string", pa.string(), nullable=False),
pa.field("some_fixed_binary", pa.binary(5), nullable=False),
pa.field("some_decimal", pa.decimal128(10, 2), nullable=False),
]
),
pa.schema(
[
pa.field("some_int", pa.int32(), nullable=True),
pa.field("some_string", pa.string(), nullable=False),
pa.field("some_fixed_binary", pa.binary(), nullable=False),
pa.field("some_decimal", pa.decimal128(10, 2), nullable=False),
]
),
False,
Expand Down Expand Up @@ -293,13 +297,15 @@ def test_delta_schema():
pa.schema(
[
("some_list", pa.list_(pa.string())),
("some_fixed_list_int", pa.list_(pa.uint32(), 5)),
("some_list_binary", pa.list_(pa.binary())),
("some_string", pa.large_string()),
]
),
pa.schema(
[
("some_list", pa.large_list(pa.large_string())),
("some_fixed_list_int", pa.large_list(pa.int32())),
("some_list_binary", pa.large_list(pa.large_binary())),
("some_string", pa.large_string()),
]
Expand Down

0 comments on commit a86cf66

Please sign in to comment.