Skip to content

Commit

Permalink
[Data] Don't convert variable-length byte data to Numpy array (#35638) (
Browse files Browse the repository at this point in the history
#35703)

Closes #35586
See #35586 (comment)

Numpy treats variable length byte data as zero-terminated bytes. So if there are zero bytes encoded into the bytestring itself, those will be discarded.

Instead, per recommendation in apache/arrow#26470, it seems that variable length bytes should be treated as python objects.

---------

Signed-off-by: amogkam <amogkamsetty@yahoo.com>
  • Loading branch information
amogkam authored May 24, 2023
1 parent b931fc7 commit 9c60c79
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
8 changes: 8 additions & 0 deletions python/ray/data/_internal/numpy_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any:
for e in udf_return_col:
if isinstance(e, np.ndarray):
shapes.add((e.dtype, e.shape))
elif isinstance(e, bytes):
# Don't convert variable length binary data to Numpy arrays as it
# treats zero encoding as termination by default.
# Per recommendation from
# https://github.com/apache/arrow/issues/26470,
# we use object dtype.
# https://github.com/ray-project/ray/issues/35586#issuecomment-1558148261
has_object = True
elif not np.isscalar(e):
has_object = True
if has_object or len(shapes) > 1:
Expand Down
9 changes: 9 additions & 0 deletions python/ray/data/tests/test_numpy_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@ def test_scalar_arrays(ray_start_regular_shared):
assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.int64))


def test_bytes(ray_start_regular_shared):
"""Tests that bytes are converted to object dtype instead of zero-terminated."""
data = b"\x1a\n\x00\n\x1a"
ds = ray.data.range(1, parallelism=1)
ds = ds.map(lambda x: {"output": data})
output = ds.take_batch()["output"]
assert_structure_equals(output, np.array([b"\x1a\n\x00\n\x1a"], dtype=object))


def test_scalar_array_like(ray_start_regular_shared):
data = torch.Tensor([1, 2, 3])
ds = ray.data.range(2, parallelism=1)
Expand Down

0 comments on commit 9c60c79

Please sign in to comment.