[Data] Don't convert variable-length byte data to Numpy array (#35638) (

#35703) Closes #35586 See #35586 (comment) Numpy treats variable length byte data as zero-terminated bytes. So if there are zero bytes encoded into the bytestring itself, those will be discarded. Instead, per recommendation in apache/arrow#26470, it seems that variable length bytes should be treated as python objects. --------- Signed-off-by: amogkam <amogkamsetty@yahoo.com>
ray-project · May 24, 2023 · 9c60c79 · 9c60c79
1 parent b931fc7
commit 9c60c79
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 0 deletions.
diff --git a/python/ray/data/_internal/numpy_support.py b/python/ray/data/_internal/numpy_support.py
@@ -75,6 +75,14 @@ def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any:
             for e in udf_return_col:
                 if isinstance(e, np.ndarray):
                     shapes.add((e.dtype, e.shape))
+                elif isinstance(e, bytes):
+                    # Don't convert variable length binary data to Numpy arrays as it
+                    # treats zero encoding as termination by default.
+                    # Per recommendation from
+                    # https://github.com/apache/arrow/issues/26470,
+                    # we use object dtype.
+                    # https://github.com/ray-project/ray/issues/35586#issuecomment-1558148261
+                    has_object = True
                 elif not np.isscalar(e):
                     has_object = True
             if has_object or len(shapes) > 1:

diff --git a/python/ray/data/tests/test_numpy_support.py b/python/ray/data/tests/test_numpy_support.py
@@ -110,6 +110,15 @@ def test_scalar_arrays(ray_start_regular_shared):
     assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.int64))
 
 
+def test_bytes(ray_start_regular_shared):
+    """Tests that bytes are converted to object dtype instead of zero-terminated."""
+    data = b"\x1a\n\x00\n\x1a"
+    ds = ray.data.range(1, parallelism=1)
+    ds = ds.map(lambda x: {"output": data})
+    output = ds.take_batch()["output"]
+    assert_structure_equals(output, np.array([b"\x1a\n\x00\n\x1a"], dtype=object))
+
+
 def test_scalar_array_like(ray_start_regular_shared):
     data = torch.Tensor([1, 2, 3])
     ds = ray.data.range(2, parallelism=1)