Revert "[Datasets] Add support for string tensor columns in `ArrowTen…

…sorArray` and `ArrowVariableShapedTensorArray` (ray-project#31817)" (ray-project#32123) This reverts commit 1fdf24e. Signed-off-by: Edward Oakes <ed.nmi.oakes@gmail.com>
edoakes · Mar 22, 2023 · 58f4e9f · 58f4e9f
1 parent dce8180
commit 58f4e9f
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 134 deletions.
diff --git a/python/ray/air/tests/test_tensor_extension.py b/python/ray/air/tests/test_tensor_extension.py
@@ -49,22 +49,6 @@ def test_arrow_scalar_tensor_array_roundtrip_boolean():
     np.testing.assert_array_equal(out, arr)
 
 
-def test_arrow_scalar_tensor_array_roundtrip_string():
-    arr = np.array(
-        [
-            ["Philip", "Fry"],
-            ["Leela", "Turanga"],
-            ["Hubert", "Farnsworth"],
-            ["Lrrr", ""],
-        ]
-    )
-    ata = ArrowTensorArray.from_numpy(arr)
-    assert isinstance(ata.type, pa.DataType)
-    assert len(ata) == len(arr)
-    out = ata.to_numpy()
-    np.testing.assert_array_equal(out, arr)
-
-
 def test_scalar_tensor_array_roundtrip():
     arr = np.arange(10)
     ta = TensorArray(arr)
@@ -155,24 +139,6 @@ def test_arrow_variable_shaped_tensor_array_roundtrip_boolean():
         np.testing.assert_array_equal(o, a)
 
 
-def test_arrow_variable_shaped_tensor_array_roundtrip_string():
-    arr = np.array(
-        [
-            ["Philip", "J", "Fry"],
-            ["Leela", "Turanga"],
-            ["Professor", "Hubert", "J", "Farnsworth"],
-            ["Lrrr"],
-        ],
-        dtype=object,
-    )
-    ata = ArrowVariableShapedTensorArray.from_numpy(arr)
-    assert isinstance(ata.type, ArrowVariableShapedTensorType)
-    assert len(ata) == len(arr)
-    out = ata.to_numpy()
-    for o, a in zip(out, arr):
-        np.testing.assert_array_equal(o, a)
-
-
 def test_arrow_variable_shaped_tensor_array_roundtrip_contiguous_optimization():
     # Test that a roundtrip on slices of an already-contiguous 1D base array does not
     # create any unnecessary copies.
@@ -223,46 +189,6 @@ def test_arrow_variable_shaped_tensor_array_slice():
             np.testing.assert_array_equal(o, e)
 
 
-def test_arrow_variable_shaped_string_tensor_array_slice():
-    arr = np.array(
-        [
-            ["Philip", "J", "Fry"],
-            ["Leela", "Turanga"],
-            ["Professor", "Hubert", "J", "Farnsworth"],
-            ["Lrrr"],
-        ],
-        dtype=object,
-    )
-    ata = ArrowVariableShapedTensorArray.from_numpy(arr)
-    assert isinstance(ata.type, ArrowVariableShapedTensorType)
-    assert len(ata) == len(arr)
-    indices = [0, 1, 2]
-    for i in indices:
-        np.testing.assert_array_equal(ata[i], arr[i])
-    slices = [
-        slice(0, 1),
-        slice(1, 2),
-        slice(2, 3),
-        slice(3, 4),
-        slice(0, 2),
-        slice(1, 3),
-        slice(2, 4),
-        slice(0, 3),
-        slice(1, 4),
-        slice(0, 4),
-    ]
-    for slice_ in slices:
-        ata_slice = ata[slice_]
-        ata_slice_np = ata_slice.to_numpy()
-        arr_slice = arr[slice_]
-        # Check for equivalent dtypes and shapes.
-        assert ata_slice_np.dtype == arr_slice.dtype
-        assert ata_slice_np.shape == arr_slice.shape
-        # Iteration over tensor array slices triggers NumPy conversion.
-        for o, e in zip(ata_slice, arr_slice):
-            np.testing.assert_array_equal(o, e)
-
-
 def test_variable_shaped_tensor_array_roundtrip():
     shapes = [(2, 2), (3, 3), (4, 4)]
     cumsum_sizes = np.cumsum([0] + [np.prod(shape) for shape in shapes[:-1]])
@@ -558,8 +484,6 @@ def test_arrow_variable_shaped_tensor_array_getitem(chunked):
         ([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float32),
         ([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float16),
         ([[False, True], [True, False], [True, True], [False, False]], None),
-        ([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], None),
-        ([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], np.str_),
     ],
 )
 def test_arrow_tensor_array_slice(test_arr, dtype):

diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py
@@ -1,5 +1,4 @@
 import itertools
-import sys
 from typing import Iterable, Optional, Tuple, List, Sequence, Union
 
 from pkg_resources._vendor.packaging.version import parse as parse_version
@@ -310,15 +309,6 @@ def from_numpy(
                 # We only natively support C-contiguous ndarrays.
                 arr = np.ascontiguousarray(arr)
             pa_dtype = pa.from_numpy_dtype(arr.dtype)
-            if pa.types.is_string(pa_dtype):
-                if arr.dtype.byteorder == ">" or (
-                    arr.dtype.byteorder == "=" and sys.byteorder == "big"
-                ):
-                    raise ValueError(
-                        "Only little-endian string tensors are supported, but got: ",
-                        arr.dtype,
-                    )
-                pa_dtype = pa.binary(arr.dtype.itemsize)
             outer_len = arr.shape[0]
             element_shape = arr.shape[1:]
             total_num_items = arr.size
@@ -331,7 +321,6 @@ def from_numpy(
                 # NOTE: Arrow expects LSB bit-packed ordering.
                 # NOTE: This creates a copy.
                 arr = np.packbits(arr, bitorder="little")
-
             data_buffer = pa.py_buffer(arr)
             data_array = pa.Array.from_buffers(
                 pa_dtype, total_num_items, [None, data_buffer]
@@ -436,12 +425,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
             arr = np.unpackbits(arr, bitorder="little")
             # Interpret buffer as boolean array.
             return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
-        # Special handling of binary/string types. Assumes unicode string tensor columns
-        if pa.types.is_fixed_size_binary(value_type):
-            NUM_BYTES_PER_UNICODE_CHAR = 4
-            ext_dtype = np.dtype(
-                f"<U{value_type.byte_width // NUM_BYTES_PER_UNICODE_CHAR}"
-            )
         return np.ndarray(shape, dtype=ext_dtype, buffer=data_buffer, offset=offset)
 
     def to_numpy(self, zero_copy_only: bool = True):
@@ -721,14 +704,6 @@ def from_numpy(
                 f"arrays: {types_and_shapes}"
             )
         pa_dtype = pa.from_numpy_dtype(dtype)
-        if pa.types.is_string(pa_dtype):
-            if dtype.byteorder == ">" or (
-                dtype.byteorder == "=" and sys.byteorder == "big"
-            ):
-                raise ValueError(
-                    "Only little-endian string tensors are supported, but got: ", dtype
-                )
-            pa_dtype = pa.binary(dtype.itemsize)
         if dtype.type is np.bool_:
             # NumPy doesn't represent boolean arrays as bit-packed, so we manually
             # bit-pack the booleans before handing the buffer off to Arrow.
@@ -783,8 +758,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
         data = self.storage.field("data")
         shapes = self.storage.field("shape")
         value_type = data.type.value_type
-        ext_dtype = value_type.to_pandas_dtype()
-        shape = shapes[index].as_py()
         if pa.types.is_boolean(value_type):
             # Arrow boolean array buffers are bit-packed, with 8 entries per byte,
             # and are accessed via bit offsets.
@@ -793,43 +766,39 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
             # We assume all other array types are accessed via byte array
             # offsets.
             buffer_item_width = value_type.bit_width // 8
-
+        shape = shapes[index].as_py()
         offset = data.offsets[index].as_py()
         data_offset = buffer_item_width * offset
         data_buffer = data.buffers()[3]
-
-        if pa.types.is_boolean(value_type):
-            # Special handling for boolean arrays, since Arrow bit-packs boolean arrays
-            # while NumPy does not.
-            # Cast as uint8 array and let NumPy unpack into a boolean view.
-            # Offset into uint8 array, where each element is a bucket for 8 booleans.
-            byte_bucket_offset = data_offset // 8
-            # Offset for a specific boolean, within a uint8 array element.
-            bool_offset = data_offset % 8
-            # The number of uint8 array elements (buckets) that our slice spans.
-            # Note that, due to the offset for a specific boolean, the slice can span
-            # byte boundaries even if it contains less than 8 booleans.
-            num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8)
-            # Construct the uint8 array view on the buffer.
-            arr = np.ndarray(
-                (num_boolean_byte_buckets,),
-                dtype=np.uint8,
+        if not pa.types.is_boolean(value_type):
+            return np.ndarray(
+                shape,
+                dtype=value_type.to_pandas_dtype(),
                 buffer=data_buffer,
-                offset=byte_bucket_offset,
-            )
-            # Unpack into a byte per boolean, using LSB bit-packed ordering.
-            arr = np.unpackbits(arr, bitorder="little")
-            # Interpret buffer as boolean array.
-            return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
-        # Special handling of binary/string types. Assumes unicode string tensor columns
-        if pa.types.is_fixed_size_binary(value_type):
-            NUM_BYTES_PER_UNICODE_CHAR = 4
-            ext_dtype = np.dtype(
-                f"<U{value_type.byte_width // NUM_BYTES_PER_UNICODE_CHAR}"
+                offset=data_offset,
             )
-        return np.ndarray(
-            shape, dtype=ext_dtype, buffer=data_buffer, offset=data_offset
+        # Special handling for boolean arrays, since Arrow bit-packs boolean arrays
+        # while NumPy does not.
+        # Cast as uint8 array and let NumPy unpack into a boolean view.
+        # Offset into uint8 array, where each element is a bucket for 8 booleans.
+        byte_bucket_offset = data_offset // 8
+        # Offset for a specific boolean, within a uint8 array element.
+        bool_offset = data_offset % 8
+        # The number of uint8 array elements (buckets) that our slice spans.
+        # Note that, due to the offset for a specific boolean, the slice can span byte
+        # boundaries even if it contains less than 8 booleans.
+        num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8)
+        # Construct the uint8 array view on the buffer.
+        arr = np.ndarray(
+            (num_boolean_byte_buckets,),
+            dtype=np.uint8,
+            buffer=data_buffer,
+            offset=byte_bucket_offset,
         )
+        # Unpack into a byte per boolean, using LSB bit-packed ordering.
+        arr = np.unpackbits(arr, bitorder="little")
+        # Interpret buffer as boolean array.
+        return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
 
     def to_numpy(self, zero_copy_only: bool = True):
         """