From 58f4e9fa9bf444118c5791f2d695e885c7fe9be6 Mon Sep 17 00:00:00 2001 From: Scott Lee Date: Tue, 31 Jan 2023 11:54:27 -0800 Subject: [PATCH] Revert "[Datasets] Add support for string tensor columns in `ArrowTensorArray` and `ArrowVariableShapedTensorArray` (#31817)" (#32123) This reverts commit 1fdf24eb81173cdea037b07ebda37f1e66f0ca04. Signed-off-by: Edward Oakes --- python/ray/air/tests/test_tensor_extension.py | 76 ----------------- .../ray/air/util/tensor_extensions/arrow.py | 85 ++++++------------- 2 files changed, 27 insertions(+), 134 deletions(-) diff --git a/python/ray/air/tests/test_tensor_extension.py b/python/ray/air/tests/test_tensor_extension.py index 935f2680c621..811116a82105 100644 --- a/python/ray/air/tests/test_tensor_extension.py +++ b/python/ray/air/tests/test_tensor_extension.py @@ -49,22 +49,6 @@ def test_arrow_scalar_tensor_array_roundtrip_boolean(): np.testing.assert_array_equal(out, arr) -def test_arrow_scalar_tensor_array_roundtrip_string(): - arr = np.array( - [ - ["Philip", "Fry"], - ["Leela", "Turanga"], - ["Hubert", "Farnsworth"], - ["Lrrr", ""], - ] - ) - ata = ArrowTensorArray.from_numpy(arr) - assert isinstance(ata.type, pa.DataType) - assert len(ata) == len(arr) - out = ata.to_numpy() - np.testing.assert_array_equal(out, arr) - - def test_scalar_tensor_array_roundtrip(): arr = np.arange(10) ta = TensorArray(arr) @@ -155,24 +139,6 @@ def test_arrow_variable_shaped_tensor_array_roundtrip_boolean(): np.testing.assert_array_equal(o, a) -def test_arrow_variable_shaped_tensor_array_roundtrip_string(): - arr = np.array( - [ - ["Philip", "J", "Fry"], - ["Leela", "Turanga"], - ["Professor", "Hubert", "J", "Farnsworth"], - ["Lrrr"], - ], - dtype=object, - ) - ata = ArrowVariableShapedTensorArray.from_numpy(arr) - assert isinstance(ata.type, ArrowVariableShapedTensorType) - assert len(ata) == len(arr) - out = ata.to_numpy() - for o, a in zip(out, arr): - np.testing.assert_array_equal(o, a) - - def test_arrow_variable_shaped_tensor_array_roundtrip_contiguous_optimization(): # Test that a roundtrip on slices of an already-contiguous 1D base array does not # create any unnecessary copies. @@ -223,46 +189,6 @@ def test_arrow_variable_shaped_tensor_array_slice(): np.testing.assert_array_equal(o, e) -def test_arrow_variable_shaped_string_tensor_array_slice(): - arr = np.array( - [ - ["Philip", "J", "Fry"], - ["Leela", "Turanga"], - ["Professor", "Hubert", "J", "Farnsworth"], - ["Lrrr"], - ], - dtype=object, - ) - ata = ArrowVariableShapedTensorArray.from_numpy(arr) - assert isinstance(ata.type, ArrowVariableShapedTensorType) - assert len(ata) == len(arr) - indices = [0, 1, 2] - for i in indices: - np.testing.assert_array_equal(ata[i], arr[i]) - slices = [ - slice(0, 1), - slice(1, 2), - slice(2, 3), - slice(3, 4), - slice(0, 2), - slice(1, 3), - slice(2, 4), - slice(0, 3), - slice(1, 4), - slice(0, 4), - ] - for slice_ in slices: - ata_slice = ata[slice_] - ata_slice_np = ata_slice.to_numpy() - arr_slice = arr[slice_] - # Check for equivalent dtypes and shapes. - assert ata_slice_np.dtype == arr_slice.dtype - assert ata_slice_np.shape == arr_slice.shape - # Iteration over tensor array slices triggers NumPy conversion. - for o, e in zip(ata_slice, arr_slice): - np.testing.assert_array_equal(o, e) - - def test_variable_shaped_tensor_array_roundtrip(): shapes = [(2, 2), (3, 3), (4, 4)] cumsum_sizes = np.cumsum([0] + [np.prod(shape) for shape in shapes[:-1]]) @@ -558,8 +484,6 @@ def test_arrow_variable_shaped_tensor_array_getitem(chunked): ([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float32), ([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float16), ([[False, True], [True, False], [True, True], [False, False]], None), - ([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], None), - ([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], np.str_), ], ) def test_arrow_tensor_array_slice(test_arr, dtype): diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py index b6d2681de031..7412e2d30c23 100644 --- a/python/ray/air/util/tensor_extensions/arrow.py +++ b/python/ray/air/util/tensor_extensions/arrow.py @@ -1,5 +1,4 @@ import itertools -import sys from typing import Iterable, Optional, Tuple, List, Sequence, Union from pkg_resources._vendor.packaging.version import parse as parse_version @@ -310,15 +309,6 @@ def from_numpy( # We only natively support C-contiguous ndarrays. arr = np.ascontiguousarray(arr) pa_dtype = pa.from_numpy_dtype(arr.dtype) - if pa.types.is_string(pa_dtype): - if arr.dtype.byteorder == ">" or ( - arr.dtype.byteorder == "=" and sys.byteorder == "big" - ): - raise ValueError( - "Only little-endian string tensors are supported, but got: ", - arr.dtype, - ) - pa_dtype = pa.binary(arr.dtype.itemsize) outer_len = arr.shape[0] element_shape = arr.shape[1:] total_num_items = arr.size @@ -331,7 +321,6 @@ def from_numpy( # NOTE: Arrow expects LSB bit-packed ordering. # NOTE: This creates a copy. arr = np.packbits(arr, bitorder="little") - data_buffer = pa.py_buffer(arr) data_array = pa.Array.from_buffers( pa_dtype, total_num_items, [None, data_buffer] @@ -436,12 +425,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False): arr = np.unpackbits(arr, bitorder="little") # Interpret buffer as boolean array. return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset) - # Special handling of binary/string types. Assumes unicode string tensor columns - if pa.types.is_fixed_size_binary(value_type): - NUM_BYTES_PER_UNICODE_CHAR = 4 - ext_dtype = np.dtype( - f"" or ( - dtype.byteorder == "=" and sys.byteorder == "big" - ): - raise ValueError( - "Only little-endian string tensors are supported, but got: ", dtype - ) - pa_dtype = pa.binary(dtype.itemsize) if dtype.type is np.bool_: # NumPy doesn't represent boolean arrays as bit-packed, so we manually # bit-pack the booleans before handing the buffer off to Arrow. @@ -783,8 +758,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False): data = self.storage.field("data") shapes = self.storage.field("shape") value_type = data.type.value_type - ext_dtype = value_type.to_pandas_dtype() - shape = shapes[index].as_py() if pa.types.is_boolean(value_type): # Arrow boolean array buffers are bit-packed, with 8 entries per byte, # and are accessed via bit offsets. @@ -793,43 +766,39 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False): # We assume all other array types are accessed via byte array # offsets. buffer_item_width = value_type.bit_width // 8 - + shape = shapes[index].as_py() offset = data.offsets[index].as_py() data_offset = buffer_item_width * offset data_buffer = data.buffers()[3] - - if pa.types.is_boolean(value_type): - # Special handling for boolean arrays, since Arrow bit-packs boolean arrays - # while NumPy does not. - # Cast as uint8 array and let NumPy unpack into a boolean view. - # Offset into uint8 array, where each element is a bucket for 8 booleans. - byte_bucket_offset = data_offset // 8 - # Offset for a specific boolean, within a uint8 array element. - bool_offset = data_offset % 8 - # The number of uint8 array elements (buckets) that our slice spans. - # Note that, due to the offset for a specific boolean, the slice can span - # byte boundaries even if it contains less than 8 booleans. - num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8) - # Construct the uint8 array view on the buffer. - arr = np.ndarray( - (num_boolean_byte_buckets,), - dtype=np.uint8, + if not pa.types.is_boolean(value_type): + return np.ndarray( + shape, + dtype=value_type.to_pandas_dtype(), buffer=data_buffer, - offset=byte_bucket_offset, - ) - # Unpack into a byte per boolean, using LSB bit-packed ordering. - arr = np.unpackbits(arr, bitorder="little") - # Interpret buffer as boolean array. - return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset) - # Special handling of binary/string types. Assumes unicode string tensor columns - if pa.types.is_fixed_size_binary(value_type): - NUM_BYTES_PER_UNICODE_CHAR = 4 - ext_dtype = np.dtype( - f"