Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "[Datasets] Add support for string tensor columns in ArrowTensorArray" #32123

Merged
merged 1 commit into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 0 additions & 76 deletions python/ray/air/tests/test_tensor_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,6 @@ def test_arrow_scalar_tensor_array_roundtrip_boolean():
np.testing.assert_array_equal(out, arr)


def test_arrow_scalar_tensor_array_roundtrip_string():
arr = np.array(
[
["Philip", "Fry"],
["Leela", "Turanga"],
["Hubert", "Farnsworth"],
["Lrrr", ""],
]
)
ata = ArrowTensorArray.from_numpy(arr)
assert isinstance(ata.type, pa.DataType)
assert len(ata) == len(arr)
out = ata.to_numpy()
np.testing.assert_array_equal(out, arr)


def test_scalar_tensor_array_roundtrip():
arr = np.arange(10)
ta = TensorArray(arr)
Expand Down Expand Up @@ -155,24 +139,6 @@ def test_arrow_variable_shaped_tensor_array_roundtrip_boolean():
np.testing.assert_array_equal(o, a)


def test_arrow_variable_shaped_tensor_array_roundtrip_string():
arr = np.array(
[
["Philip", "J", "Fry"],
["Leela", "Turanga"],
["Professor", "Hubert", "J", "Farnsworth"],
["Lrrr"],
],
dtype=object,
)
ata = ArrowVariableShapedTensorArray.from_numpy(arr)
assert isinstance(ata.type, ArrowVariableShapedTensorType)
assert len(ata) == len(arr)
out = ata.to_numpy()
for o, a in zip(out, arr):
np.testing.assert_array_equal(o, a)


def test_arrow_variable_shaped_tensor_array_roundtrip_contiguous_optimization():
# Test that a roundtrip on slices of an already-contiguous 1D base array does not
# create any unnecessary copies.
Expand Down Expand Up @@ -223,46 +189,6 @@ def test_arrow_variable_shaped_tensor_array_slice():
np.testing.assert_array_equal(o, e)


def test_arrow_variable_shaped_string_tensor_array_slice():
arr = np.array(
[
["Philip", "J", "Fry"],
["Leela", "Turanga"],
["Professor", "Hubert", "J", "Farnsworth"],
["Lrrr"],
],
dtype=object,
)
ata = ArrowVariableShapedTensorArray.from_numpy(arr)
assert isinstance(ata.type, ArrowVariableShapedTensorType)
assert len(ata) == len(arr)
indices = [0, 1, 2]
for i in indices:
np.testing.assert_array_equal(ata[i], arr[i])
slices = [
slice(0, 1),
slice(1, 2),
slice(2, 3),
slice(3, 4),
slice(0, 2),
slice(1, 3),
slice(2, 4),
slice(0, 3),
slice(1, 4),
slice(0, 4),
]
for slice_ in slices:
ata_slice = ata[slice_]
ata_slice_np = ata_slice.to_numpy()
arr_slice = arr[slice_]
# Check for equivalent dtypes and shapes.
assert ata_slice_np.dtype == arr_slice.dtype
assert ata_slice_np.shape == arr_slice.shape
# Iteration over tensor array slices triggers NumPy conversion.
for o, e in zip(ata_slice, arr_slice):
np.testing.assert_array_equal(o, e)


def test_variable_shaped_tensor_array_roundtrip():
shapes = [(2, 2), (3, 3), (4, 4)]
cumsum_sizes = np.cumsum([0] + [np.prod(shape) for shape in shapes[:-1]])
Expand Down Expand Up @@ -558,8 +484,6 @@ def test_arrow_variable_shaped_tensor_array_getitem(chunked):
([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float32),
([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float16),
([[False, True], [True, False], [True, True], [False, False]], None),
([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], None),
([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], np.str_),
],
)
def test_arrow_tensor_array_slice(test_arr, dtype):
Expand Down
85 changes: 27 additions & 58 deletions python/ray/air/util/tensor_extensions/arrow.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import itertools
import sys
from typing import Iterable, Optional, Tuple, List, Sequence, Union

from pkg_resources._vendor.packaging.version import parse as parse_version
Expand Down Expand Up @@ -310,15 +309,6 @@ def from_numpy(
# We only natively support C-contiguous ndarrays.
arr = np.ascontiguousarray(arr)
pa_dtype = pa.from_numpy_dtype(arr.dtype)
if pa.types.is_string(pa_dtype):
if arr.dtype.byteorder == ">" or (
arr.dtype.byteorder == "=" and sys.byteorder == "big"
):
raise ValueError(
"Only little-endian string tensors are supported, but got: ",
arr.dtype,
)
pa_dtype = pa.binary(arr.dtype.itemsize)
outer_len = arr.shape[0]
element_shape = arr.shape[1:]
total_num_items = arr.size
Expand All @@ -331,7 +321,6 @@ def from_numpy(
# NOTE: Arrow expects LSB bit-packed ordering.
# NOTE: This creates a copy.
arr = np.packbits(arr, bitorder="little")

data_buffer = pa.py_buffer(arr)
data_array = pa.Array.from_buffers(
pa_dtype, total_num_items, [None, data_buffer]
Expand Down Expand Up @@ -436,12 +425,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
arr = np.unpackbits(arr, bitorder="little")
# Interpret buffer as boolean array.
return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
# Special handling of binary/string types. Assumes unicode string tensor columns
if pa.types.is_fixed_size_binary(value_type):
NUM_BYTES_PER_UNICODE_CHAR = 4
ext_dtype = np.dtype(
f"<U{value_type.byte_width // NUM_BYTES_PER_UNICODE_CHAR}"
)
return np.ndarray(shape, dtype=ext_dtype, buffer=data_buffer, offset=offset)

def to_numpy(self, zero_copy_only: bool = True):
Expand Down Expand Up @@ -721,14 +704,6 @@ def from_numpy(
f"arrays: {types_and_shapes}"
)
pa_dtype = pa.from_numpy_dtype(dtype)
if pa.types.is_string(pa_dtype):
if dtype.byteorder == ">" or (
dtype.byteorder == "=" and sys.byteorder == "big"
):
raise ValueError(
"Only little-endian string tensors are supported, but got: ", dtype
)
pa_dtype = pa.binary(dtype.itemsize)
if dtype.type is np.bool_:
# NumPy doesn't represent boolean arrays as bit-packed, so we manually
# bit-pack the booleans before handing the buffer off to Arrow.
Expand Down Expand Up @@ -783,8 +758,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
data = self.storage.field("data")
shapes = self.storage.field("shape")
value_type = data.type.value_type
ext_dtype = value_type.to_pandas_dtype()
shape = shapes[index].as_py()
if pa.types.is_boolean(value_type):
# Arrow boolean array buffers are bit-packed, with 8 entries per byte,
# and are accessed via bit offsets.
Expand All @@ -793,43 +766,39 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
# We assume all other array types are accessed via byte array
# offsets.
buffer_item_width = value_type.bit_width // 8

shape = shapes[index].as_py()
offset = data.offsets[index].as_py()
data_offset = buffer_item_width * offset
data_buffer = data.buffers()[3]

if pa.types.is_boolean(value_type):
# Special handling for boolean arrays, since Arrow bit-packs boolean arrays
# while NumPy does not.
# Cast as uint8 array and let NumPy unpack into a boolean view.
# Offset into uint8 array, where each element is a bucket for 8 booleans.
byte_bucket_offset = data_offset // 8
# Offset for a specific boolean, within a uint8 array element.
bool_offset = data_offset % 8
# The number of uint8 array elements (buckets) that our slice spans.
# Note that, due to the offset for a specific boolean, the slice can span
# byte boundaries even if it contains less than 8 booleans.
num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8)
# Construct the uint8 array view on the buffer.
arr = np.ndarray(
(num_boolean_byte_buckets,),
dtype=np.uint8,
if not pa.types.is_boolean(value_type):
return np.ndarray(
shape,
dtype=value_type.to_pandas_dtype(),
buffer=data_buffer,
offset=byte_bucket_offset,
)
# Unpack into a byte per boolean, using LSB bit-packed ordering.
arr = np.unpackbits(arr, bitorder="little")
# Interpret buffer as boolean array.
return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
# Special handling of binary/string types. Assumes unicode string tensor columns
if pa.types.is_fixed_size_binary(value_type):
NUM_BYTES_PER_UNICODE_CHAR = 4
ext_dtype = np.dtype(
f"<U{value_type.byte_width // NUM_BYTES_PER_UNICODE_CHAR}"
offset=data_offset,
)
return np.ndarray(
shape, dtype=ext_dtype, buffer=data_buffer, offset=data_offset
# Special handling for boolean arrays, since Arrow bit-packs boolean arrays
# while NumPy does not.
# Cast as uint8 array and let NumPy unpack into a boolean view.
# Offset into uint8 array, where each element is a bucket for 8 booleans.
byte_bucket_offset = data_offset // 8
# Offset for a specific boolean, within a uint8 array element.
bool_offset = data_offset % 8
# The number of uint8 array elements (buckets) that our slice spans.
# Note that, due to the offset for a specific boolean, the slice can span byte
# boundaries even if it contains less than 8 booleans.
num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8)
# Construct the uint8 array view on the buffer.
arr = np.ndarray(
(num_boolean_byte_buckets,),
dtype=np.uint8,
buffer=data_buffer,
offset=byte_bucket_offset,
)
# Unpack into a byte per boolean, using LSB bit-packed ordering.
arr = np.unpackbits(arr, bitorder="little")
# Interpret buffer as boolean array.
return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)

def to_numpy(self, zero_copy_only: bool = True):
"""
Expand Down