Skip to content

Commit

Permalink
Revert "[Datasets] Add support for string tensor columns in `ArrowTen…
Browse files Browse the repository at this point in the history
…sorArray` and `ArrowVariableShapedTensorArray` (ray-project#31817)" (ray-project#32123)

This reverts commit 1fdf24e.

Signed-off-by: Edward Oakes <ed.nmi.oakes@gmail.com>
  • Loading branch information
scottjlee authored and edoakes committed Mar 22, 2023
1 parent dce8180 commit 58f4e9f
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 134 deletions.
76 changes: 0 additions & 76 deletions python/ray/air/tests/test_tensor_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,6 @@ def test_arrow_scalar_tensor_array_roundtrip_boolean():
np.testing.assert_array_equal(out, arr)


def test_arrow_scalar_tensor_array_roundtrip_string():
arr = np.array(
[
["Philip", "Fry"],
["Leela", "Turanga"],
["Hubert", "Farnsworth"],
["Lrrr", ""],
]
)
ata = ArrowTensorArray.from_numpy(arr)
assert isinstance(ata.type, pa.DataType)
assert len(ata) == len(arr)
out = ata.to_numpy()
np.testing.assert_array_equal(out, arr)


def test_scalar_tensor_array_roundtrip():
arr = np.arange(10)
ta = TensorArray(arr)
Expand Down Expand Up @@ -155,24 +139,6 @@ def test_arrow_variable_shaped_tensor_array_roundtrip_boolean():
np.testing.assert_array_equal(o, a)


def test_arrow_variable_shaped_tensor_array_roundtrip_string():
arr = np.array(
[
["Philip", "J", "Fry"],
["Leela", "Turanga"],
["Professor", "Hubert", "J", "Farnsworth"],
["Lrrr"],
],
dtype=object,
)
ata = ArrowVariableShapedTensorArray.from_numpy(arr)
assert isinstance(ata.type, ArrowVariableShapedTensorType)
assert len(ata) == len(arr)
out = ata.to_numpy()
for o, a in zip(out, arr):
np.testing.assert_array_equal(o, a)


def test_arrow_variable_shaped_tensor_array_roundtrip_contiguous_optimization():
# Test that a roundtrip on slices of an already-contiguous 1D base array does not
# create any unnecessary copies.
Expand Down Expand Up @@ -223,46 +189,6 @@ def test_arrow_variable_shaped_tensor_array_slice():
np.testing.assert_array_equal(o, e)


def test_arrow_variable_shaped_string_tensor_array_slice():
arr = np.array(
[
["Philip", "J", "Fry"],
["Leela", "Turanga"],
["Professor", "Hubert", "J", "Farnsworth"],
["Lrrr"],
],
dtype=object,
)
ata = ArrowVariableShapedTensorArray.from_numpy(arr)
assert isinstance(ata.type, ArrowVariableShapedTensorType)
assert len(ata) == len(arr)
indices = [0, 1, 2]
for i in indices:
np.testing.assert_array_equal(ata[i], arr[i])
slices = [
slice(0, 1),
slice(1, 2),
slice(2, 3),
slice(3, 4),
slice(0, 2),
slice(1, 3),
slice(2, 4),
slice(0, 3),
slice(1, 4),
slice(0, 4),
]
for slice_ in slices:
ata_slice = ata[slice_]
ata_slice_np = ata_slice.to_numpy()
arr_slice = arr[slice_]
# Check for equivalent dtypes and shapes.
assert ata_slice_np.dtype == arr_slice.dtype
assert ata_slice_np.shape == arr_slice.shape
# Iteration over tensor array slices triggers NumPy conversion.
for o, e in zip(ata_slice, arr_slice):
np.testing.assert_array_equal(o, e)


def test_variable_shaped_tensor_array_roundtrip():
shapes = [(2, 2), (3, 3), (4, 4)]
cumsum_sizes = np.cumsum([0] + [np.prod(shape) for shape in shapes[:-1]])
Expand Down Expand Up @@ -558,8 +484,6 @@ def test_arrow_variable_shaped_tensor_array_getitem(chunked):
([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float32),
([[1.5, 2.5], [3.3, 4.2], [5.2, 6.9], [7.6, 8.1]], np.float16),
([[False, True], [True, False], [True, True], [False, False]], None),
([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], None),
([["Aa", "Bb"], ["Cc", "Dd"], ["Ee", "Ff"], ["Gg", "Hh"]], np.str_),
],
)
def test_arrow_tensor_array_slice(test_arr, dtype):
Expand Down
85 changes: 27 additions & 58 deletions python/ray/air/util/tensor_extensions/arrow.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import itertools
import sys
from typing import Iterable, Optional, Tuple, List, Sequence, Union

from pkg_resources._vendor.packaging.version import parse as parse_version
Expand Down Expand Up @@ -310,15 +309,6 @@ def from_numpy(
# We only natively support C-contiguous ndarrays.
arr = np.ascontiguousarray(arr)
pa_dtype = pa.from_numpy_dtype(arr.dtype)
if pa.types.is_string(pa_dtype):
if arr.dtype.byteorder == ">" or (
arr.dtype.byteorder == "=" and sys.byteorder == "big"
):
raise ValueError(
"Only little-endian string tensors are supported, but got: ",
arr.dtype,
)
pa_dtype = pa.binary(arr.dtype.itemsize)
outer_len = arr.shape[0]
element_shape = arr.shape[1:]
total_num_items = arr.size
Expand All @@ -331,7 +321,6 @@ def from_numpy(
# NOTE: Arrow expects LSB bit-packed ordering.
# NOTE: This creates a copy.
arr = np.packbits(arr, bitorder="little")

data_buffer = pa.py_buffer(arr)
data_array = pa.Array.from_buffers(
pa_dtype, total_num_items, [None, data_buffer]
Expand Down Expand Up @@ -436,12 +425,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
arr = np.unpackbits(arr, bitorder="little")
# Interpret buffer as boolean array.
return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
# Special handling of binary/string types. Assumes unicode string tensor columns
if pa.types.is_fixed_size_binary(value_type):
NUM_BYTES_PER_UNICODE_CHAR = 4
ext_dtype = np.dtype(
f"<U{value_type.byte_width // NUM_BYTES_PER_UNICODE_CHAR}"
)
return np.ndarray(shape, dtype=ext_dtype, buffer=data_buffer, offset=offset)

def to_numpy(self, zero_copy_only: bool = True):
Expand Down Expand Up @@ -721,14 +704,6 @@ def from_numpy(
f"arrays: {types_and_shapes}"
)
pa_dtype = pa.from_numpy_dtype(dtype)
if pa.types.is_string(pa_dtype):
if dtype.byteorder == ">" or (
dtype.byteorder == "=" and sys.byteorder == "big"
):
raise ValueError(
"Only little-endian string tensors are supported, but got: ", dtype
)
pa_dtype = pa.binary(dtype.itemsize)
if dtype.type is np.bool_:
# NumPy doesn't represent boolean arrays as bit-packed, so we manually
# bit-pack the booleans before handing the buffer off to Arrow.
Expand Down Expand Up @@ -783,8 +758,6 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
data = self.storage.field("data")
shapes = self.storage.field("shape")
value_type = data.type.value_type
ext_dtype = value_type.to_pandas_dtype()
shape = shapes[index].as_py()
if pa.types.is_boolean(value_type):
# Arrow boolean array buffers are bit-packed, with 8 entries per byte,
# and are accessed via bit offsets.
Expand All @@ -793,43 +766,39 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
# We assume all other array types are accessed via byte array
# offsets.
buffer_item_width = value_type.bit_width // 8

shape = shapes[index].as_py()
offset = data.offsets[index].as_py()
data_offset = buffer_item_width * offset
data_buffer = data.buffers()[3]

if pa.types.is_boolean(value_type):
# Special handling for boolean arrays, since Arrow bit-packs boolean arrays
# while NumPy does not.
# Cast as uint8 array and let NumPy unpack into a boolean view.
# Offset into uint8 array, where each element is a bucket for 8 booleans.
byte_bucket_offset = data_offset // 8
# Offset for a specific boolean, within a uint8 array element.
bool_offset = data_offset % 8
# The number of uint8 array elements (buckets) that our slice spans.
# Note that, due to the offset for a specific boolean, the slice can span
# byte boundaries even if it contains less than 8 booleans.
num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8)
# Construct the uint8 array view on the buffer.
arr = np.ndarray(
(num_boolean_byte_buckets,),
dtype=np.uint8,
if not pa.types.is_boolean(value_type):
return np.ndarray(
shape,
dtype=value_type.to_pandas_dtype(),
buffer=data_buffer,
offset=byte_bucket_offset,
)
# Unpack into a byte per boolean, using LSB bit-packed ordering.
arr = np.unpackbits(arr, bitorder="little")
# Interpret buffer as boolean array.
return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)
# Special handling of binary/string types. Assumes unicode string tensor columns
if pa.types.is_fixed_size_binary(value_type):
NUM_BYTES_PER_UNICODE_CHAR = 4
ext_dtype = np.dtype(
f"<U{value_type.byte_width // NUM_BYTES_PER_UNICODE_CHAR}"
offset=data_offset,
)
return np.ndarray(
shape, dtype=ext_dtype, buffer=data_buffer, offset=data_offset
# Special handling for boolean arrays, since Arrow bit-packs boolean arrays
# while NumPy does not.
# Cast as uint8 array and let NumPy unpack into a boolean view.
# Offset into uint8 array, where each element is a bucket for 8 booleans.
byte_bucket_offset = data_offset // 8
# Offset for a specific boolean, within a uint8 array element.
bool_offset = data_offset % 8
# The number of uint8 array elements (buckets) that our slice spans.
# Note that, due to the offset for a specific boolean, the slice can span byte
# boundaries even if it contains less than 8 booleans.
num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8)
# Construct the uint8 array view on the buffer.
arr = np.ndarray(
(num_boolean_byte_buckets,),
dtype=np.uint8,
buffer=data_buffer,
offset=byte_bucket_offset,
)
# Unpack into a byte per boolean, using LSB bit-packed ordering.
arr = np.unpackbits(arr, bitorder="little")
# Interpret buffer as boolean array.
return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset)

def to_numpy(self, zero_copy_only: bool = True):
"""
Expand Down

0 comments on commit 58f4e9f

Please sign in to comment.