Skip to content

Commit

Permalink
work
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Mar 4, 2024
1 parent 6f41497 commit 28be4b6
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 48 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/extension/variable_shape_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType {
uniform_shape_(std::move(uniform_shape)) {}

std::string extension_name() const override { return "arrow.variable_shape_tensor"; }
std::string ToString() const override;
std::string ToString() const;

/// Number of dimensions of tensor elements
int32_t ndim() const { return ndim_; }
Expand Down
18 changes: 12 additions & 6 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -4422,8 +4422,9 @@ cdef class VariableShapeTensorArray(ExtensionArray):
numpy_type = obj[0].dtype
arrow_type = from_numpy_dtype(numpy_type)
ndim = obj[0].ndim
permutations = [(-np.array(o.strides)).argsort() for o in obj]
permutations = [(-np.array(o.strides)).argsort(kind="stable") for o in obj]
permutation = permutations[0]
shapes = [np.take(o.shape, permutation) for o in obj]

if not all([o.dtype == numpy_type for o in obj]):
raise TypeError('All numpy arrays must have matching dtype.')
Expand All @@ -4434,13 +4435,18 @@ cdef class VariableShapeTensorArray(ExtensionArray):
if not all([np.array_equal(p, permutation) for p in permutations]):
raise ValueError('All numpy arrays must have matching permutation.')

for shape in shapes:
if len(shape) < 2:
raise ValueError(
"Cannot convert 1D array or scalar to fixed shape tensor array")
if np.prod(shape) == 0:
raise ValueError("Expected a non-empty ndarray")

values = array([np.ravel(o, order="K") for o in obj], list_(arrow_type))
shapes = array([np.take(o.shape, permutation)
for o in obj], list_(int32(), list_size=ndim))
struct_arr=StructArray.from_arrays([shapes, values], names=["shape", "data"])
shapes = array(shapes, list_(int32(), list_size=ndim))
struct_arr = StructArray.from_arrays([shapes, values], names=["shape", "data"])

typ=variable_shape_tensor(arrow_type, ndim, permutation=permutation)
return ExtensionArray.from_storage(typ, struct_arr)
return ExtensionArray.from_storage(variable_shape_tensor(arrow_type, ndim, permutation=permutation), struct_arr)

cdef dict _array_classes = {
_Type_NA: NullArray,
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1095,12 +1095,17 @@ cdef class VariableShapeTensorScalar(ExtensionScalar):
Convert variable shape tensor extension scalar to a numpy array.
The conversion is zero-copy if data is primitive numeric and without nulls.
Returns
-------
numpy.ndarray
"""
return self.to_tensor().to_numpy()

def to_tensor(self):
"""
Convert variable shape tensor extension scalar to a pyarrow.Tensor.
Returns
-------
tensor : pyarrow.Tensor
Expand Down
105 changes: 64 additions & 41 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import sys

import numpy as np
from numpy.lib.stride_tricks import as_strided
import pyarrow as pa
from pyarrow.vendored.version import Version

Expand Down Expand Up @@ -1369,8 +1370,7 @@ def test_variable_shape_tensor_type():


@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
def test_tensor_class_methods(value_type):
from numpy.lib.stride_tricks import as_strided
def test_fixed_shape_tensor_class_methods(value_type):
arrow_type = pa.from_numpy_dtype(value_type)

tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3])
Expand Down Expand Up @@ -1421,8 +1421,7 @@ def test_tensor_class_methods(value_type):


@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
def test_tensor_array_from_numpy(value_type):
from numpy.lib.stride_tricks import as_strided
def test_fixed_shape_tensor_array_from_numpy(value_type):
arrow_type = pa.from_numpy_dtype(value_type)

arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
Expand Down Expand Up @@ -1552,43 +1551,67 @@ def test_variable_shape_tensor_class_methods(value_type):
[], dtype=value_type).reshape(shapes[1].as_py()))


@pytest.mark.parametrize("value_type", (np.int8, np.int32, np.int64, np.float64))
def test_variable_shape_tensor_strided(value_type):
from numpy.lib.stride_tricks import as_strided
bw = value_type().itemsize
arrow_type = pa.from_numpy_dtype(value_type())
vals = np.arange(1, 13, dtype=value_type)

for arr_in, arr_out in [
(as_strided(vals, shape=(12,), strides=(bw,)), vals),
(as_strided(vals, shape=(3, 4), strides=(bw * 4, bw)), vals.reshape(3, 4)),
# TODO: strides are not correctly handled for non-C layouts
# (
# as_strided(vals, shape=(3, 4), strides=(bw, bw * 3)),
# np.array([[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]], value_type())
# )
]:
ndim = len(arr_in.shape)
shape = arr_in.shape
permutation = (-np.array(arr_in.strides)).argsort()
tensor_array_type = pa.variable_shape_tensor(
arrow_type, ndim, permutation=permutation)

fields = [pa.field("shape", pa.list_(pa.int32(), ndim)),
pa.field("data", pa.list_(arrow_type))]
shapes = pa.array([shape], type=fields[0].type)
values = pa.array([vals.tolist()], type=fields[1].type)
struct_arr = pa.StructArray.from_arrays([shapes, values], fields=fields)

arrow_array = pa.ExtensionArray.from_storage(tensor_array_type, struct_arr)
np.testing.assert_array_equal(arrow_array[0].to_numpy_ndarray(), arr_out)
np.testing.assert_array_equal(
arrow_array[0].to_tensor(), pa.Tensor.from_numpy(arr_out))

arrow_array = pa.VariableShapeTensorArray.from_numpy_ndarray([arr_in])
np.testing.assert_array_equal(arrow_array[0].to_numpy_ndarray(), arr_out)
np.testing.assert_array_equal(
arrow_array[0].to_tensor(), pa.Tensor.from_numpy(arr_out))
@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
def test_variable_shape_tensor_array_from_numpy(value_type):
arrow_type = pa.from_numpy_dtype(value_type)

arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
dtype=value_type, order="C")
tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr])
assert isinstance(tensor_array_from_numpy.type, pa.VariableShapeTensorType)
assert tensor_array_from_numpy.type.value_type == arrow_type
assert tensor_array_from_numpy.type.ndim == 3
assert tensor_array_from_numpy.type.permutation == [0, 1, 2]

f_arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
dtype=value_type, order="F")
with pytest.raises(ValueError, match="numpy arrays must have matching permutation"):
pa.VariableShapeTensorArray.from_numpy_ndarray([f_arr, arr])
with pytest.raises(ValueError, match="numpy arrays must have matching ndim"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((12, 1)), arr])
with pytest.raises(TypeError, match="numpy arrays must have matching dtype"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr.astype(np.int32()), arr])

flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
bw = value_type.itemsize

arr = flat_arr.reshape(1, 3, 4)
tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr])
assert tensor_array_from_numpy.type.ndim == 3
assert tensor_array_from_numpy.type.permutation == [0, 1, 2]
assert tensor_array_from_numpy[0].to_tensor() == pa.Tensor.from_numpy(arr)

arr = as_strided(flat_arr, shape=(1, 2, 3, 2),
strides=(bw * 12, bw * 6, bw, bw * 3))
tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr])
assert tensor_array_from_numpy.type.ndim == 4
assert tensor_array_from_numpy.type.permutation == [0, 1, 3, 2]
assert tensor_array_from_numpy[0].to_tensor() == pa.Tensor.from_numpy(arr)

arr = flat_arr.reshape(1, 2, 3, 2)
result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr])
expected = np.array(
[[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type)
np.testing.assert_array_equal(result[0].to_numpy_ndarray(), expected)

arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr])

arr = np.array(1, dtype=value_type)
with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr])

arr = np.array([], dtype=value_type)

with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((0))])

with pytest.raises(ValueError, match="Expected a non-empty ndarray"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((0, 3, 2))])

with pytest.raises(ValueError, match="Expected a non-empty ndarray"):
pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((3, 0, 2))])


@pytest.mark.parametrize("tensor_type", (
Expand Down

0 comments on commit 28be4b6

Please sign in to comment.