Skip to content

Commit

Permalink
Create Tensor from scalar
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Nov 16, 2023
1 parent 8cd5cf7 commit 4a491a1
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 21 deletions.
21 changes: 10 additions & 11 deletions cpp/src/arrow/extension/fixed_shape_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ namespace arrow {

namespace internal {

Status ComputeStrides(const FixedWidthType& type, const std::vector<int64_t>& shape,
Status ComputeStrides(const std::shared_ptr<DataType>& value_type,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& permutation,
std::vector<int64_t>* strides) {
auto fixed_width_type = internal::checked_pointer_cast<FixedWidthType>(value_type);
if (permutation.empty()) {
return internal::ComputeRowMajorStrides(type, shape, strides);
return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides);
}

const int byte_width = type.byte_width();
const int byte_width = value_type->byte_width();

int64_t remaining = 0;
if (!shape.empty() && shape.front() > 0) {
Expand Down Expand Up @@ -319,13 +320,12 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const {
permutation.insert(permutation.begin(), 1, 0);

std::vector<int64_t> tensor_strides;
auto value_type = internal::checked_pointer_cast<FixedWidthType>(ext_arr->value_type());
std::shared_ptr<DataType> type = ext_arr->value_type();
ARROW_RETURN_NOT_OK(
internal::ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides));
internal::ComputeStrides(type, shape, permutation, &tensor_strides));
ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten());
ARROW_ASSIGN_OR_RAISE(
auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape,
tensor_strides, dim_names));
ARROW_ASSIGN_OR_RAISE(auto tensor, Tensor::Make(type, buffers->data()->buffers[1],
shape, tensor_strides, dim_names));
return tensor;
}

Expand All @@ -348,9 +348,8 @@ Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make(

const std::vector<int64_t>& FixedShapeTensorType::strides() {
if (strides_.empty()) {
auto value_type = internal::checked_pointer_cast<FixedWidthType>(this->value_type_);
std::vector<int64_t> tensor_strides;
ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), this->shape(),
ARROW_CHECK_OK(internal::ComputeStrides(this->value_type_, this->shape(),
this->permutation(), &tensor_strides));
strides_ = tensor_strides;
}
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/extension/fixed_shape_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ namespace arrow {
namespace internal {

ARROW_EXPORT
Status ComputeStrides(const FixedWidthType& type, const std::vector<int64_t>& shape,
Status ComputeStrides(const std::shared_ptr<DataType>& value_type,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& permutation,
std::vector<int64_t>* strides);

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/extension/tensor_extension_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,11 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) {
ASSERT_EQ(t->shape(), (std::vector<int64_t>{2, 3, 1}));
ASSERT_EQ(t->strides(), (std::vector<int64_t>{24, 8, 8}));

ASSERT_OK_AND_ASSIGN(auto sc, ext_array->GetScalar(0));

auto vt = internal::checked_pointer_cast<VariableShapeTensorType>(sc->type);
auto it = vt->value_type();

std::vector<int64_t> shape = {2, 3, 1};
std::vector<int64_t> strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1,
sizeof(int64_t) * 1};
Expand Down
15 changes: 7 additions & 8 deletions cpp/src/arrow/extension/variable_shape_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ const Result<std::shared_ptr<Tensor>> VariableShapeTensorArray::GetTensor(
const int64_t i) const {
auto ext_arr = internal::checked_pointer_cast<StructArray>(this->storage());
auto ext_type = internal::checked_pointer_cast<VariableShapeTensorType>(this->type());
auto value_type =
internal::checked_pointer_cast<FixedWidthType>(ext_type->value_type());
auto value_type = ext_type->value_type();
auto ndim = ext_type->ndim();
auto dim_names = ext_type->dim_names();
auto shapes =
Expand All @@ -57,16 +56,16 @@ const Result<std::shared_ptr<Tensor>> VariableShapeTensorArray::GetTensor(

std::vector<int64_t> strides;
// TODO: optimize ComputeStrides for non-uniform tensors
ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape,
ext_type->permutation(), &strides));
ARROW_CHECK_OK(
internal::ComputeStrides(value_type, shape, ext_type->permutation(), &strides));

auto list_arr =
std::static_pointer_cast<ListArray>(ext_arr->field(1))->value_slice(i)->data();
auto bw = value_type->byte_width();
auto buffer =
SliceBuffer(list_arr->buffers[1], list_arr->offset * bw, list_arr->length * bw);
auto byte_width = value_type->byte_width();
auto buffer = SliceBuffer(list_arr->buffers[1], list_arr->offset * byte_width,
list_arr->length * byte_width);

return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names);
return Tensor::Make(value_type, buffer, shape, strides, dim_names);
}

bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const {
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
void set_chunksize(int64_t chunksize)

cdef cppclass CTensor" arrow::Tensor":
CTensor(const shared_ptr[CDataType]& type,
const shared_ptr[CBuffer]& data,
const vector[int64_t]& shape,
const vector[int64_t]& strides,
const vector[c_string]& dim_names)
shared_ptr[CDataType] type()
shared_ptr[CBuffer] data()

Expand Down Expand Up @@ -2691,6 +2696,11 @@ cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extens
const vector[int64_t] permutation()
const vector[c_string] dim_names()

cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::internal" nogil:
cdef CStatus ComputeStrides(const shared_ptr[CDataType]& value_type,
const vector[int64_t]& shape,
const vector[int64_t]& permutation,
vector[int64_t]* strides)

cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
cdef enum CCompressionType" arrow::Compression::type":
Expand Down
28 changes: 27 additions & 1 deletion python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)


class VariableShapeTensorScalar(ExtensionScalar):
cdef class VariableShapeTensorScalar(ExtensionScalar):
"""
Concrete class for variable shape tensor extension scalar.
"""
Expand All @@ -1047,6 +1047,32 @@ class VariableShapeTensorScalar(ExtensionScalar):
raise ValueError(
'Only non-permuted tensors can be converted to numpy tensors.')

def to_tensor(self):
"""
Convert variable shape tensor extension scalar to a pyarrow.Tensor.
"""
cdef:
shared_ptr[CTensor] ctensor
vector[int64_t] strides
vector[c_string] dim_names

shared_ptr[CVariableShapeTensorType] typ = static_pointer_cast[CVariableShapeTensorType, CDataType](
self.wrapped.get().type)

shared_ptr[CDataType] ty = typ.get().value_type()
# TODO: this accesses the full buffer instead of a slice
shared_ptr[CBuffer] data = pyarrow_unwrap_buffer(self.value[1].values.buffers()[1])
vector[int64_t] shape = self.value[0].values.to_pylist()
vector[int64_t] permutation = self.type.permutation

for name in self.type.dim_names:
dim_names.push_back(tobytes(name))

check_status(ComputeStrides(ty, shape, permutation, &strides))
ctensor = make_shared[CTensor](ty, data, shape, strides, dim_names)

return pyarrow_wrap_tensor(ctensor)


cdef dict _scalar_classes = {
_Type_BOOL: BooleanScalar,
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1439,6 +1439,13 @@ def test_variable_shape_tensor_class_methods(value_type):
np.testing.assert_array_equal(arr[0].to_numpy_ndarray(), expected_0)
np.testing.assert_array_equal(arr[1].to_numpy_ndarray(), expected_1)

assert arr[0].to_tensor().equals(
pa.Tensor.from_numpy(expected_0, dim_names=["H", "W"]))

# TODO: due to wrong offset this would return [[1], [2]] instead of [[7], [8]]
assert arr[1].to_tensor().equals(
pa.Tensor.from_numpy(expected_1, dim_names=["H", "W"]))


@pytest.mark.parametrize("tensor_type", (
pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
Expand Down

0 comments on commit 4a491a1

Please sign in to comment.