diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 6f3b8e75a20d0..8521d500f5c05 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -283,18 +283,55 @@ struct ConvertColumnsToTensorVisitor { } }; +template +struct ConvertColumnsToTensorRowMajorVisitor { + Out*& out_values; + const ArrayData& in_data; + int num_cols; + int col_idx; + + template + Status Visit(const T&) { + if constexpr (is_numeric(T::type_id)) { + using In = typename T::c_type; + auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + + if (in_data.null_count == 0) { + for (int64_t i = 0; i < in_data.length; ++i) { + out_values[i * num_cols + col_idx] = static_cast(in_values[i]); + } + } else { + for (int64_t i = 0; i < in_data.length; ++i) { + out_values[i * num_cols + col_idx] = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); + } + } + return Status::OK(); + } + Unreachable(); + } +}; + template -inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) { +inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out, + bool row_major) { using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); + int i = 0; for (const auto& column : batch.columns()) { - ConvertColumnsToTensorVisitor visitor{out_values, *column->data()}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + if (row_major) { + ConvertColumnsToTensorRowMajorVisitor visitor{out_values, *column->data(), + batch.num_columns(), i++}; + DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + } else { + ConvertColumnsToTensorVisitor visitor{out_values, *column->data()}; + DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + } } } -Result> RecordBatch::ToTensor(bool null_to_nan, +Result> RecordBatch::ToTensor(bool null_to_nan, bool row_major, MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( @@ -362,35 +399,35 @@ Result> RecordBatch::ToTensor(bool null_to_nan, // Copy data switch (result_type->id()) { case Type::UINT8: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::UINT16: case Type::HALF_FLOAT: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::UINT32: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::UINT64: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::INT8: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::INT16: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::INT32: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::INT64: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::FLOAT: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; case Type::DOUBLE: - ConvertColumnsToTensor(*this, result->mutable_data()); + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); break; default: return Status::TypeError("DataType is not supported: ", result_type->ToString()); @@ -401,11 +438,17 @@ Result> RecordBatch::ToTensor(bool null_to_nan, internal::checked_cast(*result_type); std::vector shape = {num_rows(), num_columns()}; std::vector strides; - ARROW_RETURN_NOT_OK( - internal::ComputeColumnMajorStrides(fixed_width_type, shape, &strides)); - ARROW_ASSIGN_OR_RAISE(auto tensor, - Tensor::Make(result_type, std::move(result), shape, strides)); + std::shared_ptr tensor; + if (row_major) { + ARROW_RETURN_NOT_OK( + internal::ComputeRowMajorStrides(fixed_width_type, shape, &strides)); + } else { + ARROW_RETURN_NOT_OK( + internal::ComputeColumnMajorStrides(fixed_width_type, shape, &strides)); + } + ARROW_ASSIGN_OR_RAISE(tensor, + Tensor::Make(result_type, std::move(result), shape, strides)); return tensor; } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 5202ff4abfa0b..cd647a88abd97 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -87,10 +87,12 @@ class ARROW_EXPORT RecordBatch { /// Generated Tensor will have column-major layout. /// /// \param[in] null_to_nan if true, convert nulls to NaN + /// \param[in] row_major if true, create row-major Tensor else column-major Tensor /// \param[in] pool the memory pool to allocate the tensor buffer /// \return the resulting Tensor Result> ToTensor( - bool null_to_nan = false, MemoryPool* pool = default_memory_pool()) const; + bool null_to_nan = false, bool row_major = true, + MemoryPool* pool = default_memory_pool()) const; /// \brief Construct record batch from struct array /// diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 95f601465b440..daf7109075eab 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -685,14 +685,20 @@ TEST_F(TestRecordBatch, ToTensorEmptyBatch) { ASSERT_OK_AND_ASSIGN(std::shared_ptr empty, RecordBatch::MakeEmpty(schema)); - ASSERT_OK_AND_ASSIGN(auto tensor, empty->ToTensor()); - ASSERT_OK(tensor->Validate()); + ASSERT_OK_AND_ASSIGN(auto tensor_column, + empty->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor_column->Validate()); + + ASSERT_OK_AND_ASSIGN(auto tensor_row, empty->ToTensor()); + ASSERT_OK(tensor_row->Validate()); const std::vector strides = {4, 4}; const std::vector shape = {0, 2}; - EXPECT_EQ(strides, tensor->strides()); - EXPECT_EQ(shape, tensor->shape()); + EXPECT_EQ(strides, tensor_column->strides()); + EXPECT_EQ(shape, tensor_column->shape()); + EXPECT_EQ(strides, tensor_row->strides()); + EXPECT_EQ(shape, tensor_row->shape()); auto batch_no_columns = RecordBatch::Make(::arrow::schema({}), 10, std::vector>{}); @@ -715,6 +721,19 @@ void CheckTensor(const std::shared_ptr& tensor, const int size, EXPECT_TRUE(tensor->is_contiguous()); } +template +void CheckTensorRowMajor(const std::shared_ptr& tensor, const int size, + const std::vector shape, + const std::vector strides) { + EXPECT_EQ(size, tensor->size()); + EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); + EXPECT_EQ(shape, tensor->shape()); + EXPECT_EQ(strides, tensor->strides()); + EXPECT_TRUE(tensor->is_row_major()); + EXPECT_FALSE(tensor->is_column_major()); + EXPECT_TRUE(tensor->is_contiguous()); +} + TEST_F(TestRecordBatch, ToTensorSupportedNaN) { const int length = 9; @@ -729,7 +748,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); - ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK_AND_ASSIGN(auto tensor, + batch->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor->Validate()); std::vector shape = {9, 2}; @@ -759,7 +779,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); - ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK_AND_ASSIGN(auto tensor, + batch->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); ASSERT_OK(tensor->Validate()); std::vector shape = {9, 2}; @@ -774,6 +795,19 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { CheckTensor(tensor, 18, shape, f_strides); + ASSERT_OK_AND_ASSIGN(auto tensor_row, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor_row->Validate()); + + std::vector strides = {f64_size * shape[1], f64_size}; + std::shared_ptr tensor_expected_row = TensorFromJSON( + float64(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor_row, 18, shape, strides); + // int32 -> float64 auto f2 = field("f2", int32()); @@ -783,7 +817,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { auto a2 = ArrayFromJSON(int32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); auto batch1 = RecordBatch::Make(schema1, length, {a0, a2}); - ASSERT_OK_AND_ASSIGN(auto tensor1, batch1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK_AND_ASSIGN(auto tensor1, + batch1->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); ASSERT_OK(tensor1->Validate()); EXPECT_FALSE(tensor_expected->Equals(*tensor1)); @@ -791,6 +826,14 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { CheckTensor(tensor1, 18, shape, f_strides); + ASSERT_OK_AND_ASSIGN(auto tensor1_row, batch1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor1_row->Validate()); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor1_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor1_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor1_row, 18, shape, strides); + // int8 -> float32 auto f3 = field("f3", int8()); auto f4 = field("f4", int8()); @@ -802,7 +845,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { auto a4 = ArrayFromJSON(int8(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); auto batch2 = RecordBatch::Make(schema2, length, {a3, a4}); - ASSERT_OK_AND_ASSIGN(auto tensor2, batch2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK_AND_ASSIGN(auto tensor2, + batch2->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); ASSERT_OK(tensor2->Validate()); const int64_t f32_size = sizeof(float); @@ -815,6 +859,20 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); CheckTensor(tensor2, 18, shape, f_strides_2); + + ASSERT_OK_AND_ASSIGN(auto tensor2_row, batch2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor2_row->Validate()); + + std::vector strides_2 = {f32_size * shape[1], f32_size}; + std::shared_ptr tensor2_expected_row = TensorFromJSON( + float32(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides_2); + + EXPECT_FALSE(tensor2_expected_row->Equals(*tensor2_row)); + EXPECT_TRUE( + tensor2_expected_row->Equals(*tensor2_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor2_row, 18, shape, strides_2); } TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { @@ -833,7 +891,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { auto schema = ::arrow::schema(fields); auto batch = RecordBatch::Make(schema, length, {a0}); - ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK_AND_ASSIGN(auto tensor, + batch->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor->Validate()); std::vector shape = {9, 1}; @@ -850,7 +909,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { auto schema1 = ::arrow::schema(fields1); auto batch1 = RecordBatch::Make(schema1, length, {a0, a1}); - ASSERT_OK_AND_ASSIGN(auto tensor1, batch1->ToTensor()); + ASSERT_OK_AND_ASSIGN(auto tensor1, + batch1->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor1->Validate()); std::vector shape1 = {9, 2}; @@ -875,7 +935,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { auto schema2 = ::arrow::schema(fields2); auto batch2 = RecordBatch::Make(schema2, length, {a0, a1, a2}); - ASSERT_OK_AND_ASSIGN(auto tensor2, batch2->ToTensor()); + ASSERT_OK_AND_ASSIGN(auto tensor2, + batch2->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor2->Validate()); std::vector shape2 = {9, 3}; @@ -920,11 +981,11 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMixedFloat16) { } template -class TestBatchToTensor : public ::testing::Test {}; +class TestBatchToTensorColumnMajor : public ::testing::Test {}; -TYPED_TEST_SUITE_P(TestBatchToTensor); +TYPED_TEST_SUITE_P(TestBatchToTensorColumnMajor); -TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { +TYPED_TEST_P(TestBatchToTensorColumnMajor, SupportedTypes) { using DataType = TypeParam; using c_data_type = typename DataType::c_type; const int unit_size = sizeof(c_data_type); @@ -947,7 +1008,8 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { auto batch = RecordBatch::Make(schema, length, {a0, a1, a2}); - ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK_AND_ASSIGN(auto tensor, + batch->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor->Validate()); std::vector shape = {9, 3}; @@ -964,7 +1026,8 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { // Test offsets auto batch_slice = batch->Slice(1); - ASSERT_OK_AND_ASSIGN(auto tensor_sliced, batch_slice->ToTensor()); + ASSERT_OK_AND_ASSIGN(auto tensor_sliced, + batch_slice->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor_sliced->Validate()); std::vector shape_sliced = {8, 3}; @@ -980,7 +1043,9 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { auto batch_slice_1 = batch->Slice(1, 5); - ASSERT_OK_AND_ASSIGN(auto tensor_sliced_1, batch_slice_1->ToTensor()); + ASSERT_OK_AND_ASSIGN( + auto tensor_sliced_1, + batch_slice_1->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); ASSERT_OK(tensor_sliced_1->Validate()); std::vector shape_sliced_1 = {5, 3}; @@ -994,18 +1059,107 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { CheckTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); } -REGISTER_TYPED_TEST_SUITE_P(TestBatchToTensor, SupportedTypes); +REGISTER_TYPED_TEST_SUITE_P(TestBatchToTensorColumnMajor, SupportedTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestBatchToTensorColumnMajor, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestBatchToTensorColumnMajor, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestBatchToTensorColumnMajor, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestBatchToTensorColumnMajor, UInt64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestBatchToTensorColumnMajor, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestBatchToTensorColumnMajor, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestBatchToTensorColumnMajor, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestBatchToTensorColumnMajor, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestBatchToTensorColumnMajor, HalfFloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestBatchToTensorColumnMajor, FloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestBatchToTensorColumnMajor, DoubleType); + +template +class TestBatchToTensorRowMajor : public ::testing::Test {}; + +TYPED_TEST_SUITE_P(TestBatchToTensorRowMajor); + +TYPED_TEST_P(TestBatchToTensorRowMajor, SupportedTypes) { + using DataType = TypeParam; + using c_data_type = typename DataType::c_type; + const int unit_size = sizeof(c_data_type); + + const int length = 9; + + auto f0 = field("f0", TypeTraits::type_singleton()); + auto f1 = field("f1", TypeTraits::type_singleton()); + auto f2 = field("f2", TypeTraits::type_singleton()); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema(fields); + + auto a0 = ArrayFromJSON(TypeTraits::type_singleton(), + "[1, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a1 = ArrayFromJSON(TypeTraits::type_singleton(), + "[10, 20, 30, 40, 50, 60, 70, 80, 90]"); + auto a2 = ArrayFromJSON(TypeTraits::type_singleton(), + "[100, 100, 100, 100, 100, 100, 100, 100, 100]"); + + auto batch = RecordBatch::Make(schema, length, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 3}; + std::vector strides = {unit_size * shape[1], unit_size}; + std::shared_ptr tensor_expected = + TensorFromJSON(TypeTraits::type_singleton(), + "[1, 10, 100, 2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", + shape, strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTensorRowMajor(tensor, 27, shape, strides); + + // Test offsets + auto batch_slice = batch->Slice(1); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced, batch_slice->ToTensor()); + ASSERT_OK(tensor_sliced->Validate()); + + std::vector shape_sliced = {8, 3}; + std::vector strides_sliced = {unit_size * shape[1], unit_size}; + std::shared_ptr tensor_expected_sliced = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", + shape_sliced, strides_sliced); + + EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); + CheckTensorRowMajor(tensor_sliced, 24, shape_sliced, strides_sliced); + + auto batch_slice_1 = batch->Slice(1, 5); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced_1, batch_slice_1->ToTensor()); + ASSERT_OK(tensor_sliced_1->Validate()); + + std::vector shape_sliced_1 = {5, 3}; + std::vector strides_sliced_1 = {unit_size * shape_sliced_1[1], unit_size}; + std::shared_ptr tensor_expected_sliced_1 = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, 60, 100]", + shape_sliced_1, strides_sliced_1); + + EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); + CheckTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, strides_sliced_1); +} -INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestBatchToTensor, UInt8Type); -INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestBatchToTensor, UInt16Type); -INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestBatchToTensor, UInt32Type); -INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestBatchToTensor, UInt64Type); -INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestBatchToTensor, Int8Type); -INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestBatchToTensor, Int16Type); -INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestBatchToTensor, Int32Type); -INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestBatchToTensor, Int64Type); -INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestBatchToTensor, HalfFloatType); -INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestBatchToTensor, FloatType); -INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestBatchToTensor, DoubleType); +REGISTER_TYPED_TEST_SUITE_P(TestBatchToTensorRowMajor, SupportedTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestBatchToTensorRowMajor, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestBatchToTensorRowMajor, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestBatchToTensorRowMajor, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestBatchToTensorRowMajor, UInt64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestBatchToTensorRowMajor, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestBatchToTensorRowMajor, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestBatchToTensorRowMajor, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestBatchToTensorRowMajor, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestBatchToTensorRowMajor, HalfFloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestBatchToTensorRowMajor, FloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestBatchToTensorRowMajor, DoubleType); } // namespace arrow diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index aa50dd189a82d..a35919579541a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -984,7 +984,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CRecordBatch] Slice(int64_t offset) shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length) - CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, CMemoryPool* pool) const + CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, c_bool row_major, + CMemoryPool* pool) const cdef cppclass CRecordBatchWithMetadata" arrow::RecordBatchWithMetadata": shared_ptr[CRecordBatch] batch diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 54fda1da7dcaf..6b3c7d0b56266 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3389,20 +3389,24 @@ cdef class RecordBatch(_Tabular): deref(c_record_batch).ToStructArray()) return pyarrow_wrap_array(c_array) - def to_tensor(self, c_bool null_to_nan=False, MemoryPool memory_pool=None): + def to_tensor(self, c_bool null_to_nan=False, c_bool row_major=True, MemoryPool memory_pool=None): """ Convert to a :class:`~pyarrow.Tensor`. RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths. RecordBatches with validity bitmask - for any of the arrays can be converted with ``null_to_nan``turned to ``True``. - In this case null values are converted to NaN and signed or unsigned integer - type arrays are promoted to appropriate float type. + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. Parameters ---------- null_to_nan : bool, default False Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major memory_pool : MemoryPool, default None For memory allocations, if required, otherwise use default pool @@ -3424,13 +3428,29 @@ cdef class RecordBatch(_Tabular): a: [1,2,3,4,null] b: [10,20,30,40,null] + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + >>> batch.to_tensor(null_to_nan=True) type: double shape: (5, 2) - strides: (8, 40) - + strides: (16, 8) >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() array([[ 1., 10.], [ 2., 20.], [ 3., 30.], @@ -3446,7 +3466,7 @@ cdef class RecordBatch(_Tabular): with nogil: c_tensor = GetResultValue( deref(c_record_batch).ToTensor(null_to_nan, - pool)) + row_major, pool)) return pyarrow_wrap_tensor(c_tensor) def _export_to_c(self, out_ptr, out_schema_ptr=0): diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 8e30574188763..31d34058b61ef 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -926,36 +926,46 @@ def test_recordbatch_to_tensor_uniform_type(typ): pa.array(arr3, type=pa.from_numpy_dtype(typ)), ], ["a", "b", "c"] ) - result = batch.to_tensor() - x = np.array([arr1, arr2, arr3], typ).transpose() + result = batch.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2, arr3]).astype(typ, order="F") expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 27) + result = batch.to_tensor() + x = np.column_stack([arr1, arr2, arr3]).astype(typ, order="C") + expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.from_numpy_dtype(typ), 27) # Test offset batch1 = batch.slice(1) - result = batch1.to_tensor() - arr1 = [2, 3, 4, 5, 6, 7, 8, 9] arr2 = [20, 30, 40, 50, 60, 70, 80, 90] arr3 = [100, 100, 100, 100, 100, 100, 100, 100] - x = np.array([arr1, arr2, arr3], typ).transpose() + result = batch1.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2, arr3]).astype(typ, order="F") expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 24) + result = batch1.to_tensor() + x = np.column_stack([arr1, arr2, arr3]).astype(typ, order="C") + expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.from_numpy_dtype(typ), 24) batch2 = batch.slice(1, 5) - result = batch2.to_tensor() - arr1 = [2, 3, 4, 5, 6] arr2 = [20, 30, 40, 50, 60] arr3 = [100, 100, 100, 100, 100] - x = np.array([arr1, arr2, arr3], typ).transpose() + result = batch2.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2, arr3]).astype(typ, order="F") expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) + result = batch2.to_tensor() + x = np.column_stack([arr1, arr2, arr3]).astype(typ, order="C") + expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) @@ -970,11 +980,15 @@ def test_recordbatch_to_tensor_uniform_float_16(): pa.array(np.array(arr3, dtype=np.float16), type=pa.float16()), ], ["a", "b", "c"] ) - result = batch.to_tensor() - x = np.array([arr1, arr2, arr3], np.float16).transpose() + result = batch.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2, arr3]).astype(np.float16, order="F") expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.float16(), 27) + result = batch.to_tensor() + x = np.column_stack([arr1, arr2, arr3]).astype(np.float16, order="C") + expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.float16(), 27) @@ -989,11 +1003,15 @@ def test_recordbatch_to_tensor_mixed_type(): pa.array(arr2, type=pa.int16()), ], ["a", "b"] ) - result = batch.to_tensor() - x = np.array([arr1, arr2], np.int32).transpose() + result = batch.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2]).astype(np.int32, order="F") expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.int32(), 18) + result = batch.to_tensor() + x = np.column_stack([arr1, arr2]).astype(np.int32, order="C") + expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.int32(), 18) # uint16 + int16 + float32 = float64 @@ -1004,9 +1022,18 @@ def test_recordbatch_to_tensor_mixed_type(): pa.array(arr3, type=pa.float32()), ], ["a", "b", "c"] ) - result = batch.to_tensor() + result = batch.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2, arr3]).astype(np.float64, order="F") + expected = pa.Tensor.from_numpy(x) - x = np.array([arr1, arr2, arr3], np.float64).transpose() + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 27 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + result = batch.to_tensor() + x = np.column_stack([arr1, arr2, arr3]).astype(np.float64, order="C") expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1044,9 +1071,8 @@ def test_recordbatch_to_tensor_nan(): pa.array(arr2, type=pa.float32()), ], ["a", "b"] ) - result = batch.to_tensor() - - x = np.array([arr1, arr2], np.float32).transpose() + result = batch.to_tensor(row_major=False) + x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1071,9 +1097,8 @@ def test_recordbatch_to_tensor_null(): ): batch.to_tensor() - result = batch.to_tensor(null_to_nan=True) - - x = np.array([arr1, arr2], np.float64).transpose() + result = batch.to_tensor(null_to_nan=True, row_major=False) + x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1090,7 +1115,7 @@ def test_recordbatch_to_tensor_null(): ], ["a", "b"] ) - result = batch.to_tensor(null_to_nan=True) + result = batch.to_tensor(null_to_nan=True, row_major=False) np.testing.assert_equal(result.to_numpy(), x) assert result.size == 18 @@ -1106,9 +1131,8 @@ def test_recordbatch_to_tensor_null(): ], ["a", "b"] ) - result = batch.to_tensor(null_to_nan=True) - - x = np.array([arr1, arr2], np.float32).transpose() + result = batch.to_tensor(null_to_nan=True, row_major=False) + x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1127,7 +1151,7 @@ def test_recordbatch_to_tensor_empty(): ) result = batch.to_tensor() - x = np.array([[], []], np.float32).transpose() + x = np.column_stack([[], []]).astype(np.float32, order="F") expected = pa.Tensor.from_numpy(x) assert result.size == expected.size