diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index e0eb3f1181a5c..8521d500f5c05 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -283,6 +283,35 @@ struct ConvertColumnsToTensorVisitor { } }; +template +struct ConvertColumnsToTensorRowMajorVisitor { + Out*& out_values; + const ArrayData& in_data; + int num_cols; + int col_idx; + + template + Status Visit(const T&) { + if constexpr (is_numeric(T::type_id)) { + using In = typename T::c_type; + auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + + if (in_data.null_count == 0) { + for (int64_t i = 0; i < in_data.length; ++i) { + out_values[i * num_cols + col_idx] = static_cast(in_values[i]); + } + } else { + for (int64_t i = 0; i < in_data.length; ++i) { + out_values[i * num_cols + col_idx] = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); + } + } + return Status::OK(); + } + Unreachable(); + } +}; + template inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out, bool row_major) { @@ -302,7 +331,7 @@ inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out, } } -Result> RecordBatch::ToTensor(bool null_to_nan, +Result> RecordBatch::ToTensor(bool null_to_nan, bool row_major, MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 5202ff4abfa0b..cd647a88abd97 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -87,10 +87,12 @@ class ARROW_EXPORT RecordBatch { /// Generated Tensor will have column-major layout. /// /// \param[in] null_to_nan if true, convert nulls to NaN + /// \param[in] row_major if true, create row-major Tensor else column-major Tensor /// \param[in] pool the memory pool to allocate the tensor buffer /// \return the resulting Tensor Result> ToTensor( - bool null_to_nan = false, MemoryPool* pool = default_memory_pool()) const; + bool null_to_nan = false, bool row_major = true, + MemoryPool* pool = default_memory_pool()) const; /// \brief Construct record batch from struct array /// diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 2557db8f624ee..daf7109075eab 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -779,7 +779,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); - ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK_AND_ASSIGN(auto tensor, + batch->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); ASSERT_OK(tensor->Validate()); std::vector shape = {9, 2}; @@ -794,6 +795,19 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { CheckTensor(tensor, 18, shape, f_strides); + ASSERT_OK_AND_ASSIGN(auto tensor_row, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor_row->Validate()); + + std::vector strides = {f64_size * shape[1], f64_size}; + std::shared_ptr tensor_expected_row = TensorFromJSON( + float64(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor_row, 18, shape, strides); + // int32 -> float64 auto f2 = field("f2", int32()); @@ -803,7 +817,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { auto a2 = ArrayFromJSON(int32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); auto batch1 = RecordBatch::Make(schema1, length, {a0, a2}); - ASSERT_OK_AND_ASSIGN(auto tensor1, batch1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK_AND_ASSIGN(auto tensor1, + batch1->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); ASSERT_OK(tensor1->Validate()); EXPECT_FALSE(tensor_expected->Equals(*tensor1)); @@ -811,6 +826,14 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { CheckTensor(tensor1, 18, shape, f_strides); + ASSERT_OK_AND_ASSIGN(auto tensor1_row, batch1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor1_row->Validate()); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor1_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor1_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor1_row, 18, shape, strides); + // int8 -> float32 auto f3 = field("f3", int8()); auto f4 = field("f4", int8()); @@ -822,7 +845,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { auto a4 = ArrayFromJSON(int8(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); auto batch2 = RecordBatch::Make(schema2, length, {a3, a4}); - ASSERT_OK_AND_ASSIGN(auto tensor2, batch2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK_AND_ASSIGN(auto tensor2, + batch2->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); ASSERT_OK(tensor2->Validate()); const int64_t f32_size = sizeof(float); @@ -835,6 +859,20 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); CheckTensor(tensor2, 18, shape, f_strides_2); + + ASSERT_OK_AND_ASSIGN(auto tensor2_row, batch2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor2_row->Validate()); + + std::vector strides_2 = {f32_size * shape[1], f32_size}; + std::shared_ptr tensor2_expected_row = TensorFromJSON( + float32(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides_2); + + EXPECT_FALSE(tensor2_expected_row->Equals(*tensor2_row)); + EXPECT_TRUE( + tensor2_expected_row->Equals(*tensor2_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor2_row, 18, shape, strides_2); } TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index aebcf96fe3861..f461513e8b3cf 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -985,7 +985,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CRecordBatch] Slice(int64_t offset) shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length) - CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, CMemoryPool* pool) const + CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, c_bool row_major, + CMemoryPool* pool) const cdef cppclass CRecordBatchWithMetadata" arrow::RecordBatchWithMetadata": shared_ptr[CRecordBatch] batch diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index e82bf81c25c09..379bb82ea6ede 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3469,20 +3469,24 @@ cdef class RecordBatch(_Tabular): deref(c_record_batch).ToStructArray()) return pyarrow_wrap_array(c_array) - def to_tensor(self, c_bool null_to_nan=False, MemoryPool memory_pool=None): + def to_tensor(self, c_bool null_to_nan=False, c_bool row_major=True, MemoryPool memory_pool=None): """ Convert to a :class:`~pyarrow.Tensor`. RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths. RecordBatches with validity bitmask - for any of the arrays can be converted with ``null_to_nan``turned to ``True``. - In this case null values are converted to NaN and signed or unsigned integer - type arrays are promoted to appropriate float type. + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. Parameters ---------- null_to_nan : bool, default False Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major memory_pool : MemoryPool, default None For memory allocations, if required, otherwise use default pool @@ -3504,13 +3508,29 @@ cdef class RecordBatch(_Tabular): a: [1,2,3,4,null] b: [10,20,30,40,null] + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + >>> batch.to_tensor(null_to_nan=True) type: double shape: (5, 2) - strides: (8, 40) - + strides: (16, 8) >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() array([[ 1., 10.], [ 2., 20.], [ 3., 30.], @@ -3526,7 +3546,7 @@ cdef class RecordBatch(_Tabular): with nogil: c_tensor = GetResultValue( deref(c_record_batch).ToTensor(null_to_nan, - pool)) + row_major, pool)) return pyarrow_wrap_tensor(c_tensor) def _export_to_c(self, out_ptr, out_schema_ptr=0): diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index fce457cc4003a..a58010d083e92 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1139,9 +1139,8 @@ def test_recordbatch_to_tensor_null(): ): batch.to_tensor() - result = batch.to_tensor(null_to_nan=True) - - x = np.array([arr1, arr2], np.float64).transpose() + result = batch.to_tensor(null_to_nan=True, row_major=False) + x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1158,7 +1157,7 @@ def test_recordbatch_to_tensor_null(): ], ["a", "b"] ) - result = batch.to_tensor(null_to_nan=True) + result = batch.to_tensor(null_to_nan=True, row_major=False) np.testing.assert_equal(result.to_numpy(), x) assert result.size == 18 @@ -1174,9 +1173,8 @@ def test_recordbatch_to_tensor_null(): ], ["a", "b"] ) - result = batch.to_tensor(null_to_nan=True) - - x = np.array([arr1, arr2], np.float32).transpose() + result = batch.to_tensor(null_to_nan=True, row_major=False) + x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x)