From b66d17fe8309b125e6da040b0e38e95815117a4a Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 29 Mar 2023 11:26:37 +0530 Subject: [PATCH 01/16] feat: Support for Arrow-Velox conversion --- pyvelox/pyvelox.h | 15 +++++++++++++++ pyvelox/test/test_vector.py | 31 +++++++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 47 insertions(+) diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index 48c40f906220..80819f102f1d 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "folly/json.h" @@ -451,6 +452,20 @@ static void addVectorBindings( std::move(baseVector), PyVeloxContext::getSingletonInstance().pool()); }); + + m.def("export_to_arrow", [](VectorPtr& inputVector){ + auto arrowArray = new ArrowArray(); + facebook::velox::exportToArrow(inputVector, *arrowArray); + inputVector.reset(); + return reinterpret_cast(arrowArray); + }); + + m.def("import_from_arrow", [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr){ + auto arrowArray = reinterpret_cast(arrowArrayPtr); + auto arrowSchema = reinterpret_cast(arrowSchemaPtr); + return importFromArrowAsOwner(*arrowSchema, *arrowArray); + }); + } static void addExpressionBindings( diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 74a352aafcc6..9383fe999ebc 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pyarrow as pa +from pyarrow.cffi import ffi import unittest import pyvelox.pyvelox as pv @@ -273,3 +275,32 @@ def test_slice(self): with self.assertRaises(NotImplementedError): e = a[3:8:3] + + def test_export_to_arrow(self): + vector = pv.from_list([1, 2, 3]) + vector_ptr = pv.export_to_arrow(vector) + + self.assertTrue(isinstance(vector_ptr, int)) + + arrow_arr = pa.Array._import_from_c(vector_ptr, pa.int64()) + + self.assertEqual(arrow_arr.type, pa.int64()) + self.assertEqual(len(arrow_arr), 3) + self.assertListEqual(arrow_arr.tolist(), [1,2,3]) + + def test_import_from_arrow(self): + c_schema = ffi.new("struct ArrowSchema*") + schema_ptr = int(ffi.cast("uintptr_t",c_schema)) + + c_array = ffi.new("struct ArrowArray*") + array_ptr = int(ffi.cast("uintptr_t", c_array)) + + arr = pa.array([1, 2, 3], type=pa.int32()) + arr._export_to_c(array_ptr, schema_ptr) + + velox_vector = pv.import_from_arrow(array_ptr, schema_ptr) + + self.assertEqual(velox_vector.size(), 3) + self.assertTrue(velox_vector.dtype(), pv.IntegerType()) + for i in range(0, 3): + self.assertEqual(velox_vector[i], i+1) diff --git a/setup.py b/setup.py index 7f18d8e1f1db..fa64f1b0b77e 100644 --- a/setup.py +++ b/setup.py @@ -173,6 +173,7 @@ def build_extension(self, ext): "tabulate", "typing-inspect", ], + tests_require=['pyarrow'], python_requires=">=3.7", classifiers=[ "Intended Audience :: Developers", From d52ecfa4a29fedcb234eec4d4850ad4063c0600f Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 29 Mar 2023 19:18:26 +0530 Subject: [PATCH 02/16] fix: passing default memory pool while converting --- pyvelox/pyvelox.h | 20 ++++++++++++-------- pyvelox/test/test_vector.py | 6 +++--- setup.py | 2 +- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index 80819f102f1d..24e78c521320 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -28,8 +28,8 @@ #include #include #include -#include #include +#include #include "folly/json.h" #include "context.h" @@ -453,18 +453,22 @@ static void addVectorBindings( PyVeloxContext::getSingletonInstance().pool()); }); - m.def("export_to_arrow", [](VectorPtr& inputVector){ + m.def("export_to_arrow", [](VectorPtr& inputVector) { auto arrowArray = new ArrowArray(); - facebook::velox::exportToArrow(inputVector, *arrowArray); + std::shared_ptr pool_{facebook::velox::memory::getDefaultMemoryPool()}; + facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); inputVector.reset(); return reinterpret_cast(arrowArray); }); - m.def("import_from_arrow", [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr){ - auto arrowArray = reinterpret_cast(arrowArrayPtr); - auto arrowSchema = reinterpret_cast(arrowSchemaPtr); - return importFromArrowAsOwner(*arrowSchema, *arrowArray); - }); + m.def( + "import_from_arrow", + [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr) { + auto arrowArray = reinterpret_cast(arrowArrayPtr); + auto arrowSchema = reinterpret_cast(arrowSchemaPtr); + std::shared_ptr pool_{facebook::velox::memory::getDefaultMemoryPool()}; + return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); + }); } diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 9383fe999ebc..d15cbc24f143 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -286,11 +286,11 @@ def test_export_to_arrow(self): self.assertEqual(arrow_arr.type, pa.int64()) self.assertEqual(len(arrow_arr), 3) - self.assertListEqual(arrow_arr.tolist(), [1,2,3]) + self.assertListEqual(arrow_arr.tolist(), [1, 2, 3]) def test_import_from_arrow(self): c_schema = ffi.new("struct ArrowSchema*") - schema_ptr = int(ffi.cast("uintptr_t",c_schema)) + schema_ptr = int(ffi.cast("uintptr_t", c_schema)) c_array = ffi.new("struct ArrowArray*") array_ptr = int(ffi.cast("uintptr_t", c_array)) @@ -303,4 +303,4 @@ def test_import_from_arrow(self): self.assertEqual(velox_vector.size(), 3) self.assertTrue(velox_vector.dtype(), pv.IntegerType()) for i in range(0, 3): - self.assertEqual(velox_vector[i], i+1) + self.assertEqual(velox_vector[i], i + 1) diff --git a/setup.py b/setup.py index fa64f1b0b77e..aac64bf43a03 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,7 @@ def build_extension(self, ext): "tabulate", "typing-inspect", ], - tests_require=['pyarrow'], + tests_require=["pyarrow"], python_requires=">=3.7", classifiers=[ "Intended Audience :: Developers", From 6c4b5ad1c0f8e5ff5dc06616da4e665e38dba327 Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Tue, 4 Apr 2023 19:50:26 +0200 Subject: [PATCH 03/16] format --- pyvelox/pyvelox.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index 24e78c521320..576884397abd 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -455,7 +455,8 @@ static void addVectorBindings( m.def("export_to_arrow", [](VectorPtr& inputVector) { auto arrowArray = new ArrowArray(); - std::shared_ptr pool_{facebook::velox::memory::getDefaultMemoryPool()}; + std::shared_ptr pool_{ + facebook::velox::memory::getDefaultMemoryPool()}; facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); inputVector.reset(); return reinterpret_cast(arrowArray); @@ -466,10 +467,10 @@ static void addVectorBindings( [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr) { auto arrowArray = reinterpret_cast(arrowArrayPtr); auto arrowSchema = reinterpret_cast(arrowSchemaPtr); - std::shared_ptr pool_{facebook::velox::memory::getDefaultMemoryPool()}; + std::shared_ptr pool_{ + facebook::velox::memory::getDefaultMemoryPool()}; return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); }); - } static void addExpressionBindings( From d93409d0efc2f3a994e621a0236ae52334080115 Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Tue, 4 Apr 2023 19:50:44 +0200 Subject: [PATCH 04/16] use extras_require for test deps --- .github/workflows/build_pyvelox.yml | 1 + Makefile | 5 +++-- setup.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_pyvelox.yml b/.github/workflows/build_pyvelox.yml index 362b289d63fa..bb01d8e804ce 100644 --- a/.github/workflows/build_pyvelox.yml +++ b/.github/workflows/build_pyvelox.yml @@ -127,6 +127,7 @@ jobs: cp -R /host${{ github.workspace }}/.ccache /output/.ccache && ccache -s CIBW_ENVIRONMENT_PASS_LINUX: CCACHE_DIR BUILD_VERSION + CIBW_TEST_EXTRAS: "tests" CIBW_TEST_COMMAND: "cd {project}/pyvelox && python -m unittest -v" CIBW_TEST_SKIP: "*macos*" CCACHE_DIR: "${{ matrix.os != 'macos-11' && '/output' || github.workspace }}/.ccache" diff --git a/Makefile b/Makefile index f161e93a7f40..18f2ae18a7ab 100644 --- a/Makefile +++ b/Makefile @@ -159,7 +159,8 @@ python-clean: DEBUG=1 ${PYTHON_EXECUTABLE} setup.py clean python-build: - DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} setup.py develop + DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) -python-test: python-build +python-test: + $(MAKE) python-build extras="[tests]" DEBUG=1 ${PYTHON_EXECUTABLE} -m unittest -v diff --git a/setup.py b/setup.py index aac64bf43a03..662ce4c129f5 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,7 @@ def build_extension(self, ext): "tabulate", "typing-inspect", ], - tests_require=["pyarrow"], + extras_require={"tests": ["pyarrow"]}, python_requires=">=3.7", classifiers=[ "Intended Audience :: Developers", From 2aa27b0baed4fd018d3c3f4007b67f059ecf5641 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 25 Apr 2023 04:27:21 +0530 Subject: [PATCH 05/16] feat: separate module for the conversion functions --- pyvelox/CMakeLists.txt | 2 +- pyvelox/conversion.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++ pyvelox/conversion.h | 35 +++++++++++++++++++++++++++++++++ pyvelox/pyvelox.cpp | 8 ++++++++ pyvelox/pyvelox.h | 19 ------------------ 5 files changed, 88 insertions(+), 20 deletions(-) create mode 100644 pyvelox/conversion.cpp create mode 100644 pyvelox/conversion.h diff --git a/pyvelox/CMakeLists.txt b/pyvelox/CMakeLists.txt index 3b40a43176c2..d536c35da059 100644 --- a/pyvelox/CMakeLists.txt +++ b/pyvelox/CMakeLists.txt @@ -17,7 +17,7 @@ if(VELOX_BUILD_PYTHON_PACKAGE) include_directories(SYSTEM ${CMAKE_SOURCE_DIR}) add_definitions(-DCREATE_PYVELOX_MODULE -DVELOX_DISABLE_GOOGLETEST) # Define our Python module: - pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp) + pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp conversion.cpp) # Link with Velox: target_link_libraries( pyvelox diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp new file mode 100644 index 000000000000..2e74ff9fb7d1 --- /dev/null +++ b/pyvelox/conversion.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "conversion.h" +#include +#include + +namespace facebook::velox::py { + +namespace py = pybind11; + +void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { + m.def("export_to_arrow", [](VectorPtr& inputVector) { + auto arrowArray = new ArrowArray(); + std::shared_ptr pool_{ + facebook::velox::memory::addDefaultLeafMemoryPool()}; + facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); + return reinterpret_cast(arrowArray); + }); + + m.def( + "import_from_arrow", + [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr) { + auto arrowArray = reinterpret_cast(arrowArrayPtr); + auto arrowSchema = reinterpret_cast(arrowSchemaPtr); + std::shared_ptr pool_{ + facebook::velox::memory::addDefaultLeafMemoryPool()}; + return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); + }); +} +} // namespace facebook::velox::py diff --git a/pyvelox/conversion.h b/pyvelox/conversion.h new file mode 100644 index 000000000000..7deaa7806677 --- /dev/null +++ b/pyvelox/conversion.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + + +namespace facebook::velox::py { + +namespace py = pybind11; + +/// Adds bindings for arrow-velox conversion functions to module m. +/// +/// @param m Module to add bindings to. +/// @param asModuleLocalDefinitions If true then these bindings are only +/// visible inside the module. Refer to +/// https://pybind11.readthedocs.io/en/stable/advanced/classes.html#module-local-class-bindings +/// for further details. +void addConversionBindings(py::module& m, bool asModuleLocalDefinitions = true); + +} // namespace facebook::velox::py diff --git a/pyvelox/pyvelox.cpp b/pyvelox/pyvelox.cpp index 838bd7557049..6fd3f351d862 100644 --- a/pyvelox/pyvelox.cpp +++ b/pyvelox/pyvelox.cpp @@ -15,7 +15,11 @@ */ #include "pyvelox.h" +<<<<<<< HEAD #include "serde.h" +======= +#include "conversion.h" +>>>>>>> d2251b96 (feat: separate module for the conversion functions) #include "signatures.h" namespace facebook::velox::py { @@ -293,7 +297,11 @@ PYBIND11_MODULE(pyvelox, m) { addVeloxBindings(m); addSignatureBindings(m); +<<<<<<< HEAD addSerdeBindings(m); +======= + addConversionBindings(m); +>>>>>>> d2251b96 (feat: separate module for the conversion functions) m.attr("__version__") = "dev"; } #endif diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index 576884397abd..8775a39a10cc 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -452,25 +452,6 @@ static void addVectorBindings( std::move(baseVector), PyVeloxContext::getSingletonInstance().pool()); }); - - m.def("export_to_arrow", [](VectorPtr& inputVector) { - auto arrowArray = new ArrowArray(); - std::shared_ptr pool_{ - facebook::velox::memory::getDefaultMemoryPool()}; - facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); - inputVector.reset(); - return reinterpret_cast(arrowArray); - }); - - m.def( - "import_from_arrow", - [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr) { - auto arrowArray = reinterpret_cast(arrowArrayPtr); - auto arrowSchema = reinterpret_cast(arrowSchemaPtr); - std::shared_ptr pool_{ - facebook::velox::memory::getDefaultMemoryPool()}; - return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); - }); } static void addExpressionBindings( From 44271a2aef294868e39838b73a589a0685149a4e Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 25 Apr 2023 04:57:39 +0530 Subject: [PATCH 06/16] feat: import_from_arrow now can work only with the pyarrow.Array object --- pyvelox/conversion.cpp | 7 ++++--- pyvelox/test/test_vector.py | 10 +--------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index 2e74ff9fb7d1..57eed68d669b 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -33,9 +33,10 @@ void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { m.def( "import_from_arrow", - [](uintptr_t arrowArrayPtr, uintptr_t arrowSchemaPtr) { - auto arrowArray = reinterpret_cast(arrowArrayPtr); - auto arrowSchema = reinterpret_cast(arrowSchemaPtr); + [](py::object inputArrowArray) { + auto arrowArray = new ArrowArray(); + auto arrowSchema = new ArrowSchema(); + inputArrowArray.attr("_export_to_c")(reinterpret_cast(arrowArray), reinterpret_cast(arrowSchema)); std::shared_ptr pool_{ facebook::velox::memory::addDefaultLeafMemoryPool()}; return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index d15cbc24f143..3df2fb260ff2 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -289,16 +289,8 @@ def test_export_to_arrow(self): self.assertListEqual(arrow_arr.tolist(), [1, 2, 3]) def test_import_from_arrow(self): - c_schema = ffi.new("struct ArrowSchema*") - schema_ptr = int(ffi.cast("uintptr_t", c_schema)) - - c_array = ffi.new("struct ArrowArray*") - array_ptr = int(ffi.cast("uintptr_t", c_array)) - arr = pa.array([1, 2, 3], type=pa.int32()) - arr._export_to_c(array_ptr, schema_ptr) - - velox_vector = pv.import_from_arrow(array_ptr, schema_ptr) + velox_vector = pv.import_from_arrow(arr) self.assertEqual(velox_vector.size(), 3) self.assertTrue(velox_vector.dtype(), pv.IntegerType()) From 3ff09cde6e39e6402933dbd441c65da803398fe3 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 25 Apr 2023 05:14:48 +0530 Subject: [PATCH 07/16] feat: add roundtrip testing for conversion functions --- pyvelox/test/test_vector.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 3df2fb260ff2..96fc5faabfd6 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -13,10 +13,9 @@ # limitations under the License. import pyarrow as pa -from pyarrow.cffi import ffi +import pyvelox.pyvelox as pv import unittest -import pyvelox.pyvelox as pv class TestVeloxVector(unittest.TestCase): @@ -296,3 +295,14 @@ def test_import_from_arrow(self): self.assertTrue(velox_vector.dtype(), pv.IntegerType()) for i in range(0, 3): self.assertEqual(velox_vector[i], i + 1) + + def test_roundtrip_conversion(self): + vector = pv.from_list([1, 2, 3]) + vector_ptr = pv.export_to_arrow(vector) + arrow_arr = pa.Array._import_from_c(vector_ptr, pa.int64()) + + velox_vector = pv.import_from_arrow(arrow_arr) + self.assertEqual(velox_vector.size(), 3) + self.assertTrue(velox_vector.dtype(), pv.IntegerType()) + for i in range(0, 3): + self.assertEqual(velox_vector[i], i + 1) From 193179194febdceb96d3ad83f4d34eb8d5a7992d Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 25 Apr 2023 10:15:23 +0530 Subject: [PATCH 08/16] feat: using smart pointers for conversion memory movement --- pyvelox/conversion.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index 57eed68d669b..67f2a4b9e99a 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -24,19 +24,19 @@ namespace py = pybind11; void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { m.def("export_to_arrow", [](VectorPtr& inputVector) { - auto arrowArray = new ArrowArray(); + auto arrowArray = std::make_shared(); std::shared_ptr pool_{ facebook::velox::memory::addDefaultLeafMemoryPool()}; facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); - return reinterpret_cast(arrowArray); + return reinterpret_cast(arrowArray.get()); }); m.def( "import_from_arrow", [](py::object inputArrowArray) { - auto arrowArray = new ArrowArray(); - auto arrowSchema = new ArrowSchema(); - inputArrowArray.attr("_export_to_c")(reinterpret_cast(arrowArray), reinterpret_cast(arrowSchema)); + auto arrowArray = std::make_unique(); + auto arrowSchema = std::make_unique(); + inputArrowArray.attr("_export_to_c")(reinterpret_cast(arrowArray.get()), reinterpret_cast(arrowSchema.get())); std::shared_ptr pool_{ facebook::velox::memory::addDefaultLeafMemoryPool()}; return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); From 5f42b1e97d140510906044b1a846577109bc1847 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 15 May 2023 14:11:27 +0530 Subject: [PATCH 09/16] feat: pv.export_to_arrow() should return a pyarrow.array object and not memory address --- pyvelox/conversion.cpp | 10 ++++++++-- pyvelox/test/test_vector.py | 17 ++++++----------- setup.py | 1 + 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index 67f2a4b9e99a..be0dd0d2ba78 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -24,11 +24,17 @@ namespace py = pybind11; void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { m.def("export_to_arrow", [](VectorPtr& inputVector) { - auto arrowArray = std::make_shared(); + auto arrowArray = std::make_unique(); std::shared_ptr pool_{ facebook::velox::memory::addDefaultLeafMemoryPool()}; facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); - return reinterpret_cast(arrowArray.get()); + + auto arrowSchema = std::make_unique(); + facebook::velox::exportToArrow(inputVector, *arrowSchema); + + py::module arrow_module = py::module::import("pyarrow"); + py::object array_class = arrow_module.attr("Array"); + return array_class.attr("_import_from_c")(reinterpret_cast(arrowArray.get()), reinterpret_cast(arrowSchema.get())); }); m.def( diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 96fc5faabfd6..d66594f394c6 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -277,15 +277,11 @@ def test_slice(self): def test_export_to_arrow(self): vector = pv.from_list([1, 2, 3]) - vector_ptr = pv.export_to_arrow(vector) + array = pv.export_to_arrow(vector) - self.assertTrue(isinstance(vector_ptr, int)) - - arrow_arr = pa.Array._import_from_c(vector_ptr, pa.int64()) - - self.assertEqual(arrow_arr.type, pa.int64()) - self.assertEqual(len(arrow_arr), 3) - self.assertListEqual(arrow_arr.tolist(), [1, 2, 3]) + self.assertEqual(array.type, pa.int64()) + self.assertEqual(len(array), 3) + self.assertListEqual(array.tolist(), [1, 2, 3]) def test_import_from_arrow(self): arr = pa.array([1, 2, 3], type=pa.int32()) @@ -298,10 +294,9 @@ def test_import_from_arrow(self): def test_roundtrip_conversion(self): vector = pv.from_list([1, 2, 3]) - vector_ptr = pv.export_to_arrow(vector) - arrow_arr = pa.Array._import_from_c(vector_ptr, pa.int64()) + array = pv.export_to_arrow(vector) - velox_vector = pv.import_from_arrow(arrow_arr) + velox_vector = pv.import_from_arrow(array) self.assertEqual(velox_vector.size(), 3) self.assertTrue(velox_vector.dtype(), pv.IntegerType()) for i in range(0, 3): diff --git a/setup.py b/setup.py index 662ce4c129f5..8ec26c7d3aab 100644 --- a/setup.py +++ b/setup.py @@ -172,6 +172,7 @@ def build_extension(self, ext): "typing", "tabulate", "typing-inspect", + "pyarrow", ], extras_require={"tests": ["pyarrow"]}, python_requires=">=3.7", From 703d84623372acafc73a668ff8aa8c9825c3a300 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 16 May 2023 15:29:41 +0530 Subject: [PATCH 10/16] fix: moved MemoryPool instance to util header, updated tests --- pyvelox/conversion.cpp | 11 +++---- pyvelox/pyvelox.h | 2 +- pyvelox/test/test_vector.py | 63 +++++++++++++++++++++++------------ pyvelox/util.h | 66 +++++++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 28 deletions(-) create mode 100644 pyvelox/util.h diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index be0dd0d2ba78..f59634970d9b 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -15,6 +15,7 @@ */ #include "conversion.h" +#include "util.h" #include #include @@ -25,9 +26,8 @@ namespace py = pybind11; void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { m.def("export_to_arrow", [](VectorPtr& inputVector) { auto arrowArray = std::make_unique(); - std::shared_ptr pool_{ - facebook::velox::memory::addDefaultLeafMemoryPool()}; - facebook::velox::exportToArrow(inputVector, *arrowArray, pool_.get()); + auto pool_ = PyVeloxContext::getInstance().pool(); + facebook::velox::exportToArrow(inputVector, *arrowArray, pool_); auto arrowSchema = std::make_unique(); facebook::velox::exportToArrow(inputVector, *arrowSchema); @@ -43,9 +43,8 @@ void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { auto arrowArray = std::make_unique(); auto arrowSchema = std::make_unique(); inputArrowArray.attr("_export_to_c")(reinterpret_cast(arrowArray.get()), reinterpret_cast(arrowSchema.get())); - std::shared_ptr pool_{ - facebook::velox::memory::addDefaultLeafMemoryPool()}; - return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_.get()); + auto pool_ = PyVeloxContext::getInstance().pool(); + return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_); }); } } // namespace facebook::velox::py diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index 8775a39a10cc..329b6b5b8b02 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -29,8 +29,8 @@ #include #include #include -#include #include "folly/json.h" +#include "util.h" #include "context.h" diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index d66594f394c6..b436297b162f 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -276,28 +276,49 @@ def test_slice(self): e = a[3:8:3] def test_export_to_arrow(self): - vector = pv.from_list([1, 2, 3]) - array = pv.export_to_arrow(vector) - - self.assertEqual(array.type, pa.int64()) - self.assertEqual(len(array), 3) - self.assertListEqual(array.tolist(), [1, 2, 3]) + test_cases = [ + ([1, 2, 3], pa.int64()), + ([1.1, 2.2, 3.3], pa.float64()), + (['ab', 'bc', 'ca'], pa.string()) + ] + for data, expected_type in test_cases: + with self.subTest(data=data): + vector = pv.from_list(data) + array = pv.export_to_arrow(vector) + + self.assertEqual(array.type, expected_type) + self.assertEqual(len(array), len(data)) + self.assertListEqual(array.tolist(), data) def test_import_from_arrow(self): - arr = pa.array([1, 2, 3], type=pa.int32()) - velox_vector = pv.import_from_arrow(arr) - - self.assertEqual(velox_vector.size(), 3) - self.assertTrue(velox_vector.dtype(), pv.IntegerType()) - for i in range(0, 3): - self.assertEqual(velox_vector[i], i + 1) + test_cases = [ + ([11, 26, 31], pa.int64(), pv.IntegerType()), + ([0.1, 2.5, 3.9], pa.float64(), pv.DoubleType()), + (['az', 'by', 'cx'], pa.string(), pv.VarcharType()) + ] + for data, dtype, expected_type in test_cases: + with self.subTest(data=data): + array = pa.array(data, type=dtype) + velox_vector = pv.import_from_arrow(array) + + self.assertEqual(velox_vector.size(), len(data)) + self.assertTrue(velox_vector.dtype(), expected_type) + for i in range(0, len(data)): + self.assertEqual(velox_vector[i], data[i]) def test_roundtrip_conversion(self): - vector = pv.from_list([1, 2, 3]) - array = pv.export_to_arrow(vector) - - velox_vector = pv.import_from_arrow(array) - self.assertEqual(velox_vector.size(), 3) - self.assertTrue(velox_vector.dtype(), pv.IntegerType()) - for i in range(0, 3): - self.assertEqual(velox_vector[i], i + 1) + test_cases = [ + ([41, 92, 13], pv.IntegerType()), + ([17.19, 22.25, 13.3], pv.DoubleType()), + (['aa1', 'bb2', 'cc3'], pv.VarcharType()) + ] + for data, expected_type in test_cases: + with self.subTest(data=data): + vector = pv.from_list(data) + array = pv.export_to_arrow(vector) + + velox_vector = pv.import_from_arrow(array) + self.assertEqual(velox_vector.size(), len(data)) + self.assertTrue(velox_vector.dtype(), expected_type) + for i in range(0, len(data)): + self.assertEqual(velox_vector[i], data[i]) diff --git a/pyvelox/util.h b/pyvelox/util.h new file mode 100644 index 000000000000..778aa7d8772f --- /dev/null +++ b/pyvelox/util.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace facebook::velox::py { + +struct PyVeloxContext { + PyVeloxContext() = default; + PyVeloxContext(const PyVeloxContext&) = delete; + PyVeloxContext(const PyVeloxContext&&) = delete; + PyVeloxContext& operator=(const PyVeloxContext&) = delete; + PyVeloxContext& operator=(const PyVeloxContext&&) = delete; + + static inline PyVeloxContext& getInstance() { + if (!instance_) { + instance_ = std::make_unique(); + } + return *instance_.get(); + } + + facebook::velox::memory::MemoryPool* pool() { + return pool_.get(); + } + facebook::velox::core::QueryCtx* queryCtx() { + return queryCtx_.get(); + } + facebook::velox::core::ExecCtx* execCtx() { + return execCtx_.get(); + } + + static inline void cleanup() { + if (instance_) { + instance_.reset(); + } + } + + private: + std::shared_ptr pool_ = + facebook::velox::memory::addDefaultLeafMemoryPool(); + std::shared_ptr queryCtx_ = + std::make_shared(); + std::unique_ptr execCtx_ = + std::make_unique( + pool_.get(), + queryCtx_.get()); + + static inline std::unique_ptr instance_; +}; +} From ea6c049262a165d34012a945ba7e09679cd4f551 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 31 May 2023 12:42:43 +0530 Subject: [PATCH 11/16] fix: format-check --- pyvelox/conversion.cpp | 30 ++++++++++++++++-------------- pyvelox/conversion.h | 1 - pyvelox/pyvelox.cpp | 8 +------- pyvelox/test/test_vector.py | 7 +++---- pyvelox/util.h | 2 +- 5 files changed, 21 insertions(+), 27 deletions(-) diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index f59634970d9b..bd79b6e70977 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -14,17 +14,17 @@ * limitations under the License. */ -#include "conversion.h" -#include "util.h" +#include "conversion.h" #include #include +#include "util.h" namespace facebook::velox::py { namespace py = pybind11; void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { - m.def("export_to_arrow", [](VectorPtr& inputVector) { + m.def("export_to_arrow", [](VectorPtr& inputVector) { auto arrowArray = std::make_unique(); auto pool_ = PyVeloxContext::getInstance().pool(); facebook::velox::exportToArrow(inputVector, *arrowArray, pool_); @@ -33,18 +33,20 @@ void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { facebook::velox::exportToArrow(inputVector, *arrowSchema); py::module arrow_module = py::module::import("pyarrow"); - py::object array_class = arrow_module.attr("Array"); - return array_class.attr("_import_from_c")(reinterpret_cast(arrowArray.get()), reinterpret_cast(arrowSchema.get())); + py::object array_class = arrow_module.attr("Array"); + return array_class.attr("_import_from_c")( + reinterpret_cast(arrowArray.get()), + reinterpret_cast(arrowSchema.get())); }); - m.def( - "import_from_arrow", - [](py::object inputArrowArray) { - auto arrowArray = std::make_unique(); - auto arrowSchema = std::make_unique(); - inputArrowArray.attr("_export_to_c")(reinterpret_cast(arrowArray.get()), reinterpret_cast(arrowSchema.get())); - auto pool_ = PyVeloxContext::getInstance().pool(); - return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_); - }); + m.def("import_from_arrow", [](py::object inputArrowArray) { + auto arrowArray = std::make_unique(); + auto arrowSchema = std::make_unique(); + inputArrowArray.attr("_export_to_c")( + reinterpret_cast(arrowArray.get()), + reinterpret_cast(arrowSchema.get())); + auto pool_ = PyVeloxContext::getInstance().pool(); + return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_); + }); } } // namespace facebook::velox::py diff --git a/pyvelox/conversion.h b/pyvelox/conversion.h index 7deaa7806677..0b0e2d394bc1 100644 --- a/pyvelox/conversion.h +++ b/pyvelox/conversion.h @@ -18,7 +18,6 @@ #include - namespace facebook::velox::py { namespace py = pybind11; diff --git a/pyvelox/pyvelox.cpp b/pyvelox/pyvelox.cpp index 6fd3f351d862..495cff70e7ff 100644 --- a/pyvelox/pyvelox.cpp +++ b/pyvelox/pyvelox.cpp @@ -15,11 +15,8 @@ */ #include "pyvelox.h" -<<<<<<< HEAD -#include "serde.h" -======= #include "conversion.h" ->>>>>>> d2251b96 (feat: separate module for the conversion functions) +#include "serde.h" #include "signatures.h" namespace facebook::velox::py { @@ -297,11 +294,8 @@ PYBIND11_MODULE(pyvelox, m) { addVeloxBindings(m); addSignatureBindings(m); -<<<<<<< HEAD addSerdeBindings(m); -======= addConversionBindings(m); ->>>>>>> d2251b96 (feat: separate module for the conversion functions) m.attr("__version__") = "dev"; } #endif diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index b436297b162f..31d673cbe5e4 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -17,7 +17,6 @@ import unittest - class TestVeloxVector(unittest.TestCase): def test_inheritance(self): v1 = pv.from_list([1, 2, 3]) @@ -279,7 +278,7 @@ def test_export_to_arrow(self): test_cases = [ ([1, 2, 3], pa.int64()), ([1.1, 2.2, 3.3], pa.float64()), - (['ab', 'bc', 'ca'], pa.string()) + (["ab", "bc", "ca"], pa.string()), ] for data, expected_type in test_cases: with self.subTest(data=data): @@ -294,7 +293,7 @@ def test_import_from_arrow(self): test_cases = [ ([11, 26, 31], pa.int64(), pv.IntegerType()), ([0.1, 2.5, 3.9], pa.float64(), pv.DoubleType()), - (['az', 'by', 'cx'], pa.string(), pv.VarcharType()) + (["az", "by", "cx"], pa.string(), pv.VarcharType()), ] for data, dtype, expected_type in test_cases: with self.subTest(data=data): @@ -310,7 +309,7 @@ def test_roundtrip_conversion(self): test_cases = [ ([41, 92, 13], pv.IntegerType()), ([17.19, 22.25, 13.3], pv.DoubleType()), - (['aa1', 'bb2', 'cc3'], pv.VarcharType()) + (["aa1", "bb2", "cc3"], pv.VarcharType()), ] for data, expected_type in test_cases: with self.subTest(data=data): diff --git a/pyvelox/util.h b/pyvelox/util.h index 778aa7d8772f..ca2629d531ac 100644 --- a/pyvelox/util.h +++ b/pyvelox/util.h @@ -63,4 +63,4 @@ struct PyVeloxContext { static inline std::unique_ptr instance_; }; -} +} // namespace facebook::velox::py From 633fd0375b9ed9e63c30ee6568a871ae61f9cf5f Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 31 May 2023 12:52:07 +0530 Subject: [PATCH 12/16] fix: formatting on CMakeLists.txt --- pyvelox/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyvelox/CMakeLists.txt b/pyvelox/CMakeLists.txt index d536c35da059..92d4adbef213 100644 --- a/pyvelox/CMakeLists.txt +++ b/pyvelox/CMakeLists.txt @@ -17,7 +17,8 @@ if(VELOX_BUILD_PYTHON_PACKAGE) include_directories(SYSTEM ${CMAKE_SOURCE_DIR}) add_definitions(-DCREATE_PYVELOX_MODULE -DVELOX_DISABLE_GOOGLETEST) # Define our Python module: - pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp conversion.cpp) + pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp + conversion.cpp) # Link with Velox: target_link_libraries( pyvelox From f3108aab3d6bb9e5bb351bf2a78db722eb2de880 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 1 Jun 2023 23:22:07 +0530 Subject: [PATCH 13/16] fix: util header not required as instance struct was moved to context header --- pyvelox/conversion.cpp | 2 +- pyvelox/util.h | 66 ------------------------------------------ 2 files changed, 1 insertion(+), 67 deletions(-) delete mode 100644 pyvelox/util.h diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index bd79b6e70977..03ec908695ab 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -17,7 +17,7 @@ #include "conversion.h" #include #include -#include "util.h" +#include "context.h" namespace facebook::velox::py { diff --git a/pyvelox/util.h b/pyvelox/util.h deleted file mode 100644 index ca2629d531ac..000000000000 --- a/pyvelox/util.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -namespace facebook::velox::py { - -struct PyVeloxContext { - PyVeloxContext() = default; - PyVeloxContext(const PyVeloxContext&) = delete; - PyVeloxContext(const PyVeloxContext&&) = delete; - PyVeloxContext& operator=(const PyVeloxContext&) = delete; - PyVeloxContext& operator=(const PyVeloxContext&&) = delete; - - static inline PyVeloxContext& getInstance() { - if (!instance_) { - instance_ = std::make_unique(); - } - return *instance_.get(); - } - - facebook::velox::memory::MemoryPool* pool() { - return pool_.get(); - } - facebook::velox::core::QueryCtx* queryCtx() { - return queryCtx_.get(); - } - facebook::velox::core::ExecCtx* execCtx() { - return execCtx_.get(); - } - - static inline void cleanup() { - if (instance_) { - instance_.reset(); - } - } - - private: - std::shared_ptr pool_ = - facebook::velox::memory::addDefaultLeafMemoryPool(); - std::shared_ptr queryCtx_ = - std::make_shared(); - std::unique_ptr execCtx_ = - std::make_unique( - pool_.get(), - queryCtx_.get()); - - static inline std::unique_ptr instance_; -}; -} // namespace facebook::velox::py From 87e5f2b9c3b0d88da718a9d97e2023ab5e88de4e Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 1 Jun 2023 23:35:31 +0530 Subject: [PATCH 14/16] fix: using static method getSingletonInstance() --- pyvelox/conversion.cpp | 4 ++-- pyvelox/pyvelox.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp index 03ec908695ab..d5766865a682 100644 --- a/pyvelox/conversion.cpp +++ b/pyvelox/conversion.cpp @@ -26,7 +26,7 @@ namespace py = pybind11; void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { m.def("export_to_arrow", [](VectorPtr& inputVector) { auto arrowArray = std::make_unique(); - auto pool_ = PyVeloxContext::getInstance().pool(); + auto pool_ = PyVeloxContext::getSingletonInstance().pool(); facebook::velox::exportToArrow(inputVector, *arrowArray, pool_); auto arrowSchema = std::make_unique(); @@ -45,7 +45,7 @@ void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { inputArrowArray.attr("_export_to_c")( reinterpret_cast(arrowArray.get()), reinterpret_cast(arrowSchema.get())); - auto pool_ = PyVeloxContext::getInstance().pool(); + auto pool_ = PyVeloxContext::getSingletonInstance().pool(); return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_); }); } diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index 329b6b5b8b02..48c40f906220 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -30,7 +30,6 @@ #include #include #include "folly/json.h" -#include "util.h" #include "context.h" From 072244e1d9cf9c7125a94a6992d19557dbbfe95b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Sat, 3 Jun 2023 08:09:48 +0530 Subject: [PATCH 15/16] fix: format-check --- pyvelox/test/test_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 31d673cbe5e4..c2a1d271844f 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -273,7 +273,7 @@ def test_slice(self): with self.assertRaises(NotImplementedError): e = a[3:8:3] - + def test_export_to_arrow(self): test_cases = [ ([1, 2, 3], pa.int64()), From 579010bdc8007dec97ecd28faec83a248ba37608 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 8 Jun 2023 22:31:41 +0530 Subject: [PATCH 16/16] fix: python build should have verbose output for passing CI checks --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 18f2ae18a7ab..294b3a4d0650 100644 --- a/Makefile +++ b/Makefile @@ -159,7 +159,7 @@ python-clean: DEBUG=1 ${PYTHON_EXECUTABLE} setup.py clean python-build: - DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) + DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) --verbose python-test: $(MAKE) python-build extras="[tests]"