diff --git a/cpp/src/.clang-format b/.clang-format similarity index 100% rename from cpp/src/.clang-format rename to .clang-format diff --git a/cpp/src/.clang-tidy b/.clang-tidy similarity index 100% rename from cpp/src/.clang-tidy rename to .clang-tidy diff --git a/cpp/src/.clang-tidy-ignore b/.clang-tidy-ignore similarity index 100% rename from cpp/src/.clang-tidy-ignore rename to .clang-tidy-ignore diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 315995ce7cb..93e9853df89 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -741,7 +741,8 @@ endif (UNIX) if (${CLANG_FORMAT_FOUND}) # runs clang format and updates files in place. add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'` + `find ${CMAKE_CURRENT_SOURCE_DIR}/../python -name \\*.cc -or -name \\*.h`) # runs clang format and exits with a non-zero exit code if any files need to be reformatted add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0 diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index a5444c236bc..dc6ccd20259 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,7 +18,7 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, +from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable, CDataType, CStatus, Type, MemoryPool) cimport pyarrow.includes.libarrow_io as arrow_io @@ -39,6 +39,9 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, PyObject* py_ref, PyObject** out) + CStatus ConvertTableToPandas(const shared_ptr[CTable]& table, + int nthreads, PyObject** out) + MemoryPool* get_memory_pool() diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 2f7d4309e45..93755578884 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -430,6 +430,32 @@ cdef class RecordBatch: return result +cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads): + cdef: + PyObject* result_obj + CColumn* col + int i + + from pandas.core.internals import BlockManager, make_block + from pandas import RangeIndex + + check_status(pyarrow.ConvertTableToPandas(table, nthreads, &result_obj)) + + result = PyObject_to_object(result_obj) + + blocks = [] + for block_arr, placement_arr in result: + blocks.append(make_block(block_arr, placement=placement_arr)) + + names = [] + for i in range(table.get().num_columns()): + col = table.get().column(i).get() + names.append(frombytes(col.name())) + + axes = [names, RangeIndex(table.get().num_rows())] + return BlockManager(blocks, axes) + + cdef class Table: """ A collection of top-level named, equal length Arrow arrays. @@ -584,7 +610,7 @@ cdef class Table: table.init(c_table) return table - def to_pandas(self): + def to_pandas(self, nthreads=1, block_based=True): """ Convert the arrow::Table to a pandas DataFrame @@ -599,17 +625,21 @@ cdef class Table: import pandas as pd - names = [] - data = [] - for i in range(self.table.num_columns()): - col = self.table.column(i) - column = self.column(i) - check_status(pyarrow.ConvertColumnToPandas( - col, column, &arr)) - names.append(frombytes(col.get().name())) - data.append(PyObject_to_object(arr)) - - return pd.DataFrame(dict(zip(names, data)), columns=names) + if block_based: + mgr = table_to_blockmanager(self.sp_table, nthreads) + return pd.DataFrame(mgr) + else: + names = [] + data = [] + for i in range(self.table.num_columns()): + col = self.table.column(i) + column = self.column(i) + check_status(pyarrow.ConvertColumnToPandas( + col, column, &arr)) + names.append(frombytes(col.get().name())) + data.append(PyObject_to_object(arr)) + + return pd.DataFrame(dict(zip(names, data)), columns=names) @property def name(self): diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 2a13944b35c..fb7475f0c94 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -44,16 +44,16 @@ static inline bool IsPyInteger(PyObject* obj) { class ScalarVisitor { public: - ScalarVisitor() : - total_count_(0), - none_count_(0), - bool_count_(0), - int_count_(0), - date_count_(0), - timestamp_count_(0), - float_count_(0), - binary_count_(0), - unicode_count_(0) {} + ScalarVisitor() + : total_count_(0), + none_count_(0), + bool_count_(0), + int_count_(0), + date_count_(0), + timestamp_count_(0), + float_count_(0), + binary_count_(0), + unicode_count_(0) {} void Visit(PyObject* obj) { ++total_count_; @@ -100,9 +100,7 @@ class ScalarVisitor { } } - int64_t total_count() const { - return total_count_; - } + int64_t total_count() const { return total_count_; } private: int64_t total_count_; @@ -123,17 +121,14 @@ static constexpr int MAX_NESTING_LEVELS = 32; class SeqVisitor { public: - SeqVisitor() : - max_nesting_level_(0) { + SeqVisitor() : max_nesting_level_(0) { memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); } - Status Visit(PyObject* obj, int level=0) { + Status Visit(PyObject* obj, int level = 0) { Py_ssize_t size = PySequence_Size(obj); - if (level > max_nesting_level_) { - max_nesting_level_ = level; - } + if (level > max_nesting_level_) { max_nesting_level_ = level; } for (int64_t i = 0; i < size; ++i) { // TODO(wesm): Error checking? @@ -188,9 +183,7 @@ class SeqVisitor { int max_observed_level() const { int result = 0; for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { - result = i; - } + if (nesting_histogram_[i] > 0) { result = i; } } return result; } @@ -198,9 +191,7 @@ class SeqVisitor { int num_nesting_levels() const { int result = 0; for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { - ++result; - } + if (nesting_histogram_[i] > 0) { ++result; } } return result; } @@ -214,8 +205,8 @@ class SeqVisitor { }; // Non-exhaustive type inference -static Status InferArrowType(PyObject* obj, int64_t* size, - std::shared_ptr* out_type) { +static Status InferArrowType( + PyObject* obj, int64_t* size, std::shared_ptr* out_type) { *size = PySequence_Size(obj); if (PyErr_Occurred()) { // Not a sequence @@ -224,9 +215,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size, } // For 0-length sequences, refuse to guess - if (*size == 0) { - *out_type = arrow::null(); - } + if (*size == 0) { *out_type = arrow::null(); } SeqVisitor seq_visitor; RETURN_NOT_OK(seq_visitor.Visit(obj)); @@ -234,9 +223,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size, *out_type = seq_visitor.GetType(); - if (*out_type == nullptr) { - return Status::TypeError("Unable to determine data type"); - } + if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); } return Status::OK(); } @@ -337,7 +324,8 @@ class TimestampConverter : public TypedConverter { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { - PyDateTime_DateTime* pydatetime = reinterpret_cast(item.obj()); + PyDateTime_DateTime* pydatetime = + reinterpret_cast(item.obj()); struct tm datetime = {0}; datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900; datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1; @@ -462,6 +450,7 @@ class ListConverter : public TypedConverter { } return Status::OK(); } + protected: std::shared_ptr value_converter_; }; @@ -496,8 +485,8 @@ Status ListConverter::Init(const std::shared_ptr& builder) { builder_ = builder; typed_builder_ = static_cast(builder.get()); - value_converter_ = GetConverter(static_cast( - builder->type().get())->value_type()); + value_converter_ = + GetConverter(static_cast(builder->type().get())->value_type()); if (value_converter_ == nullptr) { return Status::NotImplemented("value type not implemented"); } @@ -521,8 +510,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { std::shared_ptr converter = GetConverter(type); if (converter == nullptr) { std::stringstream ss; - ss << "No type converter implemented for " - << type->ToString(); + ss << "No type converter implemented for " << type->ToString(); return Status::NotImplemented(ss.str()); } @@ -536,4 +524,4 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { return builder->Finish(out); } -} // namespace pyarrow +} // namespace pyarrow diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 2ddfdaaf441..1ff36945c88 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -40,6 +40,6 @@ namespace pyarrow { PYARROW_EXPORT arrow::Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_ADAPTERS_BUILTIN_H +#endif // PYARROW_ADAPTERS_BUILTIN_H diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 38f3b6f5248..899eb5519d5 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -28,10 +28,13 @@ #include #include #include +#include #include "arrow/api.h" -#include "arrow/util/bit-util.h" #include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" #include "pyarrow/common.h" #include "pyarrow/config.h" @@ -40,10 +43,13 @@ namespace pyarrow { using arrow::Array; +using arrow::ChunkedArray; using arrow::Column; using arrow::Field; using arrow::DataType; using arrow::Status; +using arrow::Table; +using arrow::Type; namespace BitUtil = arrow::BitUtil; @@ -51,8 +57,7 @@ namespace BitUtil = arrow::BitUtil; // Serialization template -struct npy_traits { -}; +struct npy_traits {}; template <> struct npy_traits { @@ -60,21 +65,17 @@ struct npy_traits { using TypeClass = arrow::BooleanType; static constexpr bool supports_nulls = false; - static inline bool isnull(uint8_t v) { - return false; - } + static inline bool isnull(uint8_t v) { return false; } }; -#define NPY_INT_DECL(TYPE, CapType, T) \ - template <> \ - struct npy_traits { \ - typedef T value_type; \ - using TypeClass = arrow::CapType##Type; \ - \ - static constexpr bool supports_nulls = false; \ - static inline bool isnull(T v) { \ - return false; \ - } \ +#define NPY_INT_DECL(TYPE, CapType, T) \ + template <> \ + struct npy_traits { \ + typedef T value_type; \ + using TypeClass = arrow::CapType##Type; \ + \ + static constexpr bool supports_nulls = false; \ + static inline bool isnull(T v) { return false; } \ }; NPY_INT_DECL(INT8, Int8, int8_t); @@ -93,9 +94,7 @@ struct npy_traits { static constexpr bool supports_nulls = true; - static inline bool isnull(float v) { - return v != v; - } + static inline bool isnull(float v) { return v != v; } }; template <> @@ -105,9 +104,7 @@ struct npy_traits { static constexpr bool supports_nulls = true; - static inline bool isnull(double v) { - return v != v; - } + static inline bool isnull(double v) { return v != v; } }; template <> @@ -135,18 +132,14 @@ struct npy_traits { template class ArrowSerializer { public: - ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) : - pool_(pool), - arr_(arr), - mask_(mask) { + ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) + : pool_(pool), arr_(arr), mask_(mask) { length_ = PyArray_SIZE(arr_); } Status Convert(std::shared_ptr* out); - int stride() const { - return PyArray_STRIDES(arr_)[0]; - } + int stride() const { return PyArray_STRIDES(arr_)[0]; } Status InitNullBitmap() { int null_bytes = BitUtil::BytesForBits(length_); @@ -215,9 +208,7 @@ class ArrowSerializer { const int32_t length = PyBytes_GET_SIZE(obj); s = string_builder.Append(PyBytes_AS_STRING(obj), length); Py_DECREF(obj); - if (!s.ok()) { - return s; - } + if (!s.ok()) { return s; } } else if (PyBytes_Check(obj)) { have_bytes = true; const int32_t length = PyBytes_GET_SIZE(obj); @@ -230,8 +221,8 @@ class ArrowSerializer { if (have_bytes) { const auto& arr = static_cast(*out->get()); - *out = std::make_shared(arr.length(), arr.offsets(), - arr.data(), arr.null_count(), arr.null_bitmap()); + *out = std::make_shared( + arr.length(), arr.offsets(), arr.data(), arr.null_count(), arr.null_bitmap()); } return Status::OK(); } @@ -259,8 +250,7 @@ class ArrowSerializer { } } - *out = std::make_shared(length_, data, null_count, - null_bitmap_); + *out = std::make_shared(length_, data, null_count, null_bitmap_); return Status::OK(); } @@ -321,26 +311,27 @@ inline Status ArrowSerializer::MakeDataType(std::shared_ptr* out } template <> -inline Status ArrowSerializer::MakeDataType(std::shared_ptr* out) { +inline Status ArrowSerializer::MakeDataType( + std::shared_ptr* out) { PyArray_Descr* descr = PyArray_DESCR(arr_); auto date_dtype = reinterpret_cast(descr->c_metadata); arrow::TimestampType::Unit unit; switch (date_dtype->meta.base) { - case NPY_FR_s: - unit = arrow::TimestampType::Unit::SECOND; - break; - case NPY_FR_ms: - unit = arrow::TimestampType::Unit::MILLI; - break; - case NPY_FR_us: - unit = arrow::TimestampType::Unit::MICRO; - break; - case NPY_FR_ns: - unit = arrow::TimestampType::Unit::NANO; - break; - default: - return Status::Invalid("Unknown NumPy datetime unit"); + case NPY_FR_s: + unit = arrow::TimestampType::Unit::SECOND; + break; + case NPY_FR_ms: + unit = arrow::TimestampType::Unit::MILLI; + break; + case NPY_FR_us: + unit = arrow::TimestampType::Unit::MICRO; + break; + case NPY_FR_ns: + unit = arrow::TimestampType::Unit::NANO; + break; + default: + return Status::Invalid("Unknown NumPy datetime unit"); } out->reset(new arrow::TimestampType(unit)); @@ -351,9 +342,7 @@ template inline Status ArrowSerializer::Convert(std::shared_ptr* out) { typedef npy_traits traits; - if (mask_ != nullptr || traits::supports_nulls) { - RETURN_NOT_OK(InitNullBitmap()); - } + if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); } int64_t null_count = 0; if (mask_ != nullptr) { @@ -429,9 +418,7 @@ inline Status ArrowSerializer::Convert(std::shared_ptr* out) template inline Status ArrowSerializer::ConvertData() { // TODO(wesm): strided arrays - if (is_strided()) { - return Status::Invalid("no support for strided data yet"); - } + if (is_strided()) { return Status::Invalid("no support for strided data yet"); } data_ = std::make_shared(arr_); return Status::OK(); @@ -439,9 +426,7 @@ inline Status ArrowSerializer::ConvertData() { template <> inline Status ArrowSerializer::ConvertData() { - if (is_strided()) { - return Status::Invalid("no support for strided data yet"); - } + if (is_strided()) { return Status::Invalid("no support for strided data yet"); } int nbytes = BitUtil::BytesForBits(length_); auto buffer = std::make_shared(pool_); @@ -453,9 +438,7 @@ inline Status ArrowSerializer::ConvertData() { memset(bitmap, 0, nbytes); for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { - BitUtil::SetBit(bitmap, i); - } + if (values[i] > 0) { BitUtil::SetBit(bitmap, i); } } data_ = buffer; @@ -468,29 +451,24 @@ inline Status ArrowSerializer::ConvertData() { return Status::TypeError("NYI"); } +#define TO_ARROW_CASE(TYPE) \ + case NPY_##TYPE: { \ + ArrowSerializer converter(pool, arr, mask); \ + RETURN_NOT_OK(converter.Convert(out)); \ + } break; -#define TO_ARROW_CASE(TYPE) \ - case NPY_##TYPE: \ - { \ - ArrowSerializer converter(pool, arr, mask); \ - RETURN_NOT_OK(converter.Convert(out)); \ - } \ - break; - -Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, - std::shared_ptr* out) { +Status PandasMaskedToArrow( + arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out) { PyArrayObject* arr = reinterpret_cast(ao); PyArrayObject* mask = nullptr; - if (mo != nullptr) { - mask = reinterpret_cast(mo); - } + if (mo != nullptr) { mask = reinterpret_cast(mo); } if (PyArray_NDIM(arr) != 1) { return Status::Invalid("only handle 1-dimensional arrays"); } - switch(PyArray_DESCR(arr)->type_num) { + switch (PyArray_DESCR(arr)->type_num) { TO_ARROW_CASE(BOOL); TO_ARROW_CASE(INT8); TO_ARROW_CASE(INT16); @@ -506,15 +484,13 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, TO_ARROW_CASE(OBJECT); default: std::stringstream ss; - ss << "unsupported type " << PyArray_DESCR(arr)->type_num - << std::endl; + ss << "unsupported type " << PyArray_DESCR(arr)->type_num << std::endl; return Status::NotImplemented(ss.str()); } return Status::OK(); } -Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, - std::shared_ptr* out) { +Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { return PandasMaskedToArrow(pool, ao, nullptr, out); } @@ -522,28 +498,27 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, // Deserialization template -struct arrow_traits { -}; +struct arrow_traits {}; template <> struct arrow_traits { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; static constexpr bool is_boolean = true; - static constexpr bool is_pandas_numeric_not_nullable = false; - static constexpr bool is_pandas_numeric_nullable = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = false; }; -#define INT_DECL(TYPE) \ - template <> \ - struct arrow_traits { \ - static constexpr int npy_type = NPY_##TYPE; \ - static constexpr bool supports_nulls = false; \ - static constexpr double na_value = NAN; \ - static constexpr bool is_boolean = false; \ - static constexpr bool is_pandas_numeric_not_nullable = true; \ - static constexpr bool is_pandas_numeric_nullable = false; \ - typedef typename npy_traits::value_type T; \ +#define INT_DECL(TYPE) \ + template <> \ + struct arrow_traits { \ + static constexpr int npy_type = NPY_##TYPE; \ + static constexpr bool supports_nulls = false; \ + static constexpr double na_value = NAN; \ + static constexpr bool is_boolean = false; \ + static constexpr bool is_numeric_not_nullable = true; \ + static constexpr bool is_numeric_nullable = false; \ + typedef typename npy_traits::value_type T; \ }; INT_DECL(INT8); @@ -561,8 +536,8 @@ struct arrow_traits { static constexpr bool supports_nulls = true; static constexpr float na_value = NAN; static constexpr bool is_boolean = false; - static constexpr bool is_pandas_numeric_not_nullable = false; - static constexpr bool is_pandas_numeric_nullable = true; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; typedef typename npy_traits::value_type T; }; @@ -572,19 +547,21 @@ struct arrow_traits { static constexpr bool supports_nulls = true; static constexpr double na_value = NAN; static constexpr bool is_boolean = false; - static constexpr bool is_pandas_numeric_not_nullable = false; - static constexpr bool is_pandas_numeric_nullable = true; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; typedef typename npy_traits::value_type T; }; +static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); + template <> struct arrow_traits { static constexpr int npy_type = NPY_DATETIME; static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = std::numeric_limits::min(); + static constexpr int64_t na_value = kPandasTimestampNull; static constexpr bool is_boolean = false; - static constexpr bool is_pandas_numeric_not_nullable = false; - static constexpr bool is_pandas_numeric_nullable = true; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; typedef typename npy_traits::value_type T; }; @@ -592,10 +569,10 @@ template <> struct arrow_traits { static constexpr int npy_type = NPY_DATETIME; static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = std::numeric_limits::min(); + static constexpr int64_t na_value = kPandasTimestampNull; static constexpr bool is_boolean = false; - static constexpr bool is_pandas_numeric_not_nullable = false; - static constexpr bool is_pandas_numeric_nullable = true; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; typedef typename npy_traits::value_type T; }; @@ -604,18 +581,39 @@ struct arrow_traits { static constexpr int npy_type = NPY_OBJECT; static constexpr bool supports_nulls = true; static constexpr bool is_boolean = false; - static constexpr bool is_pandas_numeric_not_nullable = false; - static constexpr bool is_pandas_numeric_nullable = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = false; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = false; }; +template +struct WrapBytes {}; -static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { +template <> +struct WrapBytes { + static inline PyObject* Wrap(const uint8_t* data, int64_t length) { #if PY_MAJOR_VERSION >= 3 - return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); + return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); #else - return PyString_FromStringAndSize(reinterpret_cast(data), length); + return PyString_FromStringAndSize(reinterpret_cast(data), length); #endif -} + } +}; + +template <> +struct WrapBytes { + static inline PyObject* Wrap(const uint8_t* data, int64_t length) { + return PyBytes_FromStringAndSize(reinterpret_cast(data), length); + } +}; inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { if (type == NPY_DATETIME) { @@ -645,20 +643,169 @@ inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) } } -template -class ArrowDeserializer { - public: - ArrowDeserializer(const std::shared_ptr& col, PyObject* py_ref) : - col_(col), py_ref_(py_ref) {} +template +inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + // Upcast to double, set NaN as appropriate - Status Convert(PyObject** out) { - const std::shared_ptr data = col_->data(); + for (int i = 0; i < arr->length(); ++i) { + *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i]; + } + } +} - RETURN_NOT_OK(ConvertValues(data)); - *out = reinterpret_cast(out_); +template +inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + memcpy(out_values, in_values, sizeof(T) * arr->length()); + out_values += arr->length(); + } +} - return Status::OK(); +template +inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + for (int32_t i = 0; i < arr->length(); ++i) { + *out_values = in_values[i]; + } + } +} + +static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto bool_arr = static_cast(arr.get()); + + for (int64_t i = 0; i < arr->length(); ++i) { + if (bool_arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else if (bool_arr->Value(i)) { + // True + Py_INCREF(Py_True); + *out_values++ = Py_True; + } else { + // False + Py_INCREF(Py_False); + *out_values++ = Py_False; + } + } + } + return Status::OK(); +} + +static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto bool_arr = static_cast(arr.get()); + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = static_cast(bool_arr->Value(i)); + } + } +} + +template +inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + for (int c = 0; c < data.num_chunks(); c++) { + auto arr = static_cast(data.chunk(c).get()); + + const uint8_t* data_ptr; + int32_t length; + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + if (has_nulls && arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + data_ptr = arr->GetValue(i, &length); + *out_values = WrapBytes::Wrap(data_ptr, length); + if (*out_values == nullptr) { + return Status::UnknownError("String initialization failed"); + } + } + ++out_values; + } + } + return Status::OK(); +} + +template +inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + const uint8_t* valid_bits = arr->null_bitmap_data(); + + if (arr->null_count() > 0) { + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i]; + } + } else { + memcpy(out_values, in_values, sizeof(T) * arr->length()); + out_values += arr->length(); + } } +} + +template +inline void ConvertNumericNullableCast( + const ChunkedArray& data, OutType na_value, OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = arr->IsNull(i) ? na_value : static_cast(in_values[i]); + } + } +} + +template +inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + // There are 1000 * 60 * 60 * 24 = 86400000ms in a day + *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000; + } + } +} + +template +inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = arr->IsNull(i) ? kPandasTimestampNull + : (static_cast(in_values[i]) * SHIFT); + } + } +} + +class ArrowDeserializer { + public: + ArrowDeserializer(const std::shared_ptr& col, PyObject* py_ref) + : col_(col), data_(*col->data().get()), py_ref_(py_ref) {} Status AllocateOutput(int type) { PyAcquireGIL lock; @@ -676,20 +823,29 @@ class ArrowDeserializer { return Status::OK(); } - Status OutputFromData(int type, void* data) { + template + Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr arr) { + typedef typename arrow_traits::T T; + + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + // Zero-Copy. We can pass the data pointer directly to NumPy. + void* data = const_cast(in_values); + PyAcquireGIL lock; // Zero-Copy. We can pass the data pointer directly to NumPy. npy_intp dims[1] = {col_->length()}; - out_ = reinterpret_cast(PyArray_SimpleNewFromData(1, dims, - type, data)); + out_ = reinterpret_cast( + PyArray_SimpleNewFromData(1, dims, npy_type, data)); if (out_ == NULL) { // Error occurred, trust that SimpleNew set the error state return Status::OK(); } - set_numpy_metadata(type, col_->type().get(), out_); + set_numpy_metadata(npy_type, col_->type().get(), out_); if (PyArray_SetBaseObject(out_, py_ref_) == -1) { // Error occurred, trust that SetBaseObject set the error state @@ -705,317 +861,621 @@ class ArrowDeserializer { return Status::OK(); } - template - Status ConvertValuesZeroCopy(std::shared_ptr arr) { - typedef typename arrow_traits::T T; + // ---------------------------------------------------------------------- + // Allocate new array and deserialize. Can do a zero copy conversion for some + // types - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); + Status Convert(PyObject** out) { +#define CONVERT_CASE(TYPE) \ + case arrow::Type::TYPE: { \ + RETURN_NOT_OK(ConvertValues()); \ + } break; + + switch (col_->type()->type) { + CONVERT_CASE(BOOL); + CONVERT_CASE(INT8); + CONVERT_CASE(INT16); + CONVERT_CASE(INT32); + CONVERT_CASE(INT64); + CONVERT_CASE(UINT8); + CONVERT_CASE(UINT16); + CONVERT_CASE(UINT32); + CONVERT_CASE(UINT64); + CONVERT_CASE(FLOAT); + CONVERT_CASE(DOUBLE); + CONVERT_CASE(BINARY); + CONVERT_CASE(STRING); + CONVERT_CASE(DATE); + CONVERT_CASE(TIMESTAMP); + default: + return Status::NotImplemented("Arrow type reading not implemented"); + } - // Zero-Copy. We can pass the data pointer directly to NumPy. - void* data = const_cast(in_values); - int type = arrow_traits::npy_type; - RETURN_NOT_OK(OutputFromData(type, data)); +#undef CONVERT_CASE + *out = reinterpret_cast(out_); return Status::OK(); } - template + template inline typename std::enable_if< - (T2 != arrow::Type::DATE) & arrow_traits::is_pandas_numeric_nullable, Status>::type - ConvertValues(const std::shared_ptr& data) { - typedef typename arrow_traits::T T; - size_t chunk_offset = 0; + (TYPE != arrow::Type::DATE) & arrow_traits::is_numeric_nullable, Status>::type + ConvertValues() { + typedef typename arrow_traits::T T; + int npy_type = arrow_traits::npy_type; - if (data->num_chunks() == 1 && data->null_count() == 0) { - return ConvertValuesZeroCopy(data->chunk(0)); + if (data_.num_chunks() == 1 && data_.null_count() == 0) { + return ConvertValuesZeroCopy(npy_type, data_.chunk(0)); } - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + RETURN_NOT_OK(AllocateOutput(npy_type)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + ConvertNumericNullable(data_, arrow_traits::na_value, out_values); - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; + return Status::OK(); + } - if (arr->null_count() > 0) { - for (int64_t i = 0; i < arr->length(); ++i) { - out_values[i] = arr->IsNull(i) ? arrow_traits::na_value : in_values[i]; - } - } else { - memcpy(out_values, in_values, sizeof(T) * arr->length()); - } + template + inline typename std::enable_if::type + ConvertValues() { + typedef typename arrow_traits::T T; + + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + ConvertDates(data_, arrow_traits::na_value, out_values); + return Status::OK(); + } - chunk_offset += arr->length(); + // Integer specialization + template + inline + typename std::enable_if::is_numeric_not_nullable, Status>::type + ConvertValues() { + typedef typename arrow_traits::T T; + int npy_type = arrow_traits::npy_type; + + if (data_.num_chunks() == 1 && data_.null_count() == 0) { + return ConvertValuesZeroCopy(npy_type, data_.chunk(0)); + } + + if (data_.null_count() > 0) { + RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + ConvertIntegerWithNulls(data_, out_values); + } else { + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + ConvertIntegerNoNullsSameType(data_, out_values); } return Status::OK(); } + // Boolean specialization + template + inline typename std::enable_if::is_boolean, Status>::type + ConvertValues() { + if (data_.null_count() > 0) { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values)); + } else { + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + ConvertBooleanNoNulls(data_, out_values); + } + return Status::OK(); + } + + // UTF8 strings + template + inline typename std::enable_if::type + ConvertValues() { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + return ConvertBinaryLike(data_, out_values); + } + template - inline typename std::enable_if< - T2 == arrow::Type::DATE, Status>::type - ConvertValues(const std::shared_ptr& data) { - typedef typename arrow_traits::T T; - size_t chunk_offset = 0; + inline typename std::enable_if::type + ConvertValues() { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast(PyArray_DATA(out_)); + return ConvertBinaryLike(data_, out_values); + } + + private: + std::shared_ptr col_; + const arrow::ChunkedArray& data_; + PyObject* py_ref_; + PyArrayObject* out_; +}; + +Status ConvertArrayToPandas( + const std::shared_ptr& arr, PyObject* py_ref, PyObject** out) { + static std::string dummy_name = "dummy"; + auto field = std::make_shared(dummy_name, arr->type()); + auto col = std::make_shared(field, arr); + return ConvertColumnToPandas(col, py_ref, out); +} - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); +Status ConvertColumnToPandas( + const std::shared_ptr& col, PyObject* py_ref, PyObject** out) { + ArrowDeserializer converter(col, py_ref); + return converter.Convert(out); +} - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; +// ---------------------------------------------------------------------- +// pandas 0.x DataFrame conversion internals - for (int64_t i = 0; i < arr->length(); ++i) { - // There are 1000 * 60 * 60 * 24 = 86400000ms in a day - out_values[i] = arr->IsNull(i) ? arrow_traits::na_value : in_values[i] / 86400000; - } +class PandasBlock { + public: + enum type { + OBJECT, + UINT8, + INT8, + UINT16, + INT16, + UINT32, + INT32, + UINT64, + INT64, + FLOAT, + DOUBLE, + BOOL, + DATETIME, + CATEGORICAL + }; + + PandasBlock(int64_t num_rows, int num_columns) + : num_rows_(num_rows), num_columns_(num_columns) {} + + virtual Status Allocate() = 0; + virtual Status WriteNext(const std::shared_ptr& col, int64_t placement) = 0; - chunk_offset += arr->length(); + PyObject* block_arr() { return block_arr_.obj(); } + + PyObject* placement_arr() { return placement_arr_.obj(); } + + protected: + Status AllocateNDArray(int npy_type) { + PyAcquireGIL lock; + + npy_intp block_dims[2] = {num_columns_, num_rows_}; + PyObject* block_arr = PyArray_SimpleNew(2, block_dims, npy_type); + if (block_arr == NULL) { + // TODO(wesm): propagating Python exception + return Status::OK(); } + npy_intp placement_dims[1] = {num_columns_}; + PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); + if (placement_arr == NULL) { + // TODO(wesm): propagating Python exception + return Status::OK(); + } + + block_arr_.reset(block_arr); + placement_arr_.reset(placement_arr); + current_placement_index_ = 0; + + block_data_ = reinterpret_cast( + PyArray_DATA(reinterpret_cast(block_arr))); + + placement_data_ = reinterpret_cast( + PyArray_DATA(reinterpret_cast(placement_arr))); + return Status::OK(); } - // Integer specialization - template - inline typename std::enable_if< - arrow_traits::is_pandas_numeric_not_nullable, Status>::type - ConvertValues(const std::shared_ptr& data) { - typedef typename arrow_traits::T T; - size_t chunk_offset = 0; + int64_t num_rows_; + int num_columns_; + int current_placement_index_; - if (data->num_chunks() == 1 && data->null_count() == 0) { - return ConvertValuesZeroCopy(data->chunk(0)); - } + OwnedRef block_arr_; + uint8_t* block_data_; - if (data->null_count() > 0) { - RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); + // ndarray + OwnedRef placement_arr_; + int64_t* placement_data_; - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - // Upcast to double, set NaN as appropriate - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; + DISALLOW_COPY_AND_ASSIGN(PandasBlock); +}; - for (int i = 0; i < arr->length(); ++i) { - out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; - } +class ObjectBlock : public PandasBlock { + public: + using PandasBlock::PandasBlock; - chunk_offset += arr->length(); - } - } else { - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + Status Allocate() override { return AllocateNDArray(NPY_OBJECT); } - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; + Status WriteNext(const std::shared_ptr& col, int64_t placement) override { + Type::type type = col->type()->type; - memcpy(out_values, in_values, sizeof(T) * arr->length()); + PyObject** out_buffer = + reinterpret_cast(block_data_) + current_placement_index_ * num_rows_; - chunk_offset += arr->length(); - } + const ChunkedArray& data = *col->data().get(); + + if (type == Type::BOOL) { + RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer)); + } else if (type == Type::BINARY) { + RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + } else if (type == Type::STRING) { + RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + } else { + std::stringstream ss; + ss << "Unsupported type for object array output: " << col->type()->ToString(); + return Status::NotImplemented(ss.str()); } + placement_data_[current_placement_index_++] = placement; return Status::OK(); } +}; - // Boolean specialization - template - inline typename std::enable_if< - arrow_traits::is_boolean, Status>::type - ConvertValues(const std::shared_ptr& data) { - size_t chunk_offset = 0; - PyAcquireGIL lock; +template +class IntBlock : public PandasBlock { + public: + using PandasBlock::PandasBlock; - if (data->null_count() > 0) { - RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + Status Allocate() override { + return AllocateNDArray(arrow_traits::npy_type); + } - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto bool_arr = static_cast(arr.get()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; - - for (int64_t i = 0; i < arr->length(); ++i) { - if (bool_arr->IsNull(i)) { - Py_INCREF(Py_None); - out_values[i] = Py_None; - } else if (bool_arr->Value(i)) { - // True - Py_INCREF(Py_True); - out_values[i] = Py_True; - } else { - // False - Py_INCREF(Py_False); - out_values[i] = Py_False; - } - } + Status WriteNext(const std::shared_ptr& col, int64_t placement) override { + Type::type type = col->type()->type; - chunk_offset += bool_arr->length(); - } - } else { - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + C_TYPE* out_buffer = + reinterpret_cast(block_data_) + current_placement_index_ * num_rows_; - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto bool_arr = static_cast(arr.get()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; + const ChunkedArray& data = *col->data().get(); - for (int64_t i = 0; i < arr->length(); ++i) { - out_values[i] = static_cast(bool_arr->Value(i)); - } + if (type != ARROW_TYPE) { return Status::NotImplemented(col->type()->ToString()); } - chunk_offset += bool_arr->length(); - } + ConvertIntegerNoNullsSameType(data, out_buffer); + placement_data_[current_placement_index_++] = placement; + return Status::OK(); + } +}; + +using UInt8Block = IntBlock; +using Int8Block = IntBlock; +using UInt16Block = IntBlock; +using Int16Block = IntBlock; +using UInt32Block = IntBlock; +using Int32Block = IntBlock; +using UInt64Block = IntBlock; +using Int64Block = IntBlock; + +class Float32Block : public PandasBlock { + public: + using PandasBlock::PandasBlock; + + Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); } + + Status WriteNext(const std::shared_ptr& col, int64_t placement) override { + Type::type type = col->type()->type; + + if (type != Type::FLOAT) { return Status::NotImplemented(col->type()->ToString()); } + + float* out_buffer = + reinterpret_cast(block_data_) + current_placement_index_ * num_rows_; + + ConvertNumericNullable(*col->data().get(), NAN, out_buffer); + placement_data_[current_placement_index_++] = placement; + return Status::OK(); + } +}; + +class Float64Block : public PandasBlock { + public: + using PandasBlock::PandasBlock; + + Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); } + + Status WriteNext(const std::shared_ptr& col, int64_t placement) override { + Type::type type = col->type()->type; + + double* out_buffer = + reinterpret_cast(block_data_) + current_placement_index_ * num_rows_; + + const ChunkedArray& data = *col->data().get(); + +#define INTEGER_CASE(IN_TYPE) \ + ConvertIntegerWithNulls(data, out_buffer); \ + break; + + switch (type) { + case Type::UINT8: + INTEGER_CASE(uint8_t); + case Type::INT8: + INTEGER_CASE(int8_t); + case Type::UINT16: + INTEGER_CASE(uint16_t); + case Type::INT16: + INTEGER_CASE(int16_t); + case Type::UINT32: + INTEGER_CASE(uint32_t); + case Type::INT32: + INTEGER_CASE(int32_t); + case Type::UINT64: + INTEGER_CASE(uint64_t); + case Type::INT64: + INTEGER_CASE(int64_t); + case Type::FLOAT: + ConvertNumericNullableCast(data, NAN, out_buffer); + break; + case Type::DOUBLE: + ConvertNumericNullable(data, NAN, out_buffer); + break; + default: + return Status::NotImplemented(col->type()->ToString()); } +#undef INTEGER_CASE + + placement_data_[current_placement_index_++] = placement; return Status::OK(); } +}; - // UTF8 strings - template - inline typename std::enable_if< - T2 == arrow::Type::STRING, Status>::type - ConvertValues(const std::shared_ptr& data) { - size_t chunk_offset = 0; - PyAcquireGIL lock; +class BoolBlock : public PandasBlock { + public: + using PandasBlock::PandasBlock; - RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + Status Allocate() override { return AllocateNDArray(NPY_BOOL); } - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto string_arr = static_cast(arr.get()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; - - const uint8_t* data_ptr; - int32_t length; - if (data->null_count() > 0) { - for (int64_t i = 0; i < arr->length(); ++i) { - if (string_arr->IsNull(i)) { - Py_INCREF(Py_None); - out_values[i] = Py_None; - } else { - data_ptr = string_arr->GetValue(i, &length); - - out_values[i] = make_pystring(data_ptr, length); - if (out_values[i] == nullptr) { - return Status::UnknownError("String initialization failed"); - } - } - } + Status WriteNext(const std::shared_ptr& col, int64_t placement) override { + Type::type type = col->type()->type; + + if (type != Type::BOOL) { return Status::NotImplemented(col->type()->ToString()); } + + uint8_t* out_buffer = + reinterpret_cast(block_data_) + current_placement_index_ * num_rows_; + + ConvertBooleanNoNulls(*col->data().get(), out_buffer); + placement_data_[current_placement_index_++] = placement; + return Status::OK(); + } +}; + +class DatetimeBlock : public PandasBlock { + public: + using PandasBlock::PandasBlock; + + Status Allocate() override { + RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME)); + + PyAcquireGIL lock; + auto date_dtype = reinterpret_cast( + PyArray_DESCR(reinterpret_cast(block_arr_.obj()))->c_metadata); + date_dtype->meta.base = NPY_FR_ns; + return Status::OK(); + } + + Status WriteNext(const std::shared_ptr& col, int64_t placement) override { + Type::type type = col->type()->type; + + int64_t* out_buffer = + reinterpret_cast(block_data_) + current_placement_index_ * num_rows_; + + const ChunkedArray& data = *col.get()->data(); + + if (type == Type::DATE) { + // DateType is millisecond timestamp stored as int64_t + // TODO(wesm): Do we want to make sure to zero out the milliseconds? + ConvertDatetimeNanos(data, out_buffer); + } else if (type == Type::TIMESTAMP) { + auto ts_type = static_cast(col->type().get()); + + if (ts_type->unit == arrow::TimeUnit::NANO) { + ConvertNumericNullable(data, kPandasTimestampNull, out_buffer); + } else if (ts_type->unit == arrow::TimeUnit::MICRO) { + ConvertDatetimeNanos(data, out_buffer); + } else if (ts_type->unit == arrow::TimeUnit::MILLI) { + ConvertDatetimeNanos(data, out_buffer); + } else if (ts_type->unit == arrow::TimeUnit::SECOND) { + ConvertDatetimeNanos(data, out_buffer); } else { - for (int64_t i = 0; i < arr->length(); ++i) { - data_ptr = string_arr->GetValue(i, &length); - out_values[i] = make_pystring(data_ptr, length); - if (out_values[i] == nullptr) { - return Status::UnknownError("String initialization failed"); - } - } + return Status::NotImplemented("Unsupported time unit"); } - - chunk_offset += string_arr->length(); + } else { + return Status::NotImplemented(col->type()->ToString()); } + placement_data_[current_placement_index_++] = placement; return Status::OK(); } +}; - template - inline typename std::enable_if< - T2 == arrow::Type::BINARY, Status>::type - ConvertValues(const std::shared_ptr& data) { - size_t chunk_offset = 0; - PyAcquireGIL lock; +// class CategoricalBlock : public PandasBlock {}; - RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); +Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, + std::shared_ptr* block) { +#define BLOCK_CASE(NAME, TYPE) \ + case PandasBlock::NAME: \ + *block = std::make_shared(num_rows, num_columns); \ + break; - for (int c = 0; c < data->num_chunks(); c++) { - const std::shared_ptr arr = data->chunk(c); - auto binary_arr = static_cast(arr.get()); - auto out_values = reinterpret_cast(PyArray_DATA(out_)) + chunk_offset; - - const uint8_t* data_ptr; - int32_t length; - if (data->null_count() > 0) { - for (int64_t i = 0; i < arr->length(); ++i) { - if (binary_arr->IsNull(i)) { - Py_INCREF(Py_None); - out_values[i] = Py_None; - } else { - data_ptr = binary_arr->GetValue(i, &length); - - out_values[i] = PyBytes_FromStringAndSize( - reinterpret_cast(data_ptr), length); - if (out_values[i] == nullptr) { - return Status::UnknownError("String initialization failed"); - } - } - } + switch (type) { + BLOCK_CASE(OBJECT, ObjectBlock); + BLOCK_CASE(UINT8, UInt8Block); + BLOCK_CASE(INT8, Int8Block); + BLOCK_CASE(UINT16, UInt16Block); + BLOCK_CASE(INT16, Int16Block); + BLOCK_CASE(UINT32, UInt32Block); + BLOCK_CASE(INT32, Int32Block); + BLOCK_CASE(UINT64, UInt64Block); + BLOCK_CASE(INT64, Int64Block); + BLOCK_CASE(FLOAT, Float32Block); + BLOCK_CASE(DOUBLE, Float64Block); + BLOCK_CASE(BOOL, BoolBlock); + BLOCK_CASE(DATETIME, DatetimeBlock); + case PandasBlock::CATEGORICAL: + return Status::NotImplemented("categorical"); + } + +#undef BLOCK_CASE + + return (*block)->Allocate(); +} + +// Construct the exact pandas 0.x "BlockManager" memory layout +// +// * For each column determine the correct output pandas type +// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output +// * Allocate block placement arrays +// * Write Arrow columns out into each slice of memory; populate block +// * placement arrays as we go +class DataFrameBlockCreator { + public: + DataFrameBlockCreator(const std::shared_ptr& table) : table_(table) {} + + Status Convert(int nthreads, PyObject** output) { + column_types_.resize(table_->num_columns()); + type_counts_.clear(); + blocks_.clear(); + + RETURN_NOT_OK(CountColumnTypes()); + RETURN_NOT_OK(CreateBlocks()); + RETURN_NOT_OK(WriteTableToBlocks(nthreads)); + + return GetResultList(output); + } + + Status CountColumnTypes() { + for (int i = 0; i < table_->num_columns(); ++i) { + std::shared_ptr col = table_->column(i); + PandasBlock::type output_type; + + switch (col->type()->type) { + case Type::BOOL: + output_type = col->null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; + break; + case Type::UINT8: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8; + break; + case Type::INT8: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8; + break; + case Type::UINT16: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16; + break; + case Type::INT16: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16; + break; + case Type::UINT32: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32; + break; + case Type::INT32: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32; + break; + case Type::INT64: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64; + break; + case Type::UINT64: + output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64; + break; + case Type::FLOAT: + output_type = PandasBlock::FLOAT; + break; + case Type::DOUBLE: + output_type = PandasBlock::DOUBLE; + break; + case Type::STRING: + case Type::BINARY: + output_type = PandasBlock::OBJECT; + break; + case Type::DATE: + output_type = PandasBlock::DATETIME; + break; + case Type::TIMESTAMP: + output_type = PandasBlock::DATETIME; + break; + default: + return Status::NotImplemented(col->type()->ToString()); + } + + auto it = type_counts_.find(output_type); + if (it != type_counts_.end()) { + // Increment count + it->second += 1; } else { - for (int64_t i = 0; i < arr->length(); ++i) { - data_ptr = binary_arr->GetValue(i, &length); - out_values[i] = PyBytes_FromStringAndSize( - reinterpret_cast(data_ptr), length); - if (out_values[i] == nullptr) { - return Status::UnknownError("String initialization failed"); - } - } + // Add key to map + type_counts_[output_type] = 1; } - chunk_offset += binary_arr->length(); + column_types_[i] = output_type; } + return Status::OK(); + } + Status CreateBlocks() { + for (const auto& it : type_counts_) { + PandasBlock::type type = static_cast(it.first); + std::shared_ptr block; + RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block)); + blocks_[type] = block; + } return Status::OK(); } - private: - std::shared_ptr col_; - PyObject* py_ref_; - PyArrayObject* out_; -}; + Status WriteTableToBlocks(int nthreads) { + if (nthreads > 1) { + return Status::NotImplemented("multithreading not yet implemented"); + } -#define FROM_ARROW_CASE(TYPE) \ - case arrow::Type::TYPE: \ - { \ - ArrowDeserializer converter(col, py_ref); \ - return converter.Convert(out); \ - } \ - break; + for (int i = 0; i < table_->num_columns(); ++i) { + std::shared_ptr col = table_->column(i); + PandasBlock::type output_type = column_types_[i]; -Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out) { - static std::string dummy_name = "dummy"; - auto field = std::make_shared(dummy_name, arr->type()); - auto col = std::make_shared(field, arr); - return ConvertColumnToPandas(col, py_ref, out); -} + auto it = blocks_.find(output_type); + if (it == blocks_.end()) { return Status::KeyError("No block allocated"); } + RETURN_NOT_OK(it->second->WriteNext(col, i)); + } + return Status::OK(); + } -Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, - PyObject** out) { - switch(col->type()->type) { - FROM_ARROW_CASE(BOOL); - FROM_ARROW_CASE(INT8); - FROM_ARROW_CASE(INT16); - FROM_ARROW_CASE(INT32); - FROM_ARROW_CASE(INT64); - FROM_ARROW_CASE(UINT8); - FROM_ARROW_CASE(UINT16); - FROM_ARROW_CASE(UINT32); - FROM_ARROW_CASE(UINT64); - FROM_ARROW_CASE(FLOAT); - FROM_ARROW_CASE(DOUBLE); - FROM_ARROW_CASE(BINARY); - FROM_ARROW_CASE(STRING); - FROM_ARROW_CASE(DATE); - FROM_ARROW_CASE(TIMESTAMP); - default: - return Status::NotImplemented("Arrow type reading not implemented"); + Status GetResultList(PyObject** out) { + auto num_blocks = static_cast(blocks_.size()); + PyObject* result = PyList_New(num_blocks); + RETURN_IF_PYERROR(); + + int i = 0; + for (const auto& it : blocks_) { + const std::shared_ptr block = it.second; + + PyObject* item = PyTuple_New(2); + RETURN_IF_PYERROR(); + + PyObject* block_arr = block->block_arr(); + PyObject* placement_arr = block->placement_arr(); + Py_INCREF(block_arr); + Py_INCREF(placement_arr); + PyTuple_SET_ITEM(item, 0, block_arr); + PyTuple_SET_ITEM(item, 1, placement_arr); + + if (PyList_SET_ITEM(result, i++, item) < 0) { RETURN_IF_PYERROR(); } + } + *out = result; + return Status::OK(); } - return Status::OK(); + + private: + std::shared_ptr
table_; + std::vector column_types_; + + // block type -> type count + std::unordered_map type_counts_; + + // block type -> block + std::unordered_map> blocks_; +}; + +Status ConvertTableToPandas( + const std::shared_ptr
& table, int nthreads, PyObject** out) { + DataFrameBlockCreator helper(table); + return helper.Convert(nthreads, out); } -} // namespace pyarrow +} // namespace pyarrow diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index 532495dd792..60dadd473ad 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -33,27 +33,42 @@ class Array; class Column; class MemoryPool; class Status; +class Table; -} // namespace arrow +} // namespace arrow namespace pyarrow { PYARROW_EXPORT -arrow::Status ConvertArrayToPandas(const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out); +arrow::Status ConvertArrayToPandas( + const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); PYARROW_EXPORT -arrow::Status ConvertColumnToPandas(const std::shared_ptr& col, - PyObject* py_ref, PyObject** out); +arrow::Status ConvertColumnToPandas( + const std::shared_ptr& col, PyObject* py_ref, PyObject** out); + +struct PandasOptions { + bool strings_to_categorical; +}; + +// Convert a whole table as efficiently as possible to a pandas.DataFrame. +// +// The returned Python object is a list of tuples consisting of the exact 2D +// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. +// +// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) +PYARROW_EXPORT +arrow::Status ConvertTableToPandas( + const std::shared_ptr& table, int nthreads, PyObject** out); PYARROW_EXPORT arrow::Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out); PYARROW_EXPORT -arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, - std::shared_ptr* out); +arrow::Status PandasToArrow( + arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr* out); -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_ADAPTERS_PANDAS_H +#endif // PYARROW_ADAPTERS_PANDAS_H diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h index 6dbbc45d40c..f65cc097f54 100644 --- a/python/src/pyarrow/api.h +++ b/python/src/pyarrow/api.h @@ -23,4 +23,4 @@ #include "pyarrow/adapters/builtin.h" #include "pyarrow/adapters/pandas.h" -#endif // PYARROW_API_H +#endif // PYARROW_API_H diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc index fb4d3496ac7..8660ac8f0ce 100644 --- a/python/src/pyarrow/common.cc +++ b/python/src/pyarrow/common.cc @@ -73,7 +73,7 @@ arrow::MemoryPool* get_memory_pool() { PyBytesBuffer::PyBytesBuffer(PyObject* obj) : Buffer(reinterpret_cast(PyBytes_AS_STRING(obj)), - PyBytes_GET_SIZE(obj)), + PyBytes_GET_SIZE(obj)), obj_(obj) { Py_INCREF(obj_); } @@ -83,4 +83,4 @@ PyBytesBuffer::~PyBytesBuffer() { Py_DECREF(obj_); } -} // namespace pyarrow +} // namespace pyarrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 7e3382634a7..639918d309f 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -24,7 +24,9 @@ #include "arrow/buffer.h" #include "arrow/util/macros.h" -namespace arrow { class MemoryPool; } +namespace arrow { +class MemoryPool; +} namespace pyarrow { @@ -34,27 +36,18 @@ class OwnedRef { public: OwnedRef() : obj_(nullptr) {} - OwnedRef(PyObject* obj) : - obj_(obj) {} + OwnedRef(PyObject* obj) : obj_(obj) {} - ~OwnedRef() { - Py_XDECREF(obj_); - } + ~OwnedRef() { Py_XDECREF(obj_); } void reset(PyObject* obj) { - if (obj_ != nullptr) { - Py_XDECREF(obj_); - } + if (obj_ != nullptr) { Py_XDECREF(obj_); } obj_ = obj; } - void release() { - obj_ = nullptr; - } + void release() { obj_ = nullptr; } - PyObject* obj() const{ - return obj_; - } + PyObject* obj() const { return obj_; } private: PyObject* obj_; @@ -78,13 +71,10 @@ struct PyObjectStringify { class PyGILGuard { public: - PyGILGuard() { - state_ = PyGILState_Ensure(); - } + PyGILGuard() { state_ = PyGILState_Ensure(); } + + ~PyGILGuard() { PyGILState_Release(state_); } - ~PyGILGuard() { - PyGILState_Release(state_); - } private: PyGILState_STATE state_; DISALLOW_COPY_AND_ASSIGN(PyGILGuard); @@ -108,8 +98,7 @@ PYARROW_EXPORT arrow::MemoryPool* get_memory_pool(); class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { public: - NumPyBuffer(PyArrayObject* arr) - : Buffer(nullptr, 0) { + NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) { arr_ = arr; Py_INCREF(arr); @@ -118,9 +107,7 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { capacity_ = size_; } - virtual ~NumPyBuffer() { - Py_XDECREF(arr_); - } + virtual ~NumPyBuffer() { Py_XDECREF(arr_); } private: PyArrayObject* arr_; @@ -135,22 +122,17 @@ class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer { PyObject* obj_; }; - class PyAcquireGIL { public: - PyAcquireGIL() { - state_ = PyGILState_Ensure(); - } + PyAcquireGIL() { state_ = PyGILState_Ensure(); } - ~PyAcquireGIL() { - PyGILState_Release(state_); - } + ~PyAcquireGIL() { PyGILState_Release(state_); } private: PyGILState_STATE state_; DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); }; -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_COMMON_H +#endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc index 730d2db99a5..e1002bf4fd1 100644 --- a/python/src/pyarrow/config.cc +++ b/python/src/pyarrow/config.cc @@ -21,8 +21,7 @@ namespace pyarrow { -void pyarrow_init() { -} +void pyarrow_init() {} PyObject* numpy_nan = nullptr; @@ -31,4 +30,4 @@ void pyarrow_set_numpy_nan(PyObject* obj) { numpy_nan = obj; } -} // namespace pyarrow +} // namespace pyarrow diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h index 82936b1a5f3..386ee4b1e25 100644 --- a/python/src/pyarrow/config.h +++ b/python/src/pyarrow/config.h @@ -24,7 +24,7 @@ #include "pyarrow/visibility.h" #if PY_MAJOR_VERSION >= 3 - #define PyString_Check PyUnicode_Check +#define PyString_Check PyUnicode_Check #endif namespace pyarrow { @@ -38,6 +38,6 @@ void pyarrow_init(); PYARROW_EXPORT void pyarrow_set_numpy_nan(PyObject* obj); -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_CONFIG_H +#endif // PYARROW_CONFIG_H diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index b42199c8e04..3f650326e09 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -23,36 +23,35 @@ using namespace arrow; namespace pyarrow { - -#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ - case Type::NAME: \ - return FACTORY(); \ +#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ + case Type::NAME: \ + return FACTORY(); \ break; std::shared_ptr GetPrimitiveType(Type::type type) { switch (type) { case Type::NA: return null(); - GET_PRIMITIVE_TYPE(UINT8, uint8); - GET_PRIMITIVE_TYPE(INT8, int8); - GET_PRIMITIVE_TYPE(UINT16, uint16); - GET_PRIMITIVE_TYPE(INT16, int16); - GET_PRIMITIVE_TYPE(UINT32, uint32); - GET_PRIMITIVE_TYPE(INT32, int32); - GET_PRIMITIVE_TYPE(UINT64, uint64); - GET_PRIMITIVE_TYPE(INT64, int64); - GET_PRIMITIVE_TYPE(DATE, date); + GET_PRIMITIVE_TYPE(UINT8, uint8); + GET_PRIMITIVE_TYPE(INT8, int8); + GET_PRIMITIVE_TYPE(UINT16, uint16); + GET_PRIMITIVE_TYPE(INT16, int16); + GET_PRIMITIVE_TYPE(UINT32, uint32); + GET_PRIMITIVE_TYPE(INT32, int32); + GET_PRIMITIVE_TYPE(UINT64, uint64); + GET_PRIMITIVE_TYPE(INT64, int64); + GET_PRIMITIVE_TYPE(DATE, date); case Type::TIMESTAMP: return arrow::timestamp(arrow::TimeUnit::MICRO); break; - GET_PRIMITIVE_TYPE(BOOL, boolean); - GET_PRIMITIVE_TYPE(FLOAT, float32); - GET_PRIMITIVE_TYPE(DOUBLE, float64); - GET_PRIMITIVE_TYPE(BINARY, binary); - GET_PRIMITIVE_TYPE(STRING, utf8); + GET_PRIMITIVE_TYPE(BOOL, boolean); + GET_PRIMITIVE_TYPE(FLOAT, float32); + GET_PRIMITIVE_TYPE(DOUBLE, float64); + GET_PRIMITIVE_TYPE(BINARY, binary); + GET_PRIMITIVE_TYPE(STRING, utf8); default: return nullptr; } } -} // namespace pyarrow +} // namespace pyarrow diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index 8334d974c02..788c3eedddf 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -31,6 +31,6 @@ using arrow::Type; PYARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_HELPERS_H +#endif // PYARROW_HELPERS_H diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index 12f5ba0bf2b..ac1aa635b40 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -33,8 +33,7 @@ namespace pyarrow { // ---------------------------------------------------------------------- // Python file -PythonFile::PythonFile(PyObject* file) - : file_(file) { +PythonFile::PythonFile(PyObject* file) : file_(file) { Py_INCREF(file_); } @@ -81,8 +80,8 @@ Status PythonFile::Read(int64_t nbytes, PyObject** out) { } Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { - PyObject* py_data = PyBytes_FromStringAndSize( - reinterpret_cast(data), nbytes); + PyObject* py_data = + PyBytes_FromStringAndSize(reinterpret_cast(data), nbytes); ARROW_RETURN_NOT_OK(CheckPyError()); PyObject* result = PyObject_CallMethod(file_, "write", "(O)", py_data); @@ -102,7 +101,7 @@ Status PythonFile::Tell(int64_t* position) { // PyLong_AsLongLong can raise OverflowError ARROW_RETURN_NOT_OK(CheckPyError()); - return Status::OK(); + return Status::OK(); } // ---------------------------------------------------------------------- @@ -156,7 +155,8 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) Status PyReadableFile::GetSize(int64_t* size) { PyGILGuard lock; - int64_t current_position;; + int64_t current_position; + ; ARROW_RETURN_NOT_OK(file_->Tell(¤t_position)); ARROW_RETURN_NOT_OK(file_->Seek(0, 2)); @@ -204,7 +204,7 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { PyBytesReader::PyBytesReader(PyObject* obj) : arrow::io::BufferReader(reinterpret_cast(PyBytes_AS_STRING(obj)), - PyBytes_GET_SIZE(obj)), + PyBytes_GET_SIZE(obj)), obj_(obj) { Py_INCREF(obj_); } diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h index e14aa8cfb27..fd3e7c08872 100644 --- a/python/src/pyarrow/io.h +++ b/python/src/pyarrow/io.h @@ -24,7 +24,9 @@ #include "pyarrow/config.h" #include "pyarrow/visibility.h" -namespace arrow { class MemoryPool; } +namespace arrow { +class MemoryPool; +} namespace pyarrow { @@ -92,6 +94,6 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader { // TODO(wesm): seekable output files -} // namespace pyarrow +} // namespace pyarrow #endif // PYARROW_IO_H diff --git a/python/src/pyarrow/numpy_interop.h b/python/src/pyarrow/numpy_interop.h index 882d287c7c5..6326527a674 100644 --- a/python/src/pyarrow/numpy_interop.h +++ b/python/src/pyarrow/numpy_interop.h @@ -53,6 +53,6 @@ inline int import_numpy() { return 0; } -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_NUMPY_INTEROP_H +#endif // PYARROW_NUMPY_INTEROP_H diff --git a/python/src/pyarrow/util/datetime.h b/python/src/pyarrow/util/datetime.h index b67accc388f..9ffa6910524 100644 --- a/python/src/pyarrow/util/datetime.h +++ b/python/src/pyarrow/util/datetime.h @@ -22,8 +22,8 @@ #include namespace pyarrow { - -inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { + +inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { struct tm date = {0}; date.tm_year = PyDateTime_GET_YEAR(pydate) - 1900; date.tm_mon = PyDateTime_GET_MONTH(pydate) - 1; @@ -35,6 +35,6 @@ inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { return lrint(difftime(mktime(&date), mktime(&epoch)) * 1000); } -} // namespace pyarrow +} // namespace pyarrow -#endif // PYARROW_UTIL_DATETIME_H +#endif // PYARROW_UTIL_DATETIME_H diff --git a/python/src/pyarrow/util/test_main.cc b/python/src/pyarrow/util/test_main.cc index 00139f36742..6fb7c0536ee 100644 --- a/python/src/pyarrow/util/test_main.cc +++ b/python/src/pyarrow/util/test_main.cc @@ -17,7 +17,7 @@ #include -int main(int argc, char **argv) { +int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); int ret = RUN_ALL_TESTS();