diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 84aad82e2a9..7938d8473b6 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -57,7 +57,7 @@ set(ARROW_PYTHON_SRCS init.cc io.cc numpy_convert.cc - pandas_to_arrow.cc + numpy_to_arrow.cc python_to_arrow.cc pyarrow.cc ) @@ -100,7 +100,7 @@ install(FILES io.h numpy_convert.h numpy_interop.h - pandas_to_arrow.h + numpy_to_arrow.h python_to_arrow.h platform.h pyarrow.h diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index 4ceb3f1a45d..a000ac5fa5a 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -25,7 +25,7 @@ #include "arrow/python/helpers.h" #include "arrow/python/io.h" #include "arrow/python/numpy_convert.h" -#include "arrow/python/pandas_to_arrow.h" +#include "arrow/python/numpy_to_arrow.h" #include "arrow/python/python_to_arrow.h" #endif // ARROW_PYTHON_API_H diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 747b872af0a..f9d7361e004 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -359,7 +360,11 @@ class TypedConverterVisitor : public TypedConverter { if (PySequence_Check(obj)) { for (int64_t i = 0; i < size; ++i) { OwnedRef ref(PySequence_GetItem(obj, i)); - RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + if (ref.obj() == Py_None) { + RETURN_NOT_OK(this->typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + } } } else if (PyObject_HasAttrString(obj, "__iter__")) { PyObject* iter = PyObject_GetIter(obj); @@ -370,7 +375,11 @@ class TypedConverterVisitor : public TypedConverter { // consuming at size. while ((item = PyIter_Next(iter)) && i < size) { OwnedRef ref(item); - RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + if (ref.obj() == Py_None) { + RETURN_NOT_OK(this->typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + } ++i; } if (size != i) { @@ -388,52 +397,136 @@ class TypedConverterVisitor : public TypedConverter { class NullConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - return Status::Invalid("NullConverter: passed non-None value"); - } + return Status::Invalid("NullConverter: passed non-None value"); } }; class BoolConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - if (item.obj() == Py_True) { - return typed_builder_->Append(true); - } else { - return typed_builder_->Append(false); - } + return typed_builder_->Append(item.obj() == Py_True); + } +}; + +class Int8Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class Int16Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class Int32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); } }; class Int64Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - int64_t val; - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - val = static_cast(PyLong_AsLongLong(item.obj())); - RETURN_IF_PYERROR(); - return typed_builder_->Append(val); + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); + } +}; + +class UInt8Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); } }; -class DateConverter : public TypedConverterVisitor { +class UInt16Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - PyDateTime_Date* pydate = reinterpret_cast(item.obj()); - return typed_builder_->Append(PyDate_to_ms(pydate)); + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class UInt32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class UInt64Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); + } +}; + +class DateConverter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + auto pydate = reinterpret_cast(item.obj()); + return typed_builder_->Append(PyDate_to_ms(pydate)); } }; @@ -441,27 +534,17 @@ class TimestampConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - PyDateTime_DateTime* pydatetime = - reinterpret_cast(item.obj()); - return typed_builder_->Append(PyDateTime_to_us(pydatetime)); - } + auto pydatetime = reinterpret_cast(item.obj()); + return typed_builder_->Append(PyDateTime_to_us(pydatetime)); } }; class DoubleConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - double val; - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - val = PyFloat_AsDouble(item.obj()); - RETURN_IF_PYERROR(); - return typed_builder_->Append(val); - } + double val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); } }; @@ -473,10 +556,7 @@ class BytesConverter : public TypedConverterVisitorAppendNull()); - return Status::OK(); - } else if (PyUnicode_Check(item.obj())) { + if (PyUnicode_Check(item.obj())) { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); @@ -504,10 +584,7 @@ class FixedWidthBytesConverter Py_ssize_t expected_length = std::dynamic_pointer_cast(typed_builder_->type()) ->byte_width(); - if (item.obj() == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - return Status::OK(); - } else if (PyUnicode_Check(item.obj())) { + if (PyUnicode_Check(item.obj())) { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); @@ -535,9 +612,7 @@ class UTF8Converter : public TypedConverterVisitor Py_ssize_t length; PyObject* obj = item.obj(); - if (obj == Py_None) { - return typed_builder_->AppendNull(); - } else if (PyBytes_Check(obj)) { + if (PyBytes_Check(obj)) { tmp.reset( PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj))); RETURN_IF_PYERROR(); @@ -565,14 +640,10 @@ class ListConverter : public TypedConverterVisitor { Status Init(ArrayBuilder* builder) override; inline Status AppendItem(const OwnedRef& item) override { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - RETURN_NOT_OK(typed_builder_->Append()); - PyObject* item_obj = item.obj(); - int64_t list_size = static_cast(PySequence_Size(item_obj)); - return value_converter_->AppendData(item_obj, list_size); - } + RETURN_NOT_OK(typed_builder_->Append()); + PyObject* item_obj = item.obj(); + int64_t list_size = static_cast(PySequence_Size(item_obj)); + return value_converter_->AppendData(item_obj, list_size); } protected: @@ -584,16 +655,12 @@ class DecimalConverter public: inline Status AppendItem(const OwnedRef& item) { /// TODO(phillipc): Check for nan? - if (item.obj() != Py_None) { - std::string string; - RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string)); - - Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); - return typed_builder_->Append(value); - } + std::string string; + RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string)); - return typed_builder_->AppendNull(); + Decimal128 value; + RETURN_NOT_OK(Decimal128::FromString(string, &value)); + return typed_builder_->Append(value); } }; @@ -604,8 +671,22 @@ std::shared_ptr GetConverter(const std::shared_ptr& type return std::make_shared(); case Type::BOOL: return std::make_shared(); + case Type::INT8: + return std::make_shared(); + case Type::INT16: + return std::make_shared(); + case Type::INT32: + return std::make_shared(); case Type::INT64: return std::make_shared(); + case Type::UINT8: + return std::make_shared(); + case Type::UINT16: + return std::make_shared(); + case Type::UINT32: + return std::make_shared(); + case Type::UINT64: + return std::make_shared(); case Type::DATE64: return std::make_shared(); case Type::TIMESTAMP: diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc similarity index 90% rename from cpp/src/arrow/python/pandas_to_arrow.cc rename to cpp/src/arrow/python/numpy_to_arrow.cc index dc5b67f53e4..7151c94c0db 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -19,10 +19,9 @@ #define ARROW_NO_DEFAULT_MEMORY_POOL +#include "arrow/python/numpy_to_arrow.h" #include "arrow/python/numpy_interop.h" -#include "arrow/python/pandas_to_arrow.h" - #include #include #include @@ -60,10 +59,14 @@ namespace py { using internal::NumPyTypeSize; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); + // ---------------------------------------------------------------------- // Conversion utilities -static inline bool PyFloat_isnan(const PyObject* obj) { +namespace { + +inline bool PyFloat_isnan(const PyObject* obj) { if (PyFloat_Check(obj)) { double val = PyFloat_AS_DOUBLE(obj); return val != val; @@ -71,11 +74,12 @@ static inline bool PyFloat_isnan(const PyObject* obj) { return false; } } -static inline bool PandasObjectIsNull(const PyObject* obj) { + +inline bool PandasObjectIsNull(const PyObject* obj) { return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); } -static inline bool PyObject_is_string(const PyObject* obj) { +inline bool PyObject_is_string(const PyObject* obj) { #if PY_MAJOR_VERSION >= 3 return PyUnicode_Check(obj) || PyBytes_Check(obj); #else @@ -83,14 +87,14 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } -static inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } +inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } -static inline bool PyObject_is_integer(const PyObject* obj) { +inline bool PyObject_is_integer(const PyObject* obj) { return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } template -static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { +inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef internal::npy_traits traits; typedef typename traits::value_type T; @@ -109,7 +113,7 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { } // Returns null count -static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { +int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { int64_t null_count = 0; Ndarray1DIndexer mask_values(mask); @@ -123,29 +127,6 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap return null_count; } -template -static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { - typedef internal::npy_traits traits; - typedef typename traits::value_type T; - - // TODO(wesm): Vector append when not strided - Ndarray1DIndexer values(array); - if (traits::supports_nulls) { - for (int64_t i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - RETURN_NOT_OK(builder->AppendNull()); - } else { - RETURN_NOT_OK(builder->Append(values[i])); - } - } - } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(builder->Append(values[i])); - } - } - return Status::OK(); -} - Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { if (PyArray_NDIM(numpy_array) != 1) { return Status::Invalid("only handle 1-dimensional arrays"); @@ -162,7 +143,7 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { return Status::OK(); } -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); +} // namespace /// Append as many string objects from NumPy arrays to a `StringBuilder` as we /// can fit @@ -272,14 +253,15 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas // ---------------------------------------------------------------------- // Conversion from NumPy-in-Pandas to Arrow -class PandasConverter { +class NumPyConverter { public: - PandasConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type) + NumPyConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type, bool use_pandas_null_sentinels) : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), - mask_(nullptr) { + mask_(nullptr), + use_pandas_null_sentinels_(use_pandas_null_sentinels) { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } @@ -291,6 +273,39 @@ class PandasConverter { return astrides[0] != PyArray_DESCR(arr_)->elsize; } + Status Convert(); + + const std::vector>& result() const { return out_arrays_; } + + template + typename std::enable_if::value || + std::is_same::value, + Status>::type + Visit(const T& type) { + return VisitNative(); + } + + Status Visit(const Date32Type& type) { return VisitNative(); } + Status Visit(const Date64Type& type) { return VisitNative(); } + Status Visit(const TimestampType& type) { return VisitNative(); } + Status Visit(const Time32Type& type) { return VisitNative(); } + Status Visit(const Time64Type& type) { return VisitNative(); } + + Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const FixedSizeBinaryType& type) { + return TypeNotImplemented(type.ToString()); + } + + Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } + + protected: Status InitNullBitmap() { int64_t null_bytes = BitUtil::BytesForBits(length_); @@ -317,6 +332,32 @@ class PandasConverter { return Status::OK(); } + template + Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { + typedef internal::npy_traits traits; + typedef typename traits::value_type T; + + const bool null_sentinels_possible = + (use_pandas_null_sentinels_ && traits::supports_nulls); + + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(array); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + return Status::OK(); + } + Status PushArray(const std::shared_ptr& data) { std::shared_ptr result; RETURN_NOT_OK(MakeArray(data, &result)); @@ -328,7 +369,10 @@ class PandasConverter { Status VisitNative() { using traits = internal::arrow_traits; - if (mask_ != nullptr || traits::supports_nulls) { + const bool null_sentinels_possible = + (use_pandas_null_sentinels_ && traits::supports_nulls); + + if (mask_ != nullptr || null_sentinels_possible) { RETURN_NOT_OK(InitNullBitmap()); } @@ -338,7 +382,7 @@ class PandasConverter { int64_t null_count = 0; if (mask_ != nullptr) { null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - } else if (traits::supports_nulls) { + } else if (null_sentinels_possible) { // TODO(wesm): this presumes the NumPy C type and arrow C type are the // same null_count = ValuesToBitmap(arr_, null_bitmap_data_); @@ -350,58 +394,17 @@ class PandasConverter { return PushArray(arr_data); } - template - typename std::enable_if::value || - std::is_same::value, - Status>::type - Visit(const T& type) { - return VisitNative(); - } - - Status Visit(const Date32Type& type) { return VisitNative(); } - Status Visit(const Date64Type& type) { return VisitNative(); } - Status Visit(const TimestampType& type) { return VisitNative(); } - Status Visit(const Time32Type& type) { return VisitNative(); } - Status Visit(const Time64Type& type) { return VisitNative(); } - Status TypeNotImplemented(std::string type_name) { std::stringstream ss; - ss << "PandasConverter doesn't implement <" << type_name << "> conversion. "; + ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. "; return Status::NotImplemented(ss.str()); } - Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const FixedSizeBinaryType& type) { - return TypeNotImplemented(type.ToString()); - } - - Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } - - Status Convert() { - if (PyArray_NDIM(arr_) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - if (type_ == nullptr) { - return Status::Invalid("Must pass data type"); - } - - // Visit the type to perform conversion - return VisitTypeInline(*type_, this); - } - - const std::vector>& result() const { return out_arrays_; } - // ---------------------------------------------------------------------- // Conversion logic for various object dtype arrays + Status ConvertObjects(); + template Status ConvertTypedLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list); @@ -419,17 +422,17 @@ class PandasConverter { PyObject* list); Status ConvertDecimals(); Status ConvertTimes(); - Status ConvertObjects(); Status ConvertObjectsInfer(); Status ConvertObjectsInferAndCast(); - protected: MemoryPool* pool_; std::shared_ptr type_; PyArrayObject* arr_; PyArrayObject* mask_; int64_t length_; + bool use_pandas_null_sentinels_; + // Used in visitor pattern std::vector> out_arrays_; @@ -437,6 +440,23 @@ class PandasConverter { uint8_t* null_bitmap_data_; }; +Status NumPyConverter::Convert() { + if (PyArray_NDIM(arr_) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) { + return ConvertObjects(); + } + + if (type_ == nullptr) { + return Status::Invalid("Must pass data type for non-object arrays"); + } + + // Visit the type to perform conversion + return VisitTypeInline(*type_, this); +} + template void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { // Passing input_data as non-const is a concession to PyObject* @@ -482,7 +502,7 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len } template -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { using traits = internal::arrow_traits; using T = typename traits::T; @@ -513,7 +533,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* data) { } template <> -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { // Handle LONGLONG->INT64 and other fun things int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); int type_size = NumPyTypeSize(type_num_compat); @@ -552,7 +572,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* } template <> -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { int64_t nbytes = BitUtil::BytesForBits(length_); auto buffer = std::make_shared(pool_); RETURN_NOT_OK(buffer->Resize(nbytes)); @@ -590,7 +610,7 @@ struct UnboxDate { }; template -Status PandasConverter::ConvertDates() { +Status NumPyConverter::ConvertDates() { PyAcquireGIL lock; using BuilderType = typename TypeTraits::BuilderType; @@ -626,7 +646,7 @@ Status PandasConverter::ConvertDates() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertDecimals() { +Status NumPyConverter::ConvertDecimals() { PyAcquireGIL lock; // Import the decimal module and Decimal class @@ -669,7 +689,7 @@ Status PandasConverter::ConvertDecimals() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertTimes() { +Status NumPyConverter::ConvertTimes() { // Convert array of datetime.time objects to Arrow PyAcquireGIL lock; PyDateTime_IMPORT; @@ -697,7 +717,7 @@ Status PandasConverter::ConvertTimes() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectStrings() { +Status NumPyConverter::ConvertObjectStrings() { PyAcquireGIL lock; // The output type at this point is inconclusive because there may be bytes @@ -729,7 +749,7 @@ Status PandasConverter::ConvertObjectStrings() { return Status::OK(); } -Status PandasConverter::ConvertObjectFloats() { +Status NumPyConverter::ConvertObjectFloats() { PyAcquireGIL lock; Ndarray1DIndexer objects(arr_); @@ -764,7 +784,7 @@ Status PandasConverter::ConvertObjectFloats() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectIntegers() { +Status NumPyConverter::ConvertObjectIntegers() { PyAcquireGIL lock; Int64Builder builder(pool_); @@ -799,7 +819,7 @@ Status PandasConverter::ConvertObjectIntegers() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectFixedWidthBytes( +Status NumPyConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr& type) { PyAcquireGIL lock; @@ -822,7 +842,7 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( return Status::OK(); } -Status PandasConverter::ConvertBooleans() { +Status NumPyConverter::ConvertBooleans() { PyAcquireGIL lock; Ndarray1DIndexer objects(arr_); @@ -864,7 +884,7 @@ Status PandasConverter::ConvertBooleans() { return Status::OK(); } -Status PandasConverter::ConvertObjectsInfer() { +Status NumPyConverter::ConvertObjectsInfer() { Ndarray1DIndexer objects; PyAcquireGIL lock; @@ -912,7 +932,7 @@ Status PandasConverter::ConvertObjectsInfer() { return Status::OK(); } -Status PandasConverter::ConvertObjectsInferAndCast() { +Status NumPyConverter::ConvertObjectsInferAndCast() { size_t position = out_arrays_.size(); RETURN_NOT_OK(ConvertObjectsInfer()); @@ -932,7 +952,7 @@ Status PandasConverter::ConvertObjectsInferAndCast() { return Status::OK(); } -Status PandasConverter::ConvertObjects() { +Status NumPyConverter::ConvertObjects() { // Python object arrays are annoying, since we could have one of: // // * Strings @@ -1005,8 +1025,8 @@ Status LoopPySequence(PyObject* sequence, T func) { } template -inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr& type, - ListBuilder* builder, PyObject* list) { +inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { typedef internal::npy_traits traits; typedef typename traits::BuilderClass BuilderT; @@ -1050,7 +1070,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr } template <> -inline Status PandasConverter::ConvertTypedLists( +inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; @@ -1091,7 +1111,7 @@ inline Status PandasConverter::ConvertTypedLists( } template <> -inline Status PandasConverter::ConvertTypedLists( +inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; // TODO: If there are bytes involed, convert to Binary representation @@ -1145,8 +1165,8 @@ inline Status PandasConverter::ConvertTypedLists( return ConvertTypedLists(type, builder, list); \ } -Status PandasConverter::ConvertLists(const std::shared_ptr& type, - ListBuilder* builder, PyObject* list) { +Status NumPyConverter::ConvertLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { switch (type->id()) { LIST_CASE(NA, NPY_OBJECT, NullType) LIST_CASE(UINT8, NPY_UINT8, UInt8Type) @@ -1185,7 +1205,7 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type, } } -Status PandasConverter::ConvertLists(const std::shared_ptr& type) { +Status NumPyConverter::ConvertLists(const std::shared_ptr& type) { std::unique_ptr array_builder; RETURN_NOT_OK(MakeBuilder(pool_, arrow::list(type), &array_builder)); ListBuilder* list_builder = static_cast(array_builder.get()); @@ -1193,20 +1213,13 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type) { return PushBuilderResult(list_builder); } -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out) { - PandasConverter converter(pool, ao, mo, type); +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + bool use_pandas_null_sentinels, + const std::shared_ptr& type, + std::shared_ptr* out) { + NumPyConverter converter(pool, ao, mo, type, use_pandas_null_sentinels); RETURN_NOT_OK(converter.Convert()); - *out = converter.result()[0]; - DCHECK(*out); - return Status::OK(); -} - -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, - std::shared_ptr* out) { - PandasConverter converter(pool, ao, mo, type); - RETURN_NOT_OK(converter.ConvertObjects()); + DCHECK(converter.result()[0]); *out = std::make_shared(converter.result()); return Status::OK(); } diff --git a/cpp/src/arrow/python/pandas_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h similarity index 70% rename from cpp/src/arrow/python/pandas_to_arrow.h rename to cpp/src/arrow/python/numpy_to_arrow.h index 3e655ba3fee..4a70b4bc533 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -17,8 +17,8 @@ // Converting from pandas memory representation to Arrow data structures -#ifndef ARROW_PYTHON_PANDAS_TO_ARROW_H -#define ARROW_PYTHON_PANDAS_TO_ARROW_H +#ifndef ARROW_PYTHON_NUMPY_TO_ARROW_H +#define ARROW_PYTHON_NUMPY_TO_ARROW_H #include "arrow/python/platform.h" @@ -36,12 +36,8 @@ class Status; namespace py { -ARROW_EXPORT -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out); - -/// Convert dtype=object arrays. If target data type is not known, pass a type -/// with nullptr +/// Convert NumPy arrays to Arrow. If target data type is not known, pass a +/// type with nullptr /// /// \param[in] pool Memory pool for any memory allocations /// \param[in] ao an ndarray with the array data @@ -49,11 +45,12 @@ Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, /// \param[in] type /// \param[out] out a ChunkedArray, to accommodate chunked output ARROW_EXPORT -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, - std::shared_ptr* out); +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + bool use_pandas_null_sentinels, + const std::shared_ptr& type, + std::shared_ptr* out); } // namespace py } // namespace arrow -#endif // ARROW_PYTHON_PANDAS_TO_ARROW_H +#endif // ARROW_PYTHON_NUMPY_TO_ARROW_H diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 0d76a35f4ed..ac069482274 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -36,7 +36,7 @@ time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, - list_, struct, dictionary, field, + list_, struct, dictionary, field, type_for_alias, DataType, NAType, Field, Schema, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index eec6180165c..f402defc9b0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,58 +16,161 @@ # under the License. -def array(object sequence, DataType type=None, MemoryPool memory_pool=None, - size=None): +cdef _sequence_to_array(object sequence, object size, DataType type, + CMemoryPool* pool): + cdef shared_ptr[CArray] out + cdef int64_t c_size + if type is None: + with nogil: + check_status(ConvertPySequence(sequence, pool, &out)) + else: + if size is None: + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &out, type.sp_type + ) + ) + else: + c_size = size + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &out, type.sp_type, c_size + ) + ) + + return pyarrow_wrap_array(out) + + +cdef _is_array_like(obj): + try: + import pandas + return isinstance(obj, (np.ndarray, pd.Series, pd.Index, Categorical)) + except: + return isinstance(obj, np.ndarray) + + +cdef _ndarray_to_array(object values, object mask, DataType type, + c_bool use_pandas_null_sentinels, + CMemoryPool* pool): + cdef shared_ptr[CChunkedArray] chunked_out + cdef shared_ptr[CDataType] c_type + + dtype = values.dtype + + if type is None and dtype != object: + with nogil: + check_status(NumPyDtypeToArrow(dtype, &c_type)) + + if type is not None: + c_type = type.sp_type + + with nogil: + check_status(NdarrayToArrow(pool, values, mask, + use_pandas_null_sentinels, + c_type, &chunked_out)) + + if chunked_out.get().num_chunks() > 1: + return pyarrow_wrap_chunked_array(chunked_out) + else: + return pyarrow_wrap_array(chunked_out.get().chunk(0)) + + +cdef DataType _ensure_type(object type): + if type is None: + return None + elif not isinstance(type, DataType): + return type_for_alias(type) + else: + return type + + +def array(object obj, type=None, mask=None, + MemoryPool memory_pool=None, size=None, + from_pandas=False): """ - Create pyarrow.Array instance from a Python sequence + Create pyarrow.Array instance from a Python object Parameters ---------- - sequence : sequence-like or iterable object of Python objects. - If both type and size are specified may be a single use iterable. - type : pyarrow.DataType, optional - If not passed, will be inferred from the data + obj : sequence, iterable, ndarray or Series + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool + size : int64, optional Size of the elements. If the imput is larger than size bail at this length. For iterators, if size is larger than the input iterator this will be treated as a "max size", but will involve an initial allocation of size followed by a resize to the actual size (so if you know the exact size specifying it correctly will give you better performance). + from_pandas : boolean, default False + Use pandas's semantics for inferring nulls from values in ndarray-like + data. If passed, the mask tasks precendence, but if a value is unmasked + (not-null), but still null according to pandas semantics, then it is + null + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), np.array([0, 1], + ... dtype=bool)) + + [ + 1, + NA + ] Returns ------- - array : pyarrow.Array + array : pyarrow.Array or pyarrow.ChunkedArray (if object data + overflowed binary storage) """ - cdef: - shared_ptr[CArray] sp_array - CMemoryPool* pool - int64_t c_size + type = _ensure_type(type) + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - pool = maybe_unbox_memory_pool(memory_pool) - if type is None: - with nogil: - check_status(ConvertPySequence(sequence, pool, &sp_array)) - else: - if size is None: - with nogil: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type - ) - ) - else: - c_size = size - with nogil: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type, c_size - ) - ) + if _is_array_like(obj): + if mask is not None: + mask = get_series_values(mask) + + values = get_series_values(obj) - return pyarrow_wrap_array(sp_array) + if isinstance(values, Categorical): + return DictionaryArray.from_arrays( + values.codes, values.categories.values, + mask=mask, ordered=values.ordered, + memory_pool=memory_pool) + else: + values, type = pdcompat.get_datetimetz_type(values, obj.dtype, + type) + return _ndarray_to_array(values, mask, type, from_pandas, pool) + else: + if mask is not None: + raise ValueError("Masks only supported with ndarray-like inputs") + return _sequence_to_array(obj, size, type, pool) def _normalize_slice(object arrow_obj, slice key): @@ -112,7 +215,7 @@ cdef class Array: with nogil: check_status(DebugPrint(deref(self.ap), 0)) - def cast(self, DataType target_type, safe=True): + def cast(self, object target_type, safe=True): """ Cast array values to another data type @@ -130,42 +233,37 @@ cdef class Array: cdef: CCastOptions options shared_ptr[CArray] result + DataType type + + type = _ensure_type(target_type) if not safe: options.allow_int_overflow = 1 with nogil: - check_status(Cast(_context(), self.ap[0], target_type.sp_type, + check_status(Cast(_context(), self.ap[0], type.sp_type, options, &result)) return pyarrow_wrap_array(result) @staticmethod - def from_pandas(obj, mask=None, DataType type=None, - timestamps_to_ms=False, - MemoryPool memory_pool=None): + def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): """ - Convert pandas.Series to an Arrow Array. + Convert pandas.Series to an Arrow Array, using pandas's semantics about + what values indicate nulls. See pyarrow.array for more general + conversion from arrays or sequences to Arrow arrays Parameters ---------- - series : pandas.Series or numpy.ndarray - - mask : pandas.Series or numpy.ndarray, optional - boolean mask if the object is null (True) or valid (False) - + sequence : ndarray, Inded Series + mask : array (boolean), optional + Indicate which values are null (True) or not null (False) type : pyarrow.DataType - Explicit type to attempt to coerce to - - timestamps_to_ms : bool, optional - Convert datetime columns to ms resolution. This is needed for - compatibility with other functionality like Parquet I/O which - only supports milliseconds. - - .. deprecated:: 0.7.0 - - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate the resulting Arrow array. + Explicit type to attempt to coerce to, otherwise will be inferred + from the data + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool Notes ----- @@ -173,78 +271,13 @@ cdef class Array: representation). Timezone-naive data will be implicitly interpreted as UTC. - Examples - -------- - - >>> import pandas as pd - >>> import pyarrow as pa - >>> pa.Array.from_pandas(pd.Series([1, 2])) - - [ - 1, - 2 - ] - - >>> import numpy as np - >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], - ... dtype=bool)) - - [ - 1, - NA - ] - Returns ------- array : pyarrow.Array or pyarrow.ChunkedArray (if object data - overflowed binary storage) + overflows binary buffer) """ - cdef: - shared_ptr[CArray] out - shared_ptr[CChunkedArray] chunked_out - shared_ptr[CDataType] c_type - CMemoryPool* pool - - if mask is not None: - mask = get_series_values(mask) - - values = get_series_values(obj) - pool = maybe_unbox_memory_pool(memory_pool) - - if isinstance(values, Categorical): - return DictionaryArray.from_arrays( - values.codes, values.categories.values, - mask=mask, ordered=values.ordered, - memory_pool=memory_pool) - elif values.dtype == object: - # Object dtype undergoes a different conversion path as more type - # inference may be needed - if type is not None: - c_type = type.sp_type - with nogil: - check_status(PandasObjectsToArrow( - pool, values, mask, c_type, &chunked_out)) - - if chunked_out.get().num_chunks() > 1: - return pyarrow_wrap_chunked_array(chunked_out) - else: - out = chunked_out.get().chunk(0) - else: - values, type = pdcompat.maybe_coerce_datetime64( - values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) - - if type is None: - dtype = values.dtype - with nogil: - check_status(NumPyDtypeToArrow(dtype, &c_type)) - else: - c_type = type.sp_type - - with nogil: - check_status(PandasToArrow( - pool, values, mask, c_type, &out)) - - return pyarrow_wrap_array(out) + return array(obj, mask=mask, type=type, memory_pool=memory_pool, + from_pandas=True) property null_count: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5e6708871e6..fc17d1c06ae 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -766,13 +766,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) - CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CChunkedArray]* out) + CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, + c_bool use_pandas_null_sentinels, + const shared_ptr[CDataType]& type, + shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, shared_ptr[CTensor]* out) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d1e6f5a8096..be48aeb442d 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -203,7 +203,7 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): } -def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): +def dataframe_to_arrays(df, schema, preserve_index): names = [] arrays = [] index_columns = [] @@ -223,15 +223,13 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): field = schema.field_by_name(name) type = getattr(field, "type", None) - array = pa.Array.from_pandas( - col, type=type, timestamps_to_ms=timestamps_to_ms - ) + array = pa.array(col, from_pandas=True, type=type) arrays.append(array) names.append(name) types.append(array.type) for i, column in enumerate(index_columns): - array = pa.Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms) + array = pa.array(column) arrays.append(array) names.append(index_level_name(column, i)) types.append(array.type) @@ -242,25 +240,15 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): return names, arrays, metadata -def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): - if timestamps_to_ms: - import warnings - warnings.warn('timestamps_to_ms=True is deprecated', FutureWarning) - +def get_datetimetz_type(values, dtype, type_): from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: return values, type_ - coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' - - if coerce_ms: - values = values.astype('datetime64[ms]') - type_ = pa.timestamp('ms') - if isinstance(dtype, DatetimeTZDtype): tz = dtype.tz - unit = 'ms' if coerce_ms else dtype.unit + unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 3a847f77c4f..c37ed3b200e 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -348,10 +348,10 @@ cdef class StructValue(ArrayValue): cdef dict _scalar_classes = { _Type_BOOL: BooleanValue, - _Type_UINT8: Int8Value, - _Type_UINT16: Int16Value, - _Type_UINT32: Int32Value, - _Type_UINT64: Int64Value, + _Type_UINT8: UInt8Value, + _Type_UINT16: UInt16Value, + _Type_UINT32: UInt32Value, + _Type_UINT64: UInt64Value, _Type_INT8: Int8Value, _Type_INT16: Int16Value, _Type_INT32: Int32Value, diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 028797e45b8..e5422a5beca 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -575,7 +575,7 @@ cdef class RecordBatch: pyarrow.RecordBatch """ names, arrays, metadata = pdcompat.dataframe_to_arrays( - df, False, schema, preserve_index + df, schema, preserve_index ) return cls.from_arrays(arrays, names, metadata) @@ -714,21 +714,13 @@ cdef class Table: return result @classmethod - def from_pandas(cls, df, bint timestamps_to_ms=False, - Schema schema=None, bint preserve_index=True): + def from_pandas(cls, df, Schema schema=None, bint preserve_index=True): """ Convert pandas.DataFrame to an Arrow Table Parameters ---------- df : pandas.DataFrame - timestamps_to_ms : bool - Convert datetime columns to ms resolution. This is needed for - compability with other functionality like Parquet I/O which - only supports milliseconds. - - .. deprecated:: 0.7.0 - schema : pyarrow.Schema, optional The expected schema of the Arrow Table. This can be used to indicate the type of columns if we cannot infer it automatically. @@ -754,7 +746,6 @@ cdef class Table: """ names, arrays, metadata = pdcompat.dataframe_to_arrays( df, - timestamps_to_ms=timestamps_to_ms, schema=schema, preserve_index=preserve_index ) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f316417caaf..3bf392686f0 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -149,6 +149,14 @@ def test_array_factory_invalid_type(): pa.array(arr) +def test_array_ref_to_ndarray_base(): + arr = np.array([1, 2, 3]) + + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) @@ -170,8 +178,8 @@ def test_dictionary_from_boxed_arrays(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) - iarr = pa.Array.from_pandas(indices) - darr = pa.Array.from_pandas(dictionary) + iarr = pa.array(indices) + darr = pa.array(dictionary) d1 = pa.DictionaryArray.from_arrays(iarr, darr) @@ -201,9 +209,9 @@ def test_dictionary_with_pandas(): def test_list_from_arrays(): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') - offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32()) + offsets = pa.array(offsets_arr, type='int32') pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] - values = pa.array(pyvalues, type=pa.binary()) + values = pa.array(pyvalues, type='binary') result = pa.ListArray.from_arrays(offsets, values) expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) @@ -214,22 +222,22 @@ def test_list_from_arrays(): def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case - in_arr = pa.Array.from_pandas(in_data, type=in_type) + in_arr = pa.array(in_data, type=in_type) casted = in_arr.cast(out_type, safe=safe) - expected = pa.Array.from_pandas(out_data, type=out_type) + expected = pa.array(out_data, type=out_type) assert casted.equals(expected) def test_cast_integers_safe(): safe_cases = [ - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) ] @@ -237,13 +245,13 @@ def test_cast_integers_safe(): _check_cast_case(case) unsafe_cases = [ - (np.array([50000], dtype='i4'), pa.int32(), pa.int16()), - (np.array([70000], dtype='i4'), pa.int32(), pa.uint16()), - (np.array([-1], dtype='i4'), pa.int32(), pa.uint16()), - (np.array([50000], dtype='u2'), pa.uint16(), pa.int16()) + (np.array([50000], dtype='i4'), 'int32', 'int16'), + (np.array([70000], dtype='i4'), 'int32', 'uint16'), + (np.array([-1], dtype='i4'), 'int32', 'uint16'), + (np.array([50000], dtype='u2'), 'uint16', 'int16') ] for in_data, in_type, out_type in unsafe_cases: - in_arr = pa.Array.from_pandas(in_data, type=in_type) + in_arr = pa.array(in_data, type=in_type) with pytest.raises(pa.ArrowInvalid): in_arr.cast(out_type) @@ -252,11 +260,11 @@ def test_cast_integers_safe(): def test_cast_integers_unsafe(): # We let NumPy do the unsafe casting unsafe_cases = [ - (np.array([50000], dtype='i4'), pa.int32(), + (np.array([50000], dtype='i4'), 'int32', np.array([50000], dtype='i2'), pa.int16()), - (np.array([70000], dtype='i4'), pa.int32(), + (np.array([70000], dtype='i4'), 'int32', np.array([70000], dtype='u2'), pa.uint16()), - (np.array([-1], dtype='i4'), pa.int32(), + (np.array([-1], dtype='i4'), 'int32', np.array([-1], dtype='u2'), pa.uint16()), (np.array([50000], dtype='u2'), pa.uint16(), np.array([50000], dtype='i2'), pa.int16()) @@ -315,3 +323,17 @@ def test_simple_type_construction(): ) def test_logical_type(type, expected): assert get_logical_type(type) == expected + + +def test_array_conversions_no_sentinel_values(): + arr = np.array([1, 2, 3, 4], dtype='int8') + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + assert arr2.type == 'int8' + + arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'), + type='float32') + assert arr3.type == 'float32' + assert arr3.null_count == 0 diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 5d56cde7d48..182f3afc7e6 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -18,7 +18,7 @@ from collections import OrderedDict -from datetime import datetime, date, time +from datetime import date, time import unittest import decimal import json @@ -82,7 +82,7 @@ def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def _check_series_roundtrip(self, s, type_=None): - arr = pa.Array.from_pandas(s, type=type_) + arr = pa.array(s, from_pandas=True, type=type_) result = pd.Series(arr.to_pandas(), name=s.name) if isinstance(arr.type, pa.TimestampType) and arr.type.tz is not None: @@ -93,7 +93,7 @@ def _check_series_roundtrip(self, s, type_=None): def _check_array_roundtrip(self, values, expected=None, mask=None, type=None): - arr = pa.Array.from_pandas(values, mask=mask, type=type) + arr = pa.array(values, from_pandas=True, mask=mask, type=type) result = arr.to_pandas() values_nulls = pd.isnull(values) @@ -152,7 +152,7 @@ def test_float_nulls(self): for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) - arr = pa.Array.from_pandas(values, null_mask) + arr = pa.array(values, from_pandas=True, mask=null_mask) arrays.append(arr) fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan @@ -223,7 +223,7 @@ def test_integer_with_nulls(self): for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) - arr = pa.Array.from_pandas(values, null_mask) + arr = pa.array(values, mask=null_mask) arrays.append(arr) expected = values.astype('f8') @@ -244,8 +244,8 @@ def test_array_from_pandas_type_cast(self): target_type = pa.int8() - result = pa.Array.from_pandas(arr, type=target_type) - expected = pa.Array.from_pandas(arr.astype('int8')) + result = pa.array(arr, type=target_type) + expected = pa.array(arr.astype('int8')) assert result.equals(expected) def test_boolean_no_nulls(self): @@ -266,7 +266,7 @@ def test_boolean_nulls(self): mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 - arr = pa.Array.from_pandas(values, mask) + arr = pa.array(values, mask=mask) expected = values.astype(object) expected[mask] = None @@ -292,7 +292,7 @@ def test_all_nulls_cast_numeric(self): arr = np.array([None], dtype=object) def _check_type(t): - a2 = pa.Array.from_pandas(arr, type=t) + a2 = pa.array(arr, type=t) assert a2.type == t assert a2[0].as_py() is None @@ -325,7 +325,7 @@ def test_bytes_exceed_2gb(self): df = pd.DataFrame({ 'strings': np.array([val] * 4000, dtype=object) }) - arr = pa.Array.from_pandas(df['strings']) + arr = pa.array(df['strings']) assert isinstance(arr, pa.ChunkedArray) assert arr.num_chunks == 2 arr = None @@ -365,19 +365,6 @@ def test_timestamps_notimezone_no_nulls(self): expected_schema=schema, ) - def test_timestamps_to_ms_explicit_schema(self): - # ARROW-1328 - df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]}) - pa_type = pa.from_numpy_dtype(df['datetime'].dtype) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = pa.Array.from_pandas(df['datetime'], type=pa_type, - timestamps_to_ms=True) - - tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'), - arr.to_pandas()) - def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ @@ -450,11 +437,11 @@ def test_date_objects_typed(self): t32 = pa.date32() t64 = pa.date64() - a32 = pa.Array.from_pandas(arr, type=t32) - a64 = pa.Array.from_pandas(arr, type=t64) + a32 = pa.array(arr, type=t32) + a64 = pa.array(arr, type=t64) - a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32) - a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64) + a32_expected = pa.array(arr_i4, mask=mask, type=t32) + a64_expected = pa.array(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) @@ -481,8 +468,8 @@ def test_dates_from_integers(self): arr = np.array([17259, 17260, 17261], dtype='int32') arr2 = arr.astype('int64') * 86400000 - a1 = pa.Array.from_pandas(arr, type=t1) - a2 = pa.Array.from_pandas(arr2, type=t2) + a1 = pa.array(arr, type=t1) + a2 = pa.array(arr2, type=t2) expected = date(2017, 4, 3) assert a1[0].as_py() == expected @@ -520,7 +507,7 @@ def test_column_of_arrays_to_py(self): np.arange(1, dtype=dtype) ]) type_ = pa.list_(pa.int8()) - parr = pa.Array.from_pandas(arr, type=type_) + parr = pa.array(arr, type=type_) assert parr[0].as_py() == list(range(10)) assert parr[1].as_py() == list(range(5)) @@ -592,7 +579,7 @@ def test_column_of_lists_strided(self): def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object) - arr = pa.Array.from_pandas(data) + arr = pa.array(data) expected = pa.array(list(data)) assert arr.equals(expected) assert arr.type == pa.list_(pa.null()) @@ -600,7 +587,7 @@ def test_nested_lists_all_none(self): data2 = np.array([None, None, [None, None], np.array([None, None], dtype=object)], dtype=object) - arr = pa.Array.from_pandas(data2) + arr = pa.array(data2) expected = pa.array([None, None, [None, None], [None, None]]) assert arr.equals(expected) @@ -760,7 +747,7 @@ def test_pytime_from_pandas(self): t1 = pa.time64('us') aobjs = np.array(pytimes + [None], dtype=object) - parr = pa.Array.from_pandas(aobjs) + parr = pa.array(aobjs) assert parr.type == t1 assert parr[0].as_py() == pytimes[0] assert parr[1].as_py() == pytimes[1] @@ -775,18 +762,18 @@ def test_pytime_from_pandas(self): arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64') - a1 = pa.Array.from_pandas(arr, type=pa.time64('us')) + a1 = pa.array(arr, type=pa.time64('us')) assert a1[0].as_py() == pytimes[0] - a2 = pa.Array.from_pandas(arr * 1000, type=pa.time64('ns')) + a2 = pa.array(arr * 1000, type=pa.time64('ns')) assert a2[0].as_py() == pytimes[0] - a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), - type=pa.time32('ms')) + a3 = pa.array((arr / 1000).astype('i4'), + type=pa.time32('ms')) assert a3[0].as_py() == pytimes[0].replace(microsecond=1000) - a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), - type=pa.time32('s')) + a4 = pa.array((arr / 1000000).astype('i4'), + type=pa.time32('s')) assert a4[0].as_py() == pytimes[0].replace(microsecond=0) def test_arrow_time_to_pandas(self): @@ -809,14 +796,14 @@ def test_arrow_time_to_pandas(self): null_mask = np.array([False, False, True], dtype=bool) - a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us')) - a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask, - type=pa.time64('ns')) + a1 = pa.array(arr, mask=null_mask, type=pa.time64('us')) + a2 = pa.array(arr * 1000, mask=null_mask, + type=pa.time64('ns')) - a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask, - type=pa.time32('ms')) - a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask, - type=pa.time32('s')) + a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask, + type=pa.time32('ms')) + a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask, + type=pa.time32('s')) names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]'] batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names) @@ -841,8 +828,8 @@ def test_arrow_time_to_pandas(self): tm.assert_frame_equal(df, expected_df) - def _check_numpy_array_roundtrip(self, np_array): - arr = pa.Array.from_pandas(np_array) + def _check_array_from_pandas_roundtrip(self, np_array): + arr = pa.array(np_array, from_pandas=True) result = arr.to_pandas() npt.assert_array_equal(result, np_array) @@ -853,7 +840,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - self._check_numpy_array_roundtrip(datetime64_ns) + self._check_array_from_pandas_roundtrip(datetime64_ns) datetime64_us = np.array([ '2007-07-13T01:23:34.123456', @@ -861,7 +848,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539', '2010-08-13T05:46:57.437699'], dtype='datetime64[us]') - self._check_numpy_array_roundtrip(datetime64_us) + self._check_array_from_pandas_roundtrip(datetime64_us) datetime64_ms = np.array([ '2007-07-13T01:23:34.123', @@ -869,7 +856,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - self._check_numpy_array_roundtrip(datetime64_ms) + self._check_array_from_pandas_roundtrip(datetime64_ms) datetime64_s = np.array([ '2007-07-13T01:23:34', @@ -877,7 +864,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56', '2010-08-13T05:46:57'], dtype='datetime64[s]') - self._check_numpy_array_roundtrip(datetime64_s) + self._check_array_from_pandas_roundtrip(datetime64_s) datetime64_d = np.array([ '2007-07-13', @@ -885,11 +872,11 @@ def test_numpy_datetime64_columns(self): '2006-01-15', '2010-08-19'], dtype='datetime64[D]') - self._check_numpy_array_roundtrip(datetime64_d) + self._check_array_from_pandas_roundtrip(datetime64_d) def test_all_nones(self): def _check_series(s): - converted = pa.Array.from_pandas(s) + converted = pa.array(s) assert isinstance(converted, pa.NullArray) assert len(converted) == 3 assert converted.null_count == 3 diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index eb23894f480..b0593fe885d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -457,8 +457,26 @@ def test_column_of_arrays(tmpdir): @parquet def test_coerce_timestamps(tmpdir): + from collections import OrderedDict # ARROW-622 - df, schema = dataframe_with_arrays() + arrays = OrderedDict() + fields = [pa.field('datetime64', + pa.list_(pa.timestamp('ms')))] + arrays['datetime64'] = [ + np.array(['2007-07-13T01:23:34.123456789', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + None, + None, + np.array(['2007-07-13T02', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + ] + + df = pd.DataFrame(arrays) + schema = pa.schema(fields) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, schema=schema) @@ -497,41 +515,41 @@ def test_column_of_lists(tmpdir): def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') - a1 = pa.Array.from_pandas(data1, type=t1) + a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 - a2 = pa.Array.from_pandas(data2, type=t2) + a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') - a3 = pa.Array.from_pandas(data3, type=t3) + a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') - a4 = pa.Array.from_pandas(data4, type=t4) + a4 = pa.array(data4, type=t4) t5 = pa.time64('us') - a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) + a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') - a6 = pa.Array.from_pandas(data4, type=t6) + a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') - ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) + ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') - a7 = pa.Array.from_pandas(data7, type=t7) + a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 - a7_us = pa.Array.from_pandas(data7_us, type=t7_us) + a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', @@ -575,7 +593,7 @@ def _assert_unsupported(array): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') - a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) + a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7) @@ -1295,7 +1313,7 @@ def test_large_table_int32_overflow(): arr = np.ones(size, dtype='uint8') - parr = pa.Array.from_pandas(arr, type=pa.uint8()) + parr = pa.array(arr, type=pa.uint8()) table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 4bb6a5af7dc..c77be98054c 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -69,6 +69,56 @@ def test_type_list(): assert str(l2) == 'list' +def test_type_comparisons(): + val = pa.int32() + assert val == pa.int32() + assert val == 'int32' + + with pytest.raises(TypeError): + val == 5 + + +def test_type_for_alias(): + cases = [ + ('i1', pa.int8()), + ('int8', pa.int8()), + ('i2', pa.int16()), + ('int16', pa.int16()), + ('i4', pa.int32()), + ('int32', pa.int32()), + ('i8', pa.int64()), + ('int64', pa.int64()), + ('u1', pa.uint8()), + ('uint8', pa.uint8()), + ('u2', pa.uint16()), + ('uint16', pa.uint16()), + ('u4', pa.uint32()), + ('uint32', pa.uint32()), + ('u8', pa.uint64()), + ('uint64', pa.uint64()), + ('f4', pa.float32()), + ('float32', pa.float32()), + ('f8', pa.float64()), + ('float64', pa.float64()), + ('date32', pa.date32()), + ('date64', pa.date64()), + ('string', pa.string()), + ('str', pa.string()), + ('binary', pa.binary()), + ('time32[s]', pa.time32('s')), + ('time32[ms]', pa.time32('ms')), + ('time64[us]', pa.time64('us')), + ('time64[ns]', pa.time64('ns')), + ('timestamp[s]', pa.timestamp('s')), + ('timestamp[ms]', pa.timestamp('ms')), + ('timestamp[us]', pa.timestamp('us')), + ('timestamp[ns]', pa.timestamp('ns')), + ] + + for val, expected in cases: + assert pa.type_for_alias(val) == expected + + def test_type_string(): t = pa.string() assert str(t) == 'string' diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b298e740250..316e09a6efd 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -72,11 +72,19 @@ cdef class DataType: def __repr__(self): return '{0.__class__.__name__}({0})'.format(self) - def __richcmp__(DataType self, DataType other, int op): + def __richcmp__(DataType self, object other, int op): + cdef DataType other_type + if not isinstance(other, DataType): + if not isinstance(other, six.string_types): + raise TypeError(other) + other_type = type_for_alias(other) + else: + other_type = other + if op == cp.Py_EQ: - return self.type.Equals(deref(other.type)) + return self.type.Equals(deref(other_type.type)) elif op == cp.Py_NE: - return not self.type.Equals(deref(other.type)) + return not self.type.Equals(deref(other_type.type)) else: raise TypeError('Invalid comparison') @@ -922,6 +930,64 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +cdef dict _type_aliases = { + 'null': null, + 'i1': int8, + 'int8': int8, + 'i2': int16, + 'int16': int16, + 'i4': int32, + 'int32': int32, + 'i8': int64, + 'int64': int64, + 'u1': uint8, + 'uint8': uint8, + 'u2': uint16, + 'uint16': uint16, + 'u4': uint32, + 'uint32': uint32, + 'u8': uint64, + 'uint64': uint64, + 'f4': float32, + 'float32': float32, + 'f8': float64, + 'float64': float64, + 'string': string, + 'str': string, + 'utf8': string, + 'binary': binary, + 'date32': date32, + 'date64': date64, + 'time32[s]': time32('s'), + 'time32[ms]': time32('ms'), + 'time64[us]': time64('us'), + 'time64[ns]': time64('ns'), + 'timestamp[s]': timestamp('s'), + 'timestamp[ms]': timestamp('ms'), + 'timestamp[us]': timestamp('us'), + 'timestamp[ns]': timestamp('ns'), +} + + +def type_for_alias(name): + """ + Return DataType given a string alias if one exists + + Returns + ------- + type : DataType + """ + name = name.lower() + try: + alias = _type_aliases[name] + except KeyError: + raise ValueError('No type alias for {0}'.format(name)) + + if isinstance(alias, DataType): + return alias + return alias() + + def schema(fields): """ Construct pyarrow.Schema from collection of fields