diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 17b2ec8b55069..1ef4e77e8fc50 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -25,6 +25,31 @@ #include #include +template +gboolean +garrow_array_builder_append(GArrowArrayBuilder *builder, + VALUE value, + GError **error, + const gchar *context) +{ + auto arrow_builder = + static_cast(garrow_array_builder_get_raw(builder)); + auto status = arrow_builder->Append(value); + return garrow_error_check(error, status, context); +} + +template +gboolean +garrow_array_builder_append_null(GArrowArrayBuilder *builder, + GError **error, + const gchar *context) +{ + auto arrow_builder = + static_cast(garrow_array_builder_get_raw(builder)); + auto status = arrow_builder->AppendNull(); + return garrow_error_check(error, status, context); +} + G_BEGIN_DECLS /** @@ -41,6 +66,11 @@ G_BEGIN_DECLS * #GArrowBooleanArrayBuilder is the class to create a new * #GArrowBooleanArray. * + * #GArrowIntArrayBuilder is the class to create a new integer + * array. Integer size is automatically chosen. It's recommend that + * you use this builder instead of specific integer size builder such + * as #GArrowInt8ArrayBuilder. + * * #GArrowInt8ArrayBuilder is the class to create a new * #GArrowInt8Array. * @@ -190,16 +220,22 @@ garrow_array_builder_new(const std::shared_ptr &type, /** * garrow_array_builder_finish: * @builder: A #GArrowArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): The built #GArrowArray. + * Returns: (transfer full): The built #GArrowArray on success, + * %NULL on error. */ GArrowArray * -garrow_array_builder_finish(GArrowArrayBuilder *builder) +garrow_array_builder_finish(GArrowArrayBuilder *builder, GError **error) { auto arrow_builder = garrow_array_builder_get_raw(builder); std::shared_ptr arrow_array; - arrow_builder->Finish(&arrow_array); - return garrow_array_new_raw(&arrow_array); + auto status = arrow_builder->Finish(&arrow_array); + if (garrow_error_check(error, status, "[array-builder][finish]")) { + return garrow_array_new_raw(&arrow_array); + } else { + return NULL; + } } @@ -244,12 +280,11 @@ garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(static_cast(value)); - return garrow_error_check(error, status, "[boolean-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + static_cast(value), + error, + "[boolean-array-builder][append]"); } /** @@ -263,14 +298,83 @@ gboolean garrow_boolean_array_builder_append_null(GArrowBooleanArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[boolean-array-builder][append-null]"); +} - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[boolean-array-builder][append-null]"); + +G_DEFINE_TYPE(GArrowIntArrayBuilder, + garrow_int_array_builder, + GARROW_TYPE_ARRAY_BUILDER) + +static void +garrow_int_array_builder_init(GArrowIntArrayBuilder *builder) +{ +} + +static void +garrow_int_array_builder_class_init(GArrowIntArrayBuilderClass *klass) +{ +} + +/** + * garrow_int_array_builder_new: + * + * Returns: A newly created #GArrowIntArrayBuilder. + * + * Since: 0.6.0 + */ +GArrowIntArrayBuilder * +garrow_int_array_builder_new(void) +{ + auto memory_pool = arrow::default_memory_pool(); + auto arrow_builder = new arrow::AdaptiveIntBuilder(memory_pool); + auto builder = garrow_array_builder_new_raw(arrow_builder, + GARROW_TYPE_INT_ARRAY_BUILDER); + return GARROW_INT_ARRAY_BUILDER(builder); +} + +/** + * garrow_int_array_builder_append: + * @builder: A #GArrowIntArrayBuilder. + * @value: A int value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.6.0 + */ +gboolean +garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int-array-builder][append]"); +} + +/** + * garrow_int_array_builder_append_null: + * @builder: A #GArrowIntArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.6.0 + */ +gboolean +garrow_int_array_builder_append_null(GArrowIntArrayBuilder *builder, + GError **error) +{ + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int-array-builder][append-null]"); } @@ -315,12 +419,11 @@ garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int8-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int8-array-builder][append]"); } /** @@ -334,12 +437,10 @@ gboolean garrow_int8_array_builder_append_null(GArrowInt8ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int8-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int8-array-builder][append-null]"); } @@ -384,12 +485,11 @@ garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint8-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint8-array-builder][append]"); } /** @@ -403,12 +503,10 @@ gboolean garrow_uint8_array_builder_append_null(GArrowUInt8ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[uint8-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint8-array-builder][append-null]"); } @@ -453,12 +551,11 @@ garrow_int16_array_builder_append(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int16-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int16-array-builder][append]"); } /** @@ -472,12 +569,10 @@ gboolean garrow_int16_array_builder_append_null(GArrowInt16ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int16-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int16-array-builder][append-null]"); } @@ -522,12 +617,11 @@ garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint16-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint16-array-builder][append]"); } /** @@ -541,14 +635,10 @@ gboolean garrow_uint16_array_builder_append_null(GArrowUInt16ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[uint16-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint16-array-builder][append-null]"); } @@ -593,12 +683,11 @@ garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int32-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int32-array-builder][append]"); } /** @@ -612,12 +701,10 @@ gboolean garrow_int32_array_builder_append_null(GArrowInt32ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int32-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int32-array-builder][append-null]"); } @@ -662,12 +749,11 @@ garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint32-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint32-array-builder][append]"); } /** @@ -681,14 +767,10 @@ gboolean garrow_uint32_array_builder_append_null(GArrowUInt32ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[uint32-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint32-array-builder][append-null]"); } @@ -733,12 +815,11 @@ garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int64-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int64-array-builder][append]"); } /** @@ -752,12 +833,10 @@ gboolean garrow_int64_array_builder_append_null(GArrowInt64ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int64-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int64-array-builder][append-null]"); } @@ -802,12 +881,11 @@ garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint64-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint64-array-builder][append]"); } /** @@ -821,17 +899,10 @@ gboolean garrow_uint64_array_builder_append_null(GArrowUInt64ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - if (status.ok()) { - return TRUE; - } else { - garrow_error_check(error, status, "[uint64-array-builder][append-null]"); - return FALSE; - } + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint64-array-builder][append-null]"); } @@ -876,12 +947,11 @@ garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, gfloat value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[float-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[float-array-builder][append]"); } /** @@ -895,12 +965,10 @@ gboolean garrow_float_array_builder_append_null(GArrowFloatArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[float-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[float-array-builder][append-null]"); } @@ -945,12 +1013,11 @@ garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[double-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[double-array-builder][append]"); } /** @@ -964,14 +1031,10 @@ gboolean garrow_double_array_builder_append_null(GArrowDoubleArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[double-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[double-array-builder][append-null]"); } @@ -1037,14 +1100,10 @@ gboolean garrow_binary_array_builder_append_null(GArrowBinaryArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[binary-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[binary-array-builder][append-null]"); } @@ -1240,12 +1299,10 @@ gboolean garrow_list_array_builder_append_null(GArrowListArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[list-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[list-array-builder][append-null]"); } /** @@ -1390,14 +1447,10 @@ gboolean garrow_struct_array_builder_append_null(GArrowStructArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[struct-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[struct-array-builder][append-null]"); } /** @@ -1450,59 +1503,60 @@ garrow_struct_array_builder_get_field_builders(GArrowStructArrayBuilder *builder G_END_DECLS GArrowArrayBuilder * -garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder) -{ - GType type; - - switch (arrow_builder->type()->id()) { - case arrow::Type::type::BOOL: - type = GARROW_TYPE_BOOLEAN_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT8: - type = GARROW_TYPE_UINT8_ARRAY_BUILDER; - break; - case arrow::Type::type::INT8: - type = GARROW_TYPE_INT8_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT16: - type = GARROW_TYPE_UINT16_ARRAY_BUILDER; - break; - case arrow::Type::type::INT16: - type = GARROW_TYPE_INT16_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT32: - type = GARROW_TYPE_UINT32_ARRAY_BUILDER; - break; - case arrow::Type::type::INT32: - type = GARROW_TYPE_INT32_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT64: - type = GARROW_TYPE_UINT64_ARRAY_BUILDER; - break; - case arrow::Type::type::INT64: - type = GARROW_TYPE_INT64_ARRAY_BUILDER; - break; - case arrow::Type::type::FLOAT: - type = GARROW_TYPE_FLOAT_ARRAY_BUILDER; - break; - case arrow::Type::type::DOUBLE: - type = GARROW_TYPE_DOUBLE_ARRAY_BUILDER; - break; - case arrow::Type::type::BINARY: - type = GARROW_TYPE_BINARY_ARRAY_BUILDER; - break; - case arrow::Type::type::STRING: - type = GARROW_TYPE_STRING_ARRAY_BUILDER; - break; - case arrow::Type::type::LIST: - type = GARROW_TYPE_LIST_ARRAY_BUILDER; - break; - case arrow::Type::type::STRUCT: - type = GARROW_TYPE_STRUCT_ARRAY_BUILDER; - break; - default: - type = GARROW_TYPE_ARRAY_BUILDER; - break; +garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder, + GType type) +{ + if (type == G_TYPE_INVALID) { + switch (arrow_builder->type()->id()) { + case arrow::Type::type::BOOL: + type = GARROW_TYPE_BOOLEAN_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT8: + type = GARROW_TYPE_UINT8_ARRAY_BUILDER; + break; + case arrow::Type::type::INT8: + type = GARROW_TYPE_INT8_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT16: + type = GARROW_TYPE_UINT16_ARRAY_BUILDER; + break; + case arrow::Type::type::INT16: + type = GARROW_TYPE_INT16_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT32: + type = GARROW_TYPE_UINT32_ARRAY_BUILDER; + break; + case arrow::Type::type::INT32: + type = GARROW_TYPE_INT32_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT64: + type = GARROW_TYPE_UINT64_ARRAY_BUILDER; + break; + case arrow::Type::type::INT64: + type = GARROW_TYPE_INT64_ARRAY_BUILDER; + break; + case arrow::Type::type::FLOAT: + type = GARROW_TYPE_FLOAT_ARRAY_BUILDER; + break; + case arrow::Type::type::DOUBLE: + type = GARROW_TYPE_DOUBLE_ARRAY_BUILDER; + break; + case arrow::Type::type::BINARY: + type = GARROW_TYPE_BINARY_ARRAY_BUILDER; + break; + case arrow::Type::type::STRING: + type = GARROW_TYPE_STRING_ARRAY_BUILDER; + break; + case arrow::Type::type::LIST: + type = GARROW_TYPE_LIST_ARRAY_BUILDER; + break; + case arrow::Type::type::STRUCT: + type = GARROW_TYPE_STRUCT_ARRAY_BUILDER; + break; + default: + type = GARROW_TYPE_ARRAY_BUILDER; + break; + } } auto builder = diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index f5a8ac73d630a..613a5bad494d1 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -65,7 +65,8 @@ struct _GArrowArrayBuilderClass GType garrow_array_builder_get_type (void) G_GNUC_CONST; -GArrowArray *garrow_array_builder_finish (GArrowArrayBuilder *builder); +GArrowArray *garrow_array_builder_finish (GArrowArrayBuilder *builder, + GError **error); #define GARROW_TYPE_BOOLEAN_ARRAY_BUILDER \ @@ -119,6 +120,57 @@ gboolean garrow_boolean_array_builder_append_null(GArrowBooleanArrayBuilder *bui GError **error); +#define GARROW_TYPE_INT_ARRAY_BUILDER \ + (garrow_int_array_builder_get_type()) +#define GARROW_INT_ARRAY_BUILDER(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_INT_ARRAY_BUILDER, \ + GArrowIntArrayBuilder)) +#define GARROW_INT_ARRAY_BUILDER_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_INT_ARRAY_BUILDER, \ + GArrowIntArrayBuilderClass)) +#define GARROW_IS_INT_ARRAY_BUILDER(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_INT_ARRAY_BUILDER)) +#define GARROW_IS_INT_ARRAY_BUILDER_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_INT_ARRAY_BUILDER)) +#define GARROW_INT_ARRAY_BUILDER_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_INT_ARRAY_BUILDER, \ + GArrowIntArrayBuilderClass)) + +typedef struct _GArrowIntArrayBuilder GArrowIntArrayBuilder; +typedef struct _GArrowIntArrayBuilderClass GArrowIntArrayBuilderClass; + +/** + * GArrowIntArrayBuilder: + * + * It wraps `arrow::AdaptiveIntBuilder`. + */ +struct _GArrowIntArrayBuilder +{ + /*< private >*/ + GArrowArrayBuilder parent_instance; +}; + +struct _GArrowIntArrayBuilderClass +{ + GArrowArrayBuilderClass parent_class; +}; + +GType garrow_int_array_builder_get_type(void) G_GNUC_CONST; + +GArrowIntArrayBuilder *garrow_int_array_builder_new(void); + +gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error); +gboolean garrow_int_array_builder_append_null(GArrowIntArrayBuilder *builder, + GError **error); + + #define GARROW_TYPE_INT8_ARRAY_BUILDER \ (garrow_int8_array_builder_get_type()) #define GARROW_INT8_ARRAY_BUILDER(obj) \ diff --git a/c_glib/arrow-glib/array-builder.hpp b/c_glib/arrow-glib/array-builder.hpp index e65ad005c12fd..bcdc58fd8844b 100644 --- a/c_glib/arrow-glib/array-builder.hpp +++ b/c_glib/arrow-glib/array-builder.hpp @@ -22,5 +22,6 @@ #include #include -GArrowArrayBuilder *garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder); +GArrowArrayBuilder *garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder, + GType type=G_TYPE_INVALID); arrow::ArrayBuilder *garrow_array_builder_get_raw(GArrowArrayBuilder *builder); diff --git a/c_glib/arrow-glib/array.cpp b/c_glib/arrow-glib/array.cpp index 30e51fb309f97..a3c45a890321c 100644 --- a/c_glib/arrow-glib/array.cpp +++ b/c_glib/arrow-glib/array.cpp @@ -557,6 +557,30 @@ garrow_boolean_array_get_value(GArrowBooleanArray *array, return static_cast(arrow_array.get())->Value(i); } +/** + * garrow_boolean_array_get_values: + * @array: A #GArrowBooleanArray. + * @length: (out): The number of values. + * + * Returns: (array length=length): The raw boolean values. + * + * It should be freed with g_free() when no longer needed. + */ +gboolean * +garrow_boolean_array_get_values(GArrowBooleanArray *array, + gint64 *length) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_boolean_array = + std::static_pointer_cast(arrow_array); + *length = arrow_boolean_array->length(); + auto values = static_cast(g_new(gboolean, *length)); + for (gint64 i = 0; i < *length; ++i) { + values[i] = arrow_boolean_array->Value(i); + } + return values; +} + G_DEFINE_TYPE(GArrowInt8Array, \ garrow_int8_array, \ diff --git a/c_glib/arrow-glib/array.h b/c_glib/arrow-glib/array.h index 1b2ba9fc1cc39..10b2279be4bbf 100644 --- a/c_glib/arrow-glib/array.h +++ b/c_glib/arrow-glib/array.h @@ -221,6 +221,8 @@ GArrowBooleanArray *garrow_boolean_array_new(gint64 length, gboolean garrow_boolean_array_get_value (GArrowBooleanArray *array, gint64 i); +gboolean *garrow_boolean_array_get_values(GArrowBooleanArray *array, + gint64 *length); #define GARROW_TYPE_INT8_ARRAY \ diff --git a/c_glib/configure.ac b/c_glib/configure.ac index d4e828ba55c1b..375f76efcdd51 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -79,6 +79,7 @@ else ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" ARROW_CFLAGS="-I${ARROW_INCLUDE_DIR}" + ARROW_LIBS="-L${ARROW_LIB_DIR} -larrow" AC_SUBST(ARROW_LIB_DIR) diff --git a/c_glib/example/build.c b/c_glib/example/build.c index 2722458acd5c4..8c6cf74d74815 100644 --- a/c_glib/example/build.c +++ b/c_glib/example/build.c @@ -47,7 +47,13 @@ main(int argc, char **argv) g_object_unref(builder); return EXIT_FAILURE; } - array = garrow_array_builder_finish(GARROW_ARRAY_BUILDER(builder)); + array = garrow_array_builder_finish(GARROW_ARRAY_BUILDER(builder), &error); + if (!array) { + g_print("failed to finish: %s\n", error->message); + g_error_free(error); + g_object_unref(builder); + return EXIT_FAILURE; + } g_object_unref(builder); } diff --git a/c_glib/example/go/write-batch.go b/c_glib/example/go/write-batch.go index cda09a9b4e8f7..9dbc3c00acc50 100644 --- a/c_glib/example/go/write-batch.go +++ b/c_glib/example/go/write-batch.go @@ -29,7 +29,11 @@ func BuildUInt8Array() *arrow.Array { for _, value := range []uint8{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt16Array() *arrow.Array { @@ -37,7 +41,11 @@ func BuildUInt16Array() *arrow.Array { for _, value := range []uint16{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt32Array() *arrow.Array { @@ -45,7 +53,11 @@ func BuildUInt32Array() *arrow.Array { for _, value := range []uint32{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt64Array() *arrow.Array { @@ -53,7 +65,11 @@ func BuildUInt64Array() *arrow.Array { for _, value := range []uint64{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt8Array() *arrow.Array { @@ -61,7 +77,11 @@ func BuildInt8Array() *arrow.Array { for _, value := range []int8{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt16Array() *arrow.Array { @@ -69,7 +89,11 @@ func BuildInt16Array() *arrow.Array { for _, value := range []int16{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt32Array() *arrow.Array { @@ -77,7 +101,11 @@ func BuildInt32Array() *arrow.Array { for _, value := range []int32{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt64Array() *arrow.Array { @@ -85,7 +113,11 @@ func BuildInt64Array() *arrow.Array { for _, value := range []int64{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildFloatArray() *arrow.Array { @@ -93,7 +125,11 @@ func BuildFloatArray() *arrow.Array { for _, value := range []float32{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildDoubleArray() *arrow.Array { @@ -101,7 +137,11 @@ func BuildDoubleArray() *arrow.Array { for _, value := range []float64{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func main() { diff --git a/c_glib/example/go/write-stream.go b/c_glib/example/go/write-stream.go index 20cb03ef2e324..244741e8cfeb0 100644 --- a/c_glib/example/go/write-stream.go +++ b/c_glib/example/go/write-stream.go @@ -29,7 +29,11 @@ func BuildUInt8Array() *arrow.Array { for _, value := range []uint8{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt16Array() *arrow.Array { @@ -37,7 +41,11 @@ func BuildUInt16Array() *arrow.Array { for _, value := range []uint16{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt32Array() *arrow.Array { @@ -45,7 +53,11 @@ func BuildUInt32Array() *arrow.Array { for _, value := range []uint32{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt64Array() *arrow.Array { @@ -53,7 +65,11 @@ func BuildUInt64Array() *arrow.Array { for _, value := range []uint64{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt8Array() *arrow.Array { @@ -61,7 +77,11 @@ func BuildInt8Array() *arrow.Array { for _, value := range []int8{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt16Array() *arrow.Array { @@ -69,7 +89,11 @@ func BuildInt16Array() *arrow.Array { for _, value := range []int16{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt32Array() *arrow.Array { @@ -77,7 +101,11 @@ func BuildInt32Array() *arrow.Array { for _, value := range []int32{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt64Array() *arrow.Array { @@ -85,7 +113,11 @@ func BuildInt64Array() *arrow.Array { for _, value := range []int64{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildFloatArray() *arrow.Array { @@ -93,7 +125,11 @@ func BuildFloatArray() *arrow.Array { for _, value := range []float32{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildDoubleArray() *arrow.Array { @@ -101,7 +137,11 @@ func BuildDoubleArray() *arrow.Array { for _, value := range []float64{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func main() { diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index f1bac47d6c7b9..3181c098c002b 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -21,6 +21,10 @@ def build_boolean_array(values) build_array(Arrow::BooleanArrayBuilder, values) end + def build_int_array(values) + build_array(Arrow::IntArrayBuilder, values) + end + def build_int8_array(values) build_array(Arrow::Int8ArrayBuilder, values) end diff --git a/c_glib/test/test-boolean-array.rb b/c_glib/test/test-boolean-array.rb index 43b83655638e3..622e4e90c482b 100644 --- a/c_glib/test/test-boolean-array.rb +++ b/c_glib/test/test-boolean-array.rb @@ -17,6 +17,7 @@ class TestBooleanArray < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable def test_new assert_equal(build_boolean_array([true, false, nil]), @@ -41,4 +42,14 @@ def test_value array = builder.finish assert_equal(true, array.get_value(0)) end + + def test_values + require_gi(3, 1, 9) + builder = Arrow::BooleanArrayBuilder.new + builder.append(true) + builder.append(false) + builder.append(true) + array = builder.finish + assert_equal([true, false, true], array.values) + end end diff --git a/c_glib/test/test-int-array-builder.rb b/c_glib/test/test-int-array-builder.rb new file mode 100644 index 0000000000000..e1a6c3b216597 --- /dev/null +++ b/c_glib/test/test-int-array-builder.rb @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestIntArrayBuilder < Test::Unit::TestCase + include Helper::Buildable + + def test_int8 + values = [-1, 2] + assert_equal(build_int_array([*values, nil]), + Arrow::Int8Array.new(3, + Arrow::Buffer.new(values.pack("c*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end + + def test_int16 + border_value = (2 ** (8 - 1)) + values = [-1, border_value] + assert_equal(build_int_array([*values, nil]), + Arrow::Int16Array.new(3, + Arrow::Buffer.new(values.pack("s*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end + + def test_int32 + border_value = (2 ** (16 - 1)) + values = [-1, border_value] + assert_equal(build_int_array([*values, nil]), + Arrow::Int32Array.new(3, + Arrow::Buffer.new(values.pack("l*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end + + def test_int64 + border_value = (2 ** (32 - 1)) + values = [-1, border_value] + assert_equal(build_int_array([*values, nil]), + Arrow::Int64Array.new(3, + Arrow::Buffer.new(values.pack("q*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end +end diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat index 04fe2ab62cbd4..6ebd22fc3e354 100644 --- a/ci/msvc-build.bat +++ b/ci/msvc-build.bat @@ -104,7 +104,6 @@ cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DPARQUET_BOOST_USE_SHARED=OFF ^ - -DPARQUET_ZLIB_VENDORED=off ^ -DPARQUET_BUILD_TESTS=off .. || exit /B cmake --build . --target INSTALL --config %CONFIGURATION% || exit /B popd diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index bf2d385d79d4d..7ab8e2eaa03ee 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -74,6 +74,10 @@ CONFIGURE_OPTIONS="--prefix=$ARROW_C_GLIB_INSTALL" if [ $TRAVIS_OS_NAME != "osx" ]; then CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS --enable-gtk-doc" fi + +CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS CFLAGS=-DARROW_NO_DEPRECATED_API" +CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS CXXFLAGS=-DARROW_NO_DEPRECATED_API" + ./configure $CONFIGURE_OPTIONS make -j4 diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index d456d308c53e3..7418b7614ae34 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -70,12 +70,6 @@ if [ $only_library_mode == "yes" ]; then $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_TESTS=OFF \ -DARROW_BUILD_UTILITIES=OFF" -else - # Deactivate jemalloc on Linux builds. We check the jemalloc+Linux build - # also in the manylinux1 image. - CMAKE_LINUX_FLAGS="\ -$CMAKE_LINUX_FLAGS \ --DARROW_JEMALLOC=ON" fi # Use Ninja for faster builds when using toolchain diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 9135aaf38e4e7..66cd17d5ff212 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -92,7 +92,13 @@ python_version_tests() { conda install -y -q nomkl # Expensive dependencies install from Continuum package repo - conda install -y -q pip numpy pandas cython + conda install -y -q pip numpy pandas cython flake8 + + # Fail fast on style checks + flake8 pyarrow + + # Check Cython files with some checks turned off + flake8 --config=.flake8.cython pyarrow # Build C++ libraries rebuild_arrow_libraries @@ -114,14 +120,13 @@ python_version_tests() { PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet - pushd $ARROW_PYTHON_DIR - # Build documentation once - if [[ "$PYTHON_VERSION" == "3.6" ]] - then - conda install -y -q --file=doc/requirements.txt - python setup.py build_sphinx -s doc/source + if [ "$PYTHON_VERSION" == "3.6" ] && [ $TRAVIS_OS_NAME == "linux" ]; then + # Build documentation once + pushd $ARROW_PYTHON_DIR/doc + conda install -y -q --file=requirements.txt + sphinx-build -b html -d _build/doctrees -W source _build/html + popd fi - popd } # run tests for python 2.7 and 3.6 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 07b8e15b504e4..7d73d1ffff089 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -100,7 +100,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(ARROW_JEMALLOC "Build the Arrow jemalloc-based allocator" - ON) + OFF) option(ARROW_JEMALLOC_USE_SHARED "Rely on jemalloc shared libraries where relevant" @@ -114,6 +114,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Rely on boost shared libraries where relevant" ON) + option(ARROW_BOOST_VENDORED + "Use vendored Boost instead of existing Boost" + OFF) + option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) diff --git a/cpp/README.md b/cpp/README.md index 2f98b085115f5..0228faf7349c5 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -126,6 +126,14 @@ This project follows [Google's C++ Style Guide][3] with minor exceptions. We do not encourage anonymous namespaces and we relax the line length restriction to 90 characters. +### Memory Pools + +We provide a default memory pool with `arrow::default_memory_pool()`. As a +matter of convenience, some of the array builder classes have constructors +which use the default pool without explicitly passing it. You can disable these +constructors in your application (so that you are accounting properly for all +memory allocations) by defining `ARROW_NO_DEFAULT_MEMORY_POOL`. + ### Error Handling and Exceptions For error handling, we use `arrow::Status` values instead of throwing C++ @@ -149,6 +157,12 @@ constructors, the circumstances where they would are somewhat esoteric, and it is likely that an application would have encountered other more serious problems prior to having `std::bad_alloc` thrown in a constructor. +### Deprecations and API Changes + +We use the compiler definition `ARROW_NO_DEPRECATED_API` to disable APIs that +have been deprecated. It is a good practice to compile third party applications +with this flag to proactively catch and account for API changes. + ## Continuous Integration Pull requests are run through travis-ci for continuous integration. You can avoid diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index f32ad5425da35..94156d55801f1 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2084,7 +2084,7 @@ PREDEFINED = __attribute__(x)= \ # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = ARROW_MEMORY_POOL_ARG # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md index 30b7b8f3ce210..774482ea1c4f3 100644 --- a/cpp/apidoc/Windows.md +++ b/cpp/apidoc/Windows.md @@ -187,14 +187,18 @@ Command line to build Arrow in Debug might look as following: cd cpp mkdir build cd build -cmake -G "Visual Studio 14 2015 Win64" -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=OFF -DCMAKE_BUILD_TYPE=Debug -DBOOST_ROOT=C:/local/boost_1_63_0 -DBOOST_LIBRARYDIR=C:/local/boost_1_63_0/lib64-msvc-14.0 .. +cmake -G "Visual Studio 14 2015 Win64" ^ + -DARROW_BOOST_USE_SHARED=OFF ^ + -DCMAKE_BUILD_TYPE=Debug ^ + -DBOOST_ROOT=C:/local/boost_1_63_0 ^ + -DBOOST_LIBRARYDIR=C:/local/boost_1_63_0/lib64-msvc-14.0 ^ + .. cmake --build . --config Debug ``` To get the latest build instructions, you can reference [msvc-build.bat][5], which is used by automated Appveyor builds. - [1]: https://conda.io/miniconda.html [2]: https://conda-forge.github.io/ [3]: http://cmder.net/ diff --git a/cpp/apidoc/index.md b/cpp/apidoc/index.md index 8389d16b4aa1b..ab9bbaa405abc 100644 --- a/cpp/apidoc/index.md +++ b/cpp/apidoc/index.md @@ -39,6 +39,7 @@ Table of Contents * How to access [HDFS](HDFS.md) * Tutorials * [Convert a vector of row-wise data into an Arrow table](tutorials/row_wise_conversion.md) + * [Using the Plasma In-Memory Object Store](tutorials/plasma.md) Getting Started --------------- diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md new file mode 100644 index 0000000000000..9911546ed5c69 --- /dev/null +++ b/cpp/apidoc/tutorials/plasma.md @@ -0,0 +1,442 @@ + + +Using the Plasma In-Memory Object Store from C++ +================================================ + +Apache Arrow offers the ability to share your data structures among multiple +processes simultaneously through Plasma, an in-memory object store. + +Note that **the Plasma API is not stable**. + +Plasma clients are processes that run on the same machine as the object store. +They communicate with the object store over Unix domain sockets, and they read +and write data in the object store through shared memory. + +Plasma objects are immutable once they have been created. + +The following goes over the basics so you can begin using Plasma in your big +data applications. + +Starting the Plasma store +------------------------- + +To start running the Plasma object store so that clients may +connect and access the data, run the following command: + +``` +plasma_store -m 1000000000 -s /tmp/plasma +``` + +The `-m` flag specifies the size of the object store in bytes. The `-s` flag +specifies the path of the Unix domain socket that the store will listen at. + +Therefore, the above command initializes a Plasma store up to 1 GB of memory +and sets the socket to `/tmp/plasma.` + +The Plasma store will remain available as long as the `plasma_store` process is +running in a terminal window. Messages, such as alerts for disconnecting +clients, may occasionally be output. To stop running the Plasma store, you +can press `Ctrl-C` in the terminal window. + +Alternatively, you can run the Plasma store in the background and ignore all +message output with the following terminal command: + +``` +plasma_store -m 1000000000 -s /tmp/plasma 1> /dev/null 2> /dev/null & +``` + +The Plasma store will instead run silently in the background. To stop running +the Plasma store in this case, issue the command below: + +``` +killall plasma_store +``` + +Creating a Plasma client +------------------------ + +Now that the Plasma object store is up and running, it is time to make a client +process connect to it. To use the Plasma object store as a client, your +application should initialize a `plasma::PlasmaClient` object and tell it to +connect to the socket specified when starting up the Plasma object store. + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + // Disconnect the Plasma client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +Save this program in a file `test.cc` and compile it with + +``` +g++ test.cc `pkg-config --cflags --libs plasma` --std=c++11 +``` + +Note that multiple clients can be created within the same process. + +Note that a `PlasmaClient` object is **not thread safe**. + +If the Plasma store is still running, you can now execute the `a.out` executable +and the store will print something like + +``` +Disconnecting client on fd 5 +``` + +which shows that the client was successfully disconnected. + +Object IDs +---------- + +The Plasma object store uses twenty-byte identifiers for accessing objects +stored in shared memory. Each object in the Plasma store should be associated +with a unique ID. The Object ID is then a key that can be used by **any** client +to fetch that object from the Plasma store. + +Random generation of Object IDs is often good enough to ensure unique IDs: + +```cpp +// Randomly generate an Object ID. +ObjectID object_id = ObjectID::from_random(); +``` + +Now, any connected client that knows the object's Object ID can access the +same object from the Plasma object store. For easy transportation of Object IDs, +you can convert/serialize an Object ID into a binary string and back as +follows: + +```cpp +// From ObjectID to binary string +std:string id_string = object_id.binary(); + +// From binary string to ObjectID +ObjectID id_object = ObjectID::from_binary(&id_string); +``` + +You can also get a human readable representation of ObjectIDs in the same +format that git uses for commit hashes by running `ObjectID::hex`. + +Here is a test program you can run: + +```cpp +#include +#include +#include + +using namespace plasma; + +int main(int argc, char** argv) { + ObjectID object_id1 = ObjectID::from_random(); + std::cout << "object_id1 is " << object_id1.hex() << std::endl; + + std::string id_string = object_id1.binary(); + ObjectID object_id2 = ObjectID::from_binary(id_string); + std::cout << "object_id2 is " << object_id2.hex() << std::endl; +} +``` + +Creating an Object +------------------ + +Now that you learned about Object IDs that are used to refer to objects, +let's look at how objects can be stored in Plasma. + +Storing objects is a two-stage process. First a buffer is allocated with a call +to `Create`. Then it can be constructed in place by the client. Then it is made +immutable and shared with other clients via a call to `Seal`. + +The `Create` call blocks while the Plasma store allocates a buffer of the +appropriate size. The client will then map the buffer into its own address +space. At this point the object can be constructed in place using a pointer that +was written by the `Create` command. + +```cpp +int64_t data_size = 100; +// The address of the buffer allocated by the Plasma store will be written at +// this address. +uint8_t* data; +// Create a Plasma object by specifying its ID and size. +ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); +``` + +You can also specify metadata for the object; the third argument is the +metadata (as raw bytes) and the fourth argument is the size of the metadata. + +```cpp +// Create a Plasma object without metadata. +int64_t data_size = 100; +std::string metadata = "{'author': 'john'}"; +uint8_t* data; +client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data); +``` + +Now that we've obtained a pointer to our object's data, we can +write our data to it: + +```cpp +// Write some data for the Plasma object. +for (int64_t i = 0; i < data_size; i++) { + data[i] = static_cast(i % 4); +} +``` + +When the client is done, the client **seals** the buffer, making the object +immutable, and making it available to other Plasma clients: + +```cpp +// Seal the object. This makes it available for all clients. +client.Seal(object_id); +``` + +Here is an example that combines all these features: + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + // Create an object with a random ObjectID. + ObjectID object_id = ObjectID::from_binary("00000000000000000000"); + int64_t data_size = 1000; + uint8_t *data; + std::string metadata = "{'author': 'john'}"; + ARROW_CHECK_OK(client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data)); + // Write some data into the object. + for (int64_t i = 0; i < data_size; i++) { + data[i] = static_cast(i % 4); + } + // Seal the object. + ARROW_CHECK_OK(client.Seal(object_id)); + // Disconnect the client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +This example can be compiled with + +``` +g++ create.cc `pkg-config --cflags --libs plasma` --std=c++11 -o create +``` + +To verify that an object exists in the Plasma object store, you can +call `PlasmaClient::Contains()` to check if an object has +been created and sealed for a given Object ID. Note that this function +will still return False if the object has been created, but not yet +sealed: + +```cpp +// Check if an object has been created and sealed. +bool has_object; +client.Contains(object_id, &has_object); +if (has_object) { + // Object has been created and sealed, proceed +} +``` + +Getting an Object +----------------- + +After an object has been sealed, any client who knows the Object ID can get +the object. To store the retrieved object contents, you should create an +`ObjectBuffer`, then call `PlasmaClient::Get()` as follows: + +```cpp +// Get from the Plasma store by Object ID. +ObjectBuffer object_buffer; +client.Get(&object_id, 1, -1, &object_buffer); +``` + +`PlasmaClient::Get()` isn't limited to fetching a single object +from the Plasma store at once. You can specify an array of Object IDs and +`ObjectBuffers` to fetch at once, so long as you also specify the +number of objects being fetched: + +```cpp +// Get two objects at once from the Plasma store. This function +// call will block until both objects have been fetched. +ObjectBuffer multiple_buffers[2]; +ObjectID multiple_ids[2] = {object_id1, object_id2}; +client.Get(multiple_ids, 2, -1, multiple_buffers); +``` + +Since `PlasmaClient::Get()` is a blocking function call, it may be +necessary to limit the amount of time the function is allowed to take +when trying to fetch from the Plasma store. You can pass in a timeout +in milliseconds when calling `PlasmaClient::Get().` To use `PlasmaClient::Get()` +without a timeout, just pass in -1 like in the previous example calls: + +```cpp +// Make the function call give up fetching the object if it takes +// more than 100 milliseconds. +int64_t timeout = 100; +client.Get(&object_id, 1, timeout, &object_buffer); +``` + +Finally, to access the object, you can access the `data` and +`metadata` attributes of the `ObjectBuffer`. The `data` can be indexed +like any array: + +```cpp +// Access object data. +uint8_t* data = object_buffer.data; +int64_t data_size = object_buffer.data_size; + +// Access object metadata. +uint8_t* metadata = object_buffer.metadata; +uint8_t metadata_size = object_buffer.metadata_size; + +// Index into data array. +uint8_t first_data_byte = data[0]; +``` + +Here is a longer example that shows these capabilities: + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + ObjectID object_id = ObjectID::from_binary("00000000000000000000"); + ObjectBuffer object_buffer; + ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); + + // Retrieve object data. + uint8_t* data = object_buffer.data; + int64_t data_size = object_buffer.data_size; + + // Check that the data agrees with what was written in the other process. + for (int64_t i = 0; i < data_size; i++) { + ARROW_CHECK(data[i] == static_cast(i % 4)); + } + + // Disconnect the client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +If you compile it with + +``` +g++ get.cc `pkg-config --cflags --libs plasma` --std=c++11 -o get +``` + +and run it with `./get`, all the assertions will pass if you run the `create` +example from above on the same Plasma store. + + +Object Lifetime Management +-------------------------- + +The Plasma store internally does reference counting to make sure objects that +are mapped into the address space of one of the clients with `PlasmaClient::Get` +are accessible. To unmap objects from a client, call `PlasmaClient::Release`. +All objects that are mapped into a clients address space will automatically +be released when the client is disconnected from the store (this happens even +if the client process crashes or otherwise fails to call `Disconnect`). + +If a new object is created and there is not enough space in the Plasma store, +the store will evict the least recently used object (an object is in use if at +least one client has gotten it but not released it). + +Object notifications +-------------------- + +Additionally, you can arrange to have Plasma notify you when objects are +sealed in the object store. This may especially be handy when your +program is collaborating with other Plasma clients, and needs to know +when they make objects available. + +First, you can subscribe your current Plasma client to such notifications +by getting a file descriptor: + +```cpp +// Start receiving notifications into file_descriptor. +int fd; +ARROW_CHECK_OK(client.Subscribe(&fd)); +``` + +Once you have the file descriptor, you can have your current Plasma client +wait to receive the next object notification. Object notifications +include information such as Object ID, data size, and metadata size of +the next newly available object: + +```cpp +// Receive notification of the next newly available object. +// Notification information is stored in object_id, data_size, and metadata_size +ObjectID new_object_id; +int64_t data_size; +int64_t metadata_size; +ARROW_CHECK_OK(client.GetNotification(fd, &object_id, &data_size, &metadata_size)); + +// Get the newly available object. +ObjectBuffer object_buffer; +ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); +``` + +Here is a full program that shows this capability: + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + + int fd; + ARROW_CHECK_OK(client.Subscribe(&fd)); + + ObjectID object_id; + int64_t data_size; + int64_t metadata_size; + while (true) { + ARROW_CHECK_OK(client.GetNotification(fd, &object_id, &data_size, &metadata_size)); + + std::cout << "Received object notification for object_id = " + << object_id.hex() << ", with data_size = " << data_size + << ", and metadata_size = " << metadata_size << std::endl; + } + + // Disconnect the client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +If you compile it with + +``` +g++ subscribe.cc `pkg-config --cflags --libs plasma` --std=c++11 -o subscribe +``` + +and invoke `./create` and `./subscribe` while the Plasma store is running, +you can observe the new object arriving. diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1271b8a4ab3f4..a888e92392db6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -124,50 +124,100 @@ set(Boost_ADDITIONAL_VERSIONS "1.62.0" "1.61" "1.61.0" "1.62" "1.60.0" "1.60") - -if (ARROW_BOOST_USE_SHARED) - # Find shared Boost libraries. - set(Boost_USE_STATIC_LIBS OFF) - - if(MSVC) - # disable autolinking in boost - add_definitions(-DBOOST_ALL_NO_LIB) - - # force all boost libraries to dynamic link - add_definitions(-DBOOST_ALL_DYN_LINK) - endif() - +list(GET Boost_ADDITIONAL_VERSIONS 0 BOOST_LATEST_VERSION) +string(REPLACE "." "_" BOOST_LATEST_VERSION_IN_PATH ${BOOST_LATEST_VERSION}) +set(BOOST_LATEST_URL + "https://dl.bintray.com/boostorg/release/${BOOST_LATEST_VERSION}/source/boost_${BOOST_LATEST_VERSION_IN_PATH}.tar.gz") + +if (ARROW_BOOST_VENDORED) + set(BOOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/boost_ep-prefix/src/boost_ep") + set(BOOST_LIB_DIR "${BOOST_PREFIX}/stage/lib") + set(BOOST_BUILD_LINK "static") + set(BOOST_STATIC_SYSTEM_LIBRARY + "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BOOST_STATIC_FILESYSTEM_LIBRARY + "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}") + set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}") if (ARROW_BOOST_HEADER_ONLY) - find_package(Boost) + set(BOOST_BUILD_PRODUCTS) + set(BOOST_CONFIGURE_COMMAND "") + set(BOOST_BUILD_COMMAND "") else() - find_package(Boost COMPONENTS system filesystem REQUIRED) + set(BOOST_BUILD_PRODUCTS + ${BOOST_SYSTEM_LIBRARY} + ${BOOST_FILESYSTEM_LIBRARY}) + set(BOOST_CONFIGURE_COMMAND + "./bootstrap.sh" + "--prefix=${BOOST_PREFIX}" + "--with-libraries=filesystem,system") if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) - set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_BUILD_VARIANT "debug") else() - set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) - set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_BUILD_VARIANT "release") endif() - set(BOOST_SYSTEM_LIBRARY boost_system_shared) - set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + set(BOOST_BUILD_COMMAND + "./b2" + "link=${BOOST_BUILD_LINK}" + "variant=${BOOST_BUILD_VARIANT}" + "cxxflags=-fPIC") endif() + ExternalProject_Add(boost_ep + URL ${BOOST_LATEST_URL} + BUILD_BYPRODUCTS ${BOOST_BUILD_PRODUCTS} + BUILD_IN_SOURCE 1 + CONFIGURE_COMMAND ${BOOST_CONFIGURE_COMMAND} + BUILD_COMMAND ${BOOST_BUILD_COMMAND} + INSTALL_COMMAND "" + ${EP_LOG_OPTIONS}) + set(Boost_INCLUDE_DIR "${BOOST_PREFIX}") + set(Boost_INCLUDE_DIRS "${BOOST_INCLUDE_DIR}") + add_dependencies(arrow_dependencies boost_ep) else() - # Find static boost headers and libs - # TODO Differentiate here between release and debug builds - set(Boost_USE_STATIC_LIBS ON) - if (ARROW_BOOST_HEADER_ONLY) - find_package(Boost) + if (ARROW_BOOST_USE_SHARED) + # Find shared Boost libraries. + set(Boost_USE_STATIC_LIBS OFF) + + if(MSVC) + # disable autolinking in boost + add_definitions(-DBOOST_ALL_NO_LIB) + + # force all boost libraries to dynamic link + add_definitions(-DBOOST_ALL_DYN_LINK) + endif() + + if (ARROW_BOOST_HEADER_ONLY) + find_package(Boost REQUIRED) + else() + find_package(Boost COMPONENTS system filesystem REQUIRED) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + else() + set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + endif() + set(BOOST_SYSTEM_LIBRARY boost_system_shared) + set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + endif() else() - find_package(Boost COMPONENTS system filesystem REQUIRED) - if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) - set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + # Find static boost headers and libs + # TODO Differentiate here between release and debug builds + set(Boost_USE_STATIC_LIBS ON) + if (ARROW_BOOST_HEADER_ONLY) + find_package(Boost REQUIRED) else() - set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) - set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + find_package(Boost COMPONENTS system filesystem REQUIRED) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + else() + set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + endif() + set(BOOST_SYSTEM_LIBRARY boost_system_static) + set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) endif() - set(BOOST_SYSTEM_LIBRARY boost_system_static) - set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) endif() endif() @@ -648,7 +698,7 @@ if (ARROW_WITH_LZ4) if (MSVC) set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib") set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln) - set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_wholeprogramoptimization_param.patch) + set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_wholeprogramoptimization_param.patch) else() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") set(LZ4_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh) @@ -692,7 +742,7 @@ if (ARROW_WITH_ZSTD) if (MSVC) set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/libzstd_static.lib") set(ZSTD_BUILD_COMMAND BUILD_COMMAND msbuild ${ZSTD_BUILD_DIR}/build/VS2010/zstd.sln /t:Build /v:minimal /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /p:OutDir=${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/ /p:SolutionDir=${ZSTD_BUILD_DIR}/build/VS2010/ ) - set(ZSTD_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose ${CMAKE_SOURCE_DIR}/build-support/zstd_msbuild_wholeprogramoptimization_param.patch) + set(ZSTD_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/zstd_msbuild_wholeprogramoptimization_param.patch) else() set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/lib/libzstd.a") set(ZSTD_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-zstd-lib.sh) diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc index 436ce9cf7c312..e94ba48d60840 100644 --- a/cpp/src/arrow/array-decimal-test.cc +++ b/cpp/src/arrow/array-decimal-test.cc @@ -37,7 +37,7 @@ class DecimalTestBase { auto type = std::make_shared(precision, 4); int byte_width = type->byte_width(); auto pool = default_memory_pool(); - auto builder = std::make_shared(pool, type); + auto builder = std::make_shared(type, pool); size_t null_count = 0; size_t size = draw.size(); diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 0efb51ccece0c..38aceb2d185bb 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -171,7 +171,7 @@ TEST_F(TestArray, TestIsNull) { TEST_F(TestArray, BuildLargeInMemoryArray) { const int64_t length = static_cast(std::numeric_limits::max()) + 1; - BooleanBuilder builder(default_memory_pool()); + BooleanBuilder builder; ASSERT_OK(builder.Reserve(length)); ASSERT_OK(builder.Advance(length)); @@ -754,9 +754,9 @@ TEST_F(TestStringArray, TestEmptyStringComparison) { } TEST_F(TestStringArray, CompareNullByteSlots) { - StringBuilder builder(default_memory_pool()); - StringBuilder builder2(default_memory_pool()); - StringBuilder builder3(default_memory_pool()); + StringBuilder builder; + StringBuilder builder2; + StringBuilder builder3; ASSERT_OK(builder.Append("foo")); ASSERT_OK(builder2.Append("foo")); @@ -795,7 +795,7 @@ TEST_F(TestStringArray, CompareNullByteSlots) { } TEST_F(TestStringArray, TestSliceGetString) { - StringBuilder builder(default_memory_pool()); + StringBuilder builder; ASSERT_OK(builder.Append("a")); ASSERT_OK(builder.Append("b")); @@ -958,7 +958,7 @@ TEST_F(TestBinaryArray, TestGetValue) { } TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { - BinaryBuilder builder(default_memory_pool(), arrow::binary()); + BinaryBuilder builder; string empty_string(""); for (int i = 0; i < 5; ++i) { @@ -1045,7 +1045,7 @@ void CheckSliceEquality() { using Traits = TypeTraits; using BuilderType = typename Traits::BuilderType; - BuilderType builder(default_memory_pool()); + BuilderType builder; vector strings = {"foo", "", "bar", "baz", "qux", ""}; vector is_null = {0, 1, 0, 1, 0, 0}; @@ -1102,7 +1102,7 @@ class TestFWBinaryArray : public ::testing::Test { void InitBuilder(int byte_width) { auto type = fixed_size_binary(byte_width); - builder_.reset(new FixedSizeBinaryBuilder(default_memory_pool(), type)); + builder_.reset(new FixedSizeBinaryBuilder(type, default_memory_pool())); } protected: @@ -1184,8 +1184,8 @@ TEST_F(TestFWBinaryArray, EqualsRangeEquals) { // Check that we don't compare data in null slots auto type = fixed_size_binary(4); - FixedSizeBinaryBuilder builder1(default_memory_pool(), type); - FixedSizeBinaryBuilder builder2(default_memory_pool(), type); + FixedSizeBinaryBuilder builder1(type); + FixedSizeBinaryBuilder builder2(type); ASSERT_OK(builder1.Append("foo1")); ASSERT_OK(builder1.AppendNull()); @@ -1209,7 +1209,7 @@ TEST_F(TestFWBinaryArray, EqualsRangeEquals) { TEST_F(TestFWBinaryArray, ZeroSize) { auto type = fixed_size_binary(0); - FixedSizeBinaryBuilder builder(default_memory_pool(), type); + FixedSizeBinaryBuilder builder(type); ASSERT_OK(builder.Append(nullptr)); ASSERT_OK(builder.Append(nullptr)); @@ -1233,7 +1233,7 @@ TEST_F(TestFWBinaryArray, ZeroSize) { TEST_F(TestFWBinaryArray, Slice) { auto type = fixed_size_binary(4); - FixedSizeBinaryBuilder builder(default_memory_pool(), type); + FixedSizeBinaryBuilder builder(type); vector strings = {"foo1", "foo2", "foo3", "foo4", "foo5"}; vector is_null = {0, 1, 0, 0, 0}; @@ -1519,14 +1519,14 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder(default_memory_pool()); + NumericBuilder dict_builder; ASSERT_OK(dict_builder.Append(static_cast(1))); ASSERT_OK(dict_builder.Append(static_cast(2))); std::shared_ptr dict_array; ASSERT_OK(dict_builder.Finish(&dict_array)); auto dtype = std::make_shared(int8(), dict_array); - Int8Builder int_builder(default_memory_pool()); + Int8Builder int_builder; ASSERT_OK(int_builder.Append(0)); ASSERT_OK(int_builder.Append(1)); ASSERT_OK(int_builder.Append(0)); @@ -1538,8 +1538,8 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { } TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { - NumericBuilder builder(default_memory_pool()); - // DictionaryBuilder builder(default_memory_pool()); + NumericBuilder builder; + // DictionaryBuilder builder; ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); ASSERT_OK(builder.Append(static_cast(1))); @@ -1552,14 +1552,14 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { ASSERT_OK(dictionary_builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder(default_memory_pool()); + NumericBuilder dict_builder; ASSERT_OK(dict_builder.Append(static_cast(1))); ASSERT_OK(dict_builder.Append(static_cast(2))); std::shared_ptr dict_array; ASSERT_OK(dict_builder.Finish(&dict_array)); auto dtype = std::make_shared(int8(), dict_array); - Int8Builder int_builder(default_memory_pool()); + Int8Builder int_builder; ASSERT_OK(int_builder.Append(0)); ASSERT_OK(int_builder.Append(1)); ASSERT_OK(int_builder.Append(0)); @@ -1577,8 +1577,8 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { // Build the dictionary Array DictionaryBuilder builder(default_memory_pool()); // Build expected data - NumericBuilder dict_builder(default_memory_pool()); - Int16Builder int_builder(default_memory_pool()); + NumericBuilder dict_builder; + Int16Builder int_builder; // Fill with 1024 different values for (int64_t i = 0; i < 1024; i++) { @@ -1619,14 +1619,14 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder(default_memory_pool()); + StringBuilder str_builder; ASSERT_OK(str_builder.Append("test")); ASSERT_OK(str_builder.Append("test2")); std::shared_ptr str_array; ASSERT_OK(str_builder.Finish(&str_array)); auto dtype = std::make_shared(int8(), str_array); - Int8Builder int_builder(default_memory_pool()); + Int8Builder int_builder; ASSERT_OK(int_builder.Append(0)); ASSERT_OK(int_builder.Append(1)); ASSERT_OK(int_builder.Append(0)); @@ -1641,8 +1641,8 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) { // Build the dictionary Array StringDictionaryBuilder builder(default_memory_pool()); // Build expected data - StringBuilder str_builder(default_memory_pool()); - Int16Builder int_builder(default_memory_pool()); + StringBuilder str_builder; + Int16Builder int_builder; // Fill with 1024 different values for (int64_t i = 0; i < 1024; i++) { @@ -1881,15 +1881,18 @@ TEST(TestDictionary, Basics) { std::shared_ptr type1 = std::dynamic_pointer_cast(dictionary(int16(), dict)); - DictionaryType type2(int16(), dict); + + auto type2 = + std::dynamic_pointer_cast(::arrow::dictionary(int16(), dict, true)); ASSERT_TRUE(int16()->Equals(type1->index_type())); ASSERT_TRUE(type1->dictionary()->Equals(dict)); - ASSERT_TRUE(int16()->Equals(type2.index_type())); - ASSERT_TRUE(type2.dictionary()->Equals(dict)); + ASSERT_TRUE(int16()->Equals(type2->index_type())); + ASSERT_TRUE(type2->dictionary()->Equals(dict)); - ASSERT_EQ("dictionary", type1->ToString()); + ASSERT_EQ("dictionary", type1->ToString()); + ASSERT_EQ("dictionary", type2->ToString()); } TEST(TestDictionary, Equals) { diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ab0be7a0964c6..637eb2417fcfd 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -159,6 +159,11 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t le std::make_shared(type, length, std::move(buffers), null_count, offset)); } +const uint8_t* PrimitiveArray::raw_values() const { + return raw_values_ + + offset() * static_cast(*type()).bit_width() / 8; +} + template NumericArray::NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) { diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index a853f2bb5f93d..777fbe0b006b3 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -292,8 +292,8 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { /// Does not account for any slice offset std::shared_ptr values() const { return data_->buffers[1]; } - /// Does not account for any slice offset - const uint8_t* raw_values() const { return raw_values_; } + /// \brief Return pointer to start of raw data + const uint8_t* raw_values() const; protected: PrimitiveArray() {} @@ -521,7 +521,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { int32_t byte_width() const { return byte_width_; } - const uint8_t* raw_values() const { return raw_values_; } + const uint8_t* raw_values() const { return raw_values_ + byte_width_ * data_->offset; } std::shared_ptr Slice(int64_t offset, int64_t length) const override; @@ -567,7 +567,9 @@ class ARROW_EXPORT DecimalArray : public FlatArray { int32_t byte_width() const { return static_cast(*type()).byte_width(); } - const uint8_t* raw_values() const { return raw_values_; } + + /// \brief Return pointer to value data, accounting for any offset + const uint8_t* raw_values() const { return raw_values_ + byte_width() * data_->offset; } private: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index 13d7b20591dad..7ac7fe3bed533 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -30,7 +30,7 @@ static void BM_BuildPrimitiveArrayNoNulls( // 2 MiB block std::vector data(256 * 1024, 100); while (state.KeepRunning()) { - Int64Builder builder(default_memory_pool()); + Int64Builder builder; for (int i = 0; i < kFinalSize; i++) { // Build up an array of 512 MiB in size ABORT_NOT_OK(builder.Append(data.data(), data.size(), nullptr)); @@ -66,7 +66,7 @@ static void BM_BuildAdaptiveIntNoNulls( data.push_back(i); } while (state.KeepRunning()) { - AdaptiveIntBuilder builder(default_memory_pool()); + AdaptiveIntBuilder builder; for (int64_t i = 0; i < size; i += chunk_size) { // Build up an array of 512 MiB in size ABORT_NOT_OK(builder.Append(data.data() + i, chunk_size, nullptr)); @@ -85,7 +85,7 @@ static void BM_BuildAdaptiveIntNoNullsScalarAppend( data.push_back(i); } while (state.KeepRunning()) { - AdaptiveIntBuilder builder(default_memory_pool()); + AdaptiveIntBuilder builder; for (int64_t i = 0; i < size; i++) { ABORT_NOT_OK(builder.Append(data[i])); } @@ -104,7 +104,7 @@ static void BM_BuildAdaptiveUIntNoNulls( data.push_back(i); } while (state.KeepRunning()) { - AdaptiveUIntBuilder builder(default_memory_pool()); + AdaptiveUIntBuilder builder; for (int64_t i = 0; i < size; i += chunk_size) { // Build up an array of 512 MiB in size ABORT_NOT_OK(builder.Append(data.data() + i, chunk_size, nullptr)); @@ -161,7 +161,7 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const std::string value = "1234567890"; while (state.KeepRunning()) { - BinaryBuilder builder(default_memory_pool()); + BinaryBuilder builder; for (int64_t i = 0; i < iterations; i++) { ABORT_NOT_OK(builder.Append(value)); } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 391204f566954..e2054dbfde688 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -27,6 +27,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -177,6 +178,17 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { length_ = new_length; } +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::Finish(std::shared_ptr* out) { + *out = std::make_shared(length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + template Status PrimitiveBuilder::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -262,7 +274,7 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) - : ArrayBuilder(pool, int64()), data_(nullptr), raw_data_(nullptr), int_size_(1) {} + : ArrayBuilder(int64(), pool), data_(nullptr), raw_data_(nullptr), int_size_(1) {} Status AdaptiveIntBuilderBase::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -612,13 +624,18 @@ Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) { } BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(pool, boolean()), data_(nullptr), raw_data_(nullptr) {} + : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} -BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type) +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) : BooleanBuilder(pool) { DCHECK_EQ(Type::BOOL, type->id()); } +#ifndef ARROW_NO_DEPRECATED_API +BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type) + : BooleanBuilder(type, pool) {} +#endif + Status BooleanBuilder::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); data_ = std::make_shared(pool_); @@ -693,18 +710,25 @@ Status BooleanBuilder::Append(const uint8_t* values, int64_t length, // DictionaryBuilder template -DictionaryBuilder::DictionaryBuilder(MemoryPool* pool, - const std::shared_ptr& type) - : ArrayBuilder(pool, type), +DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), - dict_builder_(pool, type), + dict_builder_(type, pool), values_builder_(pool) { if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } } +#ifndef ARROW_NO_DEPRECATED_API +template +DictionaryBuilder::DictionaryBuilder(MemoryPool* pool, + const std::shared_ptr& type) + : DictionaryBuilder(type, pool) {} +#endif + template Status DictionaryBuilder::Init(int64_t elements) { RETURN_NOT_OK(ArrayBuilder::Init(elements)); @@ -931,11 +955,17 @@ template class DictionaryBuilder; // ---------------------------------------------------------------------- // DecimalBuilder -DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type) - : FixedSizeBinaryBuilder(pool, type), + +DecimalBuilder::DecimalBuilder(const std::shared_ptr& type, MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool), sign_bitmap_(nullptr), sign_bitmap_data_(nullptr) {} +#ifndef ARROW_NO_DEPRECATED_API +DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type) + : DecimalBuilder(type, pool) {} +#endif + template ARROW_EXPORT Status DecimalBuilder::Append(const decimal::Decimal& val) { DCHECK_EQ(sign_bitmap_, nullptr) << "sign_bitmap_ is not null"; @@ -1014,9 +1044,9 @@ Status DecimalBuilder::Finish(std::shared_ptr* out) { ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr value_builder, const std::shared_ptr& type) - : ArrayBuilder(pool, - type ? type : std::static_pointer_cast( - std::make_shared(value_builder->type()))), + : ArrayBuilder(type ? type : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), offsets_builder_(pool), value_builder_(std::move(value_builder)) {} @@ -1090,10 +1120,15 @@ ArrayBuilder* ListBuilder::value_builder() const { // ---------------------------------------------------------------------- // String and binary +BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} + +#ifndef ARROW_NO_DEPRECATED_API BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr& type) - : ArrayBuilder(pool, type), offsets_builder_(pool), value_data_builder_(pool) {} + : BinaryBuilder(type, pool) {} +#endif -BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(pool, binary()) {} +BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} Status BinaryBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); @@ -1173,7 +1208,7 @@ const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { return value_data_builder_.data() + offset; } -StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {} +StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} Status StringBuilder::Finish(std::shared_ptr* out) { std::shared_ptr data; @@ -1186,12 +1221,18 @@ Status StringBuilder::Finish(std::shared_ptr* out) { // ---------------------------------------------------------------------- // Fixed width binary -FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(MemoryPool* pool, - const std::shared_ptr& type) - : ArrayBuilder(pool, type), +FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), byte_width_(static_cast(*type).byte_width()), byte_builder_(pool) {} +#ifndef ARROW_NO_DEPRECATED_API +FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(MemoryPool* pool, + const std::shared_ptr& type) + : FixedSizeBinaryBuilder(type, pool) {} +#endif + Status FixedSizeBinaryBuilder::Append(const uint8_t* value) { RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(true); @@ -1236,12 +1277,18 @@ Status FixedSizeBinaryBuilder::Finish(std::shared_ptr* out) { // ---------------------------------------------------------------------- // Struct -StructBuilder::StructBuilder(MemoryPool* pool, const std::shared_ptr& type, +StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, std::vector>&& field_builders) - : ArrayBuilder(pool, type) { + : ArrayBuilder(type, pool) { field_builders_ = std::move(field_builders); } +#ifndef ARROW_NO_DEPRECATED_API +StructBuilder::StructBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::vector>&& field_builders) + : StructBuilder(type, pool, std::move(field_builders)) {} +#endif + Status StructBuilder::Finish(std::shared_ptr* out) { std::vector> fields(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { @@ -1261,7 +1308,7 @@ Status StructBuilder::Finish(std::shared_ptr* out) { #define BUILDER_CASE(ENUM, BuilderType) \ case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ + out->reset(new BuilderType(type, pool)); \ return Status::OK(); // Initially looked at doing this with vtables, but shared pointers makes it @@ -1271,26 +1318,30 @@ Status StructBuilder::Finish(std::shared_ptr* out) { Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out) { switch (type->id()) { - BUILDER_CASE(UINT8, UInt8Builder); - BUILDER_CASE(INT8, Int8Builder); - BUILDER_CASE(UINT16, UInt16Builder); - BUILDER_CASE(INT16, Int16Builder); - BUILDER_CASE(UINT32, UInt32Builder); - BUILDER_CASE(INT32, Int32Builder); - BUILDER_CASE(UINT64, UInt64Builder); - BUILDER_CASE(INT64, Int64Builder); - BUILDER_CASE(DATE32, Date32Builder); - BUILDER_CASE(DATE64, Date64Builder); - BUILDER_CASE(TIME32, Time32Builder); - BUILDER_CASE(TIME64, Time64Builder); - BUILDER_CASE(TIMESTAMP, TimestampBuilder); - BUILDER_CASE(BOOL, BooleanBuilder); - BUILDER_CASE(FLOAT, FloatBuilder); - BUILDER_CASE(DOUBLE, DoubleBuilder); - BUILDER_CASE(STRING, StringBuilder); - BUILDER_CASE(BINARY, BinaryBuilder); - BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); - BUILDER_CASE(DECIMAL, DecimalBuilder); + case Type::NA: { + out->reset(new NullBuilder(pool)); + return Status::OK(); + } + BUILDER_CASE(UINT8, UInt8Builder); + BUILDER_CASE(INT8, Int8Builder); + BUILDER_CASE(UINT16, UInt16Builder); + BUILDER_CASE(INT16, Int16Builder); + BUILDER_CASE(UINT32, UInt32Builder); + BUILDER_CASE(INT32, Int32Builder); + BUILDER_CASE(UINT64, UInt64Builder); + BUILDER_CASE(INT64, Int64Builder); + BUILDER_CASE(DATE32, Date32Builder); + BUILDER_CASE(DATE64, Date64Builder); + BUILDER_CASE(TIME32, Time32Builder); + BUILDER_CASE(TIME64, Time64Builder); + BUILDER_CASE(TIMESTAMP, TimestampBuilder); + BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(FLOAT, FloatBuilder); + BUILDER_CASE(DOUBLE, DoubleBuilder); + BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); + BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); + BUILDER_CASE(DECIMAL, DecimalBuilder); case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = @@ -1309,18 +1360,21 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); values_builder.emplace_back(std::move(builder)); } - out->reset(new StructBuilder(pool, type, std::move(values_builder))); + out->reset(new StructBuilder(type, pool, std::move(values_builder))); return Status::OK(); } - default: - return Status::NotImplemented(type->ToString()); + default: { + std::stringstream ss; + ss << "MakeBuilder: cannot construct builder for type " << type->ToString(); + return Status::NotImplemented(ss.str()); + } } } #define DICTIONARY_BUILDER_CASE(ENUM, BuilderType) \ case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ + out->reset(new BuilderType(type, pool)); \ return Status::OK(); Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, @@ -1343,8 +1397,84 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder); DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder); DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder); - // DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); - // DICTIONARY_BUILDER_CASE(DECIMAL, DecimalBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} + +#define DICTIONARY_ARRAY_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + builder = std::make_shared(type, pool); \ + RETURN_NOT_OK(static_cast(*builder).AppendArray(input)); \ + RETURN_NOT_OK(builder->Finish(out)); \ + return Status::OK(); + +Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool, + std::shared_ptr* out) { + const std::shared_ptr& type = input.data()->type; + std::shared_ptr builder; + switch (type->id()) { + DICTIONARY_ARRAY_CASE(UINT8, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT8, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(UINT16, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT16, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(UINT32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(UINT64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DATE32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DATE64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(TIME32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(TIME64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(TIMESTAMP, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(FLOAT, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DOUBLE, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder); + DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} +#define DICTIONARY_COLUMN_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + builder = std::make_shared(type, pool); \ + chunks = input.data(); \ + for (auto chunk : chunks->chunks()) { \ + RETURN_NOT_OK(static_cast(*builder).AppendArray(*chunk)); \ + } \ + RETURN_NOT_OK(builder->Finish(&arr)); \ + *out = std::make_shared(input.name(), arr); \ + return Status::OK(); + +/// \brief Encodes a column to a suitable dictionary type +/// \param input Column to be encoded +/// \param pool MemoryPool to allocate the dictionary +/// \param out The new column +/// \return Status +Status EncodeColumnToDictionary(const Column& input, MemoryPool* pool, + std::shared_ptr* out) { + const std::shared_ptr& type = input.type(); + std::shared_ptr builder; + std::shared_ptr arr; + std::shared_ptr chunks; + switch (type->id()) { + DICTIONARY_COLUMN_CASE(UINT8, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT8, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(UINT16, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT16, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(UINT32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(UINT64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(DATE32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(DATE64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(TIME32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(TIME64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(TIMESTAMP, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(FLOAT, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(DOUBLE, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(STRING, StringDictionaryBuilder); + DICTIONARY_COLUMN_CASE(BINARY, BinaryDictionaryBuilder); default: return Status::NotImplemented(type->ToString()); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 009fd7ae47d19..46900fc7129c1 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -28,6 +28,7 @@ #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -53,6 +54,12 @@ struct Decimal; static constexpr int64_t kMinBuilderCapacity = 1 << 5; +#ifdef ARROW_NO_DEFAULT_MEMORY_POOL +#define ARROW_MEMORY_POOL_ARG pool +#else +#define ARROW_MEMORY_POOL_ARG pool = default_memory_pool() +#endif + /// Base class for all data array builders. // /// This class provides a facilities for incrementally building the null bitmap @@ -60,9 +67,9 @@ static constexpr int64_t kMinBuilderCapacity = 1 << 5; /// the null count. class ARROW_EXPORT ArrayBuilder { public: - explicit ArrayBuilder(MemoryPool* pool, const std::shared_ptr& type) - : pool_(pool), - type_(type), + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), + pool_(pool), null_bitmap_(nullptr), null_count_(0), null_bitmap_data_(nullptr), @@ -117,9 +124,8 @@ class ARROW_EXPORT ArrayBuilder { std::shared_ptr type() const { return type_; } protected: - MemoryPool* pool_; - std::shared_ptr type_; + MemoryPool* pool_; // When null_bitmap are first appended to the builder, the null bitmap is allocated std::shared_ptr null_bitmap_; @@ -157,13 +163,31 @@ class ARROW_EXPORT ArrayBuilder { DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG) : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Finish(std::shared_ptr* out) override; +}; + template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: using value_type = typename Type::c_type; + explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), data_(nullptr), raw_data_(nullptr) {} + +#ifndef ARROW_NO_DEPRECATED_API explicit PrimitiveBuilder(MemoryPool* pool, const std::shared_ptr& type) - : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {} + : PrimitiveBuilder(type, pool) {} +#endif using ArrayBuilder::Advance; @@ -210,8 +234,9 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { template explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : PrimitiveBuilder(pool, TypeTraits::type_singleton()) {} + typename std::enable_if::is_parameter_free, MemoryPool*>::type + ARROW_MEMORY_POOL_ARG) + : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} using PrimitiveBuilder::Append; using PrimitiveBuilder::Init; @@ -341,7 +366,7 @@ inline uint8_t ExpandedUIntSize(uint64_t val, uint8_t current_int_size) { class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { public: - explicit AdaptiveUIntBuilder(MemoryPool* pool); + explicit AdaptiveUIntBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); using ArrayBuilder::Advance; @@ -400,7 +425,7 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { public: - explicit AdaptiveIntBuilder(MemoryPool* pool); + explicit AdaptiveIntBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); using ArrayBuilder::Advance; @@ -459,8 +484,14 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: - explicit BooleanBuilder(MemoryPool* pool); + explicit BooleanBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 explicit BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif using ArrayBuilder::Advance; @@ -574,8 +605,14 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { /// \brief Builder class for variable-length binary data class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { public: - explicit BinaryBuilder(MemoryPool* pool); - explicit BinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); + explicit BinaryBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 + BinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); Status Append(const uint8_t* value, int32_t length); @@ -617,7 +654,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool); + explicit StringBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); using BinaryBuilder::Append; @@ -631,7 +668,13 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { public: +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 FixedSizeBinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif + + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* ARROW_MEMORY_POOL_ARG); Status Append(const uint8_t* value); Status Append(const uint8_t* data, int64_t length, @@ -653,7 +696,13 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { public: + explicit DecimalBuilder(const std::shared_ptr& type, + MemoryPool* ARROW_MEMORY_POOL_ARG); + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 explicit DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif template ARROW_EXPORT Status Append(const decimal::Decimal& val); @@ -679,8 +728,14 @@ class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { /// called to maintain data-structure consistency. class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 StructBuilder(MemoryPool* pool, const std::shared_ptr& type, std::vector>&& field_builders); +#endif + + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); Status Finish(std::shared_ptr* out) override; @@ -759,12 +814,20 @@ template class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { public: using Scalar = typename internal::DictionaryScalar::type; + + ~DictionaryBuilder() {} + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 explicit DictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif + + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); template explicit DictionaryBuilder( typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(pool, TypeTraits::type_singleton()) {} + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} /// \brief Append a scalar value Status Append(const Scalar& value); @@ -851,6 +914,21 @@ Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +/// \brief Convert Array to encoded DictionaryArray form +/// +/// \param[in] input The Array to be encoded +/// \param[in] pool MemoryPool to allocate memory for the hash table +/// \param[out] out Array encoded to DictionaryArray +Status ARROW_EXPORT EncodeArrayToDictionary(const Array& input, MemoryPool* pool, + std::shared_ptr* out); + +/// \brief Convert a Column's data internally to DictionaryArray +/// +/// \param[in] input The ChunkedArray to be encoded +/// \param[in] pool MemoryPool to allocate memory for the hash table +/// \param[out] out Column with data converted to DictionaryArray +Status ARROW_EXPORT EncodeColumnToDictionary(const Column& input, MemoryPool* pool, + std::shared_ptr* out); } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index dda5fdd95d0c3..c01f190351044 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -231,11 +231,11 @@ class RangeEqualsVisitor { const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.raw_values() + left.offset() * width; + left_data = left.raw_values(); } if (right.values()) { - right_data = right.raw_values() + right.offset() * width; + right_data = right.raw_values(); } for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; @@ -265,11 +265,11 @@ class RangeEqualsVisitor { const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.raw_values() + left.offset() * width; + left_data = left.raw_values(); } if (right.values()) { - right_data = right.raw_values() + right.offset() * width; + right_data = right.raw_values(); } for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; @@ -352,10 +352,10 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.values()->data() + left.offset() * byte_width; + left_data = left.raw_values(); } if (right.values()) { - right_data = right.values()->data() + right.offset() * byte_width; + right_data = right.raw_values(); } if (left.null_count() > 0) { @@ -399,10 +399,10 @@ static bool IsEqualDecimal(const DecimalArray& left, const DecimalArray& right) const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.values()->data(); + left_data = left.raw_values(); } if (right.values()) { - right_data = right.values()->data(); + right_data = right.raw_values(); } const int32_t byte_width = left.byte_width(); @@ -769,7 +769,8 @@ class TypeEqualsVisitor { Status Visit(const DictionaryType& left) { const auto& right = static_cast(right_); result_ = left.index_type()->Equals(right.index_type()) && - left.dictionary()->Equals(right.dictionary()); + left.dictionary()->Equals(right.dictionary()) && + (left.ordered() == right.ordered()); return Status::OK(); } diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 82e3ba8109c23..57d30f73baa29 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -118,33 +118,64 @@ namespace io { // ---------------------------------------------------------------------- // Cross-platform file compatability layer + #if defined(_MSC_VER) + constexpr const char* kRangeExceptionError = "Range exception during wide-char string conversion"; + +struct PlatformFilename { + static Status Init(const std::string& utf8_path, PlatformFilename* out) { + std::wstring_convert> utf16_converter; + + if (!utf8_path.empty()) { + try { + out->utf16_path = utf16_converter.from_bytes(utf8_path); + } catch (const std::range_error&) { + return Status::Invalid(kRangeExceptionError); + } + } else { + out->utf16_path = std::wstring(); + } + out->utf8_path = utf8_path; + return Status::OK(); + } + + const char* data() const { return reinterpret_cast(utf16_path.c_str()); } + + const char* utf8_data() const { return utf8_path.c_str(); } + + size_t length() const { return utf16_path.size(); } + + std::string utf8_path; + std::wstring utf16_path; +}; + +#else + +struct PlatformFilename { + static Status Init(const std::string& utf8_path, PlatformFilename* out) { + out->utf8_path = utf8_path; + return Status::OK(); + } + + const char* data() const { return utf8_path.c_str(); } + + const char* utf8_data() const { return data(); } + + size_t length() const { return utf8_path.size(); } + + std::string utf8_path; +}; + #endif -static inline Status CheckOpenResult(int ret, int errno_actual, const char* filename, - size_t filename_length) { +static inline Status CheckOpenResult(int ret, int errno_actual, + const PlatformFilename& filename) { if (ret == -1) { // TODO: errno codes to strings std::stringstream ss; - ss << "Failed to open file: "; -#if defined(_MSC_VER) - // using wchar_t - - // this requires c++11 - std::wstring_convert, wchar_t> converter; - std::wstring wide_string(reinterpret_cast(filename), - filename_length / sizeof(wchar_t)); - try { - std::string byte_string = converter.to_bytes(wide_string); - ss << byte_string; - } catch (const std::range_error&) { - ss << kRangeExceptionError; - } -#else - ss << filename; -#endif + ss << "Failed to open local file: " << filename.utf8_data(); return Status::IOError(ss.str()); } return Status::OK(); @@ -161,54 +192,27 @@ static inline int64_t lseek64_compat(int fd, int64_t pos, int whence) { #endif } -#if defined(_MSC_VER) -static inline Status ConvertToUtf16(const std::string& input, std::wstring* result) { - if (result == nullptr) { - return Status::Invalid("Pointer to result is not valid"); - } - - if (input.empty()) { - *result = std::wstring(); - return Status::OK(); - } - - std::wstring_convert> utf16_converter; - try { - *result = utf16_converter.from_bytes(input); - } catch (const std::range_error&) { - return Status::Invalid(kRangeExceptionError); - } - return Status::OK(); -} -#endif - -static inline Status FileOpenReadable(const std::string& filename, int* fd) { +static inline Status FileOpenReadable(const PlatformFilename& filename, int* fd) { int ret; errno_t errno_actual = 0; #if defined(_MSC_VER) - std::wstring wide_filename; - RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename)); - - errno_actual = - _wsopen_s(fd, wide_filename.c_str(), _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); + errno_actual = _wsopen_s(fd, reinterpret_cast(filename.data()), + _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); ret = *fd; #else - ret = *fd = open(filename.c_str(), O_RDONLY | O_BINARY); + ret = *fd = open(filename.data(), O_RDONLY | O_BINARY); errno_actual = errno; #endif - return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); + return CheckOpenResult(ret, errno_actual, filename); } -static inline Status FileOpenWriteable(const std::string& filename, bool write_only, +static inline Status FileOpenWriteable(const PlatformFilename& filename, bool write_only, bool truncate, int* fd) { int ret; errno_t errno_actual = 0; #if defined(_MSC_VER) - std::wstring wide_filename; - RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename)); - int oflag = _O_CREAT | _O_BINARY; int pmode = _S_IWRITE; if (!write_only) { @@ -225,7 +229,8 @@ static inline Status FileOpenWriteable(const std::string& filename, bool write_o oflag |= _O_RDWR; } - errno_actual = _wsopen_s(fd, wide_filename.c_str(), oflag, _SH_DENYNO, pmode); + errno_actual = _wsopen_s(fd, reinterpret_cast(filename.data()), oflag, + _SH_DENYNO, pmode); ret = *fd; #else @@ -241,9 +246,9 @@ static inline Status FileOpenWriteable(const std::string& filename, bool write_o oflag |= O_RDWR; } - ret = *fd = open(filename.c_str(), oflag, ARROW_WRITE_SHMODE); + ret = *fd = open(filename.data(), oflag, ARROW_WRITE_SHMODE); #endif - return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); + return CheckOpenResult(ret, errno_actual, filename); } static inline Status FileTell(int fd, int64_t* pos) { @@ -352,8 +357,9 @@ class OSFile { ~OSFile() {} Status OpenWriteable(const std::string& path, bool append, bool write_only) { - RETURN_NOT_OK(FileOpenWriteable(path, write_only, !append, &fd_)); - path_ = path; + RETURN_NOT_OK(PlatformFilename::Init(path, &path_)); + + RETURN_NOT_OK(FileOpenWriteable(path_, write_only, !append, &fd_)); is_open_ = true; mode_ = write_only ? FileMode::WRITE : FileMode::READWRITE; @@ -366,10 +372,11 @@ class OSFile { } Status OpenReadable(const std::string& path) { - RETURN_NOT_OK(FileOpenReadable(path, &fd_)); + RETURN_NOT_OK(PlatformFilename::Init(path, &path_)); + + RETURN_NOT_OK(FileOpenReadable(path_, &fd_)); RETURN_NOT_OK(FileGetSize(fd_, &size_)); - path_ = path; is_open_ = true; mode_ = FileMode::READ; return Status::OK(); @@ -408,14 +415,13 @@ class OSFile { int fd() const { return fd_; } bool is_open() const { return is_open_; } - const std::string& path() const { return path_; } int64_t size() const { return size_; } FileMode::type mode() const { return mode_; } protected: - std::string path_; + PlatformFilename path_; std::mutex lock_; diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index ba740f1e8f4a9..2a0e89ca325fa 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -40,10 +40,18 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { public: ~FileOutputStream(); - // When opening a new file, any existing file with the indicated path is - // truncated to 0 bytes, deleting any existing memory + /// \brief Open a local file for writing, truncating any existing file + /// \param[in] path with UTF8 encoding + /// \param[out] file a FileOutputStream instance + /// + /// When opening a new file, any existing file with the indicated path is + /// truncated to 0 bytes, deleting any existing memory static Status Open(const std::string& path, std::shared_ptr* file); + /// \brief Open a local file for writing + /// \param[in] path with UTF8 encoding + /// \param[in] append append to existing file, otherwise truncate to 0 bytes + /// \param[out] file a FileOutputStream instance static Status Open(const std::string& path, bool append, std::shared_ptr* file); @@ -68,10 +76,17 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { public: ~ReadableFile(); - // Open file, allocate memory (if needed) from default memory pool + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[out] file ReadableFile instance + /// Open file, allocate memory (if needed) from default memory pool static Status Open(const std::string& path, std::shared_ptr* file); - // Open file with one's own memory pool for memory allocations + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[in] pool a MemoryPool for memory allocations + /// \param[out] file ReadableFile instance + /// Open file with one's own memory pool for memory allocations static Status Open(const std::string& path, MemoryPool* memory_pool, std::shared_ptr* file); diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index 36c35700d6496..630356fa2af38 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -45,7 +45,7 @@ static bool FileExists(const std::string& path) { void InvalidParamHandler(const wchar_t* expr, const wchar_t* func, const wchar_t* source_file, unsigned int source_line, uintptr_t reserved) { - wprintf(L"Invalid parameter in funcion %s. Source: %s line %d expression %s", func, + wprintf(L"Invalid parameter in function %s. Source: %s line %d expression %s", func, source_file, source_line, expr); } #endif @@ -320,7 +320,12 @@ TEST_F(TestReadableFile, ReadAt) { } TEST_F(TestReadableFile, NonExistentFile) { - ASSERT_RAISES(IOError, ReadableFile::Open("0xDEADBEEF.txt", &file_)); + std::string path = "0xDEADBEEF.txt"; + Status s = ReadableFile::Open(path, &file_); + ASSERT_TRUE(s.IsIOError()); + + std::string message = s.message(); + ASSERT_NE(std::string::npos, message.find(path)); } class MyMemoryPool : public MemoryPool { diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9cc61bced0619..76e52a0f78b9a 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -90,18 +90,22 @@ install(FILES writer.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/ipc") -if(MSVC) - set(UTIL_LINK_LIBS - arrow_static - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY}) +if (ARROW_BUILD_STATIC) + set(ARROW_UTIL_LIB arrow_static) else() + set(ARROW_UTIL_LIB arrow_shared) +endif() + +set(UTIL_LINK_LIBS + ${ARROW_UTIL_LIB} + ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_SYSTEM_LIBRARY}) + +if(NOT MSVC) set(UTIL_LINK_LIBS - arrow_static + ${UTIL_LINK_LIBS} pthread - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY} - dl) + ${CMAKE_DL_LIBS}) endif() if (ARROW_BUILD_UTILITIES) diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index b76b518788b91..e74a60dd48925 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -354,7 +354,7 @@ TEST_F(TestTableWriter, TimeTypes) { auto f1 = field("f1", time32(TimeUnit::MILLI)); auto f2 = field("f2", timestamp(TimeUnit::NANO)); auto f3 = field("f3", timestamp(TimeUnit::SECOND, "US/Los_Angeles")); - std::shared_ptr schema(new Schema({f0, f1, f2, f3})); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::vector values_vec = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr values; diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 35264fa02c5ba..ddb2e37288e46 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -225,7 +225,7 @@ void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, static const int kBufferSize = 10; static uint8_t buffer[kBufferSize]; static uint32_t seed = 0; - StringBuilder string_builder(default_memory_pool()); + StringBuilder string_builder; for (int i = 0; i < num_rows; ++i) { if (!is_valid[i]) { ASSERT_OK(string_builder.AppendNull()); @@ -247,8 +247,8 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { auto v2_type = int32(); auto v3_type = utf8(); - std::shared_ptr schema( - new Schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)})); + auto schema = + ::arrow::schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)}); std::unique_ptr writer; ASSERT_OK(JsonWriter::Open(schema, &writer)); diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc index a88120a248d2d..a6da6377c0531 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc @@ -42,8 +42,7 @@ std::shared_ptr MakeRecordBatch(int64_t total_size, int64_t num_fie std::vector values; test::randint(length, 0, 100, &values); - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, type); + typename TypeTraits::BuilderType builder(type, default_memory_pool()); for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ABORT_NOT_OK(builder.Append(values[i])); diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index a6246c96f2d9a..045296163ea11 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -303,7 +303,7 @@ TEST_P(TestIpcRoundTrip, ZeroLengthArrays) { TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { auto CheckArray = [this](const std::shared_ptr& array) { auto f0 = field("f0", array->type()); - auto schema = std::shared_ptr(new Schema({f0})); + auto schema = ::arrow::schema({f0}); RecordBatch batch(schema, array->length(), {array}); auto sliced_batch = batch.Slice(0, 5); @@ -421,7 +421,7 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { auto f0 = field("f0", type); - *schema = std::shared_ptr(new Schema({f0})); + *schema = ::arrow::schema({f0}); std::vector> arrays = {array}; *batch = std::make_shared(*schema, batch_length, arrays); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 175d75b7d1e97..bc2b0d18e72c7 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -129,8 +129,7 @@ class SchemaWriter { writer_->Key("data"); // Make a dummy record batch. A bit tedious as we have to make a schema - auto schema = std::shared_ptr( - new Schema({arrow::field("dictionary", dictionary->type())})); + auto schema = ::arrow::schema({arrow::field("dictionary", dictionary->type())}); RecordBatch batch(schema, dictionary->length(), {dictionary}); RETURN_NOT_OK(WriteRecordBatch(batch, writer_)); writer_->EndObject(); @@ -977,7 +976,7 @@ class ArrayReader { std::is_base_of::value || std::is_base_of::value, Status>::type Visit(const T& type) { - typename TypeTraits::BuilderType builder(pool_, type_); + typename TypeTraits::BuilderType builder(type_, pool_); const auto& json_data = obj_->FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, *obj_); @@ -1046,7 +1045,7 @@ class ArrayReader { template typename std::enable_if::value, Status>::type Visit(const T& type) { - FixedSizeBinaryBuilder builder(pool_, type_); + FixedSizeBinaryBuilder builder(type_, pool_); const auto& json_data = obj_->FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, *obj_); diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index 20fd280db6de6..faf01a568483a 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -492,7 +492,8 @@ static DictionaryOffset GetDictionaryEncoding(FBB& fbb, const DictionaryType& ty auto index_type_offset = flatbuf::CreateInt(fbb, fw_index_type.bit_width(), true); // TODO(wesm): ordered dictionaries - return flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset); + return flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset, + type.ordered()); } static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, @@ -551,7 +552,7 @@ static Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr index_type; RETURN_NOT_OK(IntFromFlatbuffer(encoding->indexType(), &index_type)); - type = std::make_shared(index_type, dictionary); + type = ::arrow::dictionary(index_type, dictionary, encoding->isOrdered()); } *out = std::make_shared(field->name()->str(), type, field->nullable()); return Status::OK(); @@ -1034,7 +1035,7 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem } } - *out = std::make_shared(fields, metadata); + *out = ::arrow::schema(std::move(fields), metadata); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index cb827372d21c4..c1e79d43cc61d 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -99,7 +99,7 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(test::MakeRandomInt32PoolBuffer(length, pool, &data)); - Int32Builder builder(pool, int32()); + Int32Builder builder(int32(), pool); if (include_nulls) { std::shared_ptr valid_bytes; RETURN_NOT_OK(test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes)); @@ -179,7 +179,7 @@ Status MakeBooleanBatchSized(const int length, std::shared_ptr* out // Make the schema auto f0 = field("f0", boolean()); auto f1 = field("f1", boolean()); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::shared_ptr a0, a1; RETURN_NOT_OK(MakeRandomBooleanArray(length, true, &a0)); @@ -196,7 +196,7 @@ Status MakeIntBatchSized(int length, std::shared_ptr* out) { // Make the schema auto f0 = field("f0", int32()); auto f1 = field("f1", int32()); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); // Example data std::shared_ptr a0, a1; @@ -237,7 +237,7 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { auto binary_type = binary(); auto f0 = field("f0", string_type); auto f1 = field("f1", binary_type); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::shared_ptr a0, a1; MemoryPool* pool = default_memory_pool(); @@ -259,7 +259,7 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { Status MakeNullRecordBatch(std::shared_ptr* out) { const int64_t length = 500; auto f0 = field("f0", null()); - std::shared_ptr schema(new Schema({f0})); + auto schema = ::arrow::schema({f0}); std::shared_ptr a0 = std::make_shared(length); out->reset(new RecordBatch(schema, length, {a0})); return Status::OK(); @@ -270,7 +270,7 @@ Status MakeListRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", kListInt32); auto f1 = field("f1", kListListInt32); auto f2 = field("f2", int32()); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Example data @@ -293,7 +293,7 @@ Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", kListInt32); auto f1 = field("f1", kListListInt32); auto f2 = field("f2", int32()); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Example data MemoryPool* pool = default_memory_pool(); @@ -313,7 +313,7 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", kListInt32); auto f1 = field("f1", kListListInt32); auto f2 = field("f2", int32()); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Example data MemoryPool* pool = default_memory_pool(); @@ -345,7 +345,7 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { } auto f0 = field("f0", type); - std::shared_ptr schema(new Schema({f0})); + auto schema = ::arrow::schema({f0}); std::vector> arrays = {array}; out->reset(new RecordBatch(schema, batch_length, arrays)); return Status::OK(); @@ -364,7 +364,7 @@ Status MakeStruct(std::shared_ptr* out) { {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); auto f0 = field("non_null_struct", type); auto f1 = field("null_struct", type); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); // construct individual nullable/non-nullable struct arrays std::shared_ptr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); @@ -397,7 +397,7 @@ Status MakeUnion(std::shared_ptr* out) { auto f1 = field("sparse", sparse_type); auto f2 = field("dense", dense_type); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Create data std::vector> sparse_children(2); @@ -462,7 +462,7 @@ Status MakeDictionary(std::shared_ptr* out) { ArrayFromVector(dict2_values, &dict2); auto f0_type = arrow::dictionary(arrow::int32(), dict1); - auto f1_type = arrow::dictionary(arrow::int8(), dict1); + auto f1_type = arrow::dictionary(arrow::int8(), dict1, true); auto f2_type = arrow::dictionary(arrow::int32(), dict2); std::shared_ptr indices0, indices1, indices2; @@ -520,9 +520,9 @@ Status MakeDictionary(std::shared_ptr* out) { auto a4 = std::make_shared(f4_type, indices4); // construct batch - std::shared_ptr schema(new Schema( + auto schema = ::arrow::schema( {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type), - field("list of encoded string", f3_type), field("encoded list", f4_type)})); + field("list of encoded string", f3_type), field("encoded list", f4_type)}); std::vector> arrays = {a0, a1, a2, a3, a4}; @@ -560,8 +560,8 @@ Status MakeDictionaryFlat(std::shared_ptr* out) { auto a2 = std::make_shared(f2_type, indices2); // construct batch - std::shared_ptr schema(new Schema( - {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type)})); + auto schema = ::arrow::schema( + {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type)}); std::vector> arrays = {a0, a1, a2}; out->reset(new RecordBatch(schema, length, arrays)); @@ -572,7 +572,7 @@ Status MakeDates(std::shared_ptr* out) { std::vector is_valid = {true, true, true, false, true, true, true}; auto f0 = field("f0", date32()); auto f1 = field("f1", date64()); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::vector date32_values = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr date32_array; @@ -594,7 +594,7 @@ Status MakeTimestamps(std::shared_ptr* out) { auto f0 = field("f0", timestamp(TimeUnit::MILLI)); auto f1 = field("f1", timestamp(TimeUnit::NANO, "America/New_York")); auto f2 = field("f2", timestamp(TimeUnit::SECOND)); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); std::vector ts_values = {1489269000000, 1489270000000, 1489271000000, 1489272000000, 1489272000000, 1489273000000}; @@ -615,7 +615,7 @@ Status MakeTimes(std::shared_ptr* out) { auto f1 = field("f1", time64(TimeUnit::NANO)); auto f2 = field("f2", time32(TimeUnit::SECOND)); auto f3 = field("f3", time64(TimeUnit::NANO)); - std::shared_ptr schema(new Schema({f0, f1, f2, f3})); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; @@ -649,12 +649,12 @@ Status MakeFWBinary(std::shared_ptr* out) { std::vector is_valid = {true, true, true, false}; auto f0 = field("f0", fixed_size_binary(4)); auto f1 = field("f1", fixed_size_binary(0)); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::shared_ptr a1, a2; - FixedSizeBinaryBuilder b1(default_memory_pool(), f0->type()); - FixedSizeBinaryBuilder b2(default_memory_pool(), f1->type()); + FixedSizeBinaryBuilder b1(f0->type()); + FixedSizeBinaryBuilder b2(f1->type()); std::vector values1 = {"foo1", "foo2", "foo3", "foo4"}; AppendValues(is_valid, values1, &b1); diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 049f5a58a6841..a687a8fc0e703 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -89,7 +89,7 @@ TEST_F(TestPrettyPrint, FixedSizeBinaryType) { std::shared_ptr array; auto type = fixed_size_binary(3); - FixedSizeBinaryBuilder builder(default_memory_pool(), type); + FixedSizeBinaryBuilder builder(type); ASSERT_OK(builder.Append(values[0])); ASSERT_OK(builder.Append(values[1])); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 86f82fdbd8de5..23bef7bcae65d 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -22,14 +22,11 @@ #include "arrow/python/arrow_to_pandas.h" #include -#include #include #include #include -#include #include #include -#include #include #include @@ -42,6 +39,7 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/parallel.h" #include "arrow/visitor_inline.h" #include "arrow/python/builtin_convert.h" @@ -186,8 +184,8 @@ class PandasBlock { CATEGORICAL }; - PandasBlock(int64_t num_rows, int num_columns) - : num_rows_(num_rows), num_columns_(num_columns) {} + PandasBlock(PandasOptions options, int64_t num_rows, int num_columns) + : num_rows_(num_rows), num_columns_(num_columns), options_(options) {} virtual ~PandasBlock() {} virtual Status Allocate() = 0; @@ -255,6 +253,8 @@ class PandasBlock { OwnedRef block_arr_; uint8_t* block_data_; + PandasOptions options_; + // ndarray OwnedRef placement_arr_; int64_t* placement_data_; @@ -264,43 +264,44 @@ class PandasBlock { }; template -inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) { +inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& data, + double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); // Upcast to double, set NaN as appropriate - for (int i = 0; i < arr->length(); ++i) { - *out_values++ = prim_arr->IsNull(i) ? NAN : static_cast(in_values[i]); + for (int i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? NAN : static_cast(in_values[i]); } } } template -inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) { +inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedArray& data, + T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); - memcpy(out_values, in_values, sizeof(T) * arr->length()); - out_values += arr->length(); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); + memcpy(out_values, in_values, sizeof(T) * arr.length()); + out_values += arr.length(); } } template -inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) { +inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& data, + OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); + for (int64_t i = 0; i < arr.length(); ++i) { *out_values = in_values[i]; } } } -static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) { +static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); @@ -324,7 +325,8 @@ static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_v return Status::OK(); } -static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) { +static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& data, + uint8_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); auto bool_arr = static_cast(arr.get()); @@ -335,7 +337,8 @@ static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) } template -inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -365,7 +368,8 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) return Status::OK(); } -inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { std::shared_ptr arr = data.chunk(c); @@ -380,7 +384,8 @@ inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { return Status::OK(); } -inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertFixedSizeBinary(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { auto arr = static_cast(data.chunk(c).get()); @@ -410,7 +415,8 @@ inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_va return Status::OK(); } -inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; if (data.num_chunks() <= 0) { return Status::OK(); @@ -427,8 +433,8 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; - RETURN_NOT_OK( - ConvertArrayToPandas(arr->field(static_cast(i)), nullptr, &numpy_array)); + RETURN_NOT_OK(ConvertArrayToPandas(options, arr->field(static_cast(i)), + nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } @@ -473,7 +479,7 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { } template -inline Status ConvertListsLike(const std::shared_ptr& col, +inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr& col, PyObject** out_values) { const ChunkedArray& data = *col->data().get(); auto list_type = std::static_pointer_cast(col->type()); @@ -488,7 +494,7 @@ inline Status ConvertListsLike(const std::shared_ptr& col, // TODO(ARROW-489): Currently we don't have a Python reference for single columns. // Storing a reference to the whole Array would be to expensive. PyObject* numpy_array; - RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array)); + RETURN_NOT_OK(ConvertColumnToPandas(options, flat_column, nullptr, &numpy_array)); PyAcquireGIL lock; @@ -520,19 +526,18 @@ inline Status ConvertListsLike(const std::shared_ptr& col, template inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - const uint8_t* valid_bits = arr->null_bitmap_data(); + const uint8_t* valid_bits = arr.null_bitmap_data(); - if (arr->null_count() > 0) { - for (int64_t i = 0; i < arr->length(); ++i) { + if (arr.null_count() > 0) { + for (int64_t i = 0; i < arr.length(); ++i) { *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i]; } } else { - memcpy(out_values, in_values, sizeof(T) * arr->length()); - out_values += arr->length(); + memcpy(out_values, in_values, sizeof(T) * arr.length()); + out_values += arr.length(); } } } @@ -541,12 +546,11 @@ template inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_value, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? na_value : static_cast(in_values[i]); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? na_value : static_cast(in_values[i]); } } } @@ -554,19 +558,19 @@ inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_valu template inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? kPandasTimestampNull - : (static_cast(in_values[i]) * SHIFT); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? kPandasTimestampNull + : (static_cast(in_values[i]) * SHIFT); } } } template -static Status ConvertTimes(const ChunkedArray& data, PyObject** out_values) { +static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; PyAcquireGIL lock; @@ -635,7 +639,8 @@ Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, return Status::OK(); } -static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) { +static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; OwnedRef decimal_ref; OwnedRef Decimal_ref; @@ -679,9 +684,9 @@ static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) { return Status::OK(); } -#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ - case Type::ArrowEnum: \ - RETURN_NOT_OK((ConvertListsLike(col, out_buffer))); \ +#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ + case Type::ArrowEnum: \ + RETURN_NOT_OK((ConvertListsLike(options_, col, out_buffer))); \ break; class ObjectBlock : public PandasBlock { @@ -699,21 +704,21 @@ class ObjectBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); if (type == Type::BOOL) { - RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer)); + RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer)); } else if (type == Type::BINARY) { - RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::STRING) { - RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { - RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); + RETURN_NOT_OK(ConvertFixedSizeBinary(options_, data, out_buffer)); } else if (type == Type::TIME32) { - RETURN_NOT_OK(ConvertTimes(data, out_buffer)); + RETURN_NOT_OK(ConvertTimes(options_, data, out_buffer)); } else if (type == Type::TIME64) { - RETURN_NOT_OK(ConvertTimes(data, out_buffer)); + RETURN_NOT_OK(ConvertTimes(options_, data, out_buffer)); } else if (type == Type::DECIMAL) { - RETURN_NOT_OK(ConvertDecimals(data, out_buffer)); + RETURN_NOT_OK(ConvertDecimals(options_, data, out_buffer)); } else if (type == Type::NA) { - RETURN_NOT_OK(ConvertNulls(data, out_buffer)); + RETURN_NOT_OK(ConvertNulls(options_, data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast(col->type()); switch (list_type->value_type()->id()) { @@ -738,7 +743,7 @@ class ObjectBlock : public PandasBlock { } } } else if (type == Type::STRUCT) { - RETURN_NOT_OK(ConvertStruct(data, out_buffer)); + RETURN_NOT_OK(ConvertStruct(options_, data, out_buffer)); } else { std::stringstream ss; ss << "Unsupported type for object array output: " << col->type()->ToString(); @@ -774,7 +779,7 @@ class IntBlock : public PandasBlock { return Status::NotImplemented(ss.str()); } - ConvertIntegerNoNullsSameType(data, out_buffer); + ConvertIntegerNoNullsSameType(options_, data, out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -827,8 +832,8 @@ class Float64Block : public PandasBlock { const ChunkedArray& data = *col->data().get(); -#define INTEGER_CASE(IN_TYPE) \ - ConvertIntegerWithNulls(data, out_buffer); \ +#define INTEGER_CASE(IN_TYPE) \ + ConvertIntegerWithNulls(options_, data, out_buffer); \ break; switch (type) { @@ -887,7 +892,7 @@ class BoolBlock : public PandasBlock { uint8_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - ConvertBooleanNoNulls(*col->data().get(), out_buffer); + ConvertBooleanNoNulls(options_, *col->data().get(), out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -952,8 +957,8 @@ class DatetimeBlock : public PandasBlock { class DatetimeTZBlock : public DatetimeBlock { public: - DatetimeTZBlock(const std::string& timezone, int64_t num_rows) - : DatetimeBlock(num_rows, 1), timezone_(timezone) {} + DatetimeTZBlock(PandasOptions options, const std::string& timezone, int64_t num_rows) + : DatetimeBlock(options, num_rows, 1), timezone_(timezone) {} // Like Categorical, the internal ndarray is 1-dimensional Status Allocate() override { return AllocateDatetime(1); } @@ -979,31 +984,32 @@ class DatetimeTZBlock : public DatetimeBlock { std::string timezone_; }; -template class CategoricalBlock : public PandasBlock { public: - explicit CategoricalBlock(int64_t num_rows) : PandasBlock(num_rows, 1) {} - Status Allocate() override { - constexpr int npy_type = internal::arrow_traits::npy_type; + explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows) + : PandasBlock(options, num_rows, 1), pool_(pool) {} - if (!(npy_type == NPY_INT8 || npy_type == NPY_INT16 || npy_type == NPY_INT32 || - npy_type == NPY_INT64)) { - return Status::Invalid("Category indices must be signed integers"); - } - return AllocateNDArray(npy_type, 1); + Status Allocate() override { + return Status::NotImplemented( + "CategoricalBlock allocation happens when calling Write"); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { - using T = typename internal::arrow_traits::T; + template + Status WriteIndices(const std::shared_ptr& col) { + using TRAITS = internal::arrow_traits; + using T = typename TRAITS::T; + constexpr int npy_type = TRAITS::npy_type; + RETURN_NOT_OK(AllocateNDArray(npy_type, 1)); - T* out_values = reinterpret_cast(block_data_) + rel_placement * num_rows_; + // No relative placement offset because a single column + T* out_values = reinterpret_cast(block_data_); const ChunkedArray& data = *col->data().get(); for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); const auto& dict_arr = static_cast(*arr); + const auto& indices = static_cast(*dict_arr.indices()); auto in_values = reinterpret_cast(indices.raw_values()); @@ -1013,13 +1019,48 @@ class CategoricalBlock : public PandasBlock { } } - placement_data_[rel_placement] = abs_placement; + return Status::OK(); + } + + Status Write(const std::shared_ptr& col, int64_t abs_placement, + int64_t rel_placement) override { + std::shared_ptr converted_col; + if (options_.strings_to_categorical && + (col->type()->id() == Type::STRING || col->type()->id() == Type::BINARY)) { + RETURN_NOT_OK(EncodeColumnToDictionary(static_cast(*col), pool_, + &converted_col)); + } else { + converted_col = col; + } - auto dict_type = static_cast(col->type().get()); + const auto& dict_type = static_cast(*converted_col->type()); + + switch (dict_type.index_type()->id()) { + case Type::INT8: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + case Type::INT16: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + case Type::INT32: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + case Type::INT64: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + default: { + std::stringstream ss; + ss << "Categorical index type not supported: " + << dict_type.index_type()->ToString(); + return Status::NotImplemented(ss.str()); + } + } + placement_data_[rel_placement] = abs_placement; PyObject* dict; - RETURN_NOT_OK(ConvertArrayToPandas(dict_type->dictionary(), nullptr, &dict)); + RETURN_NOT_OK(ConvertArrayToPandas(options_, dict_type.dictionary(), nullptr, &dict)); dictionary_.reset(dict); + ordered_ = dict_type.ordered(); return Status::OK(); } @@ -1032,20 +1073,26 @@ class CategoricalBlock : public PandasBlock { PyDict_SetItemString(result, "dictionary", dictionary_.obj()); PyDict_SetItemString(result, "placement", placement_arr_.obj()); + PyObject* py_ordered = ordered_ ? Py_True : Py_False; + Py_INCREF(py_ordered); + PyDict_SetItemString(result, "ordered", py_ordered); + *output = result; return Status::OK(); } protected: + MemoryPool* pool_; OwnedRef dictionary_; + bool ordered_; }; -Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, - std::shared_ptr* block) { -#define BLOCK_CASE(NAME, TYPE) \ - case PandasBlock::NAME: \ - *block = std::make_shared(num_rows, num_columns); \ +Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows, + int num_columns, std::shared_ptr* block) { +#define BLOCK_CASE(NAME, TYPE) \ + case PandasBlock::NAME: \ + *block = std::make_shared(options, num_rows, num_columns); \ break; switch (type) { @@ -1071,36 +1118,94 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, return (*block)->Allocate(); } -static inline Status MakeCategoricalBlock(const std::shared_ptr& type, - int64_t num_rows, - std::shared_ptr* block) { - // All categoricals become a block with a single column - auto dict_type = static_cast(type.get()); - switch (dict_type->index_type()->id()) { +using BlockMap = std::unordered_map>; + +static Status GetPandasBlockType(const Column& col, const PandasOptions& options, + PandasBlock::type* output_type) { + switch (col.type()->id()) { + case Type::BOOL: + *output_type = col.null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; + break; + case Type::UINT8: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8; + break; case Type::INT8: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8; + break; + case Type::UINT16: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16; break; case Type::INT16: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16; + break; + case Type::UINT32: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32; break; case Type::INT32: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32; break; case Type::INT64: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64; + break; + case Type::UINT64: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64; + break; + case Type::FLOAT: + *output_type = PandasBlock::FLOAT; + break; + case Type::DOUBLE: + *output_type = PandasBlock::DOUBLE; + break; + case Type::STRING: + case Type::BINARY: + if (options.strings_to_categorical) { + *output_type = PandasBlock::CATEGORICAL; + break; + } + case Type::NA: + case Type::FIXED_SIZE_BINARY: + case Type::STRUCT: + case Type::TIME32: + case Type::TIME64: + case Type::DECIMAL: + *output_type = PandasBlock::OBJECT; + break; + case Type::DATE32: + *output_type = PandasBlock::DATETIME; break; - default: { + case Type::DATE64: + *output_type = PandasBlock::DATETIME; + break; + case Type::TIMESTAMP: { + const auto& ts_type = static_cast(*col.type()); + if (ts_type.timezone() != "") { + *output_type = PandasBlock::DATETIME_WITH_TZ; + } else { + *output_type = PandasBlock::DATETIME; + } + } break; + case Type::LIST: { + auto list_type = std::static_pointer_cast(col.type()); + if (!ListTypeSupported(*list_type->value_type())) { + std::stringstream ss; + ss << "Not implemented type for list in DataFrameBlock: " + << list_type->value_type()->ToString(); + return Status::NotImplemented(ss.str()); + } + *output_type = PandasBlock::OBJECT; + } break; + case Type::DICTIONARY: + *output_type = PandasBlock::CATEGORICAL; + break; + default: std::stringstream ss; - ss << "Categorical index type not implemented: " - << dict_type->index_type()->ToString(); + ss << "No known equivalent Pandas block for Arrow data of type "; + ss << col.type()->ToString() << " is known."; return Status::NotImplemented(ss.str()); - } } - return (*block)->Allocate(); + return Status::OK(); } -using BlockMap = std::unordered_map>; - // Construct the exact pandas 0.x "BlockManager" memory layout // // * For each column determine the correct output pandas type @@ -1110,7 +1215,9 @@ using BlockMap = std::unordered_map>; // * placement arrays as we go class DataFrameBlockCreator { public: - explicit DataFrameBlockCreator(const std::shared_ptr& table) : table_(table) {} + explicit DataFrameBlockCreator(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool) + : table_(table), options_(options), pool_(pool) {} Status Convert(int nthreads, PyObject** output) { column_types_.resize(table_->num_columns()); @@ -1128,94 +1235,17 @@ class DataFrameBlockCreator { for (int i = 0; i < table_->num_columns(); ++i) { std::shared_ptr col = table_->column(i); PandasBlock::type output_type; - - Type::type column_type = col->type()->id(); - switch (column_type) { - case Type::BOOL: - output_type = col->null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; - break; - case Type::UINT8: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8; - break; - case Type::INT8: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8; - break; - case Type::UINT16: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16; - break; - case Type::INT16: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16; - break; - case Type::UINT32: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32; - break; - case Type::INT32: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32; - break; - case Type::INT64: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64; - break; - case Type::UINT64: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64; - break; - case Type::FLOAT: - output_type = PandasBlock::FLOAT; - break; - case Type::DOUBLE: - output_type = PandasBlock::DOUBLE; - break; - case Type::NA: - case Type::STRING: - case Type::BINARY: - case Type::FIXED_SIZE_BINARY: - case Type::STRUCT: - case Type::TIME32: - case Type::TIME64: - case Type::DECIMAL: - output_type = PandasBlock::OBJECT; - break; - case Type::DATE32: - output_type = PandasBlock::DATETIME; - break; - case Type::DATE64: - output_type = PandasBlock::DATETIME; - break; - case Type::TIMESTAMP: { - const auto& ts_type = static_cast(*col->type()); - if (ts_type.timezone() != "") { - output_type = PandasBlock::DATETIME_WITH_TZ; - } else { - output_type = PandasBlock::DATETIME; - } - } break; - case Type::LIST: { - auto list_type = std::static_pointer_cast(col->type()); - if (!ListTypeSupported(*list_type->value_type())) { - std::stringstream ss; - ss << "Not implemented type for list in DataFrameBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); - } - output_type = PandasBlock::OBJECT; - } break; - case Type::DICTIONARY: - output_type = PandasBlock::CATEGORICAL; - break; - default: - std::stringstream ss; - ss << "No known equivalent Pandas block for Arrow data of type "; - ss << col->type()->ToString() << " is known."; - return Status::NotImplemented(ss.str()); - } + RETURN_NOT_OK(GetPandasBlockType(*col, options_, &output_type)); int block_placement = 0; std::shared_ptr block; if (output_type == PandasBlock::CATEGORICAL) { - RETURN_NOT_OK(MakeCategoricalBlock(col->type(), table_->num_rows(), &block)); + block = std::make_shared(options_, pool_, table_->num_rows()); categorical_blocks_[i] = block; } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { const auto& ts_type = static_cast(*col->type()); - block = std::make_shared(ts_type.timezone(), table_->num_rows()); + block = std::make_shared(options_, ts_type.timezone(), + table_->num_rows()); RETURN_NOT_OK(block->Allocate()); datetimetz_blocks_[i] = block; } else { @@ -1229,92 +1259,61 @@ class DataFrameBlockCreator { type_counts_[output_type] = 1; } } - column_types_[i] = output_type; column_block_placement_[i] = block_placement; } // Create normal non-categorical blocks - for (const auto& it : type_counts_) { + for (const auto& it : this->type_counts_) { PandasBlock::type type = static_cast(it.first); std::shared_ptr block; - RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block)); - blocks_[type] = block; + RETURN_NOT_OK( + MakeBlock(this->options_, type, this->table_->num_rows(), it.second, &block)); + this->blocks_[type] = block; } return Status::OK(); } - Status WriteTableToBlocks(int nthreads) { - auto WriteColumn = [this](int i) { - std::shared_ptr col = this->table_->column(i); - PandasBlock::type output_type = this->column_types_[i]; + Status GetBlock(int i, std::shared_ptr* block) { + PandasBlock::type output_type = this->column_types_[i]; - int rel_placement = this->column_block_placement_[i]; + if (output_type == PandasBlock::CATEGORICAL) { + auto it = this->categorical_blocks_.find(i); + if (it == this->blocks_.end()) { + return Status::KeyError("No categorical block allocated"); + } + *block = it->second; + } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { + auto it = this->datetimetz_blocks_.find(i); + if (it == this->datetimetz_blocks_.end()) { + return Status::KeyError("No datetimetz block allocated"); + } + *block = it->second; + } else { + auto it = this->blocks_.find(output_type); + if (it == this->blocks_.end()) { + return Status::KeyError("No block allocated"); + } + *block = it->second; + } + return Status::OK(); + } + Status WriteTableToBlocks(int nthreads) { + auto WriteColumn = [this](int i) { std::shared_ptr block; - if (output_type == PandasBlock::CATEGORICAL) { - auto it = this->categorical_blocks_.find(i); - if (it == this->blocks_.end()) { - return Status::KeyError("No categorical block allocated"); - } - block = it->second; - } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { - auto it = this->datetimetz_blocks_.find(i); - if (it == this->datetimetz_blocks_.end()) { - return Status::KeyError("No datetimetz block allocated"); - } - block = it->second; - } else { - auto it = this->blocks_.find(output_type); - if (it == this->blocks_.end()) { - return Status::KeyError("No block allocated"); - } - block = it->second; - } - return block->Write(col, i, rel_placement); + RETURN_NOT_OK(this->GetBlock(i, &block)); + return block->Write(this->table_->column(i), i, this->column_block_placement_[i]); }; - nthreads = std::min(nthreads, table_->num_columns()); - + int num_tasks = table_->num_columns(); + nthreads = std::min(nthreads, num_tasks); if (nthreads == 1) { - for (int i = 0; i < table_->num_columns(); ++i) { + for (int i = 0; i < num_tasks; ++i) { RETURN_NOT_OK(WriteColumn(i)); } } else { - std::vector thread_pool; - thread_pool.reserve(nthreads); - std::atomic task_counter(0); - - std::mutex error_mtx; - bool error_occurred = false; - Status error; - - for (int thread_id = 0; thread_id < nthreads; ++thread_id) { - thread_pool.emplace_back( - [this, &error, &error_occurred, &error_mtx, &task_counter, &WriteColumn]() { - int column_num; - while (!error_occurred) { - column_num = task_counter.fetch_add(1); - if (column_num >= this->table_->num_columns()) { - break; - } - Status s = WriteColumn(column_num); - if (!s.ok()) { - std::lock_guard lock(error_mtx); - error_occurred = true; - error = s; - break; - } - } - }); - } - for (auto&& thread : thread_pool) { - thread.join(); - } - - if (error_occurred) { - return error; - } + RETURN_NOT_OK(ParallelFor(nthreads, num_tasks, WriteColumn)); } return Status::OK(); } @@ -1359,6 +1358,11 @@ class DataFrameBlockCreator { // block type -> type count std::unordered_map type_counts_; + PandasOptions options_; + + // Memory pool for dictionary encoding + MemoryPool* pool_; + // block type -> block BlockMap blocks_; @@ -1371,8 +1375,9 @@ class DataFrameBlockCreator { class ArrowDeserializer { public: - ArrowDeserializer(const std::shared_ptr& col, PyObject* py_ref) - : col_(col), data_(*col->data().get()), py_ref_(py_ref) {} + ArrowDeserializer(PandasOptions options, const std::shared_ptr& col, + PyObject* py_ref) + : col_(col), data_(*col->data().get()), options_(options), py_ref_(py_ref) {} Status AllocateOutput(int type) { PyAcquireGIL lock; @@ -1383,11 +1388,12 @@ class ArrowDeserializer { } template - Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr arr) { + Status ConvertValuesZeroCopy(PandasOptions options, int npy_type, + std::shared_ptr arr) { typedef typename internal::arrow_traits::T T; - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& prim_arr = static_cast(*arr); + auto in_values = reinterpret_cast(prim_arr.raw_values()); // Zero-Copy. We can pass the data pointer directly to NumPy. void* data = const_cast(in_values); @@ -1434,7 +1440,7 @@ class ArrowDeserializer { int npy_type = traits::npy_type; if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { - return ConvertValuesZeroCopy(npy_type, data_.chunk(0)); + return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); } RETURN_NOT_OK(AllocateOutput(npy_type)); @@ -1461,12 +1467,11 @@ class ArrowDeserializer { constexpr int64_t kShift = traits::npy_shift; for (int c = 0; c < data_.num_chunks(); c++) { - const std::shared_ptr arr = data_.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data_.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / kShift; + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? na_value : in_values[i] / kShift; } } return Status::OK(); @@ -1488,17 +1493,17 @@ class ArrowDeserializer { typedef typename traits::T T; if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { - return ConvertValuesZeroCopy(traits::npy_type, data_.chunk(0)); + return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); } if (data_.null_count() > 0) { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertIntegerWithNulls(data_, out_values); + ConvertIntegerWithNulls(options_, data_, out_values); } else { RETURN_NOT_OK(AllocateOutput(traits::npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertIntegerNoNullsSameType(data_, out_values); + ConvertIntegerNoNullsSameType(options_, data_, out_values); } return Status::OK(); @@ -1508,7 +1513,7 @@ class ArrowDeserializer { inline Status VisitObjects(FUNCTOR func) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - return func(data_, out_values); + return func(options_, data_, out_values); } // UTF8 strings @@ -1540,7 +1545,7 @@ class ArrowDeserializer { } else { RETURN_NOT_OK(AllocateOutput(internal::arrow_traits::npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertBooleanNoNulls(data_, out_values); + ConvertBooleanNoNulls(options_, data_, out_values); } return Status::OK(); } @@ -1548,7 +1553,7 @@ class ArrowDeserializer { Status Visit(const ListType& type) { #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ - return ConvertListsLike(col_, out_values); + return ConvertListsLike(options_, col_, out_values); RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); @@ -1578,8 +1583,7 @@ class ArrowDeserializer { } Status Visit(const DictionaryType& type) { - std::shared_ptr block; - RETURN_NOT_OK(MakeCategoricalBlock(col_->type(), col_->length(), &block)); + auto block = std::make_shared(options_, nullptr, col_->length()); RETURN_NOT_OK(block->Write(col_, 0, 0)); auto dict_type = static_cast(col_->type().get()); @@ -1593,7 +1597,8 @@ class ArrowDeserializer { // Release GIL before calling ConvertArrayToPandas, will be reacquired // there if needed lock.release(); - RETURN_NOT_OK(ConvertArrayToPandas(dict_type->dictionary(), nullptr, &dictionary)); + RETURN_NOT_OK( + ConvertArrayToPandas(options_, dict_type->dictionary(), nullptr, &dictionary)); lock.acquire(); PyDict_SetItemString(result_, "indices", block->block_arr()); @@ -1613,28 +1618,29 @@ class ArrowDeserializer { private: std::shared_ptr col_; const ChunkedArray& data_; + PandasOptions options_; PyObject* py_ref_; PyArrayObject* arr_; PyObject* result_; }; -Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out) { +Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, + PyObject* py_ref, PyObject** out) { static std::string dummy_name = "dummy"; auto field = std::make_shared(dummy_name, arr->type()); auto col = std::make_shared(field, arr); - return ConvertColumnToPandas(col, py_ref, out); + return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, - PyObject** out) { - ArrowDeserializer converter(col, py_ref); +Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, + PyObject* py_ref, PyObject** out) { + ArrowDeserializer converter(options, col, py_ref); return converter.Convert(out); } -Status ConvertTableToPandas(const std::shared_ptr
& table, int nthreads, - PyObject** out) { - DataFrameBlockCreator helper(table); +Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, + int nthreads, MemoryPool* pool, PyObject** out) { + DataFrameBlockCreator helper(options, table, pool); return helper.Convert(nthreads, out); } diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 5a99274a33ee0..1d716a5c94fa6 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -39,18 +39,18 @@ class Table; namespace py { -ARROW_EXPORT -Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out); - -ARROW_EXPORT -Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, - PyObject** out); - struct PandasOptions { bool strings_to_categorical; }; +ARROW_EXPORT +Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, + PyObject* py_ref, PyObject** out); + +ARROW_EXPORT +Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, + PyObject* py_ref, PyObject** out); + // Convert a whole table as efficiently as possible to a pandas.DataFrame. // // The returned Python object is a list of tuples consisting of the exact 2D @@ -58,8 +58,8 @@ struct PandasOptions { // // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) ARROW_EXPORT -Status ConvertTableToPandas(const std::shared_ptr
& table, int nthreads, - PyObject** out); +Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, + int nthreads, MemoryPool* pool, PyObject** out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 6eaa37fb8ca93..ccaf280b0a383 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -155,7 +155,7 @@ static constexpr int MAX_NESTING_LEVELS = 32; // SeqVisitor is used to infer the type. class SeqVisitor { public: - SeqVisitor() : max_nesting_level_(0) { + SeqVisitor() : max_nesting_level_(0), max_observed_level_(0) { memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); } @@ -217,24 +217,13 @@ class SeqVisitor { if (num_nesting_levels() > 1) { return Status::Invalid("Mixed nesting levels not supported"); // If the nesting goes deeper than the deepest scalar - } else if (max_observed_level() < max_nesting_level_) { + } else if (max_observed_level_ < max_nesting_level_) { return Status::Invalid("Mixed nesting levels not supported"); } } return Status::OK(); } - // Returns the deepest level which has scalar elements. - int max_observed_level() const { - int result = 0; - for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { - result = i; - } - } - return result; - } - // Returns the number of nesting levels which have scalar elements. int num_nesting_levels() const { int result = 0; @@ -252,6 +241,8 @@ class SeqVisitor { // Track observed // Deapest nesting level (irregardless of scalars) int max_nesting_level_; + int max_observed_level_; + // Number of scalar elements at each nesting level. // (TOOD: We really only need to know if a scalar is present, not the count). int nesting_histogram_[MAX_NESTING_LEVELS]; @@ -263,13 +254,15 @@ class SeqVisitor { } else if (PyDict_Check(item_ref.obj())) { return Status::NotImplemented("No type inference for dicts"); } else { - // We permit nulls at any level of nesting - if (item_ref.obj() == Py_None) { - // TODO - } else { + // We permit nulls at any level of nesting, but they aren't treated like + // other scalar values as far as the checking for mixed nesting structure + if (item_ref.obj() != Py_None) { ++nesting_histogram_[level]; - return scalars_.Visit(item_ref.obj()); } + if (level > max_observed_level_) { + max_observed_level_ = level; + } + return scalars_.Visit(item_ref.obj()); } return Status::OK(); } @@ -392,6 +385,17 @@ class TypedConverterVisitor : public TypedConverter { virtual Status AppendItem(const OwnedRef& item) = 0; }; +class NullConverter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + if (item.obj() == Py_None) { + return typed_builder_->AppendNull(); + } else { + return Status::Invalid("NullConverter: passed non-None value"); + } + } +}; + class BoolConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { @@ -530,14 +534,24 @@ class UTF8Converter : public TypedConverterVisitor const char* bytes; Py_ssize_t length; - if (item.obj() == Py_None) { + PyObject* obj = item.obj(); + if (obj == Py_None) { return typed_builder_->AppendNull(); - } else if (!PyUnicode_Check(item.obj())) { - return Status::Invalid("Non-unicode value encountered"); + } else if (PyBytes_Check(obj)) { + tmp.reset( + PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj))); + RETURN_IF_PYERROR(); + bytes_obj = obj; + } else if (!PyUnicode_Check(obj)) { + PyObjectStringify stringified(obj); + std::stringstream ss; + ss << "Non bytes/unicode value encountered: " << stringified.bytes; + return Status::Invalid(ss.str()); + } else { + tmp.reset(PyUnicode_AsUTF8String(obj)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); } - tmp.reset(PyUnicode_AsUTF8String(item.obj())); - RETURN_IF_PYERROR(); - bytes_obj = tmp.obj(); // No error checking length = PyBytes_GET_SIZE(bytes_obj); @@ -606,6 +620,8 @@ class DecimalConverter // Dynamic constructor for sequence converters std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->id()) { + case Type::NA: + return std::make_shared(); case Type::BOOL: return std::make_shared(); case Type::INT64: @@ -660,6 +676,7 @@ Status AppendPySequence(PyObject* obj, int64_t size, } Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out) { + PyAcquireGIL lock; std::shared_ptr type; int64_t size; RETURN_NOT_OK(InferArrowTypeAndSize(obj, &size, &type)); @@ -668,6 +685,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, const std::shared_ptr& type, int64_t size) { + PyAcquireGIL lock; // Handle NA / NullType case if (type->id() == Type::NA) { out->reset(new NullArray(size)); @@ -684,7 +702,10 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, const std::shared_ptr& type) { int64_t size; - RETURN_NOT_OK(InferArrowSize(obj, &size)); + { + PyAcquireGIL lock; + RETURN_NOT_OK(InferArrowSize(obj, &size)); + } return ConvertPySequence(obj, pool, out, type, size); } diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 95d63b8fecb5b..61192f313d29d 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -202,6 +202,8 @@ Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { #undef TO_ARROW_TYPE_CASE Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { + PyAcquireGIL lock; + if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } @@ -234,6 +236,8 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* } Status TensorToNdarray(const Tensor& tensor, PyObject* base, PyObject** out) { + PyAcquireGIL lock; + int type_num; RETURN_NOT_OK(GetNumPyType(*tensor.type(), &type_num)); PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num); diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 2fbed1b8fdf08..060fcb2453800 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -17,6 +17,8 @@ // Functions for pandas conversion via NumPy +#define ARROW_NO_DEFAULT_MEMORY_POOL + #include "arrow/python/numpy_interop.h" #include "arrow/python/pandas_to_arrow.h" @@ -586,7 +588,7 @@ Status PandasConverter::ConvertDecimals() { type_ = std::make_shared(precision, scale); const int bit_width = std::dynamic_pointer_cast(type_)->bit_width(); - DecimalBuilder builder(pool_, type_); + DecimalBuilder builder(type_, pool_); RETURN_NOT_OK(builder.Resize(length_)); for (int64_t i = 0; i < length_; ++i) { @@ -619,7 +621,7 @@ Status PandasConverter::ConvertTimes() { PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); // datetime.time stores microsecond resolution - Time64Builder builder(pool_, ::arrow::time64(TimeUnit::MICRO)); + Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_); RETURN_NOT_OK(builder.Resize(length_)); PyObject* obj; @@ -751,7 +753,7 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( // The output type at this point is inconclusive because there may be bytes // and unicode mixed in the object array - FixedSizeBinaryBuilder builder(pool_, type); + FixedSizeBinaryBuilder builder(type, pool_); RETURN_NOT_OK(builder.Resize(length_)); int64_t offset = 0; @@ -942,10 +944,6 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr return Status::NotImplemented("mask not supported in object conversions yet"); } - if (is_strided()) { - return Status::NotImplemented("strided arrays not implemented for lists"); - } - BuilderT* value_builder = static_cast(builder->value_builder()); auto foreach_item = [&](PyObject* object) { @@ -989,6 +987,47 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr return LoopPySequence(list, foreach_item); } +template <> +inline Status PandasConverter::ConvertTypedLists( + const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { + PyAcquireGIL lock; + + // TODO: mask not supported here + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } + + auto value_builder = static_cast(builder->value_builder()); + + auto foreach_item = [&](PyObject* object) { + if (PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else if (PyArray_Check(object)) { + auto numpy_array = reinterpret_cast(object); + RETURN_NOT_OK(builder->Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + + for (int64_t i = 0; i < static_cast(PyArray_SIZE(numpy_array)); ++i) { + RETURN_NOT_OK(value_builder->AppendNull()); + } + return Status::OK(); + } else if (PyList_Check(object)) { + RETURN_NOT_OK(builder->Append(true)); + const Py_ssize_t size = PySequence_Size(object); + for (Py_ssize_t i = 0; i < size; ++i) { + RETURN_NOT_OK(value_builder->AppendNull()); + } + return Status::OK(); + } else { + return Status::TypeError("Unsupported Python type for list items"); + } + }; + + return LoopPySequence(list, foreach_item); +} + template <> inline Status PandasConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { @@ -1001,10 +1040,6 @@ inline Status PandasConverter::ConvertTypedLists( return Status::NotImplemented("mask not supported in object conversions yet"); } - if (is_strided()) { - return Status::NotImplemented("strided arrays not implemented for lists"); - } - auto value_builder = static_cast(builder->value_builder()); auto foreach_item = [&](PyObject* object) { @@ -1051,6 +1086,7 @@ inline Status PandasConverter::ConvertTypedLists( Status PandasConverter::ConvertLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { switch (type->id()) { + LIST_CASE(NA, NPY_OBJECT, NullType) LIST_CASE(UINT8, NPY_UINT8, UInt8Type) LIST_CASE(INT8, NPY_INT8, Int8Type) LIST_CASE(UINT16, NPY_UINT16, UInt16Type) diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index b50699d1ae9d4..0d830127ee9b0 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -71,7 +71,7 @@ TEST(DecimalTest, TestPythonDecimalToString) { } TEST(PandasConversionTest, TestObjectBlockWriteFails) { - StringBuilder builder(default_memory_pool()); + StringBuilder builder; const char value[] = {'\xf1', '\0'}; for (int i = 0; i < 1000; ++i) { @@ -85,16 +85,16 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { auto f2 = field("f2", utf8()); auto f3 = field("f3", utf8()); std::vector> fields = {f1, f2, f3}; - std::vector> cols = {std::make_shared(f1, arr), - std::make_shared(f2, arr), - std::make_shared(f3, arr)}; + std::vector> cols = {arr, arr, arr}; auto schema = std::make_shared(fields); auto table = std::make_shared
(schema, cols); PyObject* out; Py_BEGIN_ALLOW_THREADS; - ASSERT_RAISES(UnknownError, ConvertTableToPandas(table, 2, &out)); + PandasOptions options; + MemoryPool* pool = default_memory_pool(); + ASSERT_RAISES(UnknownError, ConvertTableToPandas(options, table, 2, pool, &out)); Py_END_ALLOW_THREADS; } diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 8dba8c052e922..4b67492b7ed4a 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -214,7 +214,7 @@ class TestTable : public TestBase { }; TEST_F(TestTable, EmptySchema) { - auto empty_schema = shared_ptr(new Schema({})); + auto empty_schema = ::arrow::schema({}); table_.reset(new Table(empty_schema, columns_)); ASSERT_OK(table_->ValidateColumns()); ASSERT_EQ(0, table_->num_rows()); @@ -230,6 +230,9 @@ TEST_F(TestTable, Ctors) { ASSERT_EQ(length, table_->num_rows()); ASSERT_EQ(3, table_->num_columns()); + auto array_ctor = std::make_shared
(schema_, arrays_); + ASSERT_TRUE(table_->Equals(*array_ctor)); + table_.reset(new Table(schema_, columns_, length)); ASSERT_OK(table_->ValidateColumns()); ASSERT_EQ(length, table_->num_rows()); @@ -373,18 +376,17 @@ TEST_F(TestTable, RemoveColumn) { std::shared_ptr
result; ASSERT_OK(table.RemoveColumn(0, &result)); - auto ex_schema = - std::shared_ptr(new Schema({schema_->field(1), schema_->field(2)})); + auto ex_schema = ::arrow::schema({schema_->field(1), schema_->field(2)}); std::vector> ex_columns = {table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.RemoveColumn(1, &result)); - ex_schema = std::shared_ptr(new Schema({schema_->field(0), schema_->field(2)})); + ex_schema = ::arrow::schema({schema_->field(0), schema_->field(2)}); ex_columns = {table.column(0), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.RemoveColumn(2, &result)); - ex_schema = std::shared_ptr(new Schema({schema_->field(0), schema_->field(1)})); + ex_schema = ::arrow::schema({schema_->field(0), schema_->field(1)}); ex_columns = {table.column(0), table.column(1)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); } @@ -410,27 +412,27 @@ TEST_F(TestTable, AddColumn) { // Add column 0 in different places ASSERT_OK(table.AddColumn(0, columns_[0], &result)); - auto ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); + auto ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); std::vector> ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(1, columns_[0], &result)); - ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); + ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(2, columns_[0], &result)); - ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)})); + ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)}); ex_columns = {table.column(0), table.column(1), table.column(0), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(3, columns_[0], &result)); - ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)})); + ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)}); ex_columns = {table.column(0), table.column(1), table.column(2), table.column(0)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); } @@ -470,7 +472,7 @@ TEST_F(TestRecordBatch, Validate) { auto f1 = field("f1", uint8()); auto f2 = field("f2", int16()); - auto schema = std::shared_ptr(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); auto a0 = MakePrimitive(length); auto a1 = MakePrimitive(length); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 665ce2d84dea4..1f0c6d785448d 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -273,18 +273,38 @@ Status RecordBatch::Validate() const { // Table methods Table::Table(const std::shared_ptr& schema, - const std::vector>& columns) + const std::vector>& columns, int64_t num_rows) : schema_(schema), columns_(columns) { - if (columns.size() == 0) { - num_rows_ = 0; + if (num_rows < 0) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } } else { - num_rows_ = columns[0]->length(); + num_rows_ = num_rows; } } Table::Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows) - : schema_(schema), columns_(columns), num_rows_(num_rows) {} + const std::vector>& columns, int64_t num_rows) + : schema_(schema) { + if (num_rows < 0) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } + } else { + num_rows_ = num_rows; + } + + columns_.resize(columns.size()); + for (size_t i = 0; i < columns.size(); ++i) { + columns_[i] = std::make_shared(schema->field(static_cast(i)), + columns[i]); + } +} std::shared_ptr
Table::ReplaceSchemaMetadata( const std::shared_ptr& metadata) const { diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 6afd618da043b..31ca97a37078c 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -192,16 +192,20 @@ class ARROW_EXPORT RecordBatch { // Immutable container of fixed-length columns conforming to a particular schema class ARROW_EXPORT Table { public: - // If columns is zero-length, the table's number of rows is zero + /// \brief Construct Table from schema and columns + /// If columns is zero-length, the table's number of rows is zero + /// \param schema + /// \param columns + /// \param num_rows number of rows in table, -1 (default) to infer from columns Table(const std::shared_ptr& schema, - const std::vector>& columns); + const std::vector>& columns, int64_t num_rows = -1); - // num_rows is a parameter to allow for tables of a particular size not - // having any materialized columns. Each column should therefore have the - // same length as num_rows -- you can validate this using - // Table::ValidateColumns + /// \brief Construct Table from schema and arrays + /// \param schema + /// \param arrays + /// \param num_rows number of rows in table, -1 (default) to infer from columns Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows); + const std::vector>& arrays, int64_t num_rows = -1); // Construct table from RecordBatch, but only if all of the batch schemas are // equal. Returns Status::Invalid if there is some problem diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 1a3376cee6053..711d2b04025c1 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -221,8 +221,7 @@ template void ArrayFromVector(const std::shared_ptr& type, const std::vector& is_valid, const std::vector& values, std::shared_ptr* out) { - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, type); + typename TypeTraits::BuilderType builder(type, default_memory_pool()); for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ASSERT_OK(builder.Append(values[i])); @@ -236,8 +235,7 @@ void ArrayFromVector(const std::shared_ptr& type, template void ArrayFromVector(const std::vector& is_valid, const std::vector& values, std::shared_ptr* out) { - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool); + typename TypeTraits::BuilderType builder; for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ASSERT_OK(builder.Append(values[i])); @@ -250,8 +248,7 @@ void ArrayFromVector(const std::vector& is_valid, const std::vector void ArrayFromVector(const std::vector& values, std::shared_ptr* out) { - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool); + typename TypeTraits::BuilderType builder; for (size_t i = 0; i < values.size(); ++i) { ASSERT_OK(builder.Append(values[i])); } diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 6b86b4d2f1024..4ac5c85d480ed 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -97,15 +97,14 @@ TEST_F(TestSchema, Basics) { auto f2 = field("f2", utf8()); - vector> fields = {f0, f1, f2}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2}); ASSERT_EQ(3, schema->num_fields()); ASSERT_TRUE(f0->Equals(schema->field(0))); ASSERT_TRUE(f1->Equals(schema->field(1))); ASSERT_TRUE(f2->Equals(schema->field(2))); - auto schema2 = std::make_shared(fields); + auto schema2 = ::arrow::schema({f0, f1, f2}); vector> fields3 = {f0, f1_optional, f2}; auto schema3 = std::make_shared(fields3); @@ -119,8 +118,7 @@ TEST_F(TestSchema, ToString) { auto f2 = field("f2", utf8()); auto f3 = field("f3", list(int16())); - vector> fields = {f0, f1, f2, f3}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::string result = schema->ToString(); std::string expected = R"(f0: int32 @@ -137,8 +135,7 @@ TEST_F(TestSchema, GetFieldByName) { auto f2 = field("f2", utf8()); auto f3 = field("f3", list(int16())); - vector> fields = {f0, f1, f2, f3}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::shared_ptr result; @@ -158,13 +155,12 @@ TEST_F(TestSchema, GetFieldIndex) { auto f2 = field("f2", utf8()); auto f3 = field("f3", list(int16())); - vector> fields = {f0, f1, f2, f3}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2, f3}); - ASSERT_EQ(0, schema->GetFieldIndex(fields[0]->name())); - ASSERT_EQ(1, schema->GetFieldIndex(fields[1]->name())); - ASSERT_EQ(2, schema->GetFieldIndex(fields[2]->name())); - ASSERT_EQ(3, schema->GetFieldIndex(fields[3]->name())); + ASSERT_EQ(0, schema->GetFieldIndex(f0->name())); + ASSERT_EQ(1, schema->GetFieldIndex(f1->name())); + ASSERT_EQ(2, schema->GetFieldIndex(f2->name())); + ASSERT_EQ(3, schema->GetFieldIndex(f3->name())); ASSERT_EQ(-1, schema->GetFieldIndex("not-found")); } @@ -172,10 +168,9 @@ TEST_F(TestSchema, TestMetadataConstruction) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); - vector> fields = {f0, f1, f2}; auto metadata = std::shared_ptr( new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); - auto schema = std::make_shared(fields, metadata); + auto schema = ::arrow::schema({f0, f1, f2}, metadata); ASSERT_TRUE(metadata->Equals(*schema->metadata())); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index b8489d44cdb00..4443e8d8a455b 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -236,7 +236,7 @@ std::shared_ptr DictionaryType::dictionary() const { return dictionary_; std::string DictionaryType::ToString() const { std::stringstream ss; ss << "dictionarytype()->ToString() - << ", indices=" << index_type_->ToString() << ">"; + << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">"; return ss.str(); } @@ -252,6 +252,10 @@ Schema::Schema(const std::vector>& fields, const std::shared_ptr& metadata) : fields_(fields), metadata_(metadata) {} +Schema::Schema(std::vector>&& fields, + const std::shared_ptr& metadata) + : fields_(std::move(fields)), metadata_(metadata) {} + bool Schema::Equals(const Schema& other) const { if (this == &other) { return true; @@ -343,6 +347,16 @@ std::string Schema::ToString() const { return buffer.str(); } +std::shared_ptr schema(const std::vector>& fields, + const std::shared_ptr& metadata) { + return std::make_shared(fields, metadata); +} + +std::shared_ptr schema(std::vector>&& fields, + const std::shared_ptr& metadata) { + return std::make_shared(std::move(fields), metadata); +} + // ---------------------------------------------------------------------- // Visitors and factory functions @@ -428,8 +442,9 @@ std::shared_ptr union_(const std::vector>& chil } std::shared_ptr dictionary(const std::shared_ptr& index_type, - const std::shared_ptr& dict_values) { - return std::make_shared(index_type, dict_values); + const std::shared_ptr& dict_values, + bool ordered) { + return std::make_shared(index_type, dict_values, ordered); } std::shared_ptr field(const std::string& name, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 45d97fdb32bbc..4917ebb481368 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -708,6 +708,10 @@ class ARROW_EXPORT Schema { public: explicit Schema(const std::vector>& fields, const std::shared_ptr& metadata = nullptr); + + explicit Schema(std::vector>&& fields, + const std::shared_ptr& metadata = nullptr); + virtual ~Schema() = default; /// Returns true if all of the schema fields are equal @@ -772,26 +776,56 @@ std::shared_ptr ARROW_EXPORT timestamp(TimeUnit::type unit); std::shared_ptr ARROW_EXPORT timestamp(TimeUnit::type unit, const std::string& timezone); +/// \brief Create an instance of 32-bit time type /// Unit can be either SECOND or MILLI std::shared_ptr ARROW_EXPORT time32(TimeUnit::type unit); +/// \brief Create an instance of 64-bit time type /// Unit can be either MICRO or NANO std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); +/// \brief Create an instance of Struct type std::shared_ptr ARROW_EXPORT struct_(const std::vector>& fields); +/// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); -std::shared_ptr ARROW_EXPORT dictionary( - const std::shared_ptr& index_type, const std::shared_ptr& values); - +/// \brief Create an instance of Dictionary type +std::shared_ptr ARROW_EXPORT +dictionary(const std::shared_ptr& index_type, + const std::shared_ptr& values, bool ordered = false); + +/// \brief Create a Field instance +/// +/// \param name the field name +/// \param type the field value type +/// \param nullable whether the values are nullable, default true +/// \param metadata any custom key-value metadata, default nullptr std::shared_ptr ARROW_EXPORT field( const std::string& name, const std::shared_ptr& type, bool nullable = true, const std::shared_ptr& metadata = nullptr); +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields +/// \param metadata any custom key-value metadata, default nullptr +/// \return schema shared_ptr to Schema +std::shared_ptr ARROW_EXPORT +schema(const std::vector>& fields, + const std::shared_ptr& metadata = nullptr); + +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields (rvalue reference) +/// \param metadata any custom key-value metadata, default nullptr +/// \return schema shared_ptr to Schema +std::shared_ptr ARROW_EXPORT +schema(std::vector>&& fields, + const std::shared_ptr& metadata = nullptr); + // ---------------------------------------------------------------------- // diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 99c09bd6b7dca..0d06b6f6cb86e 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -42,6 +42,7 @@ class DictionaryArray; class NullType; class NullArray; +class NullBuilder; class BooleanType; class BooleanArray; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 973b0e15c5434..f05eb56718f5f 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -31,6 +31,7 @@ struct TypeTraits {}; template <> struct TypeTraits { using ArrayType = NullArray; + using BuilderType = NullBuilder; constexpr static bool is_parameter_free = false; }; diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc index b0667cb33ada4..d0a9a14fc60c0 100644 --- a/cpp/src/arrow/util/cpu-info.cc +++ b/cpp/src/arrow/util/cpu-info.cc @@ -31,7 +31,11 @@ #endif #ifdef _WIN32 +#include #include +#include +#include + #endif #include @@ -132,6 +136,46 @@ bool RetrieveCacheSize(int64_t* cache_sizes) { } return true; } + +bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name) { + if (!hardware_flags || !model_name) { + return false; + } + const int register_ECX_id = 1; + int highest_valid_id = 0; + int highest_extended_valid_id = 0; + std::bitset<32> features_ECX; + std::array cpu_info; + + // Get highest valid id + __cpuid(cpu_info.data(), 0); + highest_valid_id = cpu_info[0]; + + if (highest_valid_id <= register_ECX_id) return false; + + __cpuidex(cpu_info.data(), register_ECX_id, 0); + features_ECX = cpu_info[2]; + + // Get highest extended id + __cpuid(cpu_info.data(), 0x80000000); + highest_extended_valid_id = cpu_info[0]; + + // Retrieve CPU model name + if (highest_extended_valid_id >= 0x80000004) { + model_name->clear(); + for (int i = 0x80000002; i <= 0x80000004; ++i) { + __cpuidex(cpu_info.data(), i, 0); + *model_name += + std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info)); + } + } + + if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; + return true; +} #endif void CpuInfo::Init() { @@ -203,6 +247,7 @@ void CpuInfo::Init() { if (!RetrieveCacheSize(cache_sizes_)) { SetDefaultCacheSize(); } + RetrieveCPUInfo(&hardware_flags_, &model_name_); #else SetDefaultCacheSize(); #endif diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h new file mode 100644 index 0000000000000..9fec000c3ed35 --- /dev/null +++ b/cpp/src/arrow/util/parallel.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_PARALLEL_H +#define ARROW_UTIL_PARALLEL_H + +#include +#include +#include +#include + +#include "arrow/status.h" + +namespace arrow { + +template +Status ParallelFor(int nthreads, int num_tasks, FUNCTION&& func) { + std::vector thread_pool; + thread_pool.reserve(nthreads); + std::atomic task_counter(0); + + std::mutex error_mtx; + bool error_occurred = false; + Status error; + + for (int thread_id = 0; thread_id < nthreads; ++thread_id) { + thread_pool.emplace_back( + [&num_tasks, &task_counter, &error, &error_occurred, &error_mtx, &func]() { + int task_id; + while (!error_occurred) { + task_id = task_counter.fetch_add(1); + if (task_id >= num_tasks) { + break; + } + Status s = func(task_id); + if (!s.ok()) { + std::lock_guard lock(error_mtx); + error_occurred = true; + error = s; + break; + } + } + }); + } + for (auto&& thread : thread_pool) { + thread.join(); + } + if (error_occurred) { + return error; + } + return Status::OK(); +} + +} // namespace arrow + +#endif diff --git a/format/Layout.md b/format/Layout.md index b62b1565a754b..334251103d732 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -62,7 +62,11 @@ Base requirements linearly in the nesting level * Capable of representing fully-materialized and decoded / decompressed [Parquet][5] data -* All contiguous memory buffers are aligned at 64-byte boundaries and padded to a multiple of 64 bytes. +* It is required to have all the contiguous memory buffers in an IPC payload + aligned at 8-byte boundaries. In other words, each buffer must start at + an aligned 8-byte offset. +* The general recommendation is to align the buffers at 64-byte boundary, but + this is not absolutely necessary. * Any relative type can have null slots * Arrays are immutable once created. Implementations can provide APIs to mutate an array, but applying mutations will require a new array data structure to @@ -108,21 +112,23 @@ via byte swapping. ## Alignment and Padding -As noted above, all buffers are intended to be aligned in memory at 64 byte -boundaries and padded to a length that is a multiple of 64 bytes. The alignment -requirement follows best practices for optimized memory access: +As noted above, all buffers must be aligned in memory at 8-byte boundaries and padded +to a length that is a multiple of 8 bytes. The alignment requirement follows best +practices for optimized memory access: * Elements in numeric arrays will be guaranteed to be retrieved via aligned access. * On some architectures alignment can help limit partially used cache lines. * 64 byte alignment is recommended by the [Intel performance guide][2] for -data-structures over 64 bytes (which will be a common case for Arrow Arrays). + data-structures over 64 bytes (which will be a common case for Arrow Arrays). -Requiring padding to a multiple of 64 bytes allows for using [SIMD][4] instructions +Recommending padding to a multiple of 64 bytes allows for using [SIMD][4] instructions consistently in loops without additional conditional checks. -This should allow for simpler and more efficient code. +This should allow for simpler, efficient and CPU cache-friendly code. The specific padding length was chosen because it matches the largest known -SIMD instruction registers available as of April 2016 (Intel AVX-512). -Guaranteed padding can also allow certain compilers +SIMD instruction registers available as of April 2016 (Intel AVX-512). In other +words, we can load the entire 64-byte buffer into a 512-bit wide SIMD register +and get data-level parallelism on all the columnar values packed into the 64-byte +buffer. Guaranteed padding can also allow certain compilers to generate more optimized code directly (e.g. One can safely use Intel's `-qopt-assume-safe-padding`). diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index 09886a6ffe316..6d17430c66966 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -406,12 +406,12 @@ public ByteBuffer internalNioBuffer(int index, int length) { @Override public ByteBuffer[] nioBuffers() { - return new ByteBuffer[]{nioBuffer()}; + return new ByteBuffer[] {nioBuffer()}; } @Override public ByteBuffer[] nioBuffers(int index, int length) { - return new ByteBuffer[]{nioBuffer(index, length)}; + return new ByteBuffer[] {nioBuffer(index, length)}; } @Override diff --git a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java index 89329b2766357..5bd6b9fe37956 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java @@ -249,9 +249,9 @@ public long getPeakMemoryAllocation() { return peakAllocation.get(); } - public long getHeadroom(){ + public long getHeadroom() { long localHeadroom = allocationLimit.get() - locallyHeldMemory.get(); - if(parent == null){ + if (parent == null) { return localHeadroom; } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index 70ca1dc32a1b3..c528937bfdcaa 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -328,6 +328,7 @@ private void inc() { * Decrement the ledger's reference count. If the ledger is decremented to zero, this ledger * should release its * ownership back to the AllocationManager + * * @param decrement amout to decrease the reference count by * @return the new reference count */ diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index be0ba77f5b2fa..2749b6fe030f4 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -171,7 +171,6 @@ public String getName() { @Override public ArrowBuf getEmpty() { - assertOpen(); return empty; } @@ -236,8 +235,6 @@ public ArrowBuf buffer(final int initialRequestSize) { } private ArrowBuf createEmpty() { - assertOpen(); - return new ArrowBuf(new AtomicInteger(), null, AllocationManager.EMPTY, null, null, 0, 0, true); } @@ -277,8 +274,7 @@ public ArrowBuf buffer(final int initialRequestSize, BufferManager manager) { throw new OutOfMemoryException(e); } throw e; - } - finally { + } finally { if (!success) { releaseBytes(actualRequestSize); } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java index 8a40441863889..b23a6e4bd8507 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java @@ -117,9 +117,8 @@ public interface BufferAllocator extends AutoCloseable { * Create an allocation reservation. A reservation is a way of building up * a request for a buffer whose size is not known in advance. See * - * @see AllocationReservation - * * @return the newly created reservation + * @see AllocationReservation */ public AllocationReservation newReservation(); @@ -128,6 +127,7 @@ public interface BufferAllocator extends AutoCloseable { * special because we don't * worry about them leaking or managing reference counts on them since they don't actually * point to any memory. + * * @return the empty buffer */ public ArrowBuf getEmpty(); @@ -136,6 +136,7 @@ public interface BufferAllocator extends AutoCloseable { * Return the name of this allocator. This is a human readable name that can help debugging. * Typically provides * coordinates about where this allocator was created + * * @return the name of the allocator */ public String getName(); @@ -145,6 +146,7 @@ public interface BufferAllocator extends AutoCloseable { * that an allocator is * over its limit, all consumers of that allocator should aggressively try to addrss the * overlimit situation. + * * @return whether or not this allocator (or one if its parents) is over its limits */ public boolean isOverLimit(); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java index 0452dc9adf256..10a64cd984ea0 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java @@ -97,7 +97,7 @@ public synchronized void recordEvent(final String noteFormat, Object... args) { * includes the identifying string provided at construction time, and all the recorded * events with their stack traces. * - * @param sb {@link StringBuilder} to write to + * @param sb {@link StringBuilder} to write to * @param includeStackTrace whether to include the stacktrace of each event in the history */ public void buildHistory(final StringBuilder sb, boolean includeStackTrace) { @@ -106,8 +106,9 @@ public void buildHistory(final StringBuilder sb, boolean includeStackTrace) { /** * build the history and write it to sb - * @param sb output - * @param indent starting indent (usually "") + * + * @param sb output + * @param indent starting indent (usually "") * @param includeStackTrace whether to include the stacktrace of each event. */ public synchronized void buildHistory( diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java b/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java index 2624a4a047e7e..100be069fe6d4 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.memory; import static org.junit.Assert.assertEquals; diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java index 59b7be87e17be..76f2c501cf4c7 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.memory; import static org.junit.Assert.assertEquals; @@ -59,13 +60,13 @@ public void checkBuffers() { @Test public void test_privateMax() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final ArrowBuf arrowBuf1 = rootAllocator.buffer(MAX_ALLOCATION / 2); assertNotNull("allocation failed", arrowBuf1); - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("noLimits", 0, MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("noLimits", 0, MAX_ALLOCATION)) { final ArrowBuf arrowBuf2 = childAllocator.buffer(MAX_ALLOCATION / 2); assertNotNull("allocation failed", arrowBuf2); arrowBuf2.release(); @@ -75,11 +76,11 @@ public void test_privateMax() throws Exception { } } - @Test(expected=IllegalStateException.class) + @Test(expected = IllegalStateException.class) public void testRootAllocator_closeWithOutstanding() throws Exception { try { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final ArrowBuf arrowBuf = rootAllocator.buffer(512); assertNotNull("allocation failed", arrowBuf); } @@ -100,8 +101,8 @@ public void testRootAllocator_closeWithOutstanding() throws Exception { @Test public void testRootAllocator_getEmpty() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final ArrowBuf arrowBuf = rootAllocator.buffer(0); assertNotNull("allocation failed", arrowBuf); assertEquals("capacity was non-zero", 0, arrowBuf.capacity()); @@ -112,8 +113,8 @@ public void testRootAllocator_getEmpty() throws Exception { @Ignore // TODO(DRILL-2740) @Test(expected = IllegalStateException.class) public void testAllocator_unreleasedEmpty() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { @SuppressWarnings("unused") final ArrowBuf arrowBuf = rootAllocator.buffer(0); } @@ -121,8 +122,8 @@ public void testAllocator_unreleasedEmpty() throws Exception { @Test public void testAllocator_transferOwnership() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("changeOwnership1", 0, MAX_ALLOCATION); final BufferAllocator childAllocator2 = @@ -197,7 +198,7 @@ public void testRootAllocator_createChildAndUse() throws Exception { } } - @Test(expected=IllegalStateException.class) + @Test(expected = IllegalStateException.class) public void testRootAllocator_createChildDontClose() throws Exception { try { try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { @@ -232,22 +233,22 @@ private static void allocateAndFree(final BufferAllocator allocator) { final int nBufs = 8; final ArrowBuf[] arrowBufs = new ArrowBuf[nBufs]; - for(int i = 0; i < arrowBufs.length; ++i) { + for (int i = 0; i < arrowBufs.length; ++i) { ArrowBuf arrowBufi = allocator.buffer(MAX_ALLOCATION / nBufs); assertNotNull("allocation failed", arrowBufi); arrowBufs[i] = arrowBufi; } - for(ArrowBuf arrowBufi : arrowBufs) { + for (ArrowBuf arrowBufi : arrowBufs) { arrowBufi.release(); } } @Test public void testAllocator_manyAllocations() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("manyAllocations", 0, MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("manyAllocations", 0, MAX_ALLOCATION)) { allocateAndFree(childAllocator); } } @@ -255,10 +256,10 @@ public void testAllocator_manyAllocations() throws Exception { @Test public void testAllocator_overAllocate() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("overAllocate", 0, MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("overAllocate", 0, MAX_ALLOCATION)) { allocateAndFree(childAllocator); try { @@ -273,10 +274,10 @@ public void testAllocator_overAllocate() throws Exception { @Test public void testAllocator_overAllocateParent() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("overAllocateParent", 0, MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("overAllocateParent", 0, MAX_ALLOCATION)) { final ArrowBuf arrowBuf1 = rootAllocator.buffer(MAX_ALLOCATION / 2); assertNotNull("allocation failed", arrowBuf1); final ArrowBuf arrowBuf2 = childAllocator.buffer(MAX_ALLOCATION / 2); @@ -326,7 +327,7 @@ public void testAllocator_createSlices() throws Exception { try (final BufferAllocator childAllocator = rootAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { try (final BufferAllocator childAllocator2 = - childAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { + childAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { final ArrowBuf arrowBuf1 = childAllocator2.buffer(MAX_ALLOCATION / 8); @SuppressWarnings("unused") final ArrowBuf arrowBuf2 = arrowBuf1.slice(MAX_ALLOCATION / 16, MAX_ALLOCATION / 16); @@ -345,8 +346,8 @@ public void testAllocator_createSlices() throws Exception { @Test public void testAllocator_sliceRanges() throws Exception { // final AllocatorOwner allocatorOwner = new NamedOwner("sliceRanges"); - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { // Populate a buffer with byte values corresponding to their indices. final ArrowBuf arrowBuf = rootAllocator.buffer(256); assertEquals(256, arrowBuf.capacity()); @@ -362,7 +363,7 @@ public void testAllocator_sliceRanges() throws Exception { // assertEquals(256, slice3.capacity()); // assertEquals(256, slice3.writableBytes()); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { arrowBuf.writeByte(i); } assertEquals(0, arrowBuf.readerIndex()); @@ -373,18 +374,18 @@ public void testAllocator_sliceRanges() throws Exception { final ArrowBuf slice1 = (ArrowBuf) arrowBuf.slice(); assertEquals(0, slice1.readerIndex()); assertEquals(256, slice1.readableBytes()); - for(int i = 0; i < 10; ++i) { + for (int i = 0; i < 10; ++i) { assertEquals(i, slice1.readByte()); } assertEquals(256 - 10, slice1.readableBytes()); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { assertEquals((byte) i, slice1.getByte(i)); } final ArrowBuf slice2 = arrowBuf.slice(25, 25); assertEquals(0, slice2.readerIndex()); assertEquals(25, slice2.readableBytes()); - for(int i = 25; i < 50; ++i) { + for (int i = 25; i < 50; ++i) { assertEquals(i, slice2.readByte()); } @@ -404,32 +405,32 @@ public void testAllocator_sliceRanges() throws Exception { @Test public void testAllocator_slicesOfSlices() throws Exception { // final AllocatorOwner allocatorOwner = new NamedOwner("slicesOfSlices"); - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { // Populate a buffer with byte values corresponding to their indices. final ArrowBuf arrowBuf = rootAllocator.buffer(256); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { arrowBuf.writeByte(i); } // Slice it up. final ArrowBuf slice0 = arrowBuf.slice(0, arrowBuf.capacity()); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { assertEquals((byte) i, arrowBuf.getByte(i)); } final ArrowBuf slice10 = slice0.slice(10, arrowBuf.capacity() - 10); - for(int i = 10; i < 256; ++i) { + for (int i = 10; i < 256; ++i) { assertEquals((byte) i, slice10.getByte(i - 10)); } final ArrowBuf slice20 = slice10.slice(10, arrowBuf.capacity() - 20); - for(int i = 20; i < 256; ++i) { + for (int i = 20; i < 256; ++i) { assertEquals((byte) i, slice20.getByte(i - 20)); } - final ArrowBuf slice30 = slice20.slice(10, arrowBuf.capacity() - 30); - for(int i = 30; i < 256; ++i) { + final ArrowBuf slice30 = slice20.slice(10, arrowBuf.capacity() - 30); + for (int i = 30; i < 256; ++i) { assertEquals((byte) i, slice30.getByte(i - 30)); } @@ -556,8 +557,8 @@ public void testAllocator_transferShared() throws Exception { public void testAllocator_unclaimedReservation() throws Exception { try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { try (final BufferAllocator childAllocator1 = - rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) { - try(final AllocationReservation reservation = childAllocator1.newReservation()) { + rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) { + try (final AllocationReservation reservation = childAllocator1.newReservation()) { assertTrue(reservation.add(64)); } rootAllocator.verify(); diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java b/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java index 25357dc7b07ef..accd15eb71705 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.memory; import static org.junit.Assert.assertEquals; + import io.netty.buffer.ByteBuf; import org.apache.arrow.memory.BufferAllocator; diff --git a/java/pom.xml b/java/pom.xml index 14834c69867ce..b0621c5a41ea6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,8 +33,8 @@ 1.7.25 18.0 2 - 2.7.1 - + 2.7.9 + 2.7.1 false diff --git a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java index 24079b62da919..c53f0ea86935e 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java @@ -73,7 +73,9 @@ public void run() throws IOException { LOGGER.info("Closed connection with client"); } } catch (java.net.SocketException ex) { - if (!closed) throw ex; + if (!closed) { + throw ex; + } } finally { serverSocket.close(); LOGGER.info("Server closed."); diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java index 135d4921ed128..7d71b0b8f9d3f 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java @@ -44,6 +44,7 @@ public class FileRoundtrip { private final Options options; private final PrintStream out; private final PrintStream err; + FileRoundtrip(PrintStream out, PrintStream err) { this.out = out; this.err = err; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java index 7d4c86f81670f..d2b35e65a8172 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java @@ -94,7 +94,7 @@ private File validateFile(String type, String fileName, boolean shouldExist) { } static void extractDictionaryEncodings(List fields, List encodings) { - for (Field field: fields) { + for (Field field : fields) { DictionaryEncoding encoding = field.getDictionary(); if (encoding != null) { encodings.add(encoding); @@ -214,7 +214,7 @@ public void execute(File arrowFile, File jsonFile) throws IOException { boolean hasMoreArrow = iterator.hasNext(); if (hasMoreJSON || hasMoreArrow) { throw new IllegalArgumentException("Unexpected RecordBatches. Total: " + totalBatches - + " J:" + hasMoreJSON + " " + + " J:" + hasMoreJSON + " " + "A:" + hasMoreArrow); } } diff --git a/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java b/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java index d7f6388721ebb..467965aff95d9 100644 --- a/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java +++ b/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java @@ -158,16 +158,16 @@ public void basicTest() throws InterruptedException, IOException { public void testFlatDictionary() throws IOException { DictionaryEncoding writeEncoding = new DictionaryEncoding(1L, false, null); try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - NullableIntVector writeVector = - new NullableIntVector( - "varchar", - new FieldType(true, MinorType.INT.getType(), writeEncoding, null), - allocator); - NullableVarCharVector writeDictionaryVector = - new NullableVarCharVector( - "dict", - FieldType.nullable(VARCHAR.getType()), - allocator)) { + NullableIntVector writeVector = + new NullableIntVector( + "varchar", + new FieldType(true, MinorType.INT.getType(), writeEncoding, null), + allocator); + NullableVarCharVector writeDictionaryVector = + new NullableVarCharVector( + "dict", + FieldType.nullable(VARCHAR.getType()), + allocator)) { writeVector.allocateNewSafe(); NullableIntVector.Mutator mutator = writeVector.getMutator(); mutator.set(0, 0); diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index 5d92cd232efb3..61164ab6c9b3d 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -111,6 +111,10 @@ public Mutator getMutator(){ return mutator; } + int getAllocationSize() { + return allocationSizeInBytes; + } + @Override public void setInitialCapacity(final int valueCount) { final long size = 1L * valueCount * ${type.width}; @@ -162,7 +166,7 @@ public void allocateNew(final int valueCount) { @Override public void reset() { - allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; allocationMonitor = 0; zeroVector(); super.reset(); diff --git a/java/vector/src/main/codegen/templates/VariableLengthVectors.java b/java/vector/src/main/codegen/templates/VariableLengthVectors.java index 2ad7d20de2651..70c7209947460 100644 --- a/java/vector/src/main/codegen/templates/VariableLengthVectors.java +++ b/java/vector/src/main/codegen/templates/VariableLengthVectors.java @@ -352,7 +352,6 @@ public void reset() { } public void reAlloc() { - offsetVector.reAlloc(); final long newAllocationSize = allocationSizeInBytes*2L; if (newAllocationSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java index 388eb9c447977..a5b5c9d1d6ad2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import com.google.common.base.Preconditions; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java index 15c3a0227c656..2a0f39d0cb59b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.vector.complex.RepeatedFixedWidthVectorLike; @@ -27,32 +28,33 @@ public static void allocate(ValueVector v, int valueCount, int bytesPerValue) { allocate(v, valueCount, bytesPerValue, 5); } - public static void allocatePrecomputedChildCount(ValueVector v, int valueCount, int bytesPerValue, int childValCount){ - if(v instanceof FixedWidthVector) { + public static void allocatePrecomputedChildCount(ValueVector v, int valueCount, int bytesPerValue, int childValCount) { + if (v instanceof FixedWidthVector) { ((FixedWidthVector) v).allocateNew(valueCount); } else if (v instanceof VariableWidthVector) { ((VariableWidthVector) v).allocateNew(valueCount * bytesPerValue, valueCount); - } else if(v instanceof RepeatedFixedWidthVectorLike) { + } else if (v instanceof RepeatedFixedWidthVectorLike) { ((RepeatedFixedWidthVectorLike) v).allocateNew(valueCount, childValCount); - } else if(v instanceof RepeatedVariableWidthVectorLike) { + } else if (v instanceof RepeatedVariableWidthVectorLike) { ((RepeatedVariableWidthVectorLike) v).allocateNew(childValCount * bytesPerValue, valueCount, childValCount); } else { v.allocateNew(); } } - public static void allocate(ValueVector v, int valueCount, int bytesPerValue, int repeatedPerTop){ + public static void allocate(ValueVector v, int valueCount, int bytesPerValue, int repeatedPerTop) { allocatePrecomputedChildCount(v, valueCount, bytesPerValue, repeatedPerTop * valueCount); } /** * Allocates the exact amount if v is fixed width, otherwise falls back to dynamic allocation - * @param v value vector we are trying to allocate - * @param valueCount size we are trying to allocate + * + * @param v value vector we are trying to allocate + * @param valueCount size we are trying to allocate * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory */ public static void allocateNew(ValueVector v, int valueCount) { - if (v instanceof FixedWidthVector) { + if (v instanceof FixedWidthVector) { ((FixedWidthVector) v).allocateNew(valueCount); } else { v.allocateNew(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index 0fea719da8815..01340f66c4095 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.ArrayList; @@ -59,8 +60,6 @@ public static List unload(List vectors) { return result; } - // TODO: Nullable vectors extend BaseDataValueVector but do not use the data field - // We should fix the inheritance tree protected ArrowBuf data; public BaseDataValueVector(String name, BufferAllocator allocator) { @@ -70,23 +69,11 @@ public BaseDataValueVector(String name, BufferAllocator allocator) { @Override public void clear() { - if (data != null) { - data.release(); - } + data.release(); data = allocator.getEmpty(); super.clear(); } - @Override - public void close() { - clear(); - if (data != null) { - data.release(); - data = null; - } - super.close(); - } - @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { return getTransferPair(ref, allocator); @@ -98,7 +85,7 @@ public ArrowBuf[] getBuffers(boolean clear) { if (getBufferSize() == 0) { out = new ArrowBuf[0]; } else { - out = new ArrowBuf[]{data}; + out = new ArrowBuf[] {data}; data.readerIndex(0); if (clear) { data.retain(1); @@ -137,5 +124,6 @@ public ArrowBuf unLoad() { * This method has a similar effect of allocateNew() without actually clearing and reallocating * the value vector. The purpose is to move the value vector to a "mutate" state */ - public void reset() {} + public void reset() { + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 6a75e6d6b9a24..598e578e55a6d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.Collections; @@ -67,7 +68,8 @@ public TransferPair getTransferPair(BufferAllocator allocator) { } public abstract static class BaseAccessor implements ValueVector.Accessor { - protected BaseAccessor() { } + protected BaseAccessor() { + } @Override public boolean isNull(int index) { @@ -80,7 +82,7 @@ public int getNullCount() { int nullCount = 0; for (int i = 0; i < getValueCount(); i++) { if (isNull(i)) { - nullCount ++; + nullCount++; } } return nullCount; @@ -88,14 +90,17 @@ public int getNullCount() { } public abstract static class BaseMutator implements ValueVector.Mutator { - protected BaseMutator() { } + protected BaseMutator() { + } @Override - public void generateTestData(int values) {} + public void generateTestData(int values) { + } //TODO: consider making mutator stateless(if possible) on another issue. @Override - public void reset() {} + public void reset() { + } } @Override @@ -104,7 +109,7 @@ public Iterator iterator() { } public static boolean checkBufRefs(final ValueVector vv) { - for(final ArrowBuf buffer : vv.getBuffers(false)) { + for (final ArrowBuf buffer : vv.getBuffers(false)) { if (buffer.refCnt() <= 0) { throw new IllegalStateException("zero refcount"); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index f34ef2c2a2244..e80ca829c667e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.memory.BufferAllocator; @@ -115,7 +116,7 @@ int getSizeFromCount(int valueCount) { @Override public int getValueCapacity() { - return (int)Math.min((long)Integer.MAX_VALUE, data.capacity() * 8L); + return (int) Math.min((long) Integer.MAX_VALUE, data.capacity() * 8L); } private int getByteIndex(int index) { @@ -165,8 +166,7 @@ public void reset() { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param valueCount - * The number of values which can be contained within this vector. + * @param valueCount The number of values which can be contained within this vector. */ @Override public void allocateNew(int valueCount) { @@ -195,7 +195,7 @@ public void reAlloc() { throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); } - final int curSize = (int)newAllocationSize; + final int curSize = (int) newAllocationSize; final ArrowBuf newBuf = allocator.buffer(curSize); newBuf.setZero(0, newBuf.capacity()); newBuf.setBytes(0, data, 0, data.capacity()); @@ -287,7 +287,7 @@ public void splitAndTransferTo(int startIndex, int length, BitVector target) { target.data.setByte(byteSize - 1, ((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset)); } else { target.data.setByte(byteSize - 1, - (((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset) + (this.data.getByte(firstByte + byteSize) << (8 - offset)))); + (((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset) + (this.data.getByte(firstByte + byteSize) << (8 - offset)))); } } } @@ -342,15 +342,14 @@ public class Accessor extends BaseAccessor { /** * Get the byte holding the desired bit, then mask all other bits. Iff the result is 0, the bit was not set. * - * @param index - * position of the bit in the vector + * @param index position of the bit in the vector * @return 1 if set, otherwise 0 */ public final int get(int index) { int byteIndex = index >> 3; byte b = data.getByte(byteIndex); int bitIndex = index & 7; - return Long.bitCount(b & (1L << bitIndex)); + return Long.bitCount(b & (1L << bitIndex)); } @Override @@ -379,6 +378,7 @@ public final void get(int index, NullableBitHolder holder) { /** * Get the number nulls, this correspond to the number of bits set to 0 in the vector + * * @return the number of bits set to 0 */ @Override @@ -414,10 +414,8 @@ private Mutator() { /** * Set the bit at the given index to the specified value. * - * @param index - * position of the bit to set - * @param value - * value to set (either 1 or 0) + * @param index position of the bit to set + * @param value value to set (either 1 or 0) */ public final void set(int index, int value) { int byteIndex = byteIndex(index); @@ -448,8 +446,9 @@ public final void setToOne(int index) { /** * set count bits to 1 in data starting at firstBitIndex + * * @param firstBitIndex the index of the first bit to set - * @param count the number of bits to set + * @param count the number of bits to set */ public void setRangeToOne(int firstBitIndex, int count) { int starByteIndex = byteIndex(firstBitIndex); @@ -473,7 +472,7 @@ public void setRangeToOne(int firstBitIndex, int count) { final byte bitMask = (byte) (0xFFL << startByteBitIndex); currentByte |= bitMask; data.setByte(starByteIndex, currentByte); - ++ starByteIndex; + ++starByteIndex; } // fill in one full byte at a time @@ -518,28 +517,28 @@ final void set(int index, NullableBitHolder holder) { } public void setSafe(int index, int value) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } set(index, value); } public void setSafeToOne(int index) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } setToOne(index); } public void setSafe(int index, BitHolder holder) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } set(index, holder.value); } public void setSafe(int index, NullableBitHolder holder) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } set(index, holder.value); @@ -550,7 +549,7 @@ public final void setValueCount(int valueCount) { int currentValueCapacity = getValueCapacity(); BitVector.this.valueCount = valueCount; int idx = getSizeFromCount(valueCount); - while(valueCount > getValueCapacity()) { + while (valueCount > getValueCapacity()) { reAlloc(); } if (valueCount > 0 && currentValueCapacity > valueCount * 2) { @@ -564,7 +563,7 @@ public final void setValueCount(int valueCount) { @Override public final void generateTestData(int values) { boolean even = true; - for(int i = 0; i < values; i++, even = !even) { + for (int i = 0; i < values; i++, even = !even) { if (even) { set(i, 1); } @@ -576,10 +575,10 @@ public void generateTestDataAlt(int size) { setValueCount(size); boolean even = true; final int valueCount = getAccessor().getValueCount(); - for(int i = 0; i < valueCount; i++, even = !even) { - if(even){ + for (int i = 0; i < valueCount; i++, even = !even) { + if (even) { set(i, (byte) 1); - }else{ + } else { set(i, (byte) 0); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java index 3c8b3210d77ff..a0dbf2bdcf101 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.vector.schema.ArrowFieldNode; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java index 6c2c8302a7b8b..af7a7912c72cc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.List; @@ -33,12 +34,14 @@ public interface FieldVector extends ValueVector { /** * Initializes the child vectors * to be later loaded with loadBuffers + * * @param children the schema */ void initializeChildrenFromFields(List children); /** * the returned list is the same size as the list passed to initializeChildrenFromFields + * * @return the children according to schema (empty for primitive types) */ List getChildrenFromFields(); @@ -46,13 +49,15 @@ public interface FieldVector extends ValueVector { /** * loads data in the vectors * (ownBuffers must be the same size as getFieldVectors()) - * @param fieldNode the fieldNode + * + * @param fieldNode the fieldNode * @param ownBuffers the buffers for this Field (own buffers only, children not included) */ void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers); /** * (same size as getFieldVectors() since it is their content) + * * @return the buffers containing the data for this vector (ready for reading) */ List getFieldBuffers(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java index 59057000bbca9..ec410fc089c9c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java @@ -15,21 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; -public interface FixedWidthVector extends ValueVector{ +public interface FixedWidthVector extends ValueVector { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param valueCount Number of values in the vector. + * @param valueCount Number of values in the vector. */ void allocateNew(int valueCount); -/** - * Zero out the underlying buffer backing this vector. - */ + /** + * Zero out the underlying buffer backing this vector. + */ void zeroVector(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java index b49e9167c2589..b2455e9e42b4b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; public interface NullableVector extends ValueVector { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java index b819c5d39e99c..1e0746aabaa61 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; public interface NullableVectorDefinitionSetter { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java b/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java index 6fdcda20480f8..54c0c591e2b92 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java @@ -42,6 +42,7 @@ public void doWork() { /** * Returns the value of schema-changed state, resetting the * schema-changed state to {@code false}. + * * @return the previous schema-changed state */ public boolean getSchemaChangedAndReset() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 3812c0b2fc319..0c95bcfcbd6b5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.io.Closeable; @@ -40,11 +41,11 @@ * There are a few "rules" around vectors: * *
    - *
  • values need to be written in order (e.g. index 0, 1, 2, 5)
  • - *
  • null vectors start with all values as null before writing anything
  • - *
  • for variable width types, the offset vector should be all zeros before writing
  • - *
  • you must call setValueCount before a vector can be read
  • - *
  • you should never write to a vector once it has been read.
  • + *
  • values need to be written in order (e.g. index 0, 1, 2, 5)
  • + *
  • null vectors start with all values as null before writing anything
  • + *
  • for variable width types, the offset vector should be all zeros before writing
  • + *
  • you must call setValueCount before a vector can be read
  • + *
  • you should never write to a vector once it has been read.
  • *
* * Please note that the current implementation doesn't enforce those rules, hence we may find few places that @@ -58,12 +59,14 @@ public interface ValueVector extends Closeable, Iterable { /** * Allocate new buffers. ValueVector implements logic to determine how much to allocate. + * * @throws OutOfMemoryException Thrown if no memory can be allocated. */ void allocateNew() throws OutOfMemoryException; /** * Allocates new buffers. ValueVector implements logic to determine how much to allocate. + * * @return Returns true if allocation was successful. */ boolean allocateNewSafe(); @@ -78,12 +81,14 @@ public interface ValueVector extends Closeable, Iterable { /** * Set the initial record capacity + * * @param numRecords the initial record capacity. */ void setInitialCapacity(int numRecords); /** * Returns the maximum number of values that can be stored in this vector instance. + * * @return the maximum number of values that can be stored in this vector instance. */ int getValueCapacity(); @@ -101,6 +106,7 @@ public interface ValueVector extends Closeable, Iterable { /** * Get information about how this field is materialized. + * * @return the field corresponding to this vector */ Field getField(); @@ -109,6 +115,7 @@ public interface ValueVector extends Closeable, Iterable { /** * to transfer quota responsibility + * * @param allocator the target allocator * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new target vector of * the same type. @@ -121,6 +128,7 @@ public interface ValueVector extends Closeable, Iterable { /** * makes a new transfer pair used to transfer underlying buffers + * * @param target the target for the transfer * @return a new {@link org.apache.arrow.vector.util.TransferPair transfer pair} that is used to transfer underlying * buffers into the target vector. @@ -167,9 +175,9 @@ public interface ValueVector extends Closeable, Iterable { * Return the underlying buffers associated with this vector. Note that this doesn't impact the reference counts for * this buffer so it only should be used for in-context access. Also note that this buffer changes regularly thus * external classes shouldn't hold a reference to it (unless they change it). - * @param clear Whether to clear vector before returning; the buffers will still be refcounted; - * but the returned array will be the only reference to them * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted; + * but the returned array will be the only reference to them * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this vector instance. */ ArrowBuf[] getBuffers(boolean clear); @@ -181,8 +189,7 @@ interface Accessor { /** * Get the Java Object representation of the element at the specified position. Useful for testing. * - * @param index - * Index of the value to get + * @param index Index of the value to get * @return the friendly java type */ Object getObject(int index); @@ -211,7 +218,7 @@ interface Mutator { /** * Sets the number of values that is stored in this vector to the given value count. * - * @param valueCount value count to set. + * @param valueCount value count to set. */ void setValueCount(int valueCount); @@ -221,8 +228,8 @@ interface Mutator { void reset(); /** - * @deprecated this has nothing to do with value vector abstraction and should be removed. * @param values the number of values to generate + * @deprecated this has nothing to do with value vector abstraction and should be removed. */ @Deprecated void generateTestData(int values); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java index ed164b548b5bd..04c00b7c8349c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java @@ -15,20 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; -public interface VariableWidthVector extends ValueVector{ +public interface VariableWidthVector extends ValueVector { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param totalBytes Desired size of the underlying data buffer. - * @param valueCount Number of values in the vector. + * @param totalBytes Desired size of the underlying data buffer. + * @param valueCount Number of values in the vector. */ void allocateNew(int totalBytes, int valueCount); /** * Provide the maximum amount of variable width bytes that can be stored in this vector. + * * @return the byte capacity of this vector */ int getByteCapacity(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index e640c7cb78418..58fc80bbba17c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static com.google.common.base.Preconditions.checkArgument; @@ -41,6 +42,7 @@ public class VectorLoader { /** * will create children in root based on schema + * * @param root the root to add vectors to based on schema */ public VectorLoader(VectorSchemaRoot root) { @@ -50,12 +52,13 @@ public VectorLoader(VectorSchemaRoot root) { /** * Loads the record batch in the vectors * will not close the record batch + * * @param recordBatch the batch to load */ public void load(ArrowRecordBatch recordBatch) { Iterator buffers = recordBatch.getBuffers().iterator(); Iterator nodes = recordBatch.getNodes().iterator(); - for (FieldVector fieldVector: root.getFieldVectors()) { + for (FieldVector fieldVector : root.getFieldVectors()) { loadBuffers(fieldVector, fieldVector.getField(), buffers, nodes); } root.setRowCount(recordBatch.getLength()); @@ -77,7 +80,7 @@ private void loadBuffers(FieldVector vector, Field field, Iterator buf vector.loadFieldBuffers(fieldNode, ownBuffers); } catch (RuntimeException e) { throw new IllegalArgumentException("Could not load buffers for field " + - field + ". error message: " + e.getMessage(), e); + field + ". error message: " + e.getMessage(), e); } List children = field.getChildren(); if (children.size() > 0) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java index 73deb0b3a426e..0c8868cad55b5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.ArrayList; @@ -67,7 +68,7 @@ public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) } if (fieldVectors.size() != schema.getFields().size()) { throw new IllegalArgumentException("The root vector did not create the right number of children. found " + - fieldVectors.size() + " expected " + schema.getFields().size()); + fieldVectors.size() + " expected " + schema.getFields().size()); } return new VectorSchemaRoot(schema, fieldVectors, 0); } @@ -102,7 +103,7 @@ public void close() { ex = chain(ex, e); } } - if (ex!= null) { + if (ex != null) { throw ex; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java index 055857e956084..ada471e63e710 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import io.netty.buffer.ByteBuf; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index 996524521cb68..f8385a7262a21 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.ArrayList; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index 01e22f2574346..cce73897718a3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.Collections; @@ -42,10 +43,12 @@ public class ZeroVector implements FieldVector { private final TransferPair defaultPair = new TransferPair() { @Override - public void transfer() { } + public void transfer() { + } @Override - public void splitAndTransfer(int startIndex, int length) { } + public void splitAndTransfer(int startIndex, int length) { + } @Override public ValueVector getTo() { @@ -53,7 +56,8 @@ public ValueVector getTo() { } @Override - public void copyValueSafe(int from, int to) { } + public void copyValueSafe(int from, int to) { + } }; private final Accessor defaultAccessor = new Accessor() { @@ -80,22 +84,28 @@ public int getNullCount() { private final Mutator defaultMutator = new Mutator() { @Override - public void setValueCount(int valueCount) { } + public void setValueCount(int valueCount) { + } @Override - public void reset() { } + public void reset() { + } @Override - public void generateTestData(int values) { } + public void generateTestData(int values) { + } }; - public ZeroVector() { } + public ZeroVector() { + } @Override - public void close() { } + public void close() { + } @Override - public void clear() { } + public void clear() { + } @Override public Field getField() { @@ -144,7 +154,8 @@ public boolean allocateNewSafe() { } @Override - public void reAlloc() {} + public void reAlloc() { + } @Override public BufferAllocator getAllocator() { @@ -152,7 +163,8 @@ public BufferAllocator getAllocator() { } @Override - public void setInitialCapacity(int numRecords) { } + public void setInitialCapacity(int numRecords) { + } @Override public int getValueCapacity() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java index 2aeeca25f0e9e..db0ff86df47a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.memory.BufferAllocator; @@ -58,6 +59,7 @@ public BufferAllocator getAllocator() { /** * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given field name if exists or null. + * * @param name the name of the child to return * @return the corresponding FieldVector */ @@ -68,9 +70,9 @@ public FieldVector getChild(String name) { /** * Clears out all underlying child vectors. */ - @Override + @Override public void close() { - for (ValueVector vector:(Iterable)this) { + for (ValueVector vector : (Iterable) this) { vector.close(); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java index 4b6d82cc8b291..26b0f90581ffc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import java.util.ArrayList; @@ -42,7 +43,7 @@ public abstract class AbstractMapVector extends AbstractContainerVector { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); // Maintains a map with key as field name and value is the vector itself - private final MapWithOrdinal vectors = new MapWithOrdinal<>(); + private final MapWithOrdinal vectors = new MapWithOrdinal<>(); protected AbstractMapVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, allocator, callBack); @@ -50,7 +51,7 @@ protected AbstractMapVector(String name, BufferAllocator allocator, CallBack cal @Override public void close() { - for(final ValueVector valueVector : vectors.values()) { + for (final ValueVector valueVector : vectors.values()) { valueVector.close(); } vectors.clear(); @@ -83,7 +84,7 @@ public boolean allocateNewSafe() { @Override public void reAlloc() { - for (final ValueVector v: vectors.values()) { + for (final ValueVector v : vectors.values()) { v.reAlloc(); } } @@ -94,27 +95,26 @@ public void reAlloc() { * * Execution takes place in the following order: *
    - *
  • - * if field is new, create and insert a new vector of desired type. - *
  • - *
  • - * if field exists and existing vector is of desired vector type, return the vector. - *
  • - *
  • - * if field exists and null filled, clear the existing vector; create and insert a new vector of desired type. - *
  • - *
  • - * otherwise, throw an {@link java.lang.IllegalStateException} - *
  • + *
  • + * if field is new, create and insert a new vector of desired type. + *
  • + *
  • + * if field exists and existing vector is of desired vector type, return the vector. + *
  • + *
  • + * if field exists and null filled, clear the existing vector; create and insert a new vector of desired type. + *
  • + *
  • + * otherwise, throw an {@link java.lang.IllegalStateException} + *
  • *
* * @param childName the name of the field * @param fieldType the type for the vector - * @param clazz class of expected vector type - * @param class type of expected vector type - * @throws java.lang.IllegalStateException raised if there is a hard schema change - * + * @param clazz class of expected vector type + * @param class type of expected vector type * @return resultant {@link org.apache.arrow.vector.ValueVector} + * @throws java.lang.IllegalStateException raised if there is a hard schema change */ @Override public T addOrGet(String childName, FieldType fieldType, Class clazz) { @@ -151,6 +151,7 @@ private boolean nullFilled(ValueVector vector) { /** * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given ordinal identifier. + * * @param id the ordinal of the child to return * @return the corresponding child */ @@ -161,7 +162,8 @@ public ValueVector getChildByOrdinal(int id) { /** * Returns a {@link org.apache.arrow.vector.ValueVector} instance of subtype of T corresponding to the given * field name if exists or null. - * @param name the name of the child to return + * + * @param name the name of the child to return * @param clazz the expected type of the child * @return the child corresponding to this name */ @@ -191,7 +193,8 @@ protected ValueVector add(String childName, FieldType fieldType) { * Inserts the vector with the given name if it does not exist else replaces it with the new value. * * Note that this method does not enforce any vector type check nor throws a schema change exception. - * @param name the name of the child to add + * + * @param name the name of the child to add * @param vector the vector to add as a child */ protected void putChild(String name, FieldVector vector) { @@ -200,8 +203,9 @@ protected void putChild(String name, FieldVector vector) { /** * Inserts the input vector into the map if it does not exist, replaces if it exists already - * @param name field name - * @param vector vector to be inserted + * + * @param name field name + * @param vector vector to be inserted */ protected void putVector(String name, FieldVector vector) { final ValueVector old = vectors.put( @@ -210,7 +214,7 @@ protected void putVector(String name, FieldVector vector) { ); if (old != null && old != vector) { logger.debug("Field [{}] mutated from [{}] to [{}]", name, old.getClass().getSimpleName(), - vector.getClass().getSimpleName()); + vector.getClass().getSimpleName()); } } @@ -298,7 +302,7 @@ public ArrowBuf[] getBuffers(boolean clear) { @Override public int getBufferSize() { - int actualBufSize = 0 ; + int actualBufSize = 0; for (final ValueVector v : vectors.values()) { for (final ArrowBuf buf : v.getBuffers(false)) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 3bfa8e7f7ce67..8e2877f892a64 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import java.util.Collections; @@ -144,7 +145,7 @@ public void clear() { public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); if (clear) { - for (ArrowBuf buffer:buffers) { + for (ArrowBuf buffer : buffers) { buffer.retain(); } clear(); @@ -156,7 +157,7 @@ public ArrowBuf[] getBuffers(boolean clear) { * @return 1 if inner vector is explicitly set via #addOrGetVector else 0 */ public int size() { - return vector == DEFAULT_DATA_VECTOR ? 0:1; + return vector == DEFAULT_DATA_VECTOR ? 0 : 1; } public AddOrGetResult addOrGetVector(FieldType fieldType) { @@ -166,8 +167,8 @@ public AddOrGetResult addOrGetVector(FieldType fieldT // returned vector must have the same field created = true; if (callBack != null && - // not a schema change if changing from ZeroVector to ZeroVector - (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { callBack.doWork(); } } @@ -178,7 +179,7 @@ public AddOrGetResult addOrGetVector(FieldType fieldT throw new SchemaChangeRuntimeException(msg); } - return new AddOrGetResult<>((T)vector, created); + return new AddOrGetResult<>((T) vector, created); } protected void replaceDataVector(FieldVector v) { @@ -200,7 +201,7 @@ public int getInnerValueCount() { @Override public int getInnerValueCountAt(int index) { - return offsets.getAccessor().get(index+1) - offsets.getAccessor().get(index); + return offsets.getAccessor().get(index + 1) - offsets.getAccessor().get(index); } @Override @@ -222,15 +223,15 @@ public int startNewValue(int index) { offsets.reAlloc(); } int offset = offsets.getAccessor().get(index); - offsets.getMutator().setSafe(index+1, offset); - setValueCount(index+1); + offsets.getMutator().setSafe(index + 1, offset); + setValueCount(index + 1); return offset; } @Override public void setValueCount(int valueCount) { // TODO: populate offset end points - offsets.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount+1); + offsets.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount + 1); final int childValueCount = valueCount == 0 ? 0 : offsets.getAccessor().get(valueCount); vector.getMutator().setValueCount(childValueCount); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java index df699755770a5..a76fbbe11a1fb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.UInt4Vector; @@ -34,8 +35,8 @@ public EmptyValuePopulator(UInt4Vector offsets) { /** * Marks all values since the last set as empty. The last set value is obtained from underlying offsets vector. * - * @param lastIndex the last index (inclusive) in the offsets vector until which empty population takes place - * @throws java.lang.IndexOutOfBoundsException if lastIndex is negative or greater than offsets capacity. + * @param lastIndex the last index (inclusive) in the offsets vector until which empty population takes place + * @throws java.lang.IndexOutOfBoundsException if lastIndex is negative or greater than offsets capacity. */ public void populate(int lastIndex) { if (lastIndex < 0) { @@ -48,7 +49,7 @@ public void populate(int lastIndex) { for (int i = lastSet; i < lastIndex; i++) { mutator.setSafe(i + 1, previousEnd); } - mutator.setValueCount(lastIndex+1); + mutator.setValueCount(lastIndex + 1); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index b6d938f3fd863..3f0f1b05b6733 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex; import static java.util.Collections.singletonList; @@ -240,7 +241,7 @@ public void clear() { public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers = ObjectArrays.concat(bits.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); if (clear) { - for (ArrowBuf buffer: buffers) { + for (ArrowBuf buffer : buffers) { buffer.retain(); } clear(); @@ -267,7 +268,7 @@ public AddOrGetResult addOrGetVector(FieldType type) // returned vector must have the same field if (!Objects.equals(vector.getField().getType(), type.getType())) { final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", - type.getType(), vector.getField().getType()); + type.getType(), vector.getField().getType()); throw new SchemaChangeRuntimeException(msg); } @@ -301,7 +302,7 @@ public Object getObject(int index) { } final List vals = new JsonStringArrayList<>(listSize); final ValueVector.Accessor valuesAccessor = vector.getAccessor(); - for(int i = 0; i < listSize; i++) { + for (int i = 0; i < listSize; i++) { vals.add(valuesAccessor.getObject(index * listSize + i)); } return vals; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index fdeac3971657d..7b6b97a8ed997 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; @@ -199,7 +200,7 @@ public TransferImpl(ListVector to) { to.addOrGetVector(vector.getField().getFieldType()); } dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); - pairs = new TransferPair[] { bitsTransferPair, offsetsTransferPair, dataTransferPair }; + pairs = new TransferPair[] {bitsTransferPair, offsetsTransferPair, dataTransferPair}; } @Override @@ -316,9 +317,9 @@ public void clear() { @Override public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), ObjectArrays.concat(bits.getBuffers(false), - vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); + vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); if (clear) { - for (ArrowBuf buffer:buffers) { + for (ArrowBuf buffer : buffers) { buffer.retain(); } clear(); @@ -351,7 +352,7 @@ public Object getObject(int index) { final int start = offsetsAccessor.get(index); final int end = offsetsAccessor.get(index + 1); final ValueVector.Accessor valuesAccessor = getDataVector().getAccessor(); - for(int i = start; i < end; i++) { + for (int i = start; i < end; i++) { vals.add(valuesAccessor.getObject(i)); } return vals; @@ -388,7 +389,7 @@ public int startNewValue(int index) { * End the current value * * @param index index of the value to end - * @param size number of elements in the list that was written + * @param size number of elements in the list that was written */ public void endValue(int index, int size) { offsets.getMutator().set(index + 1, offsets.getAccessor().get(index + 1) + size); @@ -414,7 +415,9 @@ public void setLastSet(int value) { lastSet = value; } - public int getLastSet() { return lastSet; } + public int getLastSet() { + return lastSet; + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index bdd30f88f2cc2..d8d0964ab3792 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; @@ -79,7 +80,7 @@ public FieldReader getReader() { transient private MapTransferPair ephPair; public void copyFromSafe(int fromIndex, int thisIndex, MapVector from) { - if(ephPair == null || ephPair.from != from) { + if (ephPair == null || ephPair.from != from) { ephPair = (MapTransferPair) from.makeTransferPair(this); } ephPair.copyValueSafe(fromIndex, thisIndex); @@ -107,7 +108,7 @@ public int getBufferSize() { return 0; } long buffer = 0; - for (final ValueVector v : (Iterable)this) { + for (final ValueVector v : (Iterable) this) { buffer += v.getBufferSize(); } @@ -148,7 +149,7 @@ public TransferPair getTransferPair(String ref, BufferAllocator allocator) { return new MapTransferPair(this, new MapVector(ref, allocator, fieldType, callBack), false); } - protected static class MapTransferPair implements TransferPair{ + protected static class MapTransferPair implements TransferPair { private final TransferPair[] pairs; private final MapVector from; private final MapVector to; @@ -165,7 +166,7 @@ protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { int i = 0; FieldVector vector; - for (String child:from.getChildFieldNames()) { + for (String child : from.getChildFieldNames()) { int preSize = to.size(); vector = from.getChild(child); if (vector == null) { @@ -252,7 +253,7 @@ public class Accessor extends BaseValueVector.BaseAccessor { @Override public Object getObject(int index) { Map vv = new JsonStringHashMap<>(); - for (String child:getChildFieldNames()) { + for (String child : getChildFieldNames()) { ValueVector v = getChild(child); if (v != null && index < v.getAccessor().getValueCount()) { Object value = v.getAccessor().getObject(index); @@ -290,10 +291,12 @@ public void setValueCount(int valueCount) { } @Override - public void reset() { } + public void reset() { + } @Override - public void generateTestData(int values) { } + public void generateTestData(int values) { + } } @Override @@ -329,11 +332,11 @@ public void close() { valueCount = 0; super.close(); - } + } public void initializeChildrenFromFields(List children) { for (Field field : children) { - FieldVector vector = (FieldVector)this.add(field.getName(), field.getFieldType()); + FieldVector vector = (FieldVector) this.add(field.getName(), field.getFieldType()); vector.initializeChildrenFromFields(field.getChildren()); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index ee95fdef59401..e70a915561f8b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; @@ -185,7 +186,7 @@ public void clear() { @Override - public int getBufferSize(){ + public int getBufferSize() { return super.getBufferSize() + bits.getBufferSize(); } @@ -229,7 +230,7 @@ public void reAlloc() { super.reAlloc(); } - public final class Accessor extends MapVector.Accessor { + public final class Accessor extends MapVector.Accessor { final BitVector.Accessor bAccessor = bits.getAccessor(); @Override @@ -257,7 +258,7 @@ public boolean isNull(int index) { return isSet(index) == 0; } - public int isSet(int index){ + public int isSet(int index) { return bAccessor.get(index); } @@ -265,15 +266,15 @@ public int isSet(int index){ public final class Mutator extends MapVector.Mutator implements NullableVectorDefinitionSetter { - private Mutator(){ + private Mutator() { } @Override - public void setIndexDefined(int index){ + public void setIndexDefined(int index) { bits.getMutator().setSafe(index, 1); } - public void setNull(int index){ + public void setNull(int index) { bits.getMutator().setSafe(index, 0); } @@ -285,13 +286,13 @@ public void setValueCount(int valueCount) { } @Override - public void generateTestData(int valueCount){ + public void generateTestData(int valueCount) { super.generateTestData(valueCount); bits.getMutator().generateTestDataAlt(valueCount); } @Override - public void reset(){ + public void reset() { bits.getMutator().setValueCount(0); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java index e1a4f36296987..f0a5174b2dce3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; public interface Positionable { public int getPosition(); + public void setPosition(int index); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java index 8b528b4ccab9b..4b19b9ffa0f9d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.AddOrGetResult; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java index 23850bc9034df..866883f532665 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; /** @@ -25,8 +26,8 @@ public interface RepeatedFixedWidthVectorLike { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param valueCount Number of separate repeating groupings. - * @param innerValueCount Number of supported values in the vector. + * @param valueCount Number of separate repeating groupings. + * @param innerValueCount Number of supported values in the vector. */ void allocateNew(int valueCount, int innerValueCount); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java index de58eda0b11a2..91147c663f248 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.UInt4Vector; @@ -26,7 +27,6 @@ * A repeated vector contains values that may either be flat or nested. A value consists of zero or more cells(inner values). * Current design maintains data and offsets vectors. Each cell is stored in the data vector. Repeated vector * uses the offset vector to determine the sequence of cells pertaining to an individual value. - * */ public interface RepeatedValueVector extends ValueVector { @@ -51,6 +51,7 @@ public interface RepeatedValueVector extends ValueVector { interface RepeatedAccessor extends ValueVector.Accessor { /** * The result includes empty, null valued cells. + * * @return total number of cells that vector contains. */ int getInnerValueCount(); @@ -63,7 +64,7 @@ interface RepeatedAccessor extends ValueVector.Accessor { int getInnerValueCountAt(int index); /** - * @param index value index + * @param index value index * @return true if the value at the given index is empty, false otherwise. */ boolean isEmpty(int index); @@ -74,7 +75,7 @@ interface RepeatedMutator extends ValueVector.Mutator { /** * Starts a new value that is a container of cells. * - * @param index index of new value to start + * @param index index of new value to start * @return index into the child vector */ int startNewValue(int index); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java index 29f9d75c74671..1e4f54ea37209 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java @@ -15,20 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; public interface RepeatedVariableWidthVectorLike { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param totalBytes Desired size of the underlying data buffer. - * @param parentValueCount Number of separate repeating groupings. - * @param childValueCount Number of supported values in the vector. + * @param totalBytes Desired size of the underlying data buffer. + * @param parentValueCount Number of separate repeating groupings. + * @param childValueCount Number of supported values in the vector. */ void allocateNew(int totalBytes, int parentValueCount, int childValueCount); /** * Provide the maximum amount of variable width bytes that can be stored int his vector. + * * @return the byte capacity */ int getByteCapacity(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java index 05a79d24295e4..627998045c93c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import java.util.Arrays; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java index d04fc1c022c05..1633b3ad09892 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.ValueVector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java index 7c73c27ecff41..1eeced4598a55 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import java.util.Iterator; @@ -25,7 +26,7 @@ import org.apache.arrow.vector.holders.UnionHolder; -abstract class AbstractBaseReader implements FieldReader{ +abstract class AbstractBaseReader implements FieldReader { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseReader.class); @@ -40,11 +41,11 @@ public int getPosition() { return index; } - public void setPosition(int index){ + public void setPosition(int index) { this.index = index; } - protected int idx(){ + protected int idx() { return index; } @@ -86,6 +87,6 @@ public void copyAsValue(UnionWriter writer) { @Override public void copyAsValue(ListWriter writer) { - ComplexCopier.copy(this, (FieldWriter)writer); + ComplexCopier.copy(this, (FieldWriter) writer); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java index 13a0a6bd9e28f..2f224fe3a5b7a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.writer.FieldWriter; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 6851d6d45d562..a6960238b9165 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.ListVector; @@ -38,13 +39,15 @@ public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWri private final boolean unionEnabled; private final NullableMapWriterFactory nullableMapWriterFactory; - private enum Mode { INIT, MAP, LIST }; + private enum Mode {INIT, MAP, LIST} + + ; - public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled, boolean caseSensitive){ + public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled, boolean caseSensitive) { this.name = name; this.container = container; this.unionEnabled = unionEnabled; - nullableMapWriterFactory = caseSensitive? NullableMapWriterFactory.getNullableCaseSensitiveMapWriterFactoryInstance() : + nullableMapWriterFactory = caseSensitive ? NullableMapWriterFactory.getNullableCaseSensitiveMapWriterFactoryInstance() : NullableMapWriterFactory.getNullableMapWriterFactoryInstance(); } @@ -52,7 +55,7 @@ public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled) this(name, container, unionEnabled, false); } - public ComplexWriterImpl(String name, MapVector container){ + public ComplexWriterImpl(String name, MapVector container) { this(name, container, false); } @@ -66,12 +69,12 @@ public int getValueCapacity() { return container.getValueCapacity(); } - private void check(Mode... modes){ + private void check(Mode... modes) { StateTool.check(mode, modes); } @Override - public void reset(){ + public void reset() { setPosition(0); } @@ -85,58 +88,58 @@ public void close() throws Exception { } @Override - public void clear(){ - switch(mode){ - case MAP: - mapRoot.clear(); - break; - case LIST: - listRoot.clear(); - break; + public void clear() { + switch (mode) { + case MAP: + mapRoot.clear(); + break; + case LIST: + listRoot.clear(); + break; } } @Override - public void setValueCount(int count){ - switch(mode){ - case MAP: - mapRoot.setValueCount(count); - break; - case LIST: - listRoot.setValueCount(count); - break; + public void setValueCount(int count) { + switch (mode) { + case MAP: + mapRoot.setValueCount(count); + break; + case LIST: + listRoot.setValueCount(count); + break; } } @Override - public void setPosition(int index){ + public void setPosition(int index) { super.setPosition(index); - switch(mode){ - case MAP: - mapRoot.setPosition(index); - break; - case LIST: - listRoot.setPosition(index); - break; + switch (mode) { + case MAP: + mapRoot.setPosition(index); + break; + case LIST: + listRoot.setPosition(index); + break; } } - public MapWriter directMap(){ + public MapWriter directMap() { Preconditions.checkArgument(name == null); - switch(mode){ + switch (mode) { - case INIT: - mapRoot = nullableMapWriterFactory.build((NullableMapVector) container); - mapRoot.setPosition(idx()); - mode = Mode.MAP; - break; + case INIT: + mapRoot = nullableMapWriterFactory.build((NullableMapVector) container); + mapRoot.setPosition(idx()); + mode = Mode.MAP; + break; - case MAP: - break; + case MAP: + break; - default: + default: check(Mode.INIT, Mode.MAP); } @@ -145,20 +148,20 @@ public MapWriter directMap(){ @Override public MapWriter rootAsMap() { - switch(mode){ + switch (mode) { - case INIT: - // TODO allow dictionaries in complex types - NullableMapVector map = container.addOrGetMap(name); - mapRoot = nullableMapWriterFactory.build(map); - mapRoot.setPosition(idx()); - mode = Mode.MAP; - break; + case INIT: + // TODO allow dictionaries in complex types + NullableMapVector map = container.addOrGetMap(name); + mapRoot = nullableMapWriterFactory.build(map); + mapRoot.setPosition(idx()); + mode = Mode.MAP; + break; - case MAP: - break; + case MAP: + break; - default: + default: check(Mode.INIT, Mode.MAP); } @@ -167,33 +170,33 @@ public MapWriter rootAsMap() { @Override public void allocate() { - if(mapRoot != null) { + if (mapRoot != null) { mapRoot.allocate(); - } else if(listRoot != null) { + } else if (listRoot != null) { listRoot.allocate(); } } @Override public ListWriter rootAsList() { - switch(mode){ - - case INIT: - int vectorCount = container.size(); - // TODO allow dictionaries in complex types - ListVector listVector = container.addOrGetList(name); - if (container.size() > vectorCount) { - listVector.allocateNew(); - } - listRoot = new UnionListWriter(listVector, nullableMapWriterFactory); - listRoot.setPosition(idx()); - mode = Mode.LIST; - break; - - case LIST: - break; - - default: + switch (mode) { + + case INIT: + int vectorCount = container.size(); + // TODO allow dictionaries in complex types + ListVector listVector = container.addOrGetList(name); + if (container.size() > vectorCount) { + listVector.allocateNew(); + } + listRoot = new UnionListWriter(listVector, nullableMapWriterFactory); + listRoot.setPosition(idx()); + mode = Mode.LIST; + break; + + case LIST: + break; + + default: check(Mode.INIT, Mode.MAP); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java index f8a9d4232aadc..0d860b6a04115 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.writer.BaseWriter; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java index 067716e8ea290..614c266acf147 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.MapVector; @@ -28,8 +29,8 @@ public class NullableMapReaderImpl extends SingleMapReaderImpl { private NullableMapVector nullableMapVector; public NullableMapReaderImpl(MapVector vector) { - super((NullableMapVector)vector); - this.nullableMapVector = (NullableMapVector)vector; + super((NullableMapVector) vector); + this.nullableMapVector = (NullableMapVector) vector; } @Override @@ -38,19 +39,19 @@ public Field getField() { } @Override - public void copyAsValue(MapWriter writer){ + public void copyAsValue(MapWriter writer) { NullableMapWriter impl = (NullableMapWriter) writer; impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); } @Override - public void copyAsField(String name, MapWriter writer){ + public void copyAsField(String name, MapWriter writer) { NullableMapWriter impl = (NullableMapWriter) writer.map(name); impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); } @Override - public boolean isSet(){ + public boolean isSet() { return !nullableMapVector.getAccessor().isNull(idx()); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java index d932cfb3e1287..d2dcb2374d0e7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java @@ -15,28 +15,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.NullableMapVector; public class NullableMapWriterFactory { - private final boolean caseSensitive; - private static final NullableMapWriterFactory nullableMapWriterFactory = new NullableMapWriterFactory(false); - private static final NullableMapWriterFactory nullableCaseSensitiveWriterFactory = new NullableMapWriterFactory(true); + private final boolean caseSensitive; + private static final NullableMapWriterFactory nullableMapWriterFactory = new NullableMapWriterFactory(false); + private static final NullableMapWriterFactory nullableCaseSensitiveWriterFactory = new NullableMapWriterFactory(true); - public NullableMapWriterFactory(boolean caseSensitive) { - this.caseSensitive = caseSensitive; - } + public NullableMapWriterFactory(boolean caseSensitive) { + this.caseSensitive = caseSensitive; + } - public NullableMapWriter build(NullableMapVector container) { - return this.caseSensitive? new NullableCaseSensitiveMapWriter(container) : new NullableMapWriter(container); - } + public NullableMapWriter build(NullableMapVector container) { + return this.caseSensitive ? new NullableCaseSensitiveMapWriter(container) : new NullableMapWriter(container); + } - public static NullableMapWriterFactory getNullableMapWriterFactoryInstance() { - return nullableMapWriterFactory; - } + public static NullableMapWriterFactory getNullableMapWriterFactoryInstance() { + return nullableMapWriterFactory; + } - public static NullableMapWriterFactory getNullableCaseSensitiveMapWriterFactoryInstance() { - return nullableCaseSensitiveWriterFactory; - } + public static NullableMapWriterFactory getNullableCaseSensitiveMapWriterFactoryInstance() { + return nullableCaseSensitiveWriterFactory; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index d16718e75a701..9722196ed7cd2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.FieldVector; @@ -120,7 +121,7 @@ public void setPosition(int index) { protected FieldWriter getWriter(MinorType type) { if (state == State.UNION) { - ((UnionWriter)writer).getWriter(type); + ((UnionWriter) writer).getWriter(type); } else if (state == State.UNTYPED) { if (type == null) { // ??? @@ -132,7 +133,7 @@ protected FieldWriter getWriter(MinorType type) { writer.setPosition(position); } else if (type != this.type) { promoteToUnion(); - ((UnionWriter)writer).getWriter(type); + ((UnionWriter) writer).getWriter(type); } return writer; } @@ -157,7 +158,7 @@ private FieldWriter promoteToUnion() { } else if (listVector != null) { unionVector = listVector.promoteToUnion(); } - unionVector.addVector((FieldVector)tp.getTo()); + unionVector.addVector((FieldVector) tp.getTo()); writer = new UnionWriter(unionVector, nullableMapWriterFactory); writer.setPosition(idx()); for (int i = 0; i <= idx(); i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java index b8f58658eae15..f2b46ab98db7f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java @@ -17,6 +17,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; @@ -27,7 +28,7 @@ import org.apache.arrow.vector.types.Types.MinorType; @SuppressWarnings("unused") -public class SingleListReaderImpl extends AbstractFieldReader{ +public class SingleListReaderImpl extends AbstractFieldReader { private final String name; private final AbstractContainerVector container; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java index 48019093e387f..3ebd0cd7dd959 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; @@ -31,7 +32,7 @@ import com.google.common.collect.Maps; @SuppressWarnings("unused") -public class SingleMapReaderImpl extends AbstractFieldReader{ +public class SingleMapReaderImpl extends AbstractFieldReader { private final MapVector vector; private final Map fields = Maps.newHashMap(); @@ -40,8 +41,8 @@ public SingleMapReaderImpl(MapVector vector) { this.vector = vector; } - private void setChildrenPosition(int index){ - for(FieldReader r : fields.values()){ + private void setChildrenPosition(int index) { + for (FieldReader r : fields.values()) { r.setPosition(index); } } @@ -52,13 +53,13 @@ public Field getField() { } @Override - public FieldReader reader(String name){ + public FieldReader reader(String name) { FieldReader reader = fields.get(name); - if(reader == null){ + if (reader == null) { ValueVector child = vector.getChild(name); - if(child == null){ + if (child == null) { reader = NullReader.INSTANCE; - }else{ + } else { reader = child.getReader(); } fields.put(name, reader); @@ -68,9 +69,9 @@ public FieldReader reader(String name){ } @Override - public void setPosition(int index){ + public void setPosition(int index) { super.setPosition(index); - for(FieldReader r : fields.values()){ + for (FieldReader r : fields.values()) { r.setPosition(index); } } @@ -91,18 +92,18 @@ public boolean isSet() { } @Override - public java.util.Iterator iterator(){ + public java.util.Iterator iterator() { return vector.fieldNameIterator(); } @Override - public void copyAsValue(MapWriter writer){ + public void copyAsValue(MapWriter writer) { SingleMapWriter impl = (SingleMapWriter) writer; impl.container.copyFromSafe(idx(), impl.idx(), vector); } @Override - public void copyAsField(String name, MapWriter writer){ + public void copyAsField(String name, MapWriter writer) { SingleMapWriter impl = (SingleMapWriter) writer.map(name); impl.container.copyFromSafe(idx(), impl.idx(), vector); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java index 515d4ab8ce907..f3e9b8773f25e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.ValueVector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java index 2bd0ca87cd074..b98c36d2bf721 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.UInt4Vector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java index c4eb3dc739a49..df142c7819d7c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.reader; import org.apache.arrow.vector.complex.reader.BaseReader.ListReader; @@ -24,6 +25,5 @@ import org.apache.arrow.vector.complex.reader.BaseReader.ScalarReader; - public interface FieldReader extends MapReader, ListReader, ScalarReader, RepeatedMapReader, RepeatedListReader { } \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java index ecffe0bec0e84..a2a1f5d000a8a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.writer; import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; @@ -23,5 +24,6 @@ public interface FieldWriter extends MapWriter, ListWriter, ScalarWriter { void allocate(); + void clear(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java index 0c1cadfdafdbf..c2f692035946f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.dictionary; import java.util.Objects; @@ -53,8 +54,12 @@ public String toString() { @Override public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } Dictionary that = (Dictionary) o; return Objects.equals(encoding, that.encoding) && Objects.equals(dictionary, that.dictionary); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 0f49ce61f1cdf..7e20794cbbed2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.dictionary; import java.lang.reflect.InvocationTargetException; @@ -39,7 +40,7 @@ public class DictionaryEncoder { /** * Dictionary encodes a vector with a provided dictionary. The dictionary must contain all values in the vector. * - * @param vector vector to encode + * @param vector vector to encode * @param dictionary dictionary used for encoding * @return dictionary encoded vector */ @@ -55,7 +56,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { Field valueField = vector.getField(); FieldType indexFieldType = new FieldType(valueField.isNullable(), dictionary.getEncoding().getIndexType(), - dictionary.getEncoding(), valueField.getMetadata()); + dictionary.getEncoding(), valueField.getMetadata()); Field indexField = new Field(valueField.getName(), indexFieldType, null); // vector to hold our indices (dictionary encoded values) @@ -65,11 +66,11 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { // use reflection to pull out the set method // TODO implement a common interface for int vectors Method setter = null; - for (Class c: ImmutableList.of(int.class, long.class)) { + for (Class c : ImmutableList.of(int.class, long.class)) { try { setter = mutator.getClass().getMethod("set", int.class, c); break; - } catch(NoSuchMethodException e) { + } catch (NoSuchMethodException e) { // ignore } } @@ -108,7 +109,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { /** * Decodes a dictionary encoded array using the provided dictionary. * - * @param indices dictionary encoded values, must be int type + * @param indices dictionary encoded values, must be int type * @param dictionary dictionary used to decode the values * @return vector with values restored from dictionary */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java index 87516c9a8fc5b..a170cea21d273 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.dictionary; import java.util.HashMap; @@ -31,7 +32,7 @@ public static class MapDictionaryProvider implements DictionaryProvider { public MapDictionaryProvider(Dictionary... dictionaries) { this.map = new HashMap<>(); - for (Dictionary dictionary: dictionaries) { + for (Dictionary dictionary : dictionaries) { put(dictionary); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java index 90fb02b059707..e1b4d6a8b215e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import org.apache.arrow.flatbuf.Block; @@ -64,19 +65,25 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } ArrowBlock other = (ArrowBlock) obj; - if (bodyLength != other.bodyLength) + if (bodyLength != other.bodyLength) { return false; - if (metadataLength != other.metadataLength) + } + if (metadataLength != other.metadataLength) { return false; - if (offset != other.offset) + } + if (offset != other.offset) { return false; + } return true; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java index f4d6ada932494..d711b9c6c1e26 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -117,7 +118,7 @@ private ArrowDictionaryBatch readDictionaryBatch(SeekableReadChannel in, ArrowBlock block, BufferAllocator allocator) throws IOException { LOGGER.debug(String.format("DictionaryRecordBatch at %d, metadata: %d, body: %d", - block.getOffset(), block.getMetadataLength(), block.getBodyLength())); + block.getOffset(), block.getMetadataLength(), block.getBodyLength())); in.setPosition(block.getOffset()); ArrowDictionaryBatch batch = MessageSerializer.deserializeDictionaryBatch(in, block, allocator); if (batch == null) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java index 23d210a3ee73b..06519bc49fd1c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -47,7 +48,7 @@ protected void endInternal(WriteChannel out, List records) throws IOException { long footerStart = out.getCurrentPosition(); out.write(new ArrowFooter(schema, dictionaries, records), false); - int footerLength = (int)(out.getCurrentPosition() - footerStart); + int footerLength = (int) (out.getCurrentPosition() - footerStart); if (footerLength <= 0) { throw new InvalidArrowFileException("invalid footer"); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java index 1c0008a9184a0..1e95321fdec5b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; @@ -48,7 +49,7 @@ public ArrowFooter(Footer footer) { Schema.convertSchema(footer.schema()), dictionaries(footer), recordBatches(footer) - ); + ); } private static List recordBatches(Footer footer) { @@ -112,28 +113,37 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } ArrowFooter other = (ArrowFooter) obj; if (dictionaries == null) { - if (other.dictionaries != null) + if (other.dictionaries != null) { return false; - } else if (!dictionaries.equals(other.dictionaries)) + } + } else if (!dictionaries.equals(other.dictionaries)) { return false; + } if (recordBatches == null) { - if (other.recordBatches != null) + if (other.recordBatches != null) { return false; - } else if (!recordBatches.equals(other.recordBatches)) + } + } else if (!recordBatches.equals(other.recordBatches)) { return false; + } if (schema == null) { - if (other.schema != null) + if (other.schema != null) { return false; - } else if (!schema.equals(other.schema)) + } + } else if (!schema.equals(other.schema)) { return false; + } return true; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java index 99ea96b3856d5..0d2da375295fe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java index f6b104145527c..646d6feeef086 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -94,12 +95,21 @@ public boolean loadNextBatch() throws IOException { ArrowMessageVisitor visitor = new ArrowMessageVisitor() { @Override public Boolean visit(ArrowDictionaryBatch message) { - try { load(message); } finally { message.close(); } + try { + load(message); + } finally { + message.close(); + } return true; } + @Override public Boolean visit(ArrowRecordBatch message) { - try { loader.load(message); } finally { message.close(); } + try { + loader.load(message); + } finally { + message.close(); + } return false; } }; @@ -119,13 +129,15 @@ public Boolean visit(ArrowRecordBatch message) { return readBatch; } - public long bytesRead() { return in.bytesRead(); } + public long bytesRead() { + return in.bytesRead(); + } @Override public void close() throws IOException { if (initialized) { root.close(); - for (Dictionary dictionary: dictionaries.values()) { + for (Dictionary dictionary : dictionaries.values()) { dictionary.getVector().close(); } } @@ -153,7 +165,7 @@ private void initialize() throws IOException { Map dictionaries = new HashMap<>(); // Convert fields with dictionaries to have the index type - for (Field field: originalSchema.getFields()) { + for (Field field : originalSchema.getFields()) { Field updated = DictionaryUtility.toMemoryFormat(field, allocator, dictionaries); fields.add(updated); vectors.add(updated.createVector(allocator)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java index 3b37071382ff6..b35aba5426e4a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -60,9 +61,9 @@ public abstract class ArrowWriter implements AutoCloseable { /** * Note: fields are not closed when the writer is closed * - * @param root the vectors to write to the output + * @param root the vectors to write to the output * @param provider where to find the dictionaries - * @param out the output where to write + * @param out the output where to write */ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { this.unloader = new VectorUnloader(root); @@ -72,13 +73,13 @@ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, Writab Set dictionaryIdsUsed = new HashSet<>(); // Convert fields with dictionaries to have dictionary type - for (Field field: root.getSchema().getFields()) { + for (Field field : root.getSchema().getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); } // Create a record batch for each dictionary this.dictionaries = new ArrayList<>(dictionaryIdsUsed.size()); - for (long id: dictionaryIdsUsed) { + for (long id : dictionaryIdsUsed) { Dictionary dictionary = provider.lookup(id); FieldVector vector = dictionary.getVector(); int count = vector.getAccessor().getValueCount(); @@ -105,7 +106,7 @@ public void writeBatch() throws IOException { protected void writeRecordBatch(ArrowRecordBatch batch) throws IOException { ArrowBlock block = MessageSerializer.serialize(out, batch); LOGGER.debug(String.format("RecordBatch at %d, metadata: %d, body: %d", - block.getOffset(), block.getMetadataLength(), block.getBodyLength())); + block.getOffset(), block.getMetadataLength(), block.getBodyLength())); recordBlocks.add(block); } @@ -114,7 +115,9 @@ public void end() throws IOException { ensureEnded(); } - public long bytesWritten() { return out.getCurrentPosition(); } + public long bytesWritten() { + return out.getCurrentPosition(); + } private void ensureStarted() throws IOException { if (!started) { @@ -128,7 +131,7 @@ private void ensureStarted() throws IOException { try { ArrowBlock block = MessageSerializer.serialize(out, batch); LOGGER.debug(String.format("DictionaryRecordBatch at %d, metadata: %d, body: %d", - block.getOffset(), block.getMetadataLength(), block.getBodyLength())); + block.getOffset(), block.getMetadataLength(), block.getBodyLength())); dictionaryBlocks.add(block); } finally { batch.close(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java b/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java index 3ec75dcb12a2b..607207f41b06c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; public class InvalidArrowFileException extends RuntimeException { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java index 87450e38f6852..b0eb8f3d84d9a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -37,11 +38,14 @@ public ReadChannel(ReadableByteChannel in) { this.in = in; } - public long bytesRead() { return bytesRead; } + public long bytesRead() { + return bytesRead; + } /** * Reads bytes into buffer until it is full (buffer.remaining() == 0). Returns the * number of bytes read which can be less than full if there are no more. + * * @param buffer The buffer to read to * @return the number of byte read * @throws IOException if nit enough bytes left to read @@ -51,9 +55,13 @@ public int readFully(ByteBuffer buffer) throws IOException { int totalRead = 0; while (buffer.remaining() != 0) { int read = in.read(buffer); - if (read < 0) return totalRead; + if (read < 0) { + return totalRead; + } totalRead += read; - if (read == 0) break; + if (read == 0) { + break; + } } this.bytesRead += totalRead; return totalRead; @@ -61,8 +69,9 @@ public int readFully(ByteBuffer buffer) throws IOException { /** * Reads up to len into buffer. Returns bytes read. + * * @param buffer the buffer to read to - * @param l the amount of bytes to read + * @param l the amount of bytes to read * @return the number of bytes read * @throws IOException if nit enough bytes left to read */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java index 914c3cb4b33a9..46bea1314da63 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -22,18 +23,18 @@ public class SeekableReadChannel extends ReadChannel { - private final SeekableByteChannel in; + private final SeekableByteChannel in; - public SeekableReadChannel(SeekableByteChannel in) { - super(in); - this.in = in; - } + public SeekableReadChannel(SeekableByteChannel in) { + super(in); + this.in = in; + } - public void setPosition(long position) throws IOException { - in.position(position); - } + public void setPosition(long position) throws IOException { + in.position(position); + } - public long size() throws IOException { - return in.size(); - } + public long size() throws IOException { + return in.size(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java index 42104d181a2d0..89c9d1f9b7a44 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -62,7 +63,7 @@ public long writeZeros(int zeroCount) throws IOException { public long align() throws IOException { if (currentPosition % 8 != 0) { // align on 8 byte boundaries - return writeZeros(8 - (int)(currentPosition % 8)); + return writeZeros(8 - (int) (currentPosition % 8)); } return 0; } @@ -77,10 +78,10 @@ public long write(ByteBuffer buffer) throws IOException { public static byte[] intToBytes(int value) { byte[] outBuffer = new byte[4]; - outBuffer[3] = (byte)(value >>> 24); - outBuffer[2] = (byte)(value >>> 16); - outBuffer[1] = (byte)(value >>> 8); - outBuffer[0] = (byte)(value >>> 0); + outBuffer[3] = (byte) (value >>> 24); + outBuffer[2] = (byte) (value >>> 16); + outBuffer[1] = (byte) (value >>> 8); + outBuffer[0] = (byte) (value >>> 0); return outBuffer; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 364d273fadae4..484a82fdaab67 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.file.json; import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; @@ -114,7 +115,7 @@ public Schema start() throws JsonParseException, IOException { dictionaries = new HashMap<>(); // Convert fields with dictionaries to have the index type - for (Field field: originalSchema.getFields()) { + for (Field field : originalSchema.getFields()) { fields.add(DictionaryUtility.toMemoryFormat(field, allocator, dictionaries)); } this.schema = new Schema(fields, originalSchema.getCustomMetadata()); @@ -233,7 +234,7 @@ private void readVector(Field field, FieldVector vector) throws JsonParseExcepti BufferBacked innerVector = fieldInnerVectors.get(v); nextFieldIs(vectorType.getName()); readToken(START_ARRAY); - ValueVector valueVector = (ValueVector)innerVector; + ValueVector valueVector = (ValueVector) innerVector; valueVector.allocateNew(); Mutator mutator = valueVector.getMutator(); @@ -262,7 +263,7 @@ private void readVector(Field field, FieldVector vector) throws JsonParseExcepti readToken(END_ARRAY); } if (vector instanceof NullableMapVector) { - ((NullableMapVector)vector).valueCount = count; + ((NullableMapVector) vector).valueCount = count; } } readToken(END_OBJECT); @@ -278,96 +279,96 @@ private byte[] decodeHexSafe(String hexString) throws IOException { private void setValueFromParser(ValueVector valueVector, int i) throws IOException { switch (valueVector.getMinorType()) { - case BIT: - ((BitVector)valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0); - break; - case TINYINT: - ((TinyIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case SMALLINT: - ((SmallIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case INT: - ((IntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case BIGINT: - ((BigIntVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case UINT1: - ((UInt1Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT2: - ((UInt2Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT4: - ((UInt4Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT8: - ((UInt8Vector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case FLOAT4: - ((Float4Vector)valueVector).getMutator().set(i, parser.readValueAs(Float.class)); - break; - case FLOAT8: - ((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class)); - break; - case VARBINARY: - ((VarBinaryVector)valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); - break; - case VARCHAR: - ((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); - break; - case DATEDAY: - ((DateDayVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case DATEMILLI: - ((DateMilliVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESEC: - ((TimeSecVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case TIMEMILLI: - ((TimeMilliVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case TIMEMICRO: - ((TimeMicroVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMENANO: - ((TimeNanoVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPSEC: - ((TimeStampSecVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMILLI: - ((TimeStampMilliVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMICRO: - ((TimeStampMicroVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPNANO: - ((TimeStampNanoVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPSECTZ: - ((TimeStampSecTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMILLITZ: - ((TimeStampMilliTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMICROTZ: - ((TimeStampMicroTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPNANOTZ: - ((TimeStampNanoTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - default: - throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType()); + case BIT: + ((BitVector) valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0); + break; + case TINYINT: + ((TinyIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case SMALLINT: + ((SmallIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case INT: + ((IntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case BIGINT: + ((BigIntVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case UINT1: + ((UInt1Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT2: + ((UInt2Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT4: + ((UInt4Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT8: + ((UInt8Vector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case FLOAT4: + ((Float4Vector) valueVector).getMutator().set(i, parser.readValueAs(Float.class)); + break; + case FLOAT8: + ((Float8Vector) valueVector).getMutator().set(i, parser.readValueAs(Double.class)); + break; + case VARBINARY: + ((VarBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); + break; + case VARCHAR: + ((VarCharVector) valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); + break; + case DATEDAY: + ((DateDayVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case DATEMILLI: + ((DateMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESEC: + ((TimeSecVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case TIMEMILLI: + ((TimeMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case TIMEMICRO: + ((TimeMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMENANO: + ((TimeNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPSEC: + ((TimeStampSecVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMILLI: + ((TimeStampMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMICRO: + ((TimeStampMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPNANO: + ((TimeStampNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPSECTZ: + ((TimeStampSecTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMILLITZ: + ((TimeStampMilliTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMICROTZ: + ((TimeStampMicroTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPNANOTZ: + ((TimeStampNanoTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + default: + throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType()); } } @Override public void close() throws IOException { parser.close(); - for (Dictionary dictionary: dictionaries.values()) { + for (Dictionary dictionary : dictionaries.values()) { dictionary.getVector().close(); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index befa92251f0f9..a2229cef23150 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.file.json; import java.io.File; @@ -60,12 +61,15 @@ public class JsonFileWriter implements AutoCloseable { public static final class JSONWriteConfig { private final boolean pretty; + private JSONWriteConfig(boolean pretty) { this.pretty = pretty; } + private JSONWriteConfig() { this.pretty = false; } + public JSONWriteConfig pretty(boolean pretty) { return new JSONWriteConfig(pretty); } @@ -98,7 +102,7 @@ public void start(Schema schema, DictionaryProvider provider) throws IOException this.schema = schema; // Store original Schema to ensure batches written match // Convert fields with dictionaries to have dictionary type - for (Field field: schema.getFields()) { + for (Field field : schema.getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); } Schema updatedSchema = new Schema(fields, schema.getCustomMetadata()); @@ -117,7 +121,7 @@ public void start(Schema schema, DictionaryProvider provider) throws IOException private void writeDictionaryBatches(JsonGenerator generator, Set dictionaryIdsUsed, DictionaryProvider provider) throws IOException { generator.writeArrayFieldStart("dictionaries"); - for (Long id: dictionaryIdsUsed) { + for (Long id : dictionaryIdsUsed) { generator.writeStartObject(); generator.writeObjectField("id", id); @@ -170,7 +174,7 @@ private void writeVector(Field field, FieldVector vector) throws IOException { ArrowVectorType vectorType = vectorTypes.get(v); BufferBacked innerVector = fieldInnerVectors.get(v); generator.writeArrayFieldStart(vectorType.getName()); - ValueVector valueVector = (ValueVector)innerVector; + ValueVector valueVector = (ValueVector) innerVector; for (int i = 0; i < valueVector.getAccessor().getValueCount(); i++) { writeValueToGenerator(valueVector, i); } @@ -197,37 +201,37 @@ private void writeVector(Field field, FieldVector vector) throws IOException { private void writeValueToGenerator(ValueVector valueVector, int i) throws IOException { switch (valueVector.getMinorType()) { case DATEDAY: - generator.writeNumber(((DateDayVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((DateDayVector) valueVector).getAccessor().get(i)); break; case DATEMILLI: - generator.writeNumber(((DateMilliVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((DateMilliVector) valueVector).getAccessor().get(i)); break; case TIMESEC: - generator.writeNumber(((TimeSecVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeSecVector) valueVector).getAccessor().get(i)); break; case TIMEMILLI: - generator.writeNumber(((TimeMilliVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeMilliVector) valueVector).getAccessor().get(i)); break; case TIMEMICRO: - generator.writeNumber(((TimeMicroVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeMicroVector) valueVector).getAccessor().get(i)); break; case TIMENANO: - generator.writeNumber(((TimeNanoVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeNanoVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPSEC: - generator.writeNumber(((TimeStampSecVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampSecVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPMILLI: - generator.writeNumber(((TimeStampMilliVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampMilliVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPMICRO: - generator.writeNumber(((TimeStampMicroVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampMicroVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPNANO: - generator.writeNumber(((TimeStampNanoVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampNanoVector) valueVector).getAccessor().get(i)); break; case BIT: - generator.writeNumber(((BitVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((BitVector) valueVector).getAccessor().get(i)); break; case VARBINARY: String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java index 0f9310da55b79..9f923ee343cdc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; import org.apache.arrow.vector.complex.reader.FieldReader; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java index 83506cdc17549..d8b2317f8ff4f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; -public final class RepeatedListHolder implements ValueHolder{ +public final class RepeatedListHolder implements ValueHolder { public int start; public int end; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java index 85d782b381835..c400b4dc49a80 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; -public final class RepeatedMapHolder implements ValueHolder{ +public final class RepeatedMapHolder implements ValueHolder { public int start; public int end; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java index b1b695e58a954..46cf4c8e8c712 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; import org.apache.arrow.vector.complex.reader.FieldReader; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java index 16777c806ec2d..4d012635e548a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java index 4e2e200d67645..d8c9e3001d0a5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import org.apache.arrow.flatbuf.Buffer; @@ -58,19 +59,25 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } ArrowBuffer other = (ArrowBuffer) obj; - if (offset != other.offset) + if (offset != other.offset) { return false; - if (page != other.page) + } + if (page != other.page) { return false; - if (size != other.size) + } + if (size != other.size) { return false; + } return true; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java index 901877b7058cd..635fa3fb42307 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import com.google.flatbuffers.FlatBufferBuilder; @@ -22,39 +23,48 @@ public class ArrowDictionaryBatch implements ArrowMessage { - private final long dictionaryId; - private final ArrowRecordBatch dictionary; - - public ArrowDictionaryBatch(long dictionaryId, ArrowRecordBatch dictionary) { - this.dictionaryId = dictionaryId; - this.dictionary = dictionary; - } - - public long getDictionaryId() { return dictionaryId; } - public ArrowRecordBatch getDictionary() { return dictionary; } - - @Override - public int writeTo(FlatBufferBuilder builder) { - int dataOffset = dictionary.writeTo(builder); - DictionaryBatch.startDictionaryBatch(builder); - DictionaryBatch.addId(builder, dictionaryId); - DictionaryBatch.addData(builder, dataOffset); - return DictionaryBatch.endDictionaryBatch(builder); - } - - @Override - public int computeBodyLength() { return dictionary.computeBodyLength(); } - - @Override - public T accepts(ArrowMessageVisitor visitor) { return visitor.visit(this); } - - @Override - public String toString() { - return "ArrowDictionaryBatch [dictionaryId=" + dictionaryId + ", dictionary=" + dictionary + "]"; - } - - @Override - public void close() { - dictionary.close(); - } + private final long dictionaryId; + private final ArrowRecordBatch dictionary; + + public ArrowDictionaryBatch(long dictionaryId, ArrowRecordBatch dictionary) { + this.dictionaryId = dictionaryId; + this.dictionary = dictionary; + } + + public long getDictionaryId() { + return dictionaryId; + } + + public ArrowRecordBatch getDictionary() { + return dictionary; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + int dataOffset = dictionary.writeTo(builder); + DictionaryBatch.startDictionaryBatch(builder); + DictionaryBatch.addId(builder, dictionaryId); + DictionaryBatch.addData(builder, dataOffset); + return DictionaryBatch.endDictionaryBatch(builder); + } + + @Override + public int computeBodyLength() { + return dictionary.computeBodyLength(); + } + + @Override + public T accepts(ArrowMessageVisitor visitor) { + return visitor.visit(this); + } + + @Override + public String toString() { + return "ArrowDictionaryBatch [dictionaryId=" + dictionaryId + ", dictionary=" + dictionary + "]"; + } + + @Override + public void close() { + dictionary.close(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java index 72ce982f2e7ee..3ed384ed7e280 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import org.apache.arrow.flatbuf.FieldNode; @@ -34,7 +35,7 @@ public ArrowFieldNode(int length, int nullCount) { @Override public int writeTo(FlatBufferBuilder builder) { - return FieldNode.createFieldNode(builder, (long)length, (long)nullCount); + return FieldNode.createFieldNode(builder, (long) length, (long) nullCount); } public int getNullCount() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java index d307428889b0f..f59b4b6c1721e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java @@ -15,16 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; public interface ArrowMessage extends FBSerializable, AutoCloseable { - public int computeBodyLength(); + public int computeBodyLength(); + + public T accepts(ArrowMessageVisitor visitor); - public T accepts(ArrowMessageVisitor visitor); + public static interface ArrowMessageVisitor { + public T visit(ArrowDictionaryBatch message); - public static interface ArrowMessageVisitor { - public T visit(ArrowDictionaryBatch message); - public T visit(ArrowRecordBatch message); - } + public T visit(ArrowRecordBatch message); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java index 6a716fa138a7a..d2f3782469597 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; @@ -36,10 +37,14 @@ public class ArrowRecordBatch implements ArrowMessage { private static final Logger LOGGER = LoggerFactory.getLogger(ArrowRecordBatch.class); - /** number of records */ + /** + * number of records + */ private final int length; - /** Nodes correspond to the pre-ordered flattened logical schema */ + /** + * Nodes correspond to the pre-ordered flattened logical schema + */ private final List nodes; private final List buffers; @@ -53,8 +58,8 @@ public ArrowRecordBatch(int length, List nodes, List b } /** - * @param length how many rows in this batch - * @param nodes field level info + * @param length how many rows in this batch + * @param nodes field level info * @param buffers will be retained until this recordBatch is closed */ public ArrowRecordBatch(int length, List nodes, List buffers, boolean alignBuffers) { @@ -119,7 +124,9 @@ public int writeTo(FlatBufferBuilder builder) { } @Override - public T accepts(ArrowMessageVisitor visitor) { return visitor.visit(this); } + public T accepts(ArrowMessageVisitor visitor) { + return visitor.visit(this); + } /** * releases the buffers diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java index 68da7052f2b8b..9d2fdfaafe4aa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import java.util.Map; @@ -34,10 +35,11 @@ public class ArrowVectorType { public static final ArrowVectorType TYPE = new ArrowVectorType(VectorType.TYPE); private static final Map typeByName; + static { - ArrowVectorType[] types = { DATA, OFFSET, VALIDITY, TYPE }; + ArrowVectorType[] types = {DATA, OFFSET, VALIDITY, TYPE}; Builder builder = ImmutableMap.builder(); - for (ArrowVectorType type: types) { + for (ArrowVectorType type : types) { builder.put(type.getName(), type); } typeByName = builder.build(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java index d23ed91948e5d..91d60ea995b89 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import com.google.flatbuffers.FlatBufferBuilder; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java index 31c17ad6df02b..ae5aa555e745e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import java.util.ArrayList; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 24840ec988ac3..29407bf1ab4e1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import static java.util.Arrays.asList; @@ -61,11 +62,13 @@ public class TypeLayout { public static TypeLayout getTypeLayout(final ArrowType arrowType) { TypeLayout layout = arrowType.accept(new ArrowTypeVisitor() { - @Override public TypeLayout visit(Int type) { + @Override + public TypeLayout visit(Int type) { return newFixedWidthTypeLayout(dataVector(type.getBitWidth())); } - @Override public TypeLayout visit(Union type) { + @Override + public TypeLayout visit(Union type) { List vectors; switch (type.getMode()) { case Dense: @@ -74,12 +77,12 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { validityVector(), typeVector(), offsetVector() // offset to find the vector - ); + ); break; case Sparse: vectors = asList( typeVector() // type of the value at the index or 0 if null - ); + ); break; default: throw new UnsupportedOperationException("Unsupported Union Mode: " + type.getMode()); @@ -87,64 +90,73 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { return new TypeLayout(vectors); } - @Override public TypeLayout visit(Struct type) { + @Override + public TypeLayout visit(Struct type) { List vectors = asList( validityVector() - ); + ); return new TypeLayout(vectors); } - @Override public TypeLayout visit(Timestamp type) { + @Override + public TypeLayout visit(Timestamp type) { return newFixedWidthTypeLayout(dataVector(64)); } - @Override public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + @Override + public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { List vectors = asList( validityVector(), offsetVector() - ); + ); return new TypeLayout(vectors); } - @Override public TypeLayout visit(FixedSizeList type) { + @Override + public TypeLayout visit(FixedSizeList type) { List vectors = asList( validityVector() - ); + ); return new TypeLayout(vectors); } - @Override public TypeLayout visit(FloatingPoint type) { + @Override + public TypeLayout visit(FloatingPoint type) { int bitWidth; switch (type.getPrecision()) { - case HALF: - bitWidth = 16; - break; - case SINGLE: - bitWidth = 32; - break; - case DOUBLE: - bitWidth = 64; - break; - default: - throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); + case HALF: + bitWidth = 16; + break; + case SINGLE: + bitWidth = 32; + break; + case DOUBLE: + bitWidth = 64; + break; + default: + throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); } return newFixedWidthTypeLayout(dataVector(bitWidth)); } - @Override public TypeLayout visit(Decimal type) { + @Override + public TypeLayout visit(Decimal type) { // TODO: check size return newFixedWidthTypeLayout(dataVector(64)); // actually depends on the type fields } - @Override public TypeLayout visit(Bool type) { + @Override + public TypeLayout visit(Bool type) { return newFixedWidthTypeLayout(booleanVector()); } - @Override public TypeLayout visit(Binary type) { + @Override + public TypeLayout visit(Binary type) { return newVariableWidthTypeLayout(); } - @Override public TypeLayout visit(Utf8 type) { + @Override + public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); } @@ -178,12 +190,12 @@ public TypeLayout visit(Time type) { @Override public TypeLayout visit(Interval type) { // TODO: check size switch (type.getUnit()) { - case DAY_TIME: - return newFixedWidthTypeLayout(dataVector(64)); - case YEAR_MONTH: - return newFixedWidthTypeLayout(dataVector(64)); - default: - throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); + case DAY_TIME: + return newFixedWidthTypeLayout(dataVector(64)); + case YEAR_MONTH: + return newFixedWidthTypeLayout(dataVector(64)); + default: + throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); } } @@ -228,12 +240,15 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } TypeLayout other = (TypeLayout) obj; return vectors.equals(other.vectors); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java index 2073795b2a199..0871baf38edaa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import static org.apache.arrow.vector.schema.ArrowVectorType.DATA; @@ -48,16 +49,16 @@ public static VectorLayout offsetVector() { public static VectorLayout dataVector(int typeBitWidth) { switch (typeBitWidth) { - case 8: - return VALUES_8; - case 16: - return VALUES_16; - case 32: - return VALUES_32; - case 64: - return VALUES_64; - default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + case 8: + return VALUES_8; + case 16: + return VALUES_16; + case 32: + return VALUES_32; + case 64: + return VALUES_64; + default: + throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); } } @@ -81,7 +82,7 @@ public static VectorLayout byteVector() { private VectorLayout(@JsonProperty("type") ArrowVectorType type, @JsonProperty("typeBitWidth") int typeBitWidth) { super(); this.type = Preconditions.checkNotNull(type); - this.typeBitWidth = (short)typeBitWidth; + this.typeBitWidth = (short) typeBitWidth; if (typeBitWidth <= 0) { throw new IllegalArgumentException("bitWidth invalid: " + typeBitWidth); } @@ -111,18 +112,22 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } VectorLayout other = (VectorLayout) obj; return type.equals(other.type) && (typeBitWidth == other.typeBitWidth); } @Override - public int writeTo(FlatBufferBuilder builder) {; + public int writeTo(FlatBufferBuilder builder) { + ; return org.apache.arrow.flatbuf.VectorLayout.createVectorLayout(builder, typeBitWidth, type.getType()); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java index 641978a516ae4..5b6300076b6c2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import java.io.IOException; @@ -33,31 +34,33 @@ */ public class ArrowStreamReader extends ArrowReader { - /** - * Constructs a streaming read, reading bytes from 'in'. Non-blocking. - * @param in the stream to read from - * @param allocator to allocate new buffers - */ - public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { - super(new ReadChannel(in), allocator); - } - - public ArrowStreamReader(InputStream in, BufferAllocator allocator) { - this(Channels.newChannel(in), allocator); - } - - /** - * Reads the schema message from the beginning of the stream. - * @param in to allocate new buffers - * @return the deserialized arrow schema - */ - @Override - protected Schema readSchema(ReadChannel in) throws IOException { - return MessageSerializer.deserializeSchema(in); - } - - @Override - protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { - return MessageSerializer.deserializeMessageBatch(in, allocator); - } + /** + * Constructs a streaming read, reading bytes from 'in'. Non-blocking. + * + * @param in the stream to read from + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { + super(new ReadChannel(in), allocator); + } + + public ArrowStreamReader(InputStream in, BufferAllocator allocator) { + this(Channels.newChannel(in), allocator); + } + + /** + * Reads the schema message from the beginning of the stream. + * + * @param in to allocate new buffers + * @return the deserialized arrow schema + */ + @Override + protected Schema readSchema(ReadChannel in) throws IOException { + return MessageSerializer.deserializeSchema(in); + } + + @Override + protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { + return MessageSerializer.deserializeMessageBatch(in, allocator); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java index ea29cd99804c8..b854cd2bb6e74 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import org.apache.arrow.memory.BufferAllocator; @@ -35,22 +36,23 @@ public class ArrowStreamWriter extends ArrowWriter { - public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, OutputStream out) { - this(root, provider, Channels.newChannel(out)); - } + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, OutputStream out) { + this(root, provider, Channels.newChannel(out)); + } - public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { - super(root, provider, out); - } + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { + super(root, provider, out); + } - @Override - protected void startInternal(WriteChannel out) throws IOException {} + @Override + protected void startInternal(WriteChannel out) throws IOException { + } - @Override - protected void endInternal(WriteChannel out, - Schema schema, - List dictionaries, - List records) throws IOException { - out.writeIntLittleEndian(0); - } + @Override + protected void endInternal(WriteChannel out, + Schema schema, + List dictionaries, + List records) throws IOException { + out.writeIntLittleEndian(0); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java index 05ad92ded1d52..a70d029389427 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import java.io.IOException; @@ -48,29 +49,30 @@ * Utility class for serializing Messages. Messages are all serialized a similar way. * 1. 4 byte little endian message header prefix * 2. FB serialized Message: This includes it the body length, which is the serialized - * body and the type of the message. + * body and the type of the message. * 3. Serialized message. * * For schema messages, the serialization is simply the FB serialized Schema. * * For RecordBatch messages the serialization is: - * 1. 4 byte little endian batch metadata header - * 2. FB serialized RowBatch - * 3. Padding to align to 8 byte boundary. - * 4. serialized RowBatch buffers. + * 1. 4 byte little endian batch metadata header + * 2. FB serialized RowBatch + * 3. Padding to align to 8 byte boundary. + * 4. serialized RowBatch buffers. */ public class MessageSerializer { public static int bytesToInt(byte[] bytes) { return ((bytes[3] & 255) << 24) + - ((bytes[2] & 255) << 16) + - ((bytes[1] & 255) << 8) + - ((bytes[0] & 255) << 0); + ((bytes[2] & 255) << 16) + + ((bytes[1] & 255) << 8) + + ((bytes[0] & 255) << 0); } /** * Serialize a schema object. - * @param out where to write the schema + * + * @param out where to write the schema * @param schema the object to serialize to out * @return the resulting size of the serialized schema * @throws IOException if something went wrong @@ -86,6 +88,7 @@ public static long serialize(WriteChannel out, Schema schema) throws IOException /** * Deserializes a schema object. Format is from serialize(). + * * @param in the channel to deserialize from * @return the deserialized object * @throws IOException if something went wrong @@ -106,13 +109,14 @@ public static Schema deserializeSchema(ReadChannel in) throws IOException { /** * Serializes an ArrowRecordBatch. Returns the offset and length of the written batch. - * @param out where to write the batch + * + * @param out where to write the batch * @param batch the object to serialize to out * @return the serialized block metadata * @throws IOException if something went wrong */ public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) - throws IOException { + throws IOException { long start = out.getCurrentPosition(); int bodyLength = batch.computeBodyLength(); @@ -125,9 +129,9 @@ public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) int metadataLength = serializedMessage.remaining(); // calculate alignment bytes so that metadata length points to the correct location after alignment - int padding = (int)((start + metadataLength + 4) % 8); + int padding = (int) ((start + metadataLength + 4) % 8); if (padding != 0) { - metadataLength += (8 - padding); + metadataLength += (8 - padding); } out.writeIntLittleEndian(metadataLength); @@ -152,7 +156,7 @@ public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) t ArrowBuffer layout = buffersLayout.get(i); long startPosition = bufferStart + layout.getOffset(); if (startPosition != out.getCurrentPosition()) { - out.writeZeros((int)(startPosition - out.getCurrentPosition())); + out.writeZeros((int) (startPosition - out.getCurrentPosition())); } out.write(buffer); if (out.getCurrentPosition() != startPosition + layout.getSize()) { @@ -165,9 +169,10 @@ public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) t /** * Deserializes a RecordBatch - * @param in the channel to deserialize from + * + * @param in the channel to deserialize from * @param message the object to derialize to - * @param alloc to allocate buffers + * @param alloc to allocate buffers * @return the deserialized object * @throws IOException if something went wrong */ @@ -188,14 +193,15 @@ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, Message me /** * Deserializes a RecordBatch knowing the size of the entire message up front. This * minimizes the number of reads to the underlying stream. - * @param in the channel to deserialize from + * + * @param in the channel to deserialize from * @param block the object to derialize to * @param alloc to allocate buffers * @return the deserialized object * @throws IOException if something went wrong */ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, ArrowBlock block, - BufferAllocator alloc) throws IOException { + BufferAllocator alloc) throws IOException { // Metadata length contains integer prefix plus byte padding long totalLen = block.getMetadataLength() + block.getBodyLength(); @@ -223,37 +229,38 @@ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, ArrowBlock // Deserializes a record batch given the Flatbuffer metadata and in-memory body public static ArrowRecordBatch deserializeRecordBatch(RecordBatch recordBatchFB, - ArrowBuf body) throws IOException { + ArrowBuf body) throws IOException { // Now read the body int nodesLength = recordBatchFB.nodesLength(); List nodes = new ArrayList<>(); for (int i = 0; i < nodesLength; ++i) { FieldNode node = recordBatchFB.nodes(i); - if ((int)node.length() != node.length() || - (int)node.nullCount() != node.nullCount()) { + if ((int) node.length() != node.length() || + (int) node.nullCount() != node.nullCount()) { throw new IOException("Cannot currently deserialize record batches with " + - "node length larger than Int.MAX_VALUE"); + "node length larger than Int.MAX_VALUE"); } - nodes.add(new ArrowFieldNode((int)node.length(), (int)node.nullCount())); + nodes.add(new ArrowFieldNode((int) node.length(), (int) node.nullCount())); } List buffers = new ArrayList<>(); for (int i = 0; i < recordBatchFB.buffersLength(); ++i) { Buffer bufferFB = recordBatchFB.buffers(i); - ArrowBuf vectorBuffer = body.slice((int)bufferFB.offset(), (int)bufferFB.length()); + ArrowBuf vectorBuffer = body.slice((int) bufferFB.offset(), (int) bufferFB.length()); buffers.add(vectorBuffer); } - if ((int)recordBatchFB.length() != recordBatchFB.length()) { + if ((int) recordBatchFB.length() != recordBatchFB.length()) { throw new IOException("Cannot currently deserialize record batches over 2GB"); } ArrowRecordBatch arrowRecordBatch = - new ArrowRecordBatch((int)recordBatchFB.length(), nodes, buffers); + new ArrowRecordBatch((int) recordBatchFB.length(), nodes, buffers); body.release(); return arrowRecordBatch; } /** * Serializes a dictionary ArrowRecordBatch. Returns the offset and length of the written batch. - * @param out where to serialize + * + * @param out where to serialize * @param batch the batch to serialize * @return the metadata of the serialized block * @throws IOException if something went wrong @@ -290,15 +297,16 @@ public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) /** * Deserializes a DictionaryBatch - * @param in where to read from + * + * @param in where to read from * @param message the message message metadata to deserialize - * @param alloc the allocator for new buffers + * @param alloc the allocator for new buffers * @return the corresponding dictionary batch * @throws IOException if something went wrong */ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, - Message message, - BufferAllocator alloc) throws IOException { + Message message, + BufferAllocator alloc) throws IOException { DictionaryBatch dictionaryBatchFB = (DictionaryBatch) message.header(new DictionaryBatch()); int bodyLength = (int) message.bodyLength(); @@ -315,7 +323,8 @@ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, /** * Deserializes a DictionaryBatch knowing the size of the entire message up front. This * minimizes the number of reads to the underlying stream. - * @param in where to read from + * + * @param in where to read from * @param block block metadata for deserializing * @param alloc to allocate new buffers * @return the corresponding dictionary @@ -345,7 +354,7 @@ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, // Now read the body final ArrowBuf body = buffer.slice(block.getMetadataLength(), - (int) totalLen - block.getMetadataLength()); + (int) totalLen - block.getMetadataLength()); ArrowRecordBatch recordBatch = deserializeRecordBatch(dictionaryBatchFB.data(), body); return new ArrowDictionaryBatch(dictionaryBatchFB.id(), recordBatch); } @@ -359,22 +368,26 @@ public static ArrowMessage deserializeMessageBatch(ReadChannel in, BufferAllocat } switch (message.headerType()) { - case MessageHeader.RecordBatch: return deserializeRecordBatch(in, message, alloc); - case MessageHeader.DictionaryBatch: return deserializeDictionaryBatch(in, message, alloc); - default: throw new IOException("Unexpected message header type " + message.headerType()); + case MessageHeader.RecordBatch: + return deserializeRecordBatch(in, message, alloc); + case MessageHeader.DictionaryBatch: + return deserializeDictionaryBatch(in, message, alloc); + default: + throw new IOException("Unexpected message header type " + message.headerType()); } } /** * Serializes a message header. - * @param builder to write the flatbuf to - * @param headerType headerType field + * + * @param builder to write the flatbuf to + * @param headerType headerType field * @param headerOffset header offset field - * @param bodyLength body length field + * @param bodyLength body length field * @return the corresponding ByteBuffer */ public static ByteBuffer serializeMessage(FlatBufferBuilder builder, byte headerType, - int headerOffset, int bodyLength) { + int headerOffset, int bodyLength) { Message.startMessage(builder); Message.addHeaderType(builder, headerType); Message.addHeader(builder, headerOffset); @@ -387,9 +400,13 @@ public static ByteBuffer serializeMessage(FlatBufferBuilder builder, byte header private static Message deserializeMessage(ReadChannel in) throws IOException { // Read the message size. There is an i32 little endian prefix. ByteBuffer buffer = ByteBuffer.allocate(4); - if (in.readFully(buffer) != 4) return null; + if (in.readFully(buffer) != 4) { + return null; + } int messageLength = bytesToInt(buffer.array()); - if (messageLength == 0) return null; + if (messageLength == 0) { + return null; + } buffer = ByteBuffer.allocate(messageLength); if (in.readFully(buffer) != messageLength) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java b/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java index e5beebffde9e4..003d3cdbceb3b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum DateUnit { @@ -22,6 +23,7 @@ public enum DateUnit { MILLISECOND(org.apache.arrow.flatbuf.DateUnit.MILLISECOND); private static final DateUnit[] valuesByFlatbufId = new DateUnit[DateUnit.values().length]; + static { for (DateUnit v : DateUnit.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java b/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java index 3206969fb7ead..ec253287b261d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; import org.apache.arrow.flatbuf.Precision; @@ -25,6 +26,7 @@ public enum FloatingPointPrecision { DOUBLE(Precision.DOUBLE); private static final FloatingPointPrecision[] valuesByFlatbufId = new FloatingPointPrecision[FloatingPointPrecision.values().length]; + static { for (FloatingPointPrecision v : FloatingPointPrecision.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java b/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java index b3ddf1fe497de..a8157f19e1a69 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum IntervalUnit { @@ -22,6 +23,7 @@ public enum IntervalUnit { DAY_TIME(org.apache.arrow.flatbuf.IntervalUnit.DAY_TIME); private static final IntervalUnit[] valuesByFlatbufId = new IntervalUnit[IntervalUnit.values().length]; + static { for (IntervalUnit v : IntervalUnit.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java b/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java index cea9866965854..1da9321fcc4ee 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum TimeUnit { @@ -24,6 +25,7 @@ public enum TimeUnit { NANOSECOND(org.apache.arrow.flatbuf.TimeUnit.NANOSECOND); private static final TimeUnit[] valuesByFlatbufId = new TimeUnit[TimeUnit.values().length]; + static { for (TimeUnit v : TimeUnit.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 6591a4b16da16..c57dd6dafe9e6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types; import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; @@ -531,73 +532,83 @@ public final ArrowType getType() { public static MinorType getMinorTypeForArrowType(ArrowType arrowType) { return arrowType.accept(new ArrowTypeVisitor() { - @Override public MinorType visit(Null type) { + @Override + public MinorType visit(Null type) { return MinorType.NULL; } - @Override public MinorType visit(Struct type) { + @Override + public MinorType visit(Struct type) { return MinorType.MAP; } - @Override public MinorType visit(List type) { + @Override + public MinorType visit(List type) { return MinorType.LIST; } - @Override public MinorType visit(FixedSizeList type) { + @Override + public MinorType visit(FixedSizeList type) { return MinorType.FIXED_SIZE_LIST; } - @Override public MinorType visit(Union type) { + @Override + public MinorType visit(Union type) { return MinorType.UNION; } @Override public MinorType visit(Int type) { switch (type.getBitWidth()) { - case 8: - return type.getIsSigned() ? MinorType.TINYINT : MinorType.UINT1; - case 16: - return type.getIsSigned() ? MinorType.SMALLINT : MinorType.UINT2; - case 32: - return type.getIsSigned() ? MinorType.INT : MinorType.UINT4; - case 64: - return type.getIsSigned() ? MinorType.BIGINT : MinorType.UINT8; - default: - throw new IllegalArgumentException("only 8, 16, 32, 64 supported: " + type); + case 8: + return type.getIsSigned() ? MinorType.TINYINT : MinorType.UINT1; + case 16: + return type.getIsSigned() ? MinorType.SMALLINT : MinorType.UINT2; + case 32: + return type.getIsSigned() ? MinorType.INT : MinorType.UINT4; + case 64: + return type.getIsSigned() ? MinorType.BIGINT : MinorType.UINT8; + default: + throw new IllegalArgumentException("only 8, 16, 32, 64 supported: " + type); } } @Override public MinorType visit(FloatingPoint type) { switch (type.getPrecision()) { - case HALF: - throw new UnsupportedOperationException("NYI: " + type); - case SINGLE: - return MinorType.FLOAT4; - case DOUBLE: - return MinorType.FLOAT8; - default: - throw new IllegalArgumentException("unknown precision: " + type); + case HALF: + throw new UnsupportedOperationException("NYI: " + type); + case SINGLE: + return MinorType.FLOAT4; + case DOUBLE: + return MinorType.FLOAT8; + default: + throw new IllegalArgumentException("unknown precision: " + type); } } - @Override public MinorType visit(Utf8 type) { + @Override + public MinorType visit(Utf8 type) { return MinorType.VARCHAR; } - @Override public MinorType visit(Binary type) { + @Override + public MinorType visit(Binary type) { return MinorType.VARBINARY; } - @Override public MinorType visit(Bool type) { + @Override + public MinorType visit(Bool type) { return MinorType.BIT; } - @Override public MinorType visit(Decimal type) { + @Override + public MinorType visit(Decimal type) { return MinorType.DECIMAL; } - @Override public MinorType visit(Date type) { + @Override + public MinorType visit(Date type) { switch (type.getUnit()) { case DAY: return MinorType.DATEDAY; @@ -608,7 +619,8 @@ public MinorType visit(FloatingPoint type) { } } - @Override public MinorType visit(Time type) { + @Override + public MinorType visit(Time type) { switch (type.getUnit()) { case SECOND: return MinorType.TIMESEC; @@ -623,7 +635,8 @@ public MinorType visit(FloatingPoint type) { } } - @Override public MinorType visit(Timestamp type) { + @Override + public MinorType visit(Timestamp type) { String tz = type.getTimezone(); switch (type.getUnit()) { case SECOND: @@ -642,12 +655,12 @@ public MinorType visit(FloatingPoint type) { @Override public MinorType visit(Interval type) { switch (type.getUnit()) { - case DAY_TIME: - return MinorType.INTERVALDAY; - case YEAR_MONTH: - return MinorType.INTERVALYEAR; - default: - throw new IllegalArgumentException("unknown unit: " + type); + case DAY_TIME: + return MinorType.INTERVALDAY; + case YEAR_MONTH: + return MinorType.INTERVALYEAR; + default: + throw new IllegalArgumentException("unknown unit: " + type); } } }); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java b/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java index 8e957bc0b6e34..231e85fb2c726 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum UnionMode { @@ -22,6 +23,7 @@ public enum UnionMode { Dense(org.apache.arrow.flatbuf.UnionMode.Dense); private static final UnionMode[] valuesByFlatbufId = new UnionMode[UnionMode.values().length]; + static { for (UnionMode v : UnionMode.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java index 8a0224d5564b0..2657532fa9be6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types.pojo; import java.util.Objects; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index a8f2ae5fbab8c..48e71a976c0e8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; @@ -166,7 +167,7 @@ public int getField(FlatBufferBuilder builder) { int layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, buffersData); int[] metadataOffsets = new int[getMetadata().size()]; Iterator> metadataIterator = getMetadata().entrySet().iterator(); - for (int i = 0; i < metadataOffsets.length; i ++) { + for (int i = 0; i < metadataOffsets.length; i++) { Entry kv = metadataIterator.next(); int keyOffset = builder.createString(kv.getKey()); int valueOffset = builder.createString(kv.getValue()); @@ -239,11 +240,11 @@ public boolean equals(Object obj) { } Field that = (Field) obj; return Objects.equals(this.name, that.name) && - Objects.equals(this.isNullable(), that.isNullable()) && - Objects.equals(this.getType(), that.getType()) && - Objects.equals(this.getDictionary(), that.getDictionary()) && - Objects.equals(this.getMetadata(), that.getMetadata()) && - Objects.equals(this.children, that.children); + Objects.equals(this.isNullable(), that.isNullable()) && + Objects.equals(this.getType(), that.getType()) && + Objects.equals(this.getDictionary(), that.getDictionary()) && + Objects.equals(this.getMetadata(), that.getMetadata()) && + Objects.equals(this.children, that.children); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java index c8fc689cd2c9c..f0856198a4c79 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; import static com.google.common.base.Preconditions.checkNotNull; @@ -55,12 +56,15 @@ public FieldType(boolean nullable, ArrowType type, DictionaryEncoding dictionary public boolean isNullable() { return nullable; } + public ArrowType getType() { return type; } + public DictionaryEncoding getDictionary() { return dictionary; } + public Map getMetadata() { return metadata; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java index 82e2ef55c20c6..a87d4490060df 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; @@ -53,7 +54,7 @@ public class Schema { /** * @param fields the list of the fields - * @param name the name of the field to return + * @param name the name of the field to return * @return the corresponding field * @throws IllegalArgumentException if the field was not found */ @@ -146,7 +147,7 @@ public int getSchema(FlatBufferBuilder builder) { int fieldsOffset = org.apache.arrow.flatbuf.Schema.createFieldsVector(builder, fieldOffsets); int[] metadataOffsets = new int[metadata.size()]; Iterator> metadataIterator = metadata.entrySet().iterator(); - for (int i = 0; i < metadataOffsets.length; i ++) { + for (int i = 0; i < metadataOffsets.length; i++) { Entry kv = metadataIterator.next(); int keyOffset = builder.createString(kv.getKey()); int valueOffset = builder.createString(kv.getValue()); @@ -174,7 +175,7 @@ public boolean equals(Object obj) { return false; } return Objects.equals(this.fields, ((Schema) obj).fields) && - Objects.equals(this.metadata, ((Schema) obj).metadata); + Objects.equals(this.metadata, ((Schema) obj).metadata); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java index 69840fefa968b..80d4a4684c512 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.io.IOException; @@ -58,7 +59,7 @@ public long position() throws IOException { @Override public SeekableByteChannel position(final long newPosition) throws IOException { - this.position = (int)newPosition; + this.position = (int) newPosition; return this; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index 68b9fb25f2112..5fe556a552714 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.util; import io.netty.buffer.ArrowBuf; @@ -31,15 +32,15 @@ public class ByteFunctionHelpers { /** * Helper function to check for equality of bytes in two ArrowBufs * - * @param left Left ArrowBuf for comparison + * @param left Left ArrowBuf for comparison * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right Right ArrowBuf for comparison + * @param lEnd end offset in the buffer + * @param right Right ArrowBuf for comparison * @param rStart start offset in the buffer - * @param rEnd end offset in the buffer + * @param rEnd end offset in the buffer * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise */ - public static final int equal(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd){ + public static final int equal(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd) { if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { left.checkBytes(lStart, lEnd); right.checkBytes(rStart, rEnd); @@ -48,7 +49,7 @@ public static final int equal(final ArrowBuf left, int lStart, int lEnd, final A } private static final int memEqual(final long laddr, int lStart, int lEnd, final long raddr, int rStart, - final int rEnd) { + final int rEnd) { int n = lEnd - lStart; if (n == rEnd - rStart) { @@ -85,15 +86,15 @@ private static final int memEqual(final long laddr, int lStart, int lEnd, final * * Function will check data before completing in the case that * - * @param left Left ArrowBuf to compare + * @param left Left ArrowBuf to compare * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right Right ArrowBuf to compare + * @param lEnd end offset in the buffer + * @param right Right ArrowBuf to compare * @param rStart start offset in the buffer - * @param rEnd end offset in the buffer + * @param rEnd end offset in the buffer * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise */ - public static final int compare(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd){ + public static final int compare(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd) { if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { left.checkBytes(lStart, lEnd); right.checkBytes(rStart, rEnd); @@ -140,12 +141,12 @@ private static final int memcmp(final long laddr, int lStart, int lEnd, final lo /** * Helper function to compare a set of bytes in ArrowBuf to a ByteArray. * - * @param left Left ArrowBuf for comparison purposes + * @param left Left ArrowBuf for comparison purposes * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right second input to be compared + * @param lEnd end offset in the buffer + * @param right second input to be compared * @param rStart start offset in the byte array - * @param rEnd end offset in the byte array + * @param rEnd end offset in the byte array * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise */ public static final int compare(final ArrowBuf left, int lStart, int lEnd, final byte[] right, int rStart, final int rEnd) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java b/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java index 249834270b3fe..38e3b78c778ea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java index 8aad41744f673..3dd169b82357b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -33,660 +33,660 @@ public class DateUtility { - /* We have a hashmap that stores the timezone as the key and an index as the value - * While storing the timezone in value vectors, holders we only use this index. As we - * reconstruct the timestamp, we use this index to index through the array timezoneList - * and get the corresponding timezone and pass it to joda-time - */ + /* We have a hashmap that stores the timezone as the key and an index as the value + * While storing the timezone in value vectors, holders we only use this index. As we + * reconstruct the timestamp, we use this index to index through the array timezoneList + * and get the corresponding timezone and pass it to joda-time + */ public static ObjectIntHashMap timezoneMap = new ObjectIntHashMap(); - public static String[] timezoneList = {"Africa/Abidjan", - "Africa/Accra", - "Africa/Addis_Ababa", - "Africa/Algiers", - "Africa/Asmara", - "Africa/Asmera", - "Africa/Bamako", - "Africa/Bangui", - "Africa/Banjul", - "Africa/Bissau", - "Africa/Blantyre", - "Africa/Brazzaville", - "Africa/Bujumbura", - "Africa/Cairo", - "Africa/Casablanca", - "Africa/Ceuta", - "Africa/Conakry", - "Africa/Dakar", - "Africa/Dar_es_Salaam", - "Africa/Djibouti", - "Africa/Douala", - "Africa/El_Aaiun", - "Africa/Freetown", - "Africa/Gaborone", - "Africa/Harare", - "Africa/Johannesburg", - "Africa/Juba", - "Africa/Kampala", - "Africa/Khartoum", - "Africa/Kigali", - "Africa/Kinshasa", - "Africa/Lagos", - "Africa/Libreville", - "Africa/Lome", - "Africa/Luanda", - "Africa/Lubumbashi", - "Africa/Lusaka", - "Africa/Malabo", - "Africa/Maputo", - "Africa/Maseru", - "Africa/Mbabane", - "Africa/Mogadishu", - "Africa/Monrovia", - "Africa/Nairobi", - "Africa/Ndjamena", - "Africa/Niamey", - "Africa/Nouakchott", - "Africa/Ouagadougou", - "Africa/Porto-Novo", - "Africa/Sao_Tome", - "Africa/Timbuktu", - "Africa/Tripoli", - "Africa/Tunis", - "Africa/Windhoek", - "America/Adak", - "America/Anchorage", - "America/Anguilla", - "America/Antigua", - "America/Araguaina", - "America/Argentina/Buenos_Aires", - "America/Argentina/Catamarca", - "America/Argentina/ComodRivadavia", - "America/Argentina/Cordoba", - "America/Argentina/Jujuy", - "America/Argentina/La_Rioja", - "America/Argentina/Mendoza", - "America/Argentina/Rio_Gallegos", - "America/Argentina/Salta", - "America/Argentina/San_Juan", - "America/Argentina/San_Luis", - "America/Argentina/Tucuman", - "America/Argentina/Ushuaia", - "America/Aruba", - "America/Asuncion", - "America/Atikokan", - "America/Atka", - "America/Bahia", - "America/Bahia_Banderas", - "America/Barbados", - "America/Belem", - "America/Belize", - "America/Blanc-Sablon", - "America/Boa_Vista", - "America/Bogota", - "America/Boise", - "America/Buenos_Aires", - "America/Cambridge_Bay", - "America/Campo_Grande", - "America/Cancun", - "America/Caracas", - "America/Catamarca", - "America/Cayenne", - "America/Cayman", - "America/Chicago", - "America/Chihuahua", - "America/Coral_Harbour", - "America/Cordoba", - "America/Costa_Rica", - "America/Cuiaba", - "America/Curacao", - "America/Danmarkshavn", - "America/Dawson", - "America/Dawson_Creek", - "America/Denver", - "America/Detroit", - "America/Dominica", - "America/Edmonton", - "America/Eirunepe", - "America/El_Salvador", - "America/Ensenada", - "America/Fort_Wayne", - "America/Fortaleza", - "America/Glace_Bay", - "America/Godthab", - "America/Goose_Bay", - "America/Grand_Turk", - "America/Grenada", - "America/Guadeloupe", - "America/Guatemala", - "America/Guayaquil", - "America/Guyana", - "America/Halifax", - "America/Havana", - "America/Hermosillo", - "America/Indiana/Indianapolis", - "America/Indiana/Knox", - "America/Indiana/Marengo", - "America/Indiana/Petersburg", - "America/Indiana/Tell_City", - "America/Indiana/Vevay", - "America/Indiana/Vincennes", - "America/Indiana/Winamac", - "America/Indianapolis", - "America/Inuvik", - "America/Iqaluit", - "America/Jamaica", - "America/Jujuy", - "America/Juneau", - "America/Kentucky/Louisville", - "America/Kentucky/Monticello", - "America/Knox_IN", - "America/Kralendijk", - "America/La_Paz", - "America/Lima", - "America/Los_Angeles", - "America/Louisville", - "America/Lower_Princes", - "America/Maceio", - "America/Managua", - "America/Manaus", - "America/Marigot", - "America/Martinique", - "America/Matamoros", - "America/Mazatlan", - "America/Mendoza", - "America/Menominee", - "America/Merida", - "America/Metlakatla", - "America/Mexico_City", - "America/Miquelon", - "America/Moncton", - "America/Monterrey", - "America/Montevideo", - "America/Montreal", - "America/Montserrat", - "America/Nassau", - "America/New_York", - "America/Nipigon", - "America/Nome", - "America/Noronha", - "America/North_Dakota/Beulah", - "America/North_Dakota/Center", - "America/North_Dakota/New_Salem", - "America/Ojinaga", - "America/Panama", - "America/Pangnirtung", - "America/Paramaribo", - "America/Phoenix", - "America/Port-au-Prince", - "America/Port_of_Spain", - "America/Porto_Acre", - "America/Porto_Velho", - "America/Puerto_Rico", - "America/Rainy_River", - "America/Rankin_Inlet", - "America/Recife", - "America/Regina", - "America/Resolute", - "America/Rio_Branco", - "America/Rosario", - "America/Santa_Isabel", - "America/Santarem", - "America/Santiago", - "America/Santo_Domingo", - "America/Sao_Paulo", - "America/Scoresbysund", - "America/Shiprock", - "America/Sitka", - "America/St_Barthelemy", - "America/St_Johns", - "America/St_Kitts", - "America/St_Lucia", - "America/St_Thomas", - "America/St_Vincent", - "America/Swift_Current", - "America/Tegucigalpa", - "America/Thule", - "America/Thunder_Bay", - "America/Tijuana", - "America/Toronto", - "America/Tortola", - "America/Vancouver", - "America/Virgin", - "America/Whitehorse", - "America/Winnipeg", - "America/Yakutat", - "America/Yellowknife", - "Antarctica/Casey", - "Antarctica/Davis", - "Antarctica/DumontDUrville", - "Antarctica/Macquarie", - "Antarctica/Mawson", - "Antarctica/McMurdo", - "Antarctica/Palmer", - "Antarctica/Rothera", - "Antarctica/South_Pole", - "Antarctica/Syowa", - "Antarctica/Vostok", - "Arctic/Longyearbyen", - "Asia/Aden", - "Asia/Almaty", - "Asia/Amman", - "Asia/Anadyr", - "Asia/Aqtau", - "Asia/Aqtobe", - "Asia/Ashgabat", - "Asia/Ashkhabad", - "Asia/Baghdad", - "Asia/Bahrain", - "Asia/Baku", - "Asia/Bangkok", - "Asia/Beirut", - "Asia/Bishkek", - "Asia/Brunei", - "Asia/Calcutta", - "Asia/Choibalsan", - "Asia/Chongqing", - "Asia/Chungking", - "Asia/Colombo", - "Asia/Dacca", - "Asia/Damascus", - "Asia/Dhaka", - "Asia/Dili", - "Asia/Dubai", - "Asia/Dushanbe", - "Asia/Gaza", - "Asia/Harbin", - "Asia/Hebron", - "Asia/Ho_Chi_Minh", - "Asia/Hong_Kong", - "Asia/Hovd", - "Asia/Irkutsk", - "Asia/Istanbul", - "Asia/Jakarta", - "Asia/Jayapura", - "Asia/Jerusalem", - "Asia/Kabul", - "Asia/Kamchatka", - "Asia/Karachi", - "Asia/Kashgar", - "Asia/Kathmandu", - "Asia/Katmandu", - "Asia/Kolkata", - "Asia/Krasnoyarsk", - "Asia/Kuala_Lumpur", - "Asia/Kuching", - "Asia/Kuwait", - "Asia/Macao", - "Asia/Macau", - "Asia/Magadan", - "Asia/Makassar", - "Asia/Manila", - "Asia/Muscat", - "Asia/Nicosia", - "Asia/Novokuznetsk", - "Asia/Novosibirsk", - "Asia/Omsk", - "Asia/Oral", - "Asia/Phnom_Penh", - "Asia/Pontianak", - "Asia/Pyongyang", - "Asia/Qatar", - "Asia/Qyzylorda", - "Asia/Rangoon", - "Asia/Riyadh", - "Asia/Saigon", - "Asia/Sakhalin", - "Asia/Samarkand", - "Asia/Seoul", - "Asia/Shanghai", - "Asia/Singapore", - "Asia/Taipei", - "Asia/Tashkent", - "Asia/Tbilisi", - "Asia/Tehran", - "Asia/Tel_Aviv", - "Asia/Thimbu", - "Asia/Thimphu", - "Asia/Tokyo", - "Asia/Ujung_Pandang", - "Asia/Ulaanbaatar", - "Asia/Ulan_Bator", - "Asia/Urumqi", - "Asia/Vientiane", - "Asia/Vladivostok", - "Asia/Yakutsk", - "Asia/Yekaterinburg", - "Asia/Yerevan", - "Atlantic/Azores", - "Atlantic/Bermuda", - "Atlantic/Canary", - "Atlantic/Cape_Verde", - "Atlantic/Faeroe", - "Atlantic/Faroe", - "Atlantic/Jan_Mayen", - "Atlantic/Madeira", - "Atlantic/Reykjavik", - "Atlantic/South_Georgia", - "Atlantic/St_Helena", - "Atlantic/Stanley", - "Australia/ACT", - "Australia/Adelaide", - "Australia/Brisbane", - "Australia/Broken_Hill", - "Australia/Canberra", - "Australia/Currie", - "Australia/Darwin", - "Australia/Eucla", - "Australia/Hobart", - "Australia/LHI", - "Australia/Lindeman", - "Australia/Lord_Howe", - "Australia/Melbourne", - "Australia/NSW", - "Australia/North", - "Australia/Perth", - "Australia/Queensland", - "Australia/South", - "Australia/Sydney", - "Australia/Tasmania", - "Australia/Victoria", - "Australia/West", - "Australia/Yancowinna", - "Brazil/Acre", - "Brazil/DeNoronha", - "Brazil/East", - "Brazil/West", - "CET", - "CST6CDT", - "Canada/Atlantic", - "Canada/Central", - "Canada/East-Saskatchewan", - "Canada/Eastern", - "Canada/Mountain", - "Canada/Newfoundland", - "Canada/Pacific", - "Canada/Saskatchewan", - "Canada/Yukon", - "Chile/Continental", - "Chile/EasterIsland", - "Cuba", - "EET", - "EST", - "EST5EDT", - "Egypt", - "Eire", - "Etc/GMT", - "Etc/GMT+0", - "Etc/GMT+1", - "Etc/GMT+10", - "Etc/GMT+11", - "Etc/GMT+12", - "Etc/GMT+2", - "Etc/GMT+3", - "Etc/GMT+4", - "Etc/GMT+5", - "Etc/GMT+6", - "Etc/GMT+7", - "Etc/GMT+8", - "Etc/GMT+9", - "Etc/GMT-0", - "Etc/GMT-1", - "Etc/GMT-10", - "Etc/GMT-11", - "Etc/GMT-12", - "Etc/GMT-13", - "Etc/GMT-14", - "Etc/GMT-2", - "Etc/GMT-3", - "Etc/GMT-4", - "Etc/GMT-5", - "Etc/GMT-6", - "Etc/GMT-7", - "Etc/GMT-8", - "Etc/GMT-9", - "Etc/GMT0", - "Etc/Greenwich", - "Etc/UCT", - "Etc/UTC", - "Etc/Universal", - "Etc/Zulu", - "Europe/Amsterdam", - "Europe/Andorra", - "Europe/Athens", - "Europe/Belfast", - "Europe/Belgrade", - "Europe/Berlin", - "Europe/Bratislava", - "Europe/Brussels", - "Europe/Bucharest", - "Europe/Budapest", - "Europe/Chisinau", - "Europe/Copenhagen", - "Europe/Dublin", - "Europe/Gibraltar", - "Europe/Guernsey", - "Europe/Helsinki", - "Europe/Isle_of_Man", - "Europe/Istanbul", - "Europe/Jersey", - "Europe/Kaliningrad", - "Europe/Kiev", - "Europe/Lisbon", - "Europe/Ljubljana", - "Europe/London", - "Europe/Luxembourg", - "Europe/Madrid", - "Europe/Malta", - "Europe/Mariehamn", - "Europe/Minsk", - "Europe/Monaco", - "Europe/Moscow", - "Europe/Nicosia", - "Europe/Oslo", - "Europe/Paris", - "Europe/Podgorica", - "Europe/Prague", - "Europe/Riga", - "Europe/Rome", - "Europe/Samara", - "Europe/San_Marino", - "Europe/Sarajevo", - "Europe/Simferopol", - "Europe/Skopje", - "Europe/Sofia", - "Europe/Stockholm", - "Europe/Tallinn", - "Europe/Tirane", - "Europe/Tiraspol", - "Europe/Uzhgorod", - "Europe/Vaduz", - "Europe/Vatican", - "Europe/Vienna", - "Europe/Vilnius", - "Europe/Volgograd", - "Europe/Warsaw", - "Europe/Zagreb", - "Europe/Zaporozhye", - "Europe/Zurich", - "GB", - "GB-Eire", - "GMT", - "GMT+0", - "GMT-0", - "GMT0", - "Greenwich", - "HST", - "Hongkong", - "Iceland", - "Indian/Antananarivo", - "Indian/Chagos", - "Indian/Christmas", - "Indian/Cocos", - "Indian/Comoro", - "Indian/Kerguelen", - "Indian/Mahe", - "Indian/Maldives", - "Indian/Mauritius", - "Indian/Mayotte", - "Indian/Reunion", - "Iran", - "Israel", - "Jamaica", - "Japan", - "Kwajalein", - "Libya", - "MET", - "MST", - "MST7MDT", - "Mexico/BajaNorte", - "Mexico/BajaSur", - "Mexico/General", - "NZ", - "NZ-CHAT", - "Navajo", - "PRC", - "PST8PDT", - "Pacific/Apia", - "Pacific/Auckland", - "Pacific/Chatham", - "Pacific/Chuuk", - "Pacific/Easter", - "Pacific/Efate", - "Pacific/Enderbury", - "Pacific/Fakaofo", - "Pacific/Fiji", - "Pacific/Funafuti", - "Pacific/Galapagos", - "Pacific/Gambier", - "Pacific/Guadalcanal", - "Pacific/Guam", - "Pacific/Honolulu", - "Pacific/Johnston", - "Pacific/Kiritimati", - "Pacific/Kosrae", - "Pacific/Kwajalein", - "Pacific/Majuro", - "Pacific/Marquesas", - "Pacific/Midway", - "Pacific/Nauru", - "Pacific/Niue", - "Pacific/Norfolk", - "Pacific/Noumea", - "Pacific/Pago_Pago", - "Pacific/Palau", - "Pacific/Pitcairn", - "Pacific/Pohnpei", - "Pacific/Ponape", - "Pacific/Port_Moresby", - "Pacific/Rarotonga", - "Pacific/Saipan", - "Pacific/Samoa", - "Pacific/Tahiti", - "Pacific/Tarawa", - "Pacific/Tongatapu", - "Pacific/Truk", - "Pacific/Wake", - "Pacific/Wallis", - "Pacific/Yap", - "Poland", - "Portugal", - "ROC", - "ROK", - "Singapore", - "Turkey", - "UCT", - "US/Alaska", - "US/Aleutian", - "US/Arizona", - "US/Central", - "US/East-Indiana", - "US/Eastern", - "US/Hawaii", - "US/Indiana-Starke", - "US/Michigan", - "US/Mountain", - "US/Pacific", - "US/Pacific-New", - "US/Samoa", - "UTC", - "Universal", - "W-SU", - "WET", - "Zulu"}; + public static String[] timezoneList = {"Africa/Abidjan", + "Africa/Accra", + "Africa/Addis_Ababa", + "Africa/Algiers", + "Africa/Asmara", + "Africa/Asmera", + "Africa/Bamako", + "Africa/Bangui", + "Africa/Banjul", + "Africa/Bissau", + "Africa/Blantyre", + "Africa/Brazzaville", + "Africa/Bujumbura", + "Africa/Cairo", + "Africa/Casablanca", + "Africa/Ceuta", + "Africa/Conakry", + "Africa/Dakar", + "Africa/Dar_es_Salaam", + "Africa/Djibouti", + "Africa/Douala", + "Africa/El_Aaiun", + "Africa/Freetown", + "Africa/Gaborone", + "Africa/Harare", + "Africa/Johannesburg", + "Africa/Juba", + "Africa/Kampala", + "Africa/Khartoum", + "Africa/Kigali", + "Africa/Kinshasa", + "Africa/Lagos", + "Africa/Libreville", + "Africa/Lome", + "Africa/Luanda", + "Africa/Lubumbashi", + "Africa/Lusaka", + "Africa/Malabo", + "Africa/Maputo", + "Africa/Maseru", + "Africa/Mbabane", + "Africa/Mogadishu", + "Africa/Monrovia", + "Africa/Nairobi", + "Africa/Ndjamena", + "Africa/Niamey", + "Africa/Nouakchott", + "Africa/Ouagadougou", + "Africa/Porto-Novo", + "Africa/Sao_Tome", + "Africa/Timbuktu", + "Africa/Tripoli", + "Africa/Tunis", + "Africa/Windhoek", + "America/Adak", + "America/Anchorage", + "America/Anguilla", + "America/Antigua", + "America/Araguaina", + "America/Argentina/Buenos_Aires", + "America/Argentina/Catamarca", + "America/Argentina/ComodRivadavia", + "America/Argentina/Cordoba", + "America/Argentina/Jujuy", + "America/Argentina/La_Rioja", + "America/Argentina/Mendoza", + "America/Argentina/Rio_Gallegos", + "America/Argentina/Salta", + "America/Argentina/San_Juan", + "America/Argentina/San_Luis", + "America/Argentina/Tucuman", + "America/Argentina/Ushuaia", + "America/Aruba", + "America/Asuncion", + "America/Atikokan", + "America/Atka", + "America/Bahia", + "America/Bahia_Banderas", + "America/Barbados", + "America/Belem", + "America/Belize", + "America/Blanc-Sablon", + "America/Boa_Vista", + "America/Bogota", + "America/Boise", + "America/Buenos_Aires", + "America/Cambridge_Bay", + "America/Campo_Grande", + "America/Cancun", + "America/Caracas", + "America/Catamarca", + "America/Cayenne", + "America/Cayman", + "America/Chicago", + "America/Chihuahua", + "America/Coral_Harbour", + "America/Cordoba", + "America/Costa_Rica", + "America/Cuiaba", + "America/Curacao", + "America/Danmarkshavn", + "America/Dawson", + "America/Dawson_Creek", + "America/Denver", + "America/Detroit", + "America/Dominica", + "America/Edmonton", + "America/Eirunepe", + "America/El_Salvador", + "America/Ensenada", + "America/Fort_Wayne", + "America/Fortaleza", + "America/Glace_Bay", + "America/Godthab", + "America/Goose_Bay", + "America/Grand_Turk", + "America/Grenada", + "America/Guadeloupe", + "America/Guatemala", + "America/Guayaquil", + "America/Guyana", + "America/Halifax", + "America/Havana", + "America/Hermosillo", + "America/Indiana/Indianapolis", + "America/Indiana/Knox", + "America/Indiana/Marengo", + "America/Indiana/Petersburg", + "America/Indiana/Tell_City", + "America/Indiana/Vevay", + "America/Indiana/Vincennes", + "America/Indiana/Winamac", + "America/Indianapolis", + "America/Inuvik", + "America/Iqaluit", + "America/Jamaica", + "America/Jujuy", + "America/Juneau", + "America/Kentucky/Louisville", + "America/Kentucky/Monticello", + "America/Knox_IN", + "America/Kralendijk", + "America/La_Paz", + "America/Lima", + "America/Los_Angeles", + "America/Louisville", + "America/Lower_Princes", + "America/Maceio", + "America/Managua", + "America/Manaus", + "America/Marigot", + "America/Martinique", + "America/Matamoros", + "America/Mazatlan", + "America/Mendoza", + "America/Menominee", + "America/Merida", + "America/Metlakatla", + "America/Mexico_City", + "America/Miquelon", + "America/Moncton", + "America/Monterrey", + "America/Montevideo", + "America/Montreal", + "America/Montserrat", + "America/Nassau", + "America/New_York", + "America/Nipigon", + "America/Nome", + "America/Noronha", + "America/North_Dakota/Beulah", + "America/North_Dakota/Center", + "America/North_Dakota/New_Salem", + "America/Ojinaga", + "America/Panama", + "America/Pangnirtung", + "America/Paramaribo", + "America/Phoenix", + "America/Port-au-Prince", + "America/Port_of_Spain", + "America/Porto_Acre", + "America/Porto_Velho", + "America/Puerto_Rico", + "America/Rainy_River", + "America/Rankin_Inlet", + "America/Recife", + "America/Regina", + "America/Resolute", + "America/Rio_Branco", + "America/Rosario", + "America/Santa_Isabel", + "America/Santarem", + "America/Santiago", + "America/Santo_Domingo", + "America/Sao_Paulo", + "America/Scoresbysund", + "America/Shiprock", + "America/Sitka", + "America/St_Barthelemy", + "America/St_Johns", + "America/St_Kitts", + "America/St_Lucia", + "America/St_Thomas", + "America/St_Vincent", + "America/Swift_Current", + "America/Tegucigalpa", + "America/Thule", + "America/Thunder_Bay", + "America/Tijuana", + "America/Toronto", + "America/Tortola", + "America/Vancouver", + "America/Virgin", + "America/Whitehorse", + "America/Winnipeg", + "America/Yakutat", + "America/Yellowknife", + "Antarctica/Casey", + "Antarctica/Davis", + "Antarctica/DumontDUrville", + "Antarctica/Macquarie", + "Antarctica/Mawson", + "Antarctica/McMurdo", + "Antarctica/Palmer", + "Antarctica/Rothera", + "Antarctica/South_Pole", + "Antarctica/Syowa", + "Antarctica/Vostok", + "Arctic/Longyearbyen", + "Asia/Aden", + "Asia/Almaty", + "Asia/Amman", + "Asia/Anadyr", + "Asia/Aqtau", + "Asia/Aqtobe", + "Asia/Ashgabat", + "Asia/Ashkhabad", + "Asia/Baghdad", + "Asia/Bahrain", + "Asia/Baku", + "Asia/Bangkok", + "Asia/Beirut", + "Asia/Bishkek", + "Asia/Brunei", + "Asia/Calcutta", + "Asia/Choibalsan", + "Asia/Chongqing", + "Asia/Chungking", + "Asia/Colombo", + "Asia/Dacca", + "Asia/Damascus", + "Asia/Dhaka", + "Asia/Dili", + "Asia/Dubai", + "Asia/Dushanbe", + "Asia/Gaza", + "Asia/Harbin", + "Asia/Hebron", + "Asia/Ho_Chi_Minh", + "Asia/Hong_Kong", + "Asia/Hovd", + "Asia/Irkutsk", + "Asia/Istanbul", + "Asia/Jakarta", + "Asia/Jayapura", + "Asia/Jerusalem", + "Asia/Kabul", + "Asia/Kamchatka", + "Asia/Karachi", + "Asia/Kashgar", + "Asia/Kathmandu", + "Asia/Katmandu", + "Asia/Kolkata", + "Asia/Krasnoyarsk", + "Asia/Kuala_Lumpur", + "Asia/Kuching", + "Asia/Kuwait", + "Asia/Macao", + "Asia/Macau", + "Asia/Magadan", + "Asia/Makassar", + "Asia/Manila", + "Asia/Muscat", + "Asia/Nicosia", + "Asia/Novokuznetsk", + "Asia/Novosibirsk", + "Asia/Omsk", + "Asia/Oral", + "Asia/Phnom_Penh", + "Asia/Pontianak", + "Asia/Pyongyang", + "Asia/Qatar", + "Asia/Qyzylorda", + "Asia/Rangoon", + "Asia/Riyadh", + "Asia/Saigon", + "Asia/Sakhalin", + "Asia/Samarkand", + "Asia/Seoul", + "Asia/Shanghai", + "Asia/Singapore", + "Asia/Taipei", + "Asia/Tashkent", + "Asia/Tbilisi", + "Asia/Tehran", + "Asia/Tel_Aviv", + "Asia/Thimbu", + "Asia/Thimphu", + "Asia/Tokyo", + "Asia/Ujung_Pandang", + "Asia/Ulaanbaatar", + "Asia/Ulan_Bator", + "Asia/Urumqi", + "Asia/Vientiane", + "Asia/Vladivostok", + "Asia/Yakutsk", + "Asia/Yekaterinburg", + "Asia/Yerevan", + "Atlantic/Azores", + "Atlantic/Bermuda", + "Atlantic/Canary", + "Atlantic/Cape_Verde", + "Atlantic/Faeroe", + "Atlantic/Faroe", + "Atlantic/Jan_Mayen", + "Atlantic/Madeira", + "Atlantic/Reykjavik", + "Atlantic/South_Georgia", + "Atlantic/St_Helena", + "Atlantic/Stanley", + "Australia/ACT", + "Australia/Adelaide", + "Australia/Brisbane", + "Australia/Broken_Hill", + "Australia/Canberra", + "Australia/Currie", + "Australia/Darwin", + "Australia/Eucla", + "Australia/Hobart", + "Australia/LHI", + "Australia/Lindeman", + "Australia/Lord_Howe", + "Australia/Melbourne", + "Australia/NSW", + "Australia/North", + "Australia/Perth", + "Australia/Queensland", + "Australia/South", + "Australia/Sydney", + "Australia/Tasmania", + "Australia/Victoria", + "Australia/West", + "Australia/Yancowinna", + "Brazil/Acre", + "Brazil/DeNoronha", + "Brazil/East", + "Brazil/West", + "CET", + "CST6CDT", + "Canada/Atlantic", + "Canada/Central", + "Canada/East-Saskatchewan", + "Canada/Eastern", + "Canada/Mountain", + "Canada/Newfoundland", + "Canada/Pacific", + "Canada/Saskatchewan", + "Canada/Yukon", + "Chile/Continental", + "Chile/EasterIsland", + "Cuba", + "EET", + "EST", + "EST5EDT", + "Egypt", + "Eire", + "Etc/GMT", + "Etc/GMT+0", + "Etc/GMT+1", + "Etc/GMT+10", + "Etc/GMT+11", + "Etc/GMT+12", + "Etc/GMT+2", + "Etc/GMT+3", + "Etc/GMT+4", + "Etc/GMT+5", + "Etc/GMT+6", + "Etc/GMT+7", + "Etc/GMT+8", + "Etc/GMT+9", + "Etc/GMT-0", + "Etc/GMT-1", + "Etc/GMT-10", + "Etc/GMT-11", + "Etc/GMT-12", + "Etc/GMT-13", + "Etc/GMT-14", + "Etc/GMT-2", + "Etc/GMT-3", + "Etc/GMT-4", + "Etc/GMT-5", + "Etc/GMT-6", + "Etc/GMT-7", + "Etc/GMT-8", + "Etc/GMT-9", + "Etc/GMT0", + "Etc/Greenwich", + "Etc/UCT", + "Etc/UTC", + "Etc/Universal", + "Etc/Zulu", + "Europe/Amsterdam", + "Europe/Andorra", + "Europe/Athens", + "Europe/Belfast", + "Europe/Belgrade", + "Europe/Berlin", + "Europe/Bratislava", + "Europe/Brussels", + "Europe/Bucharest", + "Europe/Budapest", + "Europe/Chisinau", + "Europe/Copenhagen", + "Europe/Dublin", + "Europe/Gibraltar", + "Europe/Guernsey", + "Europe/Helsinki", + "Europe/Isle_of_Man", + "Europe/Istanbul", + "Europe/Jersey", + "Europe/Kaliningrad", + "Europe/Kiev", + "Europe/Lisbon", + "Europe/Ljubljana", + "Europe/London", + "Europe/Luxembourg", + "Europe/Madrid", + "Europe/Malta", + "Europe/Mariehamn", + "Europe/Minsk", + "Europe/Monaco", + "Europe/Moscow", + "Europe/Nicosia", + "Europe/Oslo", + "Europe/Paris", + "Europe/Podgorica", + "Europe/Prague", + "Europe/Riga", + "Europe/Rome", + "Europe/Samara", + "Europe/San_Marino", + "Europe/Sarajevo", + "Europe/Simferopol", + "Europe/Skopje", + "Europe/Sofia", + "Europe/Stockholm", + "Europe/Tallinn", + "Europe/Tirane", + "Europe/Tiraspol", + "Europe/Uzhgorod", + "Europe/Vaduz", + "Europe/Vatican", + "Europe/Vienna", + "Europe/Vilnius", + "Europe/Volgograd", + "Europe/Warsaw", + "Europe/Zagreb", + "Europe/Zaporozhye", + "Europe/Zurich", + "GB", + "GB-Eire", + "GMT", + "GMT+0", + "GMT-0", + "GMT0", + "Greenwich", + "HST", + "Hongkong", + "Iceland", + "Indian/Antananarivo", + "Indian/Chagos", + "Indian/Christmas", + "Indian/Cocos", + "Indian/Comoro", + "Indian/Kerguelen", + "Indian/Mahe", + "Indian/Maldives", + "Indian/Mauritius", + "Indian/Mayotte", + "Indian/Reunion", + "Iran", + "Israel", + "Jamaica", + "Japan", + "Kwajalein", + "Libya", + "MET", + "MST", + "MST7MDT", + "Mexico/BajaNorte", + "Mexico/BajaSur", + "Mexico/General", + "NZ", + "NZ-CHAT", + "Navajo", + "PRC", + "PST8PDT", + "Pacific/Apia", + "Pacific/Auckland", + "Pacific/Chatham", + "Pacific/Chuuk", + "Pacific/Easter", + "Pacific/Efate", + "Pacific/Enderbury", + "Pacific/Fakaofo", + "Pacific/Fiji", + "Pacific/Funafuti", + "Pacific/Galapagos", + "Pacific/Gambier", + "Pacific/Guadalcanal", + "Pacific/Guam", + "Pacific/Honolulu", + "Pacific/Johnston", + "Pacific/Kiritimati", + "Pacific/Kosrae", + "Pacific/Kwajalein", + "Pacific/Majuro", + "Pacific/Marquesas", + "Pacific/Midway", + "Pacific/Nauru", + "Pacific/Niue", + "Pacific/Norfolk", + "Pacific/Noumea", + "Pacific/Pago_Pago", + "Pacific/Palau", + "Pacific/Pitcairn", + "Pacific/Pohnpei", + "Pacific/Ponape", + "Pacific/Port_Moresby", + "Pacific/Rarotonga", + "Pacific/Saipan", + "Pacific/Samoa", + "Pacific/Tahiti", + "Pacific/Tarawa", + "Pacific/Tongatapu", + "Pacific/Truk", + "Pacific/Wake", + "Pacific/Wallis", + "Pacific/Yap", + "Poland", + "Portugal", + "ROC", + "ROK", + "Singapore", + "Turkey", + "UCT", + "US/Alaska", + "US/Aleutian", + "US/Arizona", + "US/Central", + "US/East-Indiana", + "US/Eastern", + "US/Hawaii", + "US/Indiana-Starke", + "US/Michigan", + "US/Mountain", + "US/Pacific", + "US/Pacific-New", + "US/Samoa", + "UTC", + "Universal", + "W-SU", + "WET", + "Zulu"}; - static { - for (int i = 0; i < timezoneList.length; i++) { - timezoneMap.put(timezoneList[i], i); - } + static { + for (int i = 0; i < timezoneList.length; i++) { + timezoneMap.put(timezoneList[i], i); } + } - public static final DateTimeFormatter formatDate = DateTimeFormat.forPattern("yyyy-MM-dd"); - public static final DateTimeFormatter formatTimeStampMilli = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS"); - public static final DateTimeFormatter formatTimeStampTZ = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); - public static final DateTimeFormatter formatTime = DateTimeFormat.forPattern("HH:mm:ss.SSS"); + public static final DateTimeFormatter formatDate = DateTimeFormat.forPattern("yyyy-MM-dd"); + public static final DateTimeFormatter formatTimeStampMilli = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS"); + public static final DateTimeFormatter formatTimeStampTZ = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); + public static final DateTimeFormatter formatTime = DateTimeFormat.forPattern("HH:mm:ss.SSS"); - public static DateTimeFormatter dateTimeTZFormat = null; - public static DateTimeFormatter timeFormat = null; + public static DateTimeFormatter dateTimeTZFormat = null; + public static DateTimeFormatter timeFormat = null; - public static final int yearsToMonths = 12; - public static final int hoursToMillis = 60 * 60 * 1000; - public static final int minutesToMillis = 60 * 1000; - public static final int secondsToMillis = 1000; - public static final int monthToStandardDays = 30; - public static final long monthsToMillis = 2592000000L; // 30 * 24 * 60 * 60 * 1000 - public static final int daysToStandardMillis = 24 * 60 * 60 * 1000; + public static final int yearsToMonths = 12; + public static final int hoursToMillis = 60 * 60 * 1000; + public static final int minutesToMillis = 60 * 1000; + public static final int secondsToMillis = 1000; + public static final int monthToStandardDays = 30; + public static final long monthsToMillis = 2592000000L; // 30 * 24 * 60 * 60 * 1000 + public static final int daysToStandardMillis = 24 * 60 * 60 * 1000; public static int getIndex(String timezone) { - return timezoneMap.get(timezone); - } - - public static String getTimeZone(int index) { - return timezoneList[index]; - } + return timezoneMap.get(timezone); + } - // Function returns the date time formatter used to parse date strings - public static DateTimeFormatter getDateTimeFormatter() { + public static String getTimeZone(int index) { + return timezoneList[index]; + } - if (dateTimeTZFormat == null) { - DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); - DateTimeParser optionalTime = DateTimeFormat.forPattern(" HH:mm:ss").getParser(); - DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); - DateTimeParser optionalZone = DateTimeFormat.forPattern(" ZZZ").getParser(); + // Function returns the date time formatter used to parse date strings + public static DateTimeFormatter getDateTimeFormatter() { - dateTimeTZFormat = new DateTimeFormatterBuilder().append(dateFormatter).appendOptional(optionalTime).appendOptional(optionalSec).appendOptional(optionalZone).toFormatter(); - } + if (dateTimeTZFormat == null) { + DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); + DateTimeParser optionalTime = DateTimeFormat.forPattern(" HH:mm:ss").getParser(); + DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); + DateTimeParser optionalZone = DateTimeFormat.forPattern(" ZZZ").getParser(); - return dateTimeTZFormat; + dateTimeTZFormat = new DateTimeFormatterBuilder().append(dateFormatter).appendOptional(optionalTime).appendOptional(optionalSec).appendOptional(optionalZone).toFormatter(); } - // Function returns time formatter used to parse time strings - public static DateTimeFormatter getTimeFormatter() { - if (timeFormat == null) { - DateTimeFormatter timeFormatter = DateTimeFormat.forPattern("HH:mm:ss"); - DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); - timeFormat = new DateTimeFormatterBuilder().append(timeFormatter).appendOptional(optionalSec).toFormatter(); - } - return timeFormat; - } + return dateTimeTZFormat; + } - public static int monthsFromPeriod(Period period){ - return (period.getYears() * yearsToMonths) + period.getMonths(); + // Function returns time formatter used to parse time strings + public static DateTimeFormatter getTimeFormatter() { + if (timeFormat == null) { + DateTimeFormatter timeFormatter = DateTimeFormat.forPattern("HH:mm:ss"); + DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); + timeFormat = new DateTimeFormatterBuilder().append(timeFormatter).appendOptional(optionalSec).toFormatter(); } + return timeFormat; + } - public static int millisFromPeriod(final Period period){ - return (period.getHours() * hoursToMillis) + - (period.getMinutes() * minutesToMillis) + - (period.getSeconds() * secondsToMillis) + - (period.getMillis()); - } + public static int monthsFromPeriod(Period period) { + return (period.getYears() * yearsToMonths) + period.getMonths(); + } - public static long toMillis(LocalDateTime localDateTime) { - return LocalDateTimes.getLocalMillis(localDateTime); - } + public static int millisFromPeriod(final Period period) { + return (period.getHours() * hoursToMillis) + + (period.getMinutes() * minutesToMillis) + + (period.getSeconds() * secondsToMillis) + + (period.getMillis()); + } - public static int toMillisOfDay(final LocalDateTime localDateTime) { - return localDateTime.toDateTime(DateTimeZone.UTC).millisOfDay().get(); - } + public static long toMillis(LocalDateTime localDateTime) { + return LocalDateTimes.getLocalMillis(localDateTime); + } + + public static int toMillisOfDay(final LocalDateTime localDateTime) { + return localDateTime.toDateTime(DateTimeZone.UTC).millisOfDay().get(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 4c439b2cc1066..4b11b368dff1e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import io.netty.buffer.ArrowBuf; @@ -32,39 +33,39 @@ public class DecimalUtility { public final static int MAX_DIGITS = 9; public final static int DIGITS_BASE = 1000000000; public final static int DIGITS_MAX = 999999999; - public final static int INTEGER_SIZE = (Integer.SIZE/8); + public final static int INTEGER_SIZE = (Integer.SIZE / 8); public final static String[] decimalToString = {"", - "0", - "00", - "000", - "0000", - "00000", - "000000", - "0000000", - "00000000", - "000000000"}; + "0", + "00", + "000", + "0000", + "00000", + "000000", + "0000000", + "00000000", + "000000000"}; public final static long[] scale_long_constants = { - 1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000l, - 100000000000l, - 1000000000000l, - 10000000000000l, - 100000000000000l, - 1000000000000000l, - 10000000000000000l, - 100000000000000000l, - 1000000000000000000l}; + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000l, + 100000000000l, + 1000000000000l, + 10000000000000l, + 100000000000000l, + 1000000000000000l, + 10000000000000000l, + 100000000000000000l, + 1000000000000000000l}; public static final int DECIMAL_BYTE_LENGTH = 16; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java index 4108dc4610838..cf0596c8c1fb4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -40,7 +40,7 @@ public class DictionaryUtility { * have the dictionary type * * NOTE: in the message format, fields have the dictionary type - * in the memory format, they have the index type + * in the memory format, they have the index type */ public static Field toMessageFormat(Field field, DictionaryProvider provider, Set dictionaryIdsUsed) { DictionaryEncoding encoding = field.getDictionary(); @@ -51,7 +51,7 @@ public static Field toMessageFormat(Field field, DictionaryProvider provider, Se } List updatedChildren = new ArrayList<>(children.size()); - for (Field child: children) { + for (Field child : children) { updatedChildren.add(toMessageFormat(child, provider, dictionaryIdsUsed)); } @@ -85,7 +85,7 @@ public static Field toMemoryFormat(Field field, BufferAllocator allocator, Map updatedChildren = new ArrayList<>(children.size()); - for (Field child: children) { + for (Field child : children) { updatedChildren.add(toMemoryFormat(child, allocator, dictionaries)); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java index c598069c2c309..480bd76d445b0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.ArrayList; @@ -58,7 +59,7 @@ public boolean equals(Object obj) { public final String toString() { try { return mapper.writeValueAsString(this); - } catch(JsonProcessingException e) { + } catch (JsonProcessingException e) { throw new IllegalStateException("Cannot serialize array list to JSON string", e); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java index e8ce5221eebd9..6455389d582b9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.LinkedHashMap; @@ -51,14 +52,14 @@ public boolean equals(Object obj) { return false; } for (K key : this.keySet()) { - if (this.get(key) == null ) { + if (this.get(key) == null) { if (other.get(key) == null) { continue; } else { return false; } } - if ( ! this.get(key).equals(other.get(key))) { + if (!this.get(key).equals(other.get(key))) { return false; } } @@ -69,7 +70,7 @@ public boolean equals(Object obj) { public final String toString() { try { return mapper.writeValueAsString(this); - } catch(JsonProcessingException e) { + } catch (JsonProcessingException e) { throw new IllegalStateException("Cannot serialize hash map to JSON string", e); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java index b35aaa401bae4..6d3b390379a56 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.AbstractMap; @@ -95,16 +96,16 @@ public V get(Object key) { public V put(K key, V value) { final Entry oldPair = primary.get(key); // if key exists try replacing otherwise, assign a new ordinal identifier - final int ordinal = oldPair == null ? primary.size():oldPair.getKey(); + final int ordinal = oldPair == null ? primary.size() : oldPair.getKey(); primary.put(key, new AbstractMap.SimpleImmutableEntry<>(ordinal, value)); secondary.put(ordinal, value); - return oldPair==null ? null:oldPair.getValue(); + return oldPair == null ? null : oldPair.getValue(); } @Override public V remove(Object key) { final Entry oldPair = primary.remove(key); - if (oldPair!=null) { + if (oldPair != null) { final int lastOrdinal = secondary.size(); final V last = secondary.get(lastOrdinal); // normalize mappings so that all numbers until primary.size() is assigned @@ -112,7 +113,7 @@ public V remove(Object key) { secondary.put(oldPair.getKey(), last); primary.put((K) key, new AbstractMap.SimpleImmutableEntry<>(oldPair.getKey(), last)); } - return oldPair==null ? null:oldPair.getValue(); + return oldPair == null ? null : oldPair.getValue(); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java b/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java index bd7396249a72c..b4ff2522daf33 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; @@ -23,7 +24,6 @@ * {@link org.apache.arrow.memory.BufferAllocator#buffer(int) allocator}. * *

Operators should handle this exception to split the batch and later resume the execution on the next iteration.

- * */ public class OversizedAllocationException extends RuntimeException { public OversizedAllocationException() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java index c281561430707..ddfea948a8f74 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java index ce82f445ad883..15ce132fc801c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.io.DataInput; @@ -74,13 +75,16 @@ public Text() { /** * Construct from a string. + * * @param string initialize from that string */ public Text(String string) { set(string); } - /** Construct from another text. + /** + * Construct from another text. + * * @param utf8 initialize from that Text */ public Text(Text utf8) { @@ -89,6 +93,7 @@ public Text(Text utf8) { /** * Construct from a byte array. + * * @param utf8 initialize from that byte array */ public Text(byte[] utf8) { @@ -98,6 +103,7 @@ public Text(byte[] utf8) { /** * Get a copy of the bytes that is exactly the length of the data. See {@link #getBytes()} for faster access to the * underlying array. + * * @return a copy of the underlying array */ public byte[] copyBytes() { @@ -109,13 +115,16 @@ public byte[] copyBytes() { /** * Returns the raw bytes; however, only data up to {@link #getLength()} is valid. Please use {@link #copyBytes()} if * you need the returned array to be precisely the length of the data. + * * @return the underlying array */ public byte[] getBytes() { return bytes; } - /** @return the number of bytes in the byte array */ + /** + * @return the number of bytes in the byte array + */ public int getLength() { return length; } @@ -128,12 +137,10 @@ public int getLength() { * @return the Unicode scalar value at position or -1 if the position is invalid or points to a trailing byte */ public int charAt(int position) { - if (position > this.length) - { + if (position > this.length) { return -1; // too long } - if (position < 0) - { + if (position < 0) { return -1; // duh. } @@ -150,7 +157,7 @@ public int find(String what) { * starting position is measured in bytes and the return value is in terms of byte position in the buffer. The backing * buffer is not converted to a string for this operation. * - * @param what the string to search for + * @param what the string to search for * @param start where to start from * @return byte position of the first occurence of the search string in the UTF-8 buffer or -1 if not found */ @@ -196,6 +203,7 @@ public int find(String what, int start) { /** * Set to contain the contents of a string. + * * @param string the string to initialize from */ public void set(String string) { @@ -210,14 +218,18 @@ public void set(String string) { /** * Set to a utf8 byte array + * * @param utf8 the byte array to initialize from */ public void set(byte[] utf8) { set(utf8, 0, utf8.length); } - /** copy a text. - * @param other the text to initialize from */ + /** + * copy a text. + * + * @param other the text to initialize from + */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); } @@ -225,12 +237,9 @@ public void set(Text other) { /** * Set the Text to range of bytes * - * @param utf8 - * the data to copy from - * @param start - * the first position of the new string - * @param len - * the number of bytes of the new string + * @param utf8 the data to copy from + * @param start the first position of the new string + * @param len the number of bytes of the new string */ public void set(byte[] utf8, int start, int len) { setCapacity(len, false); @@ -241,12 +250,9 @@ public void set(byte[] utf8, int start, int len) { /** * Append a range of bytes to the end of the given text * - * @param utf8 - * the data to copy from - * @param start - * the first position to append from utf8 - * @param len - * the number of bytes to append + * @param utf8 the data to copy from + * @param start the first position to append from utf8 + * @param len the number of bytes to append */ public void append(byte[] utf8, int start, int len) { setCapacity(length + len, true); @@ -270,7 +276,7 @@ public void clear() { * then the capacity and existing content of the buffer are unchanged. If len is larger than the current * capacity, the Text object's capacity is increased to match. * - * @param len the number of bytes we need + * @param len the number of bytes we need * @param keepData should the old data be kept */ private void setCapacity(int len, boolean keepData) { @@ -295,7 +301,8 @@ public String toString() { /** * Read a Text object whose length is already known. This allows creating Text from a stream which uses a different * serialization format. - * @param in the input to initialize from + * + * @param in the input to initialize from * @param len how many bytes to read from in * @throws IOException if something bad happens */ @@ -351,9 +358,11 @@ public int hashCode() { } // / STATIC UTILITIES FROM HERE DOWN + /** * Converts the provided byte array to a String using the UTF-8 encoding. If the input is malformed, replace by a * default value. + * * @param utf8 bytes to decode * @return the decoded string * @throws CharacterCodingException if this is not valid UTF-8 @@ -371,9 +380,10 @@ public static String decode(byte[] utf8, int start, int length) * Converts the provided byte array to a String using the UTF-8 encoding. If replace is true, then * malformed input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a * MalformedInputException. - * @param utf8 the bytes to decode - * @param start where to start from - * @param length length of the bytes to decode + * + * @param utf8 the bytes to decode + * @param start where to start from + * @param length length of the bytes to decode * @param replace whether to replace malformed characters with U+FFFD * @return the decoded string * @throws CharacterCodingException if the input could not be decoded @@ -418,8 +428,7 @@ public static ByteBuffer encode(String string) * input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a * MalformedInputException. * - - * @param string the string to encode + * @param string the string to encode * @param replace whether to replace malformed characters with U+FFFD * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit() * @throws CharacterCodingException if the string could not be encoded @@ -453,10 +462,8 @@ public static ByteBuffer encode(String string, boolean replace) /** * Check if a byte array contains valid utf-8 * - * @param utf8 - * byte array - * @throws MalformedInputException - * if the byte array contains invalid utf-8 + * @param utf8 byte array + * @throws MalformedInputException if the byte array contains invalid utf-8 */ public static void validateUTF8(byte[] utf8) throws MalformedInputException { validateUTF8(utf8, 0, utf8.length); @@ -465,14 +472,10 @@ public static void validateUTF8(byte[] utf8) throws MalformedInputException { /** * Check to see if a byte array is valid utf-8 * - * @param utf8 - * the array of bytes - * @param start - * the offset of the first byte in the array - * @param len - * the length of the byte sequence - * @throws MalformedInputException - * if the byte array contains invalid bytes + * @param utf8 the array of bytes + * @param start the offset of the first byte in the array + * @param len the length of the byte sequence + * @throws MalformedInputException if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { @@ -484,67 +487,67 @@ public static void validateUTF8(byte[] utf8, int start, int len) int aByte = utf8[count] & 0xFF; switch (state) { - case LEAD_BYTE: - leadByte = aByte; - length = bytesFromUTF8[aByte]; + case LEAD_BYTE: + leadByte = aByte; + length = bytesFromUTF8[aByte]; + + switch (length) { + case 0: // check for ASCII + if (leadByte > 0x7F) { + throw new MalformedInputException(count); + } + break; + case 1: + if (leadByte < 0xC2 || leadByte > 0xDF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 2: + if (leadByte < 0xE0 || leadByte > 0xEF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 3: + if (leadByte < 0xF0 || leadByte > 0xF4) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + default: + // too long! Longest valid UTF-8 is 4 bytes (lead + three) + // or if < 0 we got a trail byte in the lead byte position + throw new MalformedInputException(count); + } // switch (length) + break; - switch (length) { - case 0: // check for ASCII - if (leadByte > 0x7F) { + case TRAIL_BYTE_1: + if (leadByte == 0xF0 && aByte < 0x90) { throw new MalformedInputException(count); } - break; - case 1: - if (leadByte < 0xC2 || leadByte > 0xDF) { + if (leadByte == 0xF4 && aByte > 0x8F) { throw new MalformedInputException(count); } - state = TRAIL_BYTE_1; - break; - case 2: - if (leadByte < 0xE0 || leadByte > 0xEF) { + if (leadByte == 0xE0 && aByte < 0xA0) { throw new MalformedInputException(count); } - state = TRAIL_BYTE_1; - break; - case 3: - if (leadByte < 0xF0 || leadByte > 0xF4) { + if (leadByte == 0xED && aByte > 0x9F) { + throw new MalformedInputException(count); + } + // falls through to regular trail-byte test!! + case TRAIL_BYTE: + if (aByte < 0x80 || aByte > 0xBF) { throw new MalformedInputException(count); } - state = TRAIL_BYTE_1; + if (--length == 0) { + state = LEAD_BYTE; + } else { + state = TRAIL_BYTE; + } break; default: - // too long! Longest valid UTF-8 is 4 bytes (lead + three) - // or if < 0 we got a trail byte in the lead byte position - throw new MalformedInputException(count); - } // switch (length) - break; - - case TRAIL_BYTE_1: - if (leadByte == 0xF0 && aByte < 0x90) { - throw new MalformedInputException(count); - } - if (leadByte == 0xF4 && aByte > 0x8F) { - throw new MalformedInputException(count); - } - if (leadByte == 0xE0 && aByte < 0xA0) { - throw new MalformedInputException(count); - } - if (leadByte == 0xED && aByte > 0x9F) { - throw new MalformedInputException(count); - } - // falls through to regular trail-byte test!! - case TRAIL_BYTE: - if (aByte < 0x80 || aByte > 0xBF) { - throw new MalformedInputException(count); - } - if (--length == 0) { - state = LEAD_BYTE; - } else { - state = TRAIL_BYTE; - } - break; - default: - break; + break; } // switch (state) count++; } @@ -556,25 +559,26 @@ public static void validateUTF8(byte[] utf8, int start, int len) * six byte sequences. */ static final int[] bytesFromUTF8 = - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - // trail bytes - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, - 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + // trail bytes + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5}; /** * Returns the next code point at the current position in the buffer. The buffer's position will be incremented. Any * mark set on this buffer will be changed by this method! + * * @param bytes the incoming bytes * @return the corresponding unicode codepoint */ @@ -583,30 +587,29 @@ public static int bytesToCodePoint(ByteBuffer bytes) { byte b = bytes.get(); bytes.reset(); int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; - if (extraBytesToRead < 0) - { + if (extraBytesToRead < 0) { return -1; // trailing byte! } int ch = 0; switch (extraBytesToRead) { - case 5: - ch += (bytes.get() & 0xFF); - ch <<= 6; /* remember, illegal UTF-8 */ - case 4: - ch += (bytes.get() & 0xFF); - ch <<= 6; /* remember, illegal UTF-8 */ - case 3: - ch += (bytes.get() & 0xFF); - ch <<= 6; - case 2: - ch += (bytes.get() & 0xFF); - ch <<= 6; - case 1: - ch += (bytes.get() & 0xFF); - ch <<= 6; - case 0: - ch += (bytes.get() & 0xFF); + case 5: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + case 4: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + case 3: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 2: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 1: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 0: + ch += (bytes.get() & 0xFF); } ch -= offsetsFromUTF8[extraBytesToRead]; @@ -614,14 +617,13 @@ public static int bytesToCodePoint(ByteBuffer bytes) { } static final int offsetsFromUTF8[] = - { 0x00000000, 0x00003080, - 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; + {0x00000000, 0x00003080, + 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080}; /** * For the given string, returns the number of UTF-8 bytes required to encode the string. * - * @param string - * text to encode + * @param string text to encode * @return number of UTF-8 bytes required to encode */ public static int utf8Length(String string) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java b/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java index 6e68d55226266..314ffdcb6637c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java @@ -15,13 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import org.apache.arrow.vector.ValueVector; public interface TransferPair { public void transfer(); + public void splitAndTransfer(int startIndex, int length); + public ValueVector getTo(); + public void copyValueSafe(int from, int to); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index b70a63fe7dd02..5851bd5fa5d97 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.Arrays; @@ -68,7 +69,7 @@ public static void compareDictionaries(List encodings1, List Dictionary dict2 = provider2.lookup(id); if (dict1 == null || dict2 == null) { - throw new IllegalArgumentException("The DictionaryProvider did not contain the required dictionary with id: " + id +"\n" + dict1 + "\n" + dict2); + throw new IllegalArgumentException("The DictionaryProvider did not contain the required dictionary with id: " + id + "\n" + dict1 + "\n" + dict2); } try { @@ -132,9 +133,9 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { ArrowType.FloatingPoint fpType = (ArrowType.FloatingPoint) type; switch (fpType.getPrecision()) { case DOUBLE: - return equalEnough((Double)o1, (Double)o2); + return equalEnough((Double) o1, (Double) o2); case SINGLE: - return equalEnough((Float)o1, (Float)o2); + return equalEnough((Float) o1, (Float) o2); case HALF: default: throw new UnsupportedOperationException("unsupported precision: " + fpType); diff --git a/java/vector/src/main/java/org/joda/time/LocalDateTimes.java b/java/vector/src/main/java/org/joda/time/LocalDateTimes.java index e4f999e1d828e..a1c18fe9a5f41 100644 --- a/java/vector/src/main/java/org/joda/time/LocalDateTimes.java +++ b/java/vector/src/main/java/org/joda/time/LocalDateTimes.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.joda.time; /** * Workaround to access package protected fields in JODA - * */ public class LocalDateTimes { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java b/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java index f775f1d2d67af..febd59fba7408 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.memory.BufferManager; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java index 194b78585faaf..495bed389e568 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -75,11 +76,10 @@ public void testSplitAndTransfer() throws Exception { sourceVector.allocateNew(40); /* populate the bitvector -- 010101010101010101010101..... */ - for(int i = 0; i < 40; i++) { - if((i & 1) == 1) { + for (int i = 0; i < 40; i++) { + if ((i & 1) == 1) { sourceMutator.set(i, 1); - } - else { + } else { sourceMutator.set(i, 0); } } @@ -87,18 +87,17 @@ public void testSplitAndTransfer() throws Exception { sourceMutator.setValueCount(40); /* check the vector output */ - for(int i = 0; i < 40; i++) { + for (int i = 0; i < 40; i++) { int result = sourceAccessor.get(i); - if((i & 1) == 1) { + if ((i & 1) == 1) { assertEquals(Integer.toString(1), Integer.toString(result)); - } - else { + } else { assertEquals(Integer.toString(0), Integer.toString(result)); } } final TransferPair transferPair = sourceVector.getTransferPair(allocator); - final BitVector toVector = (BitVector)transferPair.getTo(); + final BitVector toVector = (BitVector) transferPair.getTo(); final BitVector.Accessor toAccessor = toVector.getAccessor(); final BitVector.Mutator toMutator = toVector.getMutator(); @@ -110,13 +109,13 @@ public void testSplitAndTransfer() throws Exception { * (2.1) the length is a multiple of 8 * (2.2) the length is not a multiple of 8 */ - final int[][] transferLengths = { {0, 8}, /* (1) */ - {8, 10}, /* (1) */ - {18, 0}, /* zero length scenario */ - {18, 8}, /* (2.1) */ - {26, 0}, /* zero length scenario */ - {26, 14} /* (2.2) */ - }; + final int[][] transferLengths = {{0, 8}, /* (1) */ + {8, 10}, /* (1) */ + {18, 0}, /* zero length scenario */ + {18, 8}, /* (2.1) */ + {26, 0}, /* zero length scenario */ + {26, 14} /* (2.2) */ + }; for (final int[] transferLength : transferLengths) { final int start = transferLength[0]; @@ -127,10 +126,9 @@ public void testSplitAndTransfer() throws Exception { /* check the toVector output after doing splitAndTransfer */ for (int i = 0; i < length; i++) { int result = toAccessor.get(i); - if((i & 1) == 1) { + if ((i & 1) == 1) { assertEquals(Integer.toString(1), Integer.toString(result)); - } - else { + } else { assertEquals(Integer.toString(0), Integer.toString(result)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 08e3700daeebf..54fc306717088 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -65,7 +66,7 @@ public void testTransferVariableidth() { v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expected = 8*4096 + 4*4096 + 4096; + int expected = 8 * 4096 + 4 * 4096 + 4096; assertEquals(expected, childAllocator2.getAllocatedMemory()); } @@ -90,11 +91,11 @@ public void emptyListTransferShouldNotTriggerSchemaChange() { final Pointer trigger1 = new Pointer<>(); final Pointer trigger2 = new Pointer<>(); final ListVector v1 = new ListVector("v1", allocator, - FieldType.nullable(ArrowType.Null.INSTANCE), - newTriggerCallback(trigger1)); + FieldType.nullable(ArrowType.Null.INSTANCE), + newTriggerCallback(trigger1)); final ListVector v2 = new ListVector("v2", allocator, - FieldType.nullable(ArrowType.Null.INSTANCE), - newTriggerCallback(trigger2)); + FieldType.nullable(ArrowType.Null.INSTANCE), + newTriggerCallback(trigger2)); v1.makeTransferPair(v2).transfer(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java index b98c24d189528..774fbe084f1c2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -39,6 +40,7 @@ public class TestDecimalVector { intValues[2 * i] = -1 * (1 << i + 1); } } + private int scale = 3; @Test diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index 3bf3b1cedff38..f2db9baac04ca 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; @@ -35,8 +36,8 @@ public class TestDictionaryVector { private BufferAllocator allocator; byte[] zero = "foo".getBytes(StandardCharsets.UTF_8); - byte[] one = "bar".getBytes(StandardCharsets.UTF_8); - byte[] two = "baz".getBytes(StandardCharsets.UTF_8); + byte[] one = "bar".getBytes(StandardCharsets.UTF_8); + byte[] two = "baz".getBytes(StandardCharsets.UTF_8); @Before public void init() { @@ -74,7 +75,7 @@ public void testEncodeStrings() { Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); - try(final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { + try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { // verify indices assertEquals(NullableIntVector.class, encoded.getClass()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java index 304db9dcc00bd..5677f2566797a 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import com.google.common.collect.Lists; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index a1762c466ce0b..fdb576ef75cc4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -15,12 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertFalse; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; @@ -38,8 +39,10 @@ import org.junit.Before; import org.junit.Test; +import java.util.ArrayList; import java.util.List; + public class TestListVector { private BufferAllocator allocator; @@ -92,6 +95,26 @@ public void testCopyFrom() throws Exception { Assert.assertFalse("should be null", reader.isSet()); reader.setPosition(2); Assert.assertTrue("shouldn't be null", reader.isSet()); + + /* check the exact contents of vector */ + final ListVector.Accessor accessor = outVector.getAccessor(); + + /* index 0 */ + Object result = accessor.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(new Long(1), (Long) resultSet.get(0)); + assertEquals(new Long(2), (Long) resultSet.get(1)); + assertEquals(new Long(3), (Long) resultSet.get(2)); + + /* index 1 */ + result = accessor.getObject(1); + assertNull(result); + + /* index 2 */ + result = accessor.getObject(2); + resultSet = (ArrayList) result; + assertEquals(0, resultSet.size()); } } @@ -108,11 +131,11 @@ public void testSetLastSetUsage() throws Exception { /* get inner vectors; bitVector and offsetVector */ List innerVectors = listVector.getFieldInnerVectors(); - BitVector bitVector = (BitVector)innerVectors.get(0); - UInt4Vector offsetVector = (UInt4Vector)innerVectors.get(1); + BitVector bitVector = (BitVector) innerVectors.get(0); + UInt4Vector offsetVector = (UInt4Vector) innerVectors.get(1); /* get the underlying data vector -- NullableBigIntVector */ - NullableBigIntVector dataVector = (NullableBigIntVector)listVector.getDataVector(); + NullableBigIntVector dataVector = (NullableBigIntVector) listVector.getDataVector(); /* check current lastSet */ assertEquals(Integer.toString(0), Integer.toString(listVector.getMutator().getLastSet())); @@ -127,7 +150,7 @@ public void testSetLastSetUsage() throws Exception { dataVector.getMutator().setSafe(2, 1, 12); offsetVector.getMutator().setSafe(index + 1, 3); - index += 1; + index += 1; /* write [13, 14] to the list vector at index 1 */ bitVector.getMutator().setSafe(index, 1); @@ -193,41 +216,41 @@ public void testSetLastSetUsage() throws Exception { final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); - index = 0; + index = 0; offset = offsetAccessor.get(index); assertEquals(Integer.toString(0), Integer.toString(offset)); Object actual = valueAccessor.getObject(offset); - assertEquals(new Long(10), (Long)actual); + assertEquals(new Long(10), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(11), (Long)actual); + assertEquals(new Long(11), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(12), (Long)actual); + assertEquals(new Long(12), (Long) actual); index++; offset = offsetAccessor.get(index); assertEquals(Integer.toString(3), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(13), (Long)actual); + assertEquals(new Long(13), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(14), (Long)actual); + assertEquals(new Long(14), (Long) actual); index++; offset = offsetAccessor.get(index); assertEquals(Integer.toString(5), Integer.toString(offset)); actual = valueAccessor.getObject(offsetAccessor.get(index)); - assertEquals(new Long(15), (Long)actual); + assertEquals(new Long(15), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(16), (Long)actual); + assertEquals(new Long(16), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(17), (Long)actual); + assertEquals(new Long(17), (Long) actual); index++; offset = offsetAccessor.get(index); @@ -291,16 +314,16 @@ public void testSplitAndTransfer() throws Exception { assertEquals(5, listVector.getMutator().getLastSet()); /* get offsetVector */ - UInt4Vector offsetVector = (UInt4Vector)listVector.getOffsetVector(); + UInt4Vector offsetVector = (UInt4Vector) listVector.getOffsetVector(); /* get dataVector */ - NullableBigIntVector dataVector = (NullableBigIntVector)listVector.getDataVector(); + NullableBigIntVector dataVector = (NullableBigIntVector) listVector.getDataVector(); /* check the vector output */ final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); - int index = 0; + int index = 0; int offset = 0; Object actual = null; @@ -310,13 +333,13 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(0), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(10), (Long)actual); + assertEquals(new Long(10), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(11), (Long)actual); + assertEquals(new Long(11), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(12), (Long)actual); + assertEquals(new Long(12), (Long) actual); /* index 1 */ index++; @@ -325,10 +348,10 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(3), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(13), (Long)actual); + assertEquals(new Long(13), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(14), (Long)actual); + assertEquals(new Long(14), (Long) actual); /* index 2 */ index++; @@ -337,16 +360,16 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(5), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(15), (Long)actual); + assertEquals(new Long(15), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(16), (Long)actual); + assertEquals(new Long(16), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(17), (Long)actual); + assertEquals(new Long(17), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(18), (Long)actual); + assertEquals(new Long(18), (Long) actual); /* index 3 */ index++; @@ -355,7 +378,7 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(9), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(19), (Long)actual); + assertEquals(new Long(19), (Long) actual); /* index 4 */ index++; @@ -364,16 +387,16 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(10), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(20), (Long)actual); + assertEquals(new Long(20), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(21), (Long)actual); + assertEquals(new Long(21), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(22), (Long)actual); + assertEquals(new Long(22), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(23), (Long)actual); + assertEquals(new Long(23), (Long) actual); /* index 5 */ index++; @@ -386,10 +409,7 @@ public void testSplitAndTransfer() throws Exception { TransferPair transferPair = listVector.makeTransferPair(toVector); - int[][] transferLengths = { {0, 2}, - {3, 1}, - {4, 1} - }; + int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}}; for (final int[] transferLength : transferLengths) { int start = transferLength[0]; @@ -404,26 +424,26 @@ public void testSplitAndTransfer() throws Exception { transferPair.splitAndTransfer(start, splitLength); /* get offsetVector of toVector */ - UInt4Vector offsetVector1 = (UInt4Vector)toVector.getOffsetVector(); + UInt4Vector offsetVector1 = (UInt4Vector) toVector.getOffsetVector(); UInt4Vector.Accessor offsetAccessor1 = offsetVector1.getAccessor(); /* get dataVector of toVector */ - NullableBigIntVector dataVector1 = (NullableBigIntVector)toVector.getDataVector(); + NullableBigIntVector dataVector1 = (NullableBigIntVector) toVector.getDataVector(); NullableBigIntVector.Accessor valueAccessor1 = dataVector1.getAccessor(); - for(int i = 0; i < splitLength; i++) { + for (int i = 0; i < splitLength; i++) { dataLength1 = offsetAccessor.get(start + i + 1) - offsetAccessor.get(start + i); dataLength2 = offsetAccessor1.get(i + 1) - offsetAccessor1.get(i); assertEquals("Different data lengths at index: " + i + " and start: " + start, - dataLength1, dataLength2); + dataLength1, dataLength2); offset1 = offsetAccessor.get(start + i); offset2 = offsetAccessor1.get(i); - for(int j = 0; j < dataLength1; j++) { + for (int j = 0; j < dataLength1; j++) { assertEquals("Different data at indexes: " + offset1 + " and " + offset2, - valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); + valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); offset1++; offset2++; @@ -433,4 +453,120 @@ public void testSplitAndTransfer() throws Exception { } } } + + @Test + public void testNestedListVector() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + UnionListWriter listWriter = listVector.getWriter(); + + /* allocate memory */ + listWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().bigInt().writeBigInt(150); + listWriter.list().bigInt().writeBigInt(175); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(10); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(2, listVector.getMutator().getLastSet()); + + listVector.getMutator().setValueCount(2); + + final ListVector.Accessor accessor = listVector.getAccessor(); + assertEquals(2, accessor.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = accessor.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + assertEquals(new Long(150), list.get(2)); + assertEquals(new Long(175), list.get(3)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = accessor.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of third inner list */ + + list = resultSet.get(0); + assertEquals(new Long(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(accessor.isNull(0)); + assertFalse(accessor.isNull(1)); + + /* check underlying offsetVector */ + UInt4Vector offsetVector = listVector.getOffsetVector(); + final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetAccessor.get(0)); + assertEquals(2, offsetAccessor.get(1)); + assertEquals(5, offsetAccessor.get(2)); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java index 9baebc5a2992c..ba2ebbf05ad0d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java @@ -90,7 +90,7 @@ public void testBitVectorReallocation() { // common: value count < MAX_VALUE_ALLOCATION try { vector.allocateNew(expectedValueCapacity); - for (int i=0; i<3;i++) { + for (int i = 0; i < 3; i++) { vector.reAlloc(); // expand buffer size } assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index a5159242d76f9..a75b196fbcc30 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -98,7 +99,7 @@ public void testTransfer() throws Exception { mutator.setSafe(5, newBitHolder(false)); mutator.setValueCount(6); - try(UnionVector destVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector destVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { TransferPair pair = srcVector.makeTransferPair(destVector); // Creating the transfer should transfer the type of the field at least. @@ -111,7 +112,7 @@ public void testTransfer() throws Exception { // now check the values are transferred assertEquals(srcVector.getAccessor().getValueCount(), destVector.getAccessor().getValueCount()); - for(int i=0; i T newVector(Class c, String name, ArrowType type, BufferAllocator allocator) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 0f41c2dd790e1..72214ed2ed6fb 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; @@ -394,7 +395,7 @@ public void testBitVectorRangeSetAllOnes() { } private void validateRange(int length, int start, int count) { - String desc = "[" + start + ", " + (start + count) + ") "; + String desc = "[" + start + ", " + (start + count) + ") "; try (BitVector bitVector = new BitVector("bits", allocator)) { bitVector.reset(); bitVector.allocateNew(length); @@ -489,7 +490,7 @@ public void testFillEmptiesNotOverfill() { @Test public void testCopyFromWithNulls() { try (final NullableVarCharVector vector = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final NullableVarCharVector vector2 = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + final NullableVarCharVector vector2 = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); for (int i = 0; i < 4095; i++) { @@ -608,15 +609,15 @@ public void testVectorLoadUnload() { VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); try ( - ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); - BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("new vector", 0, Long.MAX_VALUE); - VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("new vector", 0, Long.MAX_VALUE); + VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, finalVectorsAllocator); ) { VectorLoader vectorLoader = new VectorLoader(schemaRoot2); vectorLoader.load(recordBatch); - NullableVarCharVector vector2 = (NullableVarCharVector)schemaRoot2.getVector(fieldName); + NullableVarCharVector vector2 = (NullableVarCharVector) schemaRoot2.getVector(fieldName); NullableVarCharVector.Mutator mutator2 = vector2.getMutator(); /* @@ -736,6 +737,16 @@ public void testFillEmptiesUsage() { } } + @Test + public void testMultipleClose() { + BufferAllocator vectorAllocator = allocator.newChildAllocator("vector_allocator", 0, Long.MAX_VALUE); + NullableIntVector vector = newVector(NullableIntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, vectorAllocator); + vector.close(); + vectorAllocator.close(); + vector.close(); + vectorAllocator.close(); + } + public static void setBytes(int index, byte[] bytes, NullableVarCharVector vector) { final int currentOffset = vector.values.offsetVector.getAccessor().get(index); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index da9cb00361c0b..4ac7536c017db 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -72,31 +73,6 @@ public void testFixedType() { } } - @Test - public void testVariableLengthType() { - try (final VarCharVector vector = new VarCharVector("", allocator)) { - final VarCharVector.Mutator m = vector.getMutator(); - // note: capacity ends up being - 1 due to offsets vector - vector.setInitialCapacity(511); - vector.allocateNew(); - - assertEquals(511, vector.getValueCapacity()); - - try { - m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - Assert.fail("Expected out of bounds exception"); - } catch (Exception e) { - // ok - } - - vector.reAlloc(); - assertEquals(1023, vector.getValueCapacity()); - - m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - assertEquals("foo", new String(vector.getAccessor().get(512), StandardCharsets.UTF_8)); - } - } - @Test public void testNullableType() { try (final NullableVarCharVector vector = new NullableVarCharVector("", allocator)) { @@ -114,7 +90,7 @@ public void testNullableType() { } vector.reAlloc(); - assertEquals(1024, vector.getValueCapacity()); + assertEquals(1023, vector.getValueCapacity()); m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); assertEquals("foo", new String(vector.getAccessor().get(512), StandardCharsets.UTF_8)); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java new file mode 100644 index 0000000000000..d53f69489d4da --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestVectorReset { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testFixedTypeReset() { + try (final UInt4Vector vector = new UInt4Vector("", allocator)) { + final UInt4Vector.Mutator m = vector.getMutator(); + vector.allocateNew(); + final int sizeBefore = vector.getAllocationSize(); + vector.reAlloc(); + vector.reset(); + final int sizeAfter = vector.getAllocationSize(); + assertEquals(sizeBefore, sizeAfter); + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index f3694659a8f51..7facf73f511da 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static java.util.Arrays.asList; @@ -176,8 +177,9 @@ public void testUnloadLoadAddPadding() throws IOException { /** * The validity buffer can be empty if: - * - all values are defined - * - all values are null + * - all values are defined + * - all values are null + * * @throws IOException */ @Test @@ -185,7 +187,7 @@ public void testLoadEmptyValidityBuffer() throws IOException { Schema schema = new Schema(asList( new Field("intDefined", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), new Field("intNull", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) - )); + )); int count = 10; ArrowBuf validity = allocator.buffer(10).slice(0, 0); ArrowBuf[] values = new ArrowBuf[2]; @@ -208,8 +210,8 @@ public void testLoadEmptyValidityBuffer() throws IOException { vectorLoader.load(recordBatch); - NullableIntVector intDefinedVector = (NullableIntVector)newRoot.getVector("intDefined"); - NullableIntVector intNullVector = (NullableIntVector)newRoot.getVector("intNull"); + NullableIntVector intDefinedVector = (NullableIntVector) newRoot.getVector("intDefined"); + NullableIntVector intNullVector = (NullableIntVector) newRoot.getVector("intNull"); for (int i = 0; i < count; i++) { assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i)); assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i)); @@ -244,20 +246,20 @@ public void testLoadEmptyValidityBuffer() throws IOException { public void testUnloadLoadDuplicates() throws IOException { int count = 10; Schema schema = new Schema(asList( - new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), - new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) + new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), + new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) )); try ( - BufferAllocator originalVectorsAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + BufferAllocator originalVectorsAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); ) { List sources = new ArrayList<>(); - for (Field field: schema.getFields()) { + for (Field field : schema.getFields()) { FieldVector vector = field.createVector(originalVectorsAllocator); vector.allocateNew(); sources.add(vector); NullableIntVector.Mutator mutator = (NullableIntVector.Mutator) vector.getMutator(); - for (int i = 0; i < count; i++) { + for (int i = 0; i < count; i++) { mutator.set(i, i); } mutator.setValueCount(count); @@ -266,8 +268,8 @@ public void testUnloadLoadDuplicates() throws IOException { try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), sources, count)) { VectorUnloader vectorUnloader = new VectorUnloader(root); try (ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); - BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator);) { + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator);) { // load it VectorLoader vectorLoader = new VectorLoader(newRoot); vectorLoader.load(recordBatch); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index e826fa53b0977..97efb7d5a6d30 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import static org.junit.Assert.assertEquals; @@ -67,7 +68,7 @@ public void testPromoteToUnion() throws Exception { writer.setPosition(1); writer.bit("A").writeBit(1); - writer.decimal("dec", 10,10); + writer.decimal("dec", 10, 10); writer.setPosition(2); writer.integer("A").writeInt(10); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 5a9c80dc124a2..f81cd557a9d8f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.writer; import static org.junit.Assert.*; @@ -217,7 +218,7 @@ public void listScalarType() { for (int i = 0; i < COUNT; i++) { listWriter.startList(); for (int j = 0; j < i % 7; j++) { - if (j%2 == 0) { + if (j % 2 == 0) { listWriter.writeInt(j); } else { IntHolder holder = new IntHolder(); @@ -259,7 +260,7 @@ public void listScalarTypeNullable() { listReader.setPosition(i); if (i % 2 == 0) { assertTrue("index is set: " + i, listReader.isSet()); - assertEquals("correct length at: " + i, i % 7, ((List)listReader.readObject()).size()); + assertEquals("correct length at: " + i, i % 7, ((List) listReader.readObject()).size()); } else { assertFalse("index is not set: " + i, listReader.isSet()); assertNull("index is not set: " + i, listReader.readObject()); @@ -529,10 +530,10 @@ public void promotableWriterSchema() { private Set getFieldNames(List fields) { Set fieldNames = new HashSet<>(); - for (Field field: fields) { + for (Field field : fields) { fieldNames.add(field.getName()); if (!field.getChildren().isEmpty()) { - for (String name: getFieldNames(field.getChildren())) { + for (String name : getFieldNames(field.getChildren())) { fieldNames.add(field.getName() + "::" + name); } } @@ -698,7 +699,7 @@ private void checkTimestampField(Field field, String name) { private void checkTimestampTZField(Field field, String name, String tz) { checkTimestampField(field, name); - Assert.assertEquals(tz, ((Timestamp)field.getType()).getTimezone()); + Assert.assertEquals(tz, ((Timestamp) field.getType()).getTimezone()); } @Test @@ -824,13 +825,13 @@ public void complexCopierWithList() { TransferPair tp = mapVector.getTransferPair(allocator); tp.splitAndTransfer(0, 1); MapVector toMapVector = (MapVector) tp.getTo(); - JsonStringHashMap toMapValue = (JsonStringHashMap) toMapVector.getAccessor().getObject(0); + JsonStringHashMap toMapValue = (JsonStringHashMap) toMapVector.getAccessor().getObject(0); JsonStringArrayList object = (JsonStringArrayList) toMapValue.get("list"); assertEquals(1, object.get(0)); assertEquals(2, object.get(1)); - JsonStringHashMap innerMap = (JsonStringHashMap) object.get(2); + JsonStringHashMap innerMap = (JsonStringHashMap) object.get(2); assertEquals(1, innerMap.get("a")); - innerMap = (JsonStringHashMap) object.get(3); + innerMap = (JsonStringHashMap) object.get(3); assertEquals(2, innerMap.get("a")); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java index 3f717cbc18b6e..732bd98b7c61c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.nio.charset.StandardCharsets; @@ -155,7 +156,7 @@ protected void validateComplexContent(int count, VectorSchemaRoot root) { Assert.assertNull(intVal); } Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); - Assert.assertEquals(i % 3, ((List)root.getVector("list").getAccessor().getObject(i)).size()); + Assert.assertEquals(i % 3, ((List) root.getVector("list").getAccessor().getObject(i)).size()); NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); FieldReader mapReader = root.getVector("map").getReader(); mapReader.setPosition(i); @@ -198,11 +199,11 @@ protected void validateDateTimeContent(int count, VectorSchemaRoot root) { Assert.assertEquals(count, root.getRowCount()); printVectors(root.getFieldVectors()); for (int i = 0; i < count; i++) { - long dateVal = ((NullableDateMilliVector)root.getVector("date")).getAccessor().get(i); + long dateVal = ((NullableDateMilliVector) root.getVector("date")).getAccessor().get(i); LocalDateTime dt = makeDateTimeFromCount(i); LocalDateTime dateExpected = dt.minusMillis(dt.getMillisOfDay()); Assert.assertEquals(DateUtility.toMillis(dateExpected), dateVal); - long timeVal = ((NullableTimeMilliVector)root.getVector("time")).getAccessor().get(i); + long timeVal = ((NullableTimeMilliVector) root.getVector("time")).getAccessor().get(i); Assert.assertEquals(dt.getMillisOfDay(), timeVal); Object timestampMilliVal = root.getVector("timestamp-milli").getAccessor().getObject(i); Assert.assertEquals(dt, timestampMilliVal); @@ -450,20 +451,20 @@ public void validateUnionData(int count, VectorSchemaRoot root) { for (int i = 0; i < count; i++) { unionReader.setPosition(i); switch (i % 4) { - case 0: - Assert.assertEquals(i, unionReader.readInteger().intValue()); - break; - case 1: - Assert.assertEquals(i, unionReader.readLong().longValue()); - break; - case 2: - Assert.assertEquals(i % 3, unionReader.size()); - break; - case 3: - NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); - unionReader.reader("timestamp").read(h); - Assert.assertEquals(i, h.value); - break; + case 0: + Assert.assertEquals(i, unionReader.readInteger().intValue()); + break; + case 1: + Assert.assertEquals(i, unionReader.readLong().longValue()); + break; + case 2: + Assert.assertEquals(i % 3, unionReader.size()); + break; + case 3: + NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); + unionReader.reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + break; } } } @@ -483,28 +484,28 @@ public void writeUnionData(int count, NullableMapVector parent) { MapWriter mapWriter = rootWriter.map("union"); for (int i = 0; i < count; i++) { switch (i % 4) { - case 0: - intWriter.setPosition(i); - intWriter.writeInt(i); - break; - case 1: - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - break; - case 2: - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 3; j++) { - listWriter.varChar().writeVarChar(0, 3, varchar); - } - listWriter.endList(); - break; - case 3: - mapWriter.setPosition(i); - mapWriter.start(); - mapWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); - mapWriter.end(); - break; + case 0: + intWriter.setPosition(i); + intWriter.writeInt(i); + break; + case 1: + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + break; + case 2: + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + break; + case 3: + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); + mapWriter.end(); + break; } } writer.setValueCount(count); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index 90fb5d252d68d..c483ba7de91c6 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.ByteArrayInputStream; @@ -107,19 +108,19 @@ public void testWriteRead() throws IOException { // read try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); FileInputStream fileInputStream = new FileInputStream(file); - ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator){ - @Override - protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator allocator) throws IOException { - ArrowMessage message = super.readMessage(in, allocator); - if (message != null) { - ArrowRecordBatch batch = (ArrowRecordBatch) message; - List buffersLayout = batch.getBuffersLayout(); - for (ArrowBuffer arrowBuffer : buffersLayout) { - Assert.assertEquals(0, arrowBuffer.getOffset() % 8); - } - } - return message; - } + ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator) { + @Override + protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator allocator) throws IOException { + ArrowMessage message = super.readMessage(in, allocator); + if (message != null) { + ArrowRecordBatch batch = (ArrowRecordBatch) message; + List buffersLayout = batch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + } + return message; + } }) { Schema schema = arrowReader.getVectorSchemaRoot().getSchema(); LOGGER.debug("reading schema: " + schema); @@ -134,7 +135,7 @@ protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator alloc // Read from stream. try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); ByteArrayInputStream input = new ByteArrayInputStream(stream.toByteArray()); - ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator){ + ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator) { @Override protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { ArrowMessage message = super.readMessage(in, allocator); @@ -203,17 +204,17 @@ public void testWriteReadComplex() throws IOException { public void testWriteReadMultipleRBs() throws IOException { File file = new File("target/mytest_multiple.arrow"); ByteArrayOutputStream stream = new ByteArrayOutputStream(); - int[] counts = { 10, 5 }; + int[] counts = {10, 5}; // write try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); MapVector parent = MapVector.empty("parent", originalVectorAllocator); - FileOutputStream fileOutputStream = new FileOutputStream(file)){ + FileOutputStream fileOutputStream = new FileOutputStream(file)) { writeData(counts[0], parent); VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); - try(ArrowFileWriter fileWriter = new ArrowFileWriter(root, null, fileOutputStream.getChannel()); - ArrowStreamWriter streamWriter = new ArrowStreamWriter(root, null, stream)) { + try (ArrowFileWriter fileWriter = new ArrowFileWriter(root, null, fileOutputStream.getChannel()); + ArrowStreamWriter streamWriter = new ArrowStreamWriter(root, null, stream)) { fileWriter.start(); streamWriter.start(); @@ -318,7 +319,7 @@ public void testWriteReadTiny() throws IOException { root.getFieldVectors().get(0).allocateNew(); NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); for (int i = 0; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte)(i + 1)); + mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } mutator.setValueCount(16); root.setRowCount(16); @@ -367,7 +368,7 @@ private void validateTinyData(VectorSchemaRoot root) { NullableTinyIntVector vector = (NullableTinyIntVector) root.getFieldVectors().get(0); for (int i = 0; i < 16; i++) { if (i < 8) { - Assert.assertEquals((byte)(i + 1), vector.getAccessor().get(i)); + Assert.assertEquals((byte) (i + 1), vector.getAccessor().get(i)); } else { Assert.assertTrue(vector.getAccessor().isNull(i)); } @@ -384,7 +385,7 @@ public void testWriteReadMetadata() throws IOException { childFields.add(new Field("float-child", new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null, metadata(2)), null)); childFields.add(new Field("int-child", new FieldType(false, new ArrowType.Int(32, true), null, metadata(3)), null)); childFields.add(new Field("list-child", new FieldType(true, ArrowType.List.INSTANCE, null, metadata(4)), - ImmutableList.of(new Field("l1", FieldType.nullable(new ArrowType.Int(16 ,true)), null)))); + ImmutableList.of(new Field("l1", FieldType.nullable(new ArrowType.Int(16, true)), null)))); Field field = new Field("meta", new FieldType(true, ArrowType.Struct.INSTANCE, null, metadata(0)), childFields); Map metadata = new HashMap<>(); metadata.put("s1", "v1"); @@ -425,7 +426,7 @@ public void testWriteReadMetadata() throws IOException { Assert.assertEquals(originalSchema.getCustomMetadata(), schema.getCustomMetadata()); Field top = schema.getFields().get(0); Assert.assertEquals(metadata(0), top.getMetadata()); - for (int i = 0; i < 4; i ++) { + for (int i = 0; i < 4; i++) { Assert.assertEquals(metadata(i + 1), top.getChildren().get(i).getMetadata()); } } @@ -441,7 +442,7 @@ public void testWriteReadMetadata() throws IOException { Assert.assertEquals(originalSchema.getCustomMetadata(), schema.getCustomMetadata()); Field top = schema.getFields().get(0); Assert.assertEquals(metadata(0), top.getMetadata()); - for (int i = 0; i < 4; i ++) { + for (int i = 0; i < 4; i++) { Assert.assertEquals(metadata(i + 1), top.getChildren().get(i).getMetadata()); } } @@ -475,7 +476,7 @@ public void testWriteReadDictionary() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } @@ -534,7 +535,7 @@ public void testWriteReadNestedDictionary() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java index 3014e64b4eea4..4612465323130 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static java.util.Arrays.asList; @@ -40,7 +41,7 @@ public class TestArrowFooter { public void test() { Schema schema = new Schema(asList( new Field("a", FieldType.nullable(new ArrowType.Int(8, true)), Collections.emptyList()) - )); + )); ArrowFooter footer = new ArrowFooter(schema, Collections.emptyList(), Collections.emptyList()); ArrowFooter newFooter = roundTrip(footer); assertEquals(footer, newFooter); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java index 55629d5107c86..65332aa2c7de2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static java.nio.channels.Channels.newChannel; @@ -77,15 +78,15 @@ public void test() throws IOException { FieldVector vector = TestUtils.newVector(FieldVector.class, "testField", type, allocator); vector.initializeChildrenFromFields(schema.getFields().get(0).getChildren()); - byte[] validity = new byte[] { (byte) 255, 0}; + byte[] validity = new byte[] {(byte) 255, 0}; // second half is "undefined" - byte[] values = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + byte[] values = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; ByteArrayOutputStream out = new ByteArrayOutputStream(); try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), asList(vector), 16); ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out))) { ArrowBuf validityb = buf(validity); - ArrowBuf valuesb = buf(values); + ArrowBuf valuesb = buf(values); writer.writeRecordBatch(new ArrowRecordBatch(16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb))); } @@ -113,7 +114,7 @@ public void test() throws IOException { // Read just the header. This demonstrates being able to read without need to // deserialize the buffer. ByteBuffer headerBuffer = ByteBuffer.allocate(recordBatches.get(0).getMetadataLength()); - headerBuffer.put(byteArray, (int)recordBatches.get(0).getOffset(), headerBuffer.capacity()); + headerBuffer.put(byteArray, (int) recordBatches.get(0).getOffset(), headerBuffer.capacity()); headerBuffer.position(4); Message messageFB = Message.getRootAsMessage(headerBuffer); RecordBatch recordBatchFB = (RecordBatch) messageFB.header(new RecordBatch()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java index 7e9afd381c181..e2efabef0095b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static java.util.Arrays.asList; @@ -71,7 +72,7 @@ public void testReadWrite() throws IOException { root.getFieldVectors().get(0).allocateNew(); NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); for (int i = 0; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte)(i + 1)); + mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } mutator.setValueCount(16); root.setRowCount(16); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java index 20d4482da7c98..a19c3795fd5bb 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static org.junit.Assert.assertEquals; @@ -65,7 +66,7 @@ public void run() { // Send a changing batch id first mutator.set(0, j); for (int i = 1; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte)(i + 1)); + mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } mutator.setValueCount(16); root.setRowCount(16); @@ -80,7 +81,9 @@ public void run() { } } - public long bytesWritten() { return writer.bytesWritten(); } + public long bytesWritten() { + return writer.bytesWritten(); + } } private final class ReaderThread extends Thread { @@ -104,6 +107,7 @@ protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) th } return message; } + @Override public boolean loadNextBatch() throws IOException { if (!super.loadNextBatch()) { @@ -113,10 +117,10 @@ public boolean loadNextBatch() throws IOException { VectorSchemaRoot root = getVectorSchemaRoot(); Assert.assertEquals(16, root.getRowCount()); NullableTinyIntVector vector = (NullableTinyIntVector) root.getFieldVectors().get(0); - Assert.assertEquals((byte)(batchesRead - 1), vector.getAccessor().get(0)); + Assert.assertEquals((byte) (batchesRead - 1), vector.getAccessor().get(0)); for (int i = 1; i < 16; i++) { if (i < 8) { - Assert.assertEquals((byte)(i + 1), vector.getAccessor().get(i)); + Assert.assertEquals((byte) (i + 1), vector.getAccessor().get(i)); } else { Assert.assertTrue(vector.getAccessor().isNull(i)); } @@ -143,8 +147,13 @@ public void run() { } } - public int getBatchesRead() { return batchesRead; } - public long bytesRead() { return reader.bytesRead(); } + public int getBatchesRead() { + return batchesRead; + } + + public long bytesRead() { + return reader.bytesRead(); + } } // Starts up a producer and consumer thread to read/write batches. diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java index 311cada194eaf..24b2138386da1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file.json; import java.io.File; @@ -53,7 +54,7 @@ public void testWriteReadComplexJSON() throws IOException { // read try ( BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); - ) { + ) { JsonFileReader reader = new JsonFileReader(file, readerAllocator); Schema schema = reader.start(); LOGGER.debug("reading schema: " + schema); @@ -109,7 +110,7 @@ public void testWriteReadUnionJSON() throws IOException { try ( BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - ) { + ) { JsonFileReader reader = new JsonFileReader(file, readerAllocator); Schema schema = reader.start(); LOGGER.debug("reading schema: " + schema); @@ -174,7 +175,7 @@ public void testWriteReadDictionaryJSON() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } @@ -215,7 +216,7 @@ public void testWriteReadNestedDictionaryJSON() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } @@ -240,7 +241,7 @@ public void testWriteReadNestedDictionaryJSON() throws IOException { public void testSetStructLength() throws IOException { File file = new File("../../integration/data/struct_example.json"); try ( - BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); ) { JsonFileReader reader = new JsonFileReader(file, readerAllocator); Schema schema = reader.start(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 62c21f7da0db6..f98aeac8c8196 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.pojo; import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; @@ -92,15 +93,15 @@ public void nestedSchema() { childrenBuilder.add(new Field("child3", FieldType.nullable(new Struct()), ImmutableList.of( new Field("child3.1", FieldType.nullable(Utf8.INSTANCE), null), new Field("child3.2", FieldType.nullable(new FloatingPoint(DOUBLE)), ImmutableList.of()) - ))); + ))); childrenBuilder.add(new Field("child4", FieldType.nullable(new List()), ImmutableList.of( new Field("child4.1", FieldType.nullable(Utf8.INSTANCE), null) - ))); - childrenBuilder.add(new Field("child5", FieldType.nullable(new Union(UnionMode.Sparse, new int[] { MinorType.TIMESTAMPMILLI.ordinal(), MinorType.FLOAT8.ordinal() } )), ImmutableList.of( + ))); + childrenBuilder.add(new Field("child5", FieldType.nullable(new Union(UnionMode.Sparse, new int[] {MinorType.TIMESTAMPMILLI.ordinal(), MinorType.FLOAT8.ordinal()})), ImmutableList.of( new Field("child5.1", FieldType.nullable(new Timestamp(TimeUnit.MILLISECOND, null)), null), new Field("child5.2", FieldType.nullable(new FloatingPoint(DOUBLE)), ImmutableList.of()), new Field("child5.3", true, new Timestamp(TimeUnit.MILLISECOND, "UTC"), null) - ))); + ))); Schema initialSchema = new Schema(childrenBuilder.build()); run(initialSchema); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java b/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java index 9678423c0fbbe..f968768f5e67d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import static java.util.Arrays.asList; @@ -107,13 +108,13 @@ public void testdeSerializeRecordBatchLongMetaData() throws IOException { @Test public void testSerializeRecordBatch() throws IOException { - byte[] validity = new byte[] { (byte)255, 0}; + byte[] validity = new byte[] {(byte) 255, 0}; // second half is "undefined" - byte[] values = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + byte[] values = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE); ArrowBuf validityb = buf(alloc, validity); - ArrowBuf valuesb = buf(alloc, values); + ArrowBuf valuesb = buf(alloc, values); ArrowRecordBatch batch = new ArrowRecordBatch( 16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb)); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java index 84cc10787f7b0..43b0907720f83 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; import static java.util.Arrays.asList; @@ -66,7 +67,7 @@ public void testComplex() throws IOException { field("g", new Timestamp(TimeUnit.MILLISECOND, "UTC")), field("h", new Timestamp(TimeUnit.MICROSECOND, null)), field("i", new Interval(IntervalUnit.DAY_TIME)) - )); + )); roundTrip(schema); assertEquals( "Schema, e: List, f: FloatingPoint(SINGLE), g: Timestamp(MILLISECOND, UTC), h: Timestamp(MICROSECOND, null), i: Interval(DAY_TIME)>", @@ -95,7 +96,7 @@ public void testAll() throws IOException { field("q", new Timestamp(TimeUnit.MILLISECOND, "UTC")), field("r", new Timestamp(TimeUnit.MICROSECOND, null)), field("s", new Interval(IntervalUnit.DAY_TIME)) - )); + )); roundTrip(schema); } @@ -103,7 +104,7 @@ public void testAll() throws IOException { public void testUnion() throws IOException { Schema schema = new Schema(asList( field("d", new Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new Null())) - )); + )); roundTrip(schema); contains(schema, "Sparse"); } @@ -113,7 +114,7 @@ public void testDate() throws IOException { Schema schema = new Schema(asList( field("a", new Date(DateUnit.DAY)), field("b", new Date(DateUnit.MILLISECOND)) - )); + )); roundTrip(schema); assertEquals( "Schema", @@ -123,15 +124,15 @@ public void testDate() throws IOException { @Test public void testTime() throws IOException { Schema schema = new Schema(asList( - field("a", new Time(TimeUnit.SECOND, 32)), - field("b", new Time(TimeUnit.MILLISECOND, 32)), - field("c", new Time(TimeUnit.MICROSECOND, 64)), - field("d", new Time(TimeUnit.NANOSECOND, 64)) + field("a", new Time(TimeUnit.SECOND, 32)), + field("b", new Time(TimeUnit.MILLISECOND, 32)), + field("c", new Time(TimeUnit.MICROSECOND, 64)), + field("d", new Time(TimeUnit.NANOSECOND, 64)) )); roundTrip(schema); assertEquals( - "Schema", - schema.toString()); + "Schema", + schema.toString()); } @Test @@ -145,7 +146,7 @@ public void testTS() throws IOException { field("f", new Timestamp(TimeUnit.MILLISECOND, null)), field("g", new Timestamp(TimeUnit.MICROSECOND, null)), field("h", new Timestamp(TimeUnit.NANOSECOND, null)) - )); + )); roundTrip(schema); assertEquals( "Schema", @@ -157,7 +158,7 @@ public void testInterval() throws IOException { Schema schema = new Schema(asList( field("a", new Interval(IntervalUnit.YEAR_MONTH)), field("b", new Interval(IntervalUnit.DAY_TIME)) - )); + )); roundTrip(schema); contains(schema, "YEAR_MONTH", "DAY_TIME"); } @@ -168,7 +169,7 @@ public void testFP() throws IOException { field("a", new FloatingPoint(FloatingPointPrecision.HALF)), field("b", new FloatingPoint(FloatingPointPrecision.SINGLE)), field("c", new FloatingPoint(FloatingPointPrecision.DOUBLE)) - )); + )); roundTrip(schema); contains(schema, "HALF", "SINGLE", "DOUBLE"); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java index 7cf638e57d849..95b08099c204d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.arrow.vector.util; import static org.apache.arrow.vector.util.Validator.equalEnough; @@ -26,32 +27,32 @@ public class TestValidator { - @Test - public void testFloatComp() { - assertTrue(equalEnough(912.4140000000002F, 912.414F)); - assertTrue(equalEnough(912.4140000000002D, 912.414D)); - assertTrue(equalEnough(912.414F, 912.4140000000002F)); - assertTrue(equalEnough(912.414D, 912.4140000000002D)); - assertFalse(equalEnough(912.414D, 912.4140001D)); - assertFalse(equalEnough(null, 912.414D)); - assertTrue(equalEnough((Float)null, null)); - assertTrue(equalEnough((Double)null, null)); - assertFalse(equalEnough(912.414D, null)); - assertFalse(equalEnough(Double.MAX_VALUE, Double.MIN_VALUE)); - assertFalse(equalEnough(Double.MIN_VALUE, Double.MAX_VALUE)); - assertTrue(equalEnough(Double.MAX_VALUE, Double.MAX_VALUE)); - assertTrue(equalEnough(Double.MIN_VALUE, Double.MIN_VALUE)); - assertTrue(equalEnough(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY)); - assertFalse(equalEnough(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); - assertTrue(equalEnough(Double.NaN, Double.NaN)); - assertFalse(equalEnough(1.0, Double.NaN)); - assertFalse(equalEnough(Float.MAX_VALUE, Float.MIN_VALUE)); - assertFalse(equalEnough(Float.MIN_VALUE, Float.MAX_VALUE)); - assertTrue(equalEnough(Float.MAX_VALUE, Float.MAX_VALUE)); - assertTrue(equalEnough(Float.MIN_VALUE, Float.MIN_VALUE)); - assertTrue(equalEnough(Float.NEGATIVE_INFINITY, Float.NEGATIVE_INFINITY)); - assertFalse(equalEnough(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)); - assertTrue(equalEnough(Float.NaN, Float.NaN)); - assertFalse(equalEnough(1.0F, Float.NaN)); - } + @Test + public void testFloatComp() { + assertTrue(equalEnough(912.4140000000002F, 912.414F)); + assertTrue(equalEnough(912.4140000000002D, 912.414D)); + assertTrue(equalEnough(912.414F, 912.4140000000002F)); + assertTrue(equalEnough(912.414D, 912.4140000000002D)); + assertFalse(equalEnough(912.414D, 912.4140001D)); + assertFalse(equalEnough(null, 912.414D)); + assertTrue(equalEnough((Float) null, null)); + assertTrue(equalEnough((Double) null, null)); + assertFalse(equalEnough(912.414D, null)); + assertFalse(equalEnough(Double.MAX_VALUE, Double.MIN_VALUE)); + assertFalse(equalEnough(Double.MIN_VALUE, Double.MAX_VALUE)); + assertTrue(equalEnough(Double.MAX_VALUE, Double.MAX_VALUE)); + assertTrue(equalEnough(Double.MIN_VALUE, Double.MIN_VALUE)); + assertTrue(equalEnough(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY)); + assertFalse(equalEnough(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); + assertTrue(equalEnough(Double.NaN, Double.NaN)); + assertFalse(equalEnough(1.0, Double.NaN)); + assertFalse(equalEnough(Float.MAX_VALUE, Float.MIN_VALUE)); + assertFalse(equalEnough(Float.MIN_VALUE, Float.MAX_VALUE)); + assertTrue(equalEnough(Float.MAX_VALUE, Float.MAX_VALUE)); + assertTrue(equalEnough(Float.MIN_VALUE, Float.MIN_VALUE)); + assertTrue(equalEnough(Float.NEGATIVE_INFINITY, Float.NEGATIVE_INFINITY)); + assertFalse(equalEnough(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)); + assertTrue(equalEnough(Float.NaN, Float.NaN)); + assertFalse(equalEnough(1.0F, Float.NaN)); + } } diff --git a/python/.flake8.cython b/python/.flake8.cython new file mode 100644 index 0000000000000..53e41323051f9 --- /dev/null +++ b/python/.flake8.cython @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[flake8] +filename = *.pyx,*.pxd,*.pxi +ignore = E211,E901,E225,E226,E227 diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index bfae157ed6b9c..af95073f5da35 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -92,6 +92,10 @@ else() # Cython generates some bitshift expressions that MSVC does not like in # __Pyx_PyFloat_DivideObjC set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293") + + # Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning + # seem harmless, and probably not worth the effort of working around it + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800") endif() if ("${COMPILER_FAMILY}" STREQUAL "clang") diff --git a/python/README.md b/python/README.md index 29d213babd93b..3c48d5d30b595 100644 --- a/python/README.md +++ b/python/README.md @@ -38,7 +38,21 @@ On Linux, you can also install binary wheels from PyPI with pip: pip install pyarrow ``` -### Development details +## Development + +### Coding Style + +We follow a similar PEP8-like coding style to the [pandas project][3]. + +The code must pass `flake8` (available from pip or conda) or it will fail the +build. Check for style errors before submitting your pull request with: + +``` +flake8 pyarrow +flake8 --config=.flake8.cython pyarrow +``` + +### Building from Source See the [Development][2] page in the documentation. @@ -50,4 +64,5 @@ python setup.py build_sphinx -s doc/source ``` [1]: https://github.com/apache/parquet-cpp -[2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst \ No newline at end of file +[2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst +[3]: https://github.com/pandas-dev/pandas \ No newline at end of file diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index fd1cb728d9828..1aaf89ce9a1f0 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -91,13 +91,14 @@ Scalar Value Types .. _api.array: -Array Types and Constructors ----------------------------- +.. currentmodule:: pyarrow.lib + +Array Types +----------- .. autosummary:: :toctree: generated/ - array Array BooleanArray DictionaryArray @@ -126,6 +127,8 @@ Array Types and Constructors .. _api.table: +.. currentmodule:: pyarrow + Tables and Record Batches ------------------------- @@ -214,6 +217,8 @@ Memory Pools .. _api.type_classes: +.. currentmodule:: pyarrow.lib + Type Classes ------------ diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index d0a1c544dd091..53544ba7a6ac3 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -159,12 +159,16 @@ Now build and install the Arrow C++ libraries: cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_PYTHON=on \ + -DARROW_PLASMA=on \ -DARROW_BUILD_TESTS=OFF \ .. make -j4 make install popd +If you don't want to build and install the Plasma in-memory object store, +you can omit the ``-DARROW_PLASMA=on`` flag. + Now, optionally build and install the Apache Parquet libraries in your toolchain: @@ -190,9 +194,10 @@ Now, build pyarrow: cd arrow/python python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ - --with-parquet --inplace + --with-parquet --with-plasma --inplace -If you did not build parquet-cpp, you can omit ``--with-parquet``. +If you did not build parquet-cpp, you can omit ``--with-parquet`` and if +you did not build with plasma, you can omit ``--with-plasma``. You should be able to run the unit tests with: @@ -224,9 +229,10 @@ You can build a wheel by running: .. code-block:: shell python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ - --with-parquet --bundle-arrow-cpp bdist_wheel + --with-parquet --with-plasma --bundle-arrow-cpp bdist_wheel -Again, if you did not build parquet-cpp, you should omit ``--with-parquet``. +Again, if you did not build parquet-cpp, you should omit ``--with-parquet`` and +if you did not build with plasma, you should omit ``--with-plasma``. Developing on Windows ===================== diff --git a/python/doc/source/index.rst b/python/doc/source/index.rst index a12853c448209..c2ae769b23e83 100644 --- a/python/doc/source/index.rst +++ b/python/doc/source/index.rst @@ -40,6 +40,7 @@ structures. data ipc filesystems + plasma pandas parquet api diff --git a/python/doc/source/plasma.rst b/python/doc/source/plasma.rst new file mode 100644 index 0000000000000..832d9960cb539 --- /dev/null +++ b/python/doc/source/plasma.rst @@ -0,0 +1,337 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _plasma: + +The Plasma In-Memory Object Store +================================= + +.. contents:: Contents + :depth: 3 + + +The Plasma API +-------------- + +Starting the Plasma store +^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can start the Plasma store by issuing a terminal command similar to the +following: + +.. code-block:: bash + + plasma_store -m 1000000000 -s /tmp/plasma + +The ``-m`` flag specifies the size of the store in bytes, and the ``-s`` flag +specifies the socket that the store will listen at. Thus, the above command +allows the Plasma store to use up to 1GB of memory, and sets the socket to +``/tmp/plasma``. + +Leaving the current terminal window open as long as Plasma store should keep +running. Messages, concerning such as disconnecting clients, may occasionally be +printed to the screen. To stop running the Plasma store, you can press +``Ctrl-C`` in the terminal. + +Creating a Plasma client +^^^^^^^^^^^^^^^^^^^^^^^^ + +To start a Plasma client from Python, call ``plasma.connect`` using the same +socket name: + +.. code-block:: python + + import pyarrow.plasma as plasma + client = plasma.connect("/tmp/plasma", "", 0) + +If the following error occurs from running the above Python code, that +means that either the socket given is incorrect, or the ``./plasma_store`` is +not currently running. Check to see if the Plasma store is still running. + +.. code-block:: shell + + >>> client = plasma.connect("/tmp/plasma", "", 0) + Connection to socket failed for pathname /tmp/plasma + Could not connect to socket /tmp/plasma + + +Object IDs +^^^^^^^^^^ + +Each object in the Plasma store should be associated with a unique ID. The +Object ID then serves as a key that any client can use to retrieve that object +from the Plasma store. You can form an ``ObjectID`` object from a byte string of +length 20. + +.. code-block:: shell + + # Create an ObjectID. + >>> id = plasma.ObjectID(20 * b"a") + + # The character "a" is encoded as 61 in hex. + >>> id + ObjectID(6161616161616161616161616161616161616161) + +The random generation of Object IDs is often good enough to ensure unique IDs. +You can easily create a helper function that randomly generates object IDs as +follows: + +.. code-block:: python + + import numpy as np + + def random_object_id(): + return plasma.ObjectID(np.random.bytes(20)) + + +Creating an Object +^^^^^^^^^^^^^^^^^^ + +Objects are created in Plasma in two stages. First, they are **created**, which +allocates a buffer for the object. At this point, the client can write to the +buffer and construct the object within the allocated buffer. + +To create an object for Plasma, you need to create an object ID, as well as +give the object's maximum size in bytes. + +.. code-block:: python + + # Create an object. + object_id = plasma.ObjectID(20 * b"a") + object_size = 1000 + buffer = memoryview(client.create(object_id, object_size)) + + # Write to the buffer. + for i in range(1000): + buffer[i] = i % 128 + +When the client is done, the client **seals** the buffer, making the object +immutable, and making it available to other Plasma clients. + +.. code-block:: python + + # Seal the object. This makes the object immutable and available to other clients. + client.seal(object_id) + + +Getting an Object +^^^^^^^^^^^^^^^^^ + +After an object has been sealed, any client who knows the object ID can get +the object. + +.. code-block:: python + + # Create a different client. Note that this second client could be + # created in the same or in a separate, concurrent Python session. + client2 = plasma.connect("/tmp/plasma", "", 0) + + # Get the object in the second client. This blocks until the object has been sealed. + object_id2 = plasma.ObjectID(20 * b"a") + [buffer2] = client2.get([object_id]) + +If the object has not been sealed yet, then the call to client.get will block +until the object has been sealed by the client constructing the object. Using +the ``timeout_ms`` argument to get, you can specify a timeout for this (in +milliseconds). After the timeout, the interpreter will yield control back. + +.. code-block:: shell + + >>> buffer + + >>> buffer[1] + 1 + >>> buffer2 + + >>> view2 = memoryview(buffer2) + >>> view2[1] + 1 + >>> view2[129] + 1 + >>> bytes(buffer[1:4]) + b'\x01\x02\x03' + >>> bytes(view2[1:4]) + b'\x01\x02\x03' + + +Using Arrow and Pandas with Plasma +---------------------------------- + +Storing Arrow Objects in Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To store an Arrow object in Plasma, we must first **create** the object and then +**seal** it. However, Arrow objects such as ``Tensors`` may be more complicated +to write than simple binary data. + +To create the object in Plasma, you still need an ``ObjectID`` and a size to +pass in. To find out the size of your Arrow object, you can use pyarrow +API such as ``pyarrow.get_tensor_size``. + +.. code-block:: python + + import numpy as np + import pyarrow as pa + + # Create a pyarrow.Tensor object from a numpy random 2-dimensional array + data = np.random.randn(10, 4) + tensor = pa.Tensor.from_numpy(data) + + # Create the object in Plasma + object_id = plasma.ObjectID(np.random.bytes(20)) + data_size = pa.get_tensor_size(tensor) + buf = client.create(object_id, data_size) + +To write the Arrow ``Tensor`` object into the buffer, you can use Plasma to +convert the ``memoryview`` buffer into a ``pyarrow.FixedSizeBufferOutputStream`` +object. A ``pyarrow.FixedSizeBufferOutputStream`` is a format suitable for Arrow's +``pyarrow.write_tensor``: + +.. code-block:: python + + # Write the tensor into the Plasma-allocated buffer + stream = pa.FixedSizeBufferOutputStream(buf) + pa.write_tensor(tensor, stream) # Writes tensor's 552 bytes to Plasma stream + +To finish storing the Arrow object in Plasma, call ``seal``: + +.. code-block:: python + + # Seal the Plasma object + client.seal(object_id) + +Getting Arrow Objects from Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To read the object, first retrieve it as a ``PlasmaBuffer`` using its object ID. + +.. code-block:: python + + # Get the arrow object by ObjectID. + [buf2] = client.get([object_id]) + +To convert the ``PlasmaBuffer`` back into an Arrow ``Tensor``, first create a +pyarrow ``BufferReader`` object from it. You can then pass the ``BufferReader`` +into ``pyarrow.read_tensor`` to reconstruct the Arrow ``Tensor`` object: + +.. code-block:: python + + # Reconstruct the Arrow tensor object. + reader = pa.BufferReader(buf2) + tensor2 = pa.read_tensor(reader) + +Finally, you can use ``pyarrow.read_tensor`` to convert the Arrow object +back into numpy data: + +.. code-block:: python + + # Convert back to numpy + array = tensor2.to_numpy() + +Storing Pandas DataFrames in Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Storing a Pandas ``DataFrame`` still follows the **create** then **seal** +process of storing an object in the Plasma store, however one cannot directly +write the ``DataFrame`` to Plasma with Pandas alone. Plasma also needs to know +the size of the ``DataFrame`` to allocate a buffer for. + +See :ref:`pandas` for more information on using Arrow with Pandas. + +You can create the pyarrow equivalent of a Pandas ``DataFrame`` by using +``pyarrow.from_pandas`` to convert it to a ``RecordBatch``. + +.. code-block:: python + + import pyarrow as pa + import pandas as pd + + # Create a Pandas DataFrame + d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), + 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + df = pd.DataFrame(d) + + # Convert the Pandas DataFrame into a PyArrow RecordBatch + record_batch = pa.RecordBatch.from_pandas(df) + +Creating the Plasma object requires an ``ObjectID`` and the size of the +data. Now that we have converted the Pandas ``DataFrame`` into a PyArrow +``RecordBatch``, use the ``MockOutputStream`` to determine the +size of the Plasma object. + +.. code-block:: python + + # Create the Plasma object from the PyArrow RecordBatch. Most of the work here + # is done to determine the size of buffer to request from the object store. + object_id = plasma.ObjectID(np.random.bytes(20)) + mock_sink = pa.MockOutputStream() + stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) + stream_writer.write_batch(record_batch) + stream_writer.close() + data_size = mock_sink.size() + buf = client.create(object_id, data_size) + +The DataFrame can now be written to the buffer as follows. + +.. code-block:: python + + # Write the PyArrow RecordBatch to Plasma + stream = pa.FixedSizeBufferOutputStream(buf) + stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) + stream_writer.write_batch(record_batch) + stream_writer.close() + +Finally, seal the finished object for use by all clients: + +.. code-block:: python + + # Seal the Plasma object + client.seal(object_id) + +Getting Pandas DataFrames from Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since we store the Pandas DataFrame as a PyArrow ``RecordBatch`` object, +to get the object back from the Plasma store, we follow similar steps +to those specified in `Getting Arrow Objects from Plasma`_. + +We first have to convert the ``PlasmaBuffer`` returned from ``client.get`` +into an Arrow ``BufferReader`` object. + +.. code-block:: python + + # Fetch the Plasma object + [data] = client.get([object_id]) # Get PlasmaBuffer from ObjectID + buffer = pa.BufferReader(data) + +From the ``BufferReader``, we can create a specific ``RecordBatchStreamReader`` +in Arrow to reconstruct the stored PyArrow ``RecordBatch`` object. + +.. code-block:: python + + # Convert object back into an Arrow RecordBatch + reader = pa.RecordBatchStreamReader(buffer) + record_batch = reader.read_next_batch() + +The last step is to convert the PyArrow ``RecordBatch`` object back into +the original Pandas ``DataFrame`` structure. + +.. code-block:: python + + # Convert back into Pandas + result = record_batch.to_pandas() diff --git a/python/examples/plasma/sorting/multimerge.pyx b/python/examples/plasma/sorting/multimerge.pyx new file mode 100644 index 0000000000000..6dd5aaef95cb9 --- /dev/null +++ b/python/examples/plasma/sorting/multimerge.pyx @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from libc.stdint cimport uintptr_t +from libcpp.vector cimport vector +from libcpp.pair cimport pair + +cimport numpy as np +import numpy as np + + +cdef extern from "" namespace "std" nogil: + cdef cppclass priority_queue[T]: + priority_queue() except + + priority_queue(priority_queue&) except + + bint empty() + void pop() + void push(T&) + size_t size() + T& top() + + +def multimerge2d(*arrays): + """Merge a list of sorted 2d arrays into a sorted 2d array. + + This assumes C style ordering for both input and output arrays. For + each input array we have array[i,0] <= array[i+1,0] and for the output + array the same will hold. + + Ideally this code would be simpler and also support both C style + and Fortran style ordering. + """ + cdef int num_arrays = len(arrays) + assert num_arrays > 0 + + cdef int num_cols = arrays[0].shape[1] + + for i in range(num_arrays): + assert arrays[i].ndim == 2 + assert arrays[i].dtype == np.float64 + assert arrays[i].shape[1] == num_cols + assert not np.isfortran(arrays[i]) + + cdef vector[double*] data + + # The indices vector keeps track of the index of the next row to process in + # each array. + cdef vector[int] indices = num_arrays * [0] + + # The sizes vector stores the total number of elements that each array has. + cdef vector[int] sizes + + cdef priority_queue[pair[double, int]] queue + cdef pair[double, int] top + cdef int num_rows = sum([array.shape[0] for array in arrays]) + cdef np.ndarray[np.float64_t, ndim=2] result = np.zeros( + (num_rows, num_cols), dtype=np.float64) + cdef double* result_ptr = np.PyArray_DATA(result) + for i in range(num_arrays): + if arrays[i].size > 0: + sizes.push_back(arrays[i].size) + data.push_back( np.PyArray_DATA(arrays[i])) + queue.push(pair[double, int](-data[i][0], i)) + + cdef int curr_idx = 0 + cdef int j + cdef int col = 0 + + for j in range(num_rows): + top = queue.top() + for col in range(num_cols): + result_ptr[curr_idx + col] = ( + data[top.second][indices[top.second] + col]) + + indices[top.second] += num_cols + curr_idx += num_cols + + queue.pop() + if indices[top.second] < sizes[top.second]: + queue.push( + pair[double, int](-data[top.second][indices[top.second]], + top.second)) + + return result diff --git a/python/examples/plasma/sorting/setup.py b/python/examples/plasma/sorting/setup.py new file mode 100644 index 0000000000000..a578085a8e4cc --- /dev/null +++ b/python/examples/plasma/sorting/setup.py @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +from distutils.core import setup +from Cython.Build import cythonize + +setup( + name="multimerge", + extra_compile_args=["-O3", "-mtune=native", "-march=native"], + ext_modules=cythonize("multimerge.pyx"), + include_dirs=[np.get_include()], +) diff --git a/python/examples/plasma/sorting/sort_df.py b/python/examples/plasma/sorting/sort_df.py new file mode 100644 index 0000000000000..03cfd13c6d76f --- /dev/null +++ b/python/examples/plasma/sorting/sort_df.py @@ -0,0 +1,204 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from multiprocessing import Pool +import numpy as np +import os +import pandas as pd +import pyarrow as pa +import pyarrow.plasma as plasma +import subprocess +import time + +import multimerge + +# To run this example, you will first need to run "python setup.py install" in +# this directory to build the Cython module. +# +# You will only see speedups if you run this code on more data, this is just a +# small example that can run on a laptop. +# +# The values we used to get a speedup (on a m4.10xlarge instance on EC2) were +# object_store_size = 84 * 10 ** 9 +# num_cores = 20 +# num_rows = 10 ** 9 +# num_cols = 1 + +client = None +object_store_size = 2 * 10 ** 9 # 2 GB +num_cores = 8 +num_rows = 200000 +num_cols = 2 +column_names = [str(i) for i in range(num_cols)] +column_to_sort = column_names[0] + + +# Connect to clients +def connect(): + global client + client = plasma.connect('/tmp/store', '', 0) + np.random.seed(int(time.time() * 10e7) % 10000000) + + +def put_df(df): + record_batch = pa.RecordBatch.from_pandas(df) + + # Get size of record batch and schema + mock_sink = pa.MockOutputStream() + stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) + stream_writer.write_batch(record_batch) + data_size = mock_sink.size() + + # Generate an ID and allocate a buffer in the object store for the + # serialized DataFrame + object_id = plasma.ObjectID(np.random.bytes(20)) + buf = client.create(object_id, data_size) + + # Write the serialized DataFrame to the object store + sink = pa.FixedSizeBufferOutputStream(buf) + stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema) + stream_writer.write_batch(record_batch) + + # Seal the object + client.seal(object_id) + + return object_id + + +def get_dfs(object_ids): + """Retrieve dataframes from the object store given their object IDs.""" + buffers = client.get(object_ids) + return [pa.RecordBatchStreamReader(buf).read_next_batch().to_pandas() + for buf in buffers] + + +def local_sort(object_id): + """Sort a partition of a dataframe.""" + # Get the dataframe from the object store. + [df] = get_dfs([object_id]) + # Sort the dataframe. + sorted_df = df.sort_values(by=column_to_sort) + # Get evenly spaced values from the dataframe. + indices = np.linspace(0, len(df) - 1, num=num_cores, dtype=np.int64) + # Put the sorted dataframe in the object store and return the corresponding + # object ID as well as the sampled values. + return put_df(sorted_df), sorted_df.as_matrix().take(indices) + + +def local_partitions(object_id_and_pivots): + """Take a sorted partition of a dataframe and split it into more pieces.""" + object_id, pivots = object_id_and_pivots + [df] = get_dfs([object_id]) + split_at = df[column_to_sort].searchsorted(pivots) + split_at = [0] + list(split_at) + [len(df)] + # Partition the sorted dataframe and put each partition into the object + # store. + return [put_df(df[i:j]) for i, j in zip(split_at[:-1], split_at[1:])] + + +def merge(object_ids): + """Merge a number of sorted dataframes into a single sorted dataframe.""" + dfs = get_dfs(object_ids) + + # In order to use our multimerge code, we have to convert the arrays from + # the Fortran format to the C format. + arrays = [np.ascontiguousarray(df.as_matrix()) for df in dfs] + for a in arrays: + assert a.dtype == np.float64 + assert not np.isfortran(a) + + # Filter out empty arrays. + arrays = [a for a in arrays if a.shape[0] > 0] + + if len(arrays) == 0: + return None + + resulting_array = multimerge.multimerge2d(*arrays) + merged_df2 = pd.DataFrame(resulting_array, columns=column_names) + + return put_df(merged_df2) + + +if __name__ == '__main__': + # Start the plasma store. + p = subprocess.Popen(['plasma_store', + '-s', '/tmp/store', + '-m', str(object_store_size)]) + + # Connect to the plasma store. + connect() + + # Connect the processes in the pool. + pool = Pool(initializer=connect, initargs=(), processes=num_cores) + + # Create a DataFrame from a numpy array. + df = pd.DataFrame(np.random.randn(num_rows, num_cols), + columns=column_names) + + partition_ids = [put_df(partition) for partition + in np.split(df, num_cores)] + + # Begin timing the parallel sort example. + parallel_sort_start = time.time() + + # Sort each partition and subsample them. The subsampled values will be + # used to create buckets. + sorted_df_ids, pivot_groups = list(zip(*pool.map(local_sort, + partition_ids))) + + # Choose the pivots. + all_pivots = np.concatenate(pivot_groups) + indices = np.linspace(0, len(all_pivots) - 1, num=num_cores, + dtype=np.int64) + pivots = np.take(np.sort(all_pivots), indices) + + # Break all of the sorted partitions into even smaller partitions. Group + # the object IDs from each bucket together. + results = list(zip(*pool.map(local_partitions, + zip(sorted_df_ids, + len(sorted_df_ids) * [pivots])))) + + # Merge each of the buckets and store the results in the object store. + object_ids = pool.map(merge, results) + + resulting_ids = [object_id for object_id in object_ids + if object_id is not None] + + # Stop timing the paralle sort example. + parallel_sort_end = time.time() + + print('Parallel sort took {} seconds.' + .format(parallel_sort_end - parallel_sort_start)) + + serial_sort_start = time.time() + + original_sorted_df = df.sort_values(by=column_to_sort) + + serial_sort_end = time.time() + + # Check that we sorted the DataFrame properly. + + sorted_dfs = get_dfs(resulting_ids) + sorted_df = pd.concat(sorted_dfs) + + print('Serial sort took {} seconds.' + .format(serial_sort_end - serial_sort_start)) + + assert np.allclose(sorted_df.values, original_sorted_df.values) + + # Kill the object store. + p.kill() diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 5a21e36e4d7d5..074bd0056a948 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -58,7 +58,7 @@ for PYTHON in ${PYTHON_VERSIONS}; do ARROW_BUILD_DIR=/arrow/cpp/build-PY${PYTHON} mkdir -p "${ARROW_BUILD_DIR}" pushd "${ARROW_BUILD_DIR}" - PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON .. + PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=off -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON .. make -j5 install popd @@ -81,9 +81,7 @@ for PYTHON in ${PYTHON_VERSIONS}; do source /venv-test-${PYTHON}/bin/activate pip install repaired_wheels/*.whl - # ARROW-1264; for some reason the test case added causes a segfault inside - # the Docker container when writing and error message to stderr - py.test --parquet /venv-test-${PYTHON}/lib/*/site-packages/pyarrow -v -s --disable-plasma + py.test --parquet /venv-test-${PYTHON}/lib/*/site-packages/pyarrow -v deactivate mv repaired_wheels/*.whl /io/dist diff --git a/python/pyarrow/_config.pyx b/python/pyarrow/_config.pyx index a2d2d719e68d0..bc9f36d8e50cb 100644 --- a/python/pyarrow/_config.pyx +++ b/python/pyarrow/_config.pyx @@ -19,6 +19,10 @@ # distutils: language = c++ # cython: embedsignature = True +import numpy as np +import multiprocessing +import os + cdef extern from 'arrow/python/init.h': int arrow_init_numpy() except -1 @@ -27,15 +31,13 @@ cdef extern from 'arrow/python/config.h' namespace 'arrow::py': arrow_init_numpy() -import numpy as np set_numpy_nan(np.nan) -import multiprocessing -import os cdef int CPU_COUNT = int( os.environ.get('OMP_NUM_THREADS', max(multiprocessing.cpu_count() // 2, 1))) + def cpu_count(): """ Returns @@ -49,6 +51,7 @@ def cpu_count(): """ return CPU_COUNT + def set_cpu_count(count): global CPU_COUNT CPU_COUNT = max(int(count), 1) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index b1cd5eb2c2be0..ced654915e57e 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -21,21 +21,23 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, CTable, CMemoryPool, CKeyValueMetadata, - RandomAccessFile, OutputStream) + RandomAccessFile, OutputStream, + TimeUnit) cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: - cdef cppclass Node: - pass + cdef cppclass Node: + pass + + cdef cppclass GroupNode(Node): + pass - cdef cppclass GroupNode(Node): - pass + cdef cppclass PrimitiveNode(Node): + pass - cdef cppclass PrimitiveNode(Node): - pass + cdef cppclass ColumnPath: + c_string ToDotString() - cdef cppclass ColumnPath: - c_string ToDotString() cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: enum ParquetType" parquet::Type::type": @@ -59,8 +61,10 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetLogicalType_DATE" parquet::LogicalType::DATE" ParquetLogicalType_TIME_MILLIS" parquet::LogicalType::TIME_MILLIS" ParquetLogicalType_TIME_MICROS" parquet::LogicalType::TIME_MICROS" - ParquetLogicalType_TIMESTAMP_MILLIS" parquet::LogicalType::TIMESTAMP_MILLIS" - ParquetLogicalType_TIMESTAMP_MICROS" parquet::LogicalType::TIMESTAMP_MICROS" + ParquetLogicalType_TIMESTAMP_MILLIS \ + " parquet::LogicalType::TIMESTAMP_MILLIS" + ParquetLogicalType_TIMESTAMP_MICROS \ + " parquet::LogicalType::TIMESTAMP_MICROS" ParquetLogicalType_UINT_8" parquet::LogicalType::UINT_8" ParquetLogicalType_UINT_16" parquet::LogicalType::UINT_16" ParquetLogicalType_UINT_32" parquet::LogicalType::UINT_32" @@ -83,8 +87,10 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY" ParquetEncoding_RLE" parquet::Encoding::RLE" ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED" - ParquetEncoding_DELTA_BINARY_PACKED" parquet::Encoding::DELTA_BINARY_PACKED" - ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY" parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY" + ParquetEncoding_DELTA_BINARY_PACKED \ + " parquet::Encoding::DELTA_BINARY_PACKED" + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY \ + " parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY" ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY" ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY" @@ -231,13 +237,15 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: - CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, - const shared_ptr[const CKeyValueMetadata]& key_value_metadata, - shared_ptr[CSchema]* out) + CStatus FromParquetSchema( + const SchemaDescriptor* parquet_schema, + const shared_ptr[const CKeyValueMetadata]& key_value_metadata, + shared_ptr[CSchema]* out) - CStatus ToParquetSchema(const CSchema* arrow_schema, - const shared_ptr[const CKeyValueMetadata]& key_value_metadata, - shared_ptr[SchemaDescriptor]* out) + CStatus ToParquetSchema( + const CSchema* arrow_schema, + const shared_ptr[const CKeyValueMetadata]& key_value_metadata, + shared_ptr[SchemaDescriptor]* out) cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: @@ -259,5 +267,6 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: Builder() Builder* disable_deprecated_int96_timestamps() Builder* enable_deprecated_int96_timestamps() + Builder* coerce_timestamps(TimeUnit unit) shared_ptr[ArrowWriterProperties] build() c_bool support_deprecated_int96_timestamps() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c940122da5dcf..f3b7875045904 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -279,8 +279,8 @@ cdef class ColumnSchema: max_repetition_level: {3} physical_type: {4} logical_type: {5}""".format(self.name, self.path, self.max_definition_level, - self.max_repetition_level, physical_type, - logical_type) + self.max_repetition_level, physical_type, + logical_type) property name: @@ -514,7 +514,7 @@ cdef class ParquetReader: with nogil: check_status(self.reader.get() - .ReadSchemaField(field_index, &carray)); + .ReadSchemaField(field_index, &carray)) array.init(carray) return array @@ -547,19 +547,27 @@ cdef class ParquetWriter: cdef readonly: object use_dictionary object use_deprecated_int96_timestamps + object coerce_timestamps object compression object version int row_group_size def __cinit__(self, where, Schema schema, use_dictionary=None, compression=None, version=None, - MemoryPool memory_pool=None, use_deprecated_int96_timestamps=False): + MemoryPool memory_pool=None, + use_deprecated_int96_timestamps=False, + coerce_timestamps=None): cdef: shared_ptr[FileOutputStream] filestream shared_ptr[WriterProperties] properties + c_string c_where + CMemoryPool* pool if isinstance(where, six.string_types): - check_status(FileOutputStream.Open(tobytes(where), &filestream)) + c_where = tobytes(where) + with nogil: + check_status(FileOutputStream.Open(c_where, + &filestream)) self.sink = filestream else: get_writer(where, &self.sink) @@ -568,6 +576,7 @@ cdef class ParquetWriter: self.compression = compression self.version = version self.use_deprecated_int96_timestamps = use_deprecated_int96_timestamps + self.coerce_timestamps = coerce_timestamps cdef WriterProperties.Builder properties_builder self._set_version(&properties_builder) @@ -577,13 +586,15 @@ cdef class ParquetWriter: cdef ArrowWriterProperties.Builder arrow_properties_builder self._set_int96_support(&arrow_properties_builder) + self._set_coerce_timestamps(&arrow_properties_builder) arrow_properties = arrow_properties_builder.build() - check_status( - FileWriter.Open(deref(schema.schema), - maybe_unbox_memory_pool(memory_pool), - self.sink, properties, arrow_properties, - &self.writer)) + pool = maybe_unbox_memory_pool(memory_pool) + with nogil: + check_status( + FileWriter.Open(deref(schema.schema), pool, + self.sink, properties, arrow_properties, + &self.writer)) cdef void _set_int96_support(self, ArrowWriterProperties.Builder* props): if self.use_deprecated_int96_timestamps: @@ -591,6 +602,16 @@ cdef class ParquetWriter: else: props.disable_deprecated_int96_timestamps() + cdef int _set_coerce_timestamps( + self, ArrowWriterProperties.Builder* props) except -1: + if self.coerce_timestamps == 'ms': + props.coerce_timestamps(TimeUnit_MILLI) + elif self.coerce_timestamps == 'us': + props.coerce_timestamps(TimeUnit_MICRO) + elif self.coerce_timestamps is not None: + raise ValueError('Invalid value for coerce_timestamps: {0}' + .format(self.coerce_timestamps)) + cdef void _set_version(self, WriterProperties.Builder* props): if self.version is not None: if self.version == "1.0": diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 67418aa5eac67..20e778d068ff8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,30 +16,6 @@ # under the License. -cdef maybe_coerce_datetime64(values, dtype, DataType type, - timestamps_to_ms=False): - - from pyarrow.compat import DatetimeTZDtype - - if values.dtype.type != np.datetime64: - return values, type - - coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' - - if coerce_ms: - values = values.astype('datetime64[ms]') - - if isinstance(dtype, DatetimeTZDtype): - tz = dtype.tz - unit = 'ms' if coerce_ms else dtype.unit - type = timestamp(unit, tz) - elif type is None: - # Trust the NumPy dtype - type = from_numpy_dtype(values.dtype) - - return values, type - - def array(object sequence, DataType type=None, MemoryPool memory_pool=None, size=None): """ @@ -66,25 +42,30 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, array : pyarrow.Array """ cdef: - shared_ptr[CArray] sp_array - CMemoryPool* pool + shared_ptr[CArray] sp_array + CMemoryPool* pool + int64_t c_size pool = maybe_unbox_memory_pool(memory_pool) if type is None: - check_status(ConvertPySequence(sequence, pool, &sp_array)) + with nogil: + check_status(ConvertPySequence(sequence, pool, &sp_array)) else: if size is None: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &sp_array, type.sp_type + ) ) - ) else: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type, size + c_size = size + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &sp_array, type.sp_type, c_size + ) ) - ) return pyarrow_wrap_array(sp_array) @@ -115,7 +96,8 @@ cdef class Array: self.type = pyarrow_wrap_data_type(self.sp_array.get().type()) def _debug_print(self): - check_status(DebugPrint(deref(self.ap), 0)) + with nogil: + check_status(DebugPrint(deref(self.ap), 0)) @staticmethod def from_pandas(obj, mask=None, DataType type=None, @@ -189,7 +171,8 @@ cdef class Array: if isinstance(values, Categorical): return DictionaryArray.from_arrays( values.codes, values.categories.values, - mask=mask, memory_pool=memory_pool) + mask=mask, ordered=values.ordered, + memory_pool=memory_pool) elif values.dtype == object: # Object dtype undergoes a different conversion path as more type # inference may be needed @@ -204,11 +187,13 @@ cdef class Array: else: out = chunked_out.get().chunk(0) else: - values, type = maybe_coerce_datetime64( + values, type = pdcompat.maybe_coerce_datetime64( values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) if type is None: - check_status(NumPyDtypeToArrow(values.dtype, &c_type)) + dtype = values.dtype + with nogil: + check_status(NumPyDtypeToArrow(dtype, &c_type)) else: c_type = type.sp_type @@ -289,10 +274,15 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self): + def to_pandas(self, c_bool strings_to_categorical=False): """ Convert to an array object suitable for use in pandas + Parameters + ---------- + strings_to_categorical : boolean, default False + Encode string (UTF8) and binary types to pandas.Categorical + See also -------- Column.to_pandas @@ -301,9 +291,12 @@ cdef class Array: """ cdef: PyObject* out + PandasOptions options + options = PandasOptions(strings_to_categorical=strings_to_categorical) with nogil: - check_status(ConvertArrayToPandas(self.sp_array, self, &out)) + check_status(ConvertArrayToPandas(options, self.sp_array, + self, &out)) return wrap_array_output(out) def to_pylist(self): @@ -342,7 +335,9 @@ strides: {2}""".format(self.type, self.shape, self.strides) @staticmethod def from_numpy(obj): cdef shared_ptr[CTensor] ctensor - check_status(NdarrayToTensor(c_default_memory_pool(), obj, &ctensor)) + with nogil: + check_status(NdarrayToTensor(c_default_memory_pool(), obj, + &ctensor)) return pyarrow_wrap_tensor(ctensor) def to_numpy(self): @@ -352,7 +347,8 @@ strides: {2}""".format(self.type, self.shape, self.strides) cdef: PyObject* out - check_status(TensorToNdarray(deref(self.tp), self, &out)) + with nogil: + check_status(TensorToNdarray(deref(self.tp), self, &out)) return PyObject_to_object(out) def equals(self, Tensor other): @@ -400,7 +396,6 @@ strides: {2}""".format(self.type, self.shape, self.strides) return py_strides - cdef wrap_array_output(PyObject* output): cdef object obj = PyObject_to_object(output) @@ -564,7 +559,7 @@ cdef class DictionaryArray(Array): return self._indices @staticmethod - def from_arrays(indices, dictionary, mask=None, + def from_arrays(indices, dictionary, mask=None, ordered=False, MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be @@ -576,6 +571,8 @@ cdef class DictionaryArray(Array): dictionary : ndarray or pandas.Series mask : ndarray or pandas.Series, boolean type True values indicate that indices are actually null + ordered : boolean, default False + Set to True if the category values are ordered Returns ------- @@ -609,8 +606,10 @@ cdef class DictionaryArray(Array): if not isinstance(arrow_indices, IntegerArray): raise ValueError('Indices must be integer type') + cdef c_bool c_ordered = ordered + c_type.reset(new CDictionaryType(arrow_indices.type.sp_type, - arrow_dictionary.sp_array)) + arrow_dictionary.sp_array, c_ordered)) c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array)) result = DictionaryArray() diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 7be35dfc2c81f..2252e85e6ef77 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -132,7 +132,6 @@ def frombytes(o): def encode_file_path(path): import os - # Windows requires utf-16le encoding for unicode file names if isinstance(path, unicode_type): # POSIX systems can handle utf-8. UTF8 is converted to utf16-le in # libarrow @@ -140,6 +139,8 @@ def encode_file_path(path): else: encoded_path = path + # Windows file system requires utf-16le for file names; Arrow C++ libraries + # will convert utf8 to utf16 return encoded_path diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 2e7cf6c9bd1b8..6a1fa30ba63e7 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#---------------------------------------------------------------------- +# --------------------------------------------------------------------- # Implement legacy Feather file format @@ -44,7 +44,8 @@ cdef class FeatherWriter: if self.num_rows < 0: self.num_rows = 0 self.writer.get().SetNumRows(self.num_rows) - check_status(self.writer.get().Finalize()) + with nogil: + check_status(self.writer.get().Finalize()) def write_array(self, object name, object col, object mask=None): cdef Array arr diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8d7e27915eede..eed9640861fac 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -132,10 +132,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType): CDictionaryType(const shared_ptr[CDataType]& index_type, - const shared_ptr[CArray]& dictionary) + const shared_ptr[CArray]& dictionary, + c_bool ordered) shared_ptr[CDataType] index_type() shared_ptr[CArray] dictionary() + c_bool ordered() shared_ptr[CDataType] ctimestamp" arrow::timestamp"(TimeUnit unit) shared_ptr[CDataType] ctimestamp" arrow::timestamp"( @@ -176,7 +178,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringType" arrow::StringType"(CDataType): pass - cdef cppclass CFixedSizeBinaryType" arrow::FixedSizeBinaryType"(CFixedWidthType): + cdef cppclass CFixedSizeBinaryType \ + " arrow::FixedSizeBinaryType"(CFixedWidthType): CFixedSizeBinaryType(int byte_width) int byte_width() int bit_width() @@ -207,7 +210,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CKeyValueMetadata]& metadata) shared_ptr[CField] RemoveMetadata() - cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) @@ -307,9 +309,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructArray" arrow::StructArray"(CArray): CStructArray(shared_ptr[CDataType] type, int64_t length, - vector[shared_ptr[CArray]] children, - shared_ptr[CBuffer] null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0) + vector[shared_ptr[CArray]] children, + shared_ptr[CBuffer] null_bitmap=nullptr, + int64_t null_count=0, + int64_t offset=0) shared_ptr[CArray] field(int pos) const vector[shared_ptr[CArray]] fields() @@ -460,7 +463,6 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: cdef extern from "arrow/io/file.h" namespace "arrow::io" nogil: - cdef cppclass FileOutputStream(OutputStream): @staticmethod CStatus Open(const c_string& path, shared_ptr[FileOutputStream]* file) @@ -477,12 +479,12 @@ cdef extern from "arrow/io/file.h" namespace "arrow::io" nogil: int file_descriptor() - cdef cppclass CMemoryMappedFile" arrow::io::MemoryMappedFile"\ - (ReadWriteFileInterface): + cdef cppclass CMemoryMappedFile \ + " arrow::io::MemoryMappedFile"(ReadWriteFileInterface): @staticmethod CStatus Create(const c_string& path, int64_t size, - shared_ptr[CMemoryMappedFile]* file) + shared_ptr[CMemoryMappedFile]* file) @staticmethod CStatus Open(const c_string& path, FileMode mode, @@ -507,7 +509,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: HdfsDriver driver cdef cppclass HdfsPathInfo: - ObjectType kind; + ObjectType kind c_string name c_string owner c_string group @@ -561,21 +563,22 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil: - cdef cppclass CBufferReader" arrow::io::BufferReader"\ - (RandomAccessFile): + cdef cppclass CBufferReader \ + " arrow::io::BufferReader"(RandomAccessFile): CBufferReader(const shared_ptr[CBuffer]& buffer) CBufferReader(const uint8_t* data, int64_t nbytes) - cdef cppclass CBufferOutputStream" arrow::io::BufferOutputStream"\ - (OutputStream): + cdef cppclass CBufferOutputStream \ + " arrow::io::BufferOutputStream"(OutputStream): CBufferOutputStream(const shared_ptr[ResizableBuffer]& buffer) - cdef cppclass CMockOutputStream" arrow::io::MockOutputStream"\ - (OutputStream): + cdef cppclass CMockOutputStream \ + " arrow::io::MockOutputStream"(OutputStream): CMockOutputStream() int64_t GetExtentBytesWritten() - cdef cppclass CFixedSizeBufferWriter" arrow::io::FixedSizeBufferWriter"(WriteableFile): + cdef cppclass CFixedSizeBufferWriter \ + " arrow::io::FixedSizeBufferWriter"(WriteableFile): CFixedSizeBufferWriter(const shared_ptr[CBuffer]& buffer) @@ -607,48 +610,45 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: c_string FormatMessageType(MessageType type) - cdef cppclass CMessageReader \ - " arrow::ipc::MessageReader": + cdef cppclass CMessageReader" arrow::ipc::MessageReader": CStatus ReadNextMessage(unique_ptr[CMessage]* out) cdef cppclass CInputStreamMessageReader \ - " arrow::ipc::InputStreamMessageReader": + " arrow::ipc::InputStreamMessageReader": CInputStreamMessageReader(const shared_ptr[InputStream]& stream) - cdef cppclass CRecordBatchWriter \ - " arrow::ipc::RecordBatchWriter": + cdef cppclass CRecordBatchWriter" arrow::ipc::RecordBatchWriter": CStatus Close() CStatus WriteRecordBatch(const CRecordBatch& batch) - cdef cppclass CRecordBatchReader \ - " arrow::ipc::RecordBatchReader": + cdef cppclass CRecordBatchReader" arrow::ipc::RecordBatchReader": shared_ptr[CSchema] schema() CStatus ReadNextRecordBatch(shared_ptr[CRecordBatch]* batch) cdef cppclass CRecordBatchStreamReader \ - " arrow::ipc::RecordBatchStreamReader"(CRecordBatchReader): + " arrow::ipc::RecordBatchStreamReader"(CRecordBatchReader): @staticmethod CStatus Open(const shared_ptr[InputStream]& stream, shared_ptr[CRecordBatchStreamReader]* out) @staticmethod CStatus Open2" Open"(unique_ptr[CMessageReader] message_reader, - shared_ptr[CRecordBatchStreamReader]* out) + shared_ptr[CRecordBatchStreamReader]* out) cdef cppclass CRecordBatchStreamWriter \ - " arrow::ipc::RecordBatchStreamWriter"(CRecordBatchWriter): + " arrow::ipc::RecordBatchStreamWriter"(CRecordBatchWriter): @staticmethod CStatus Open(OutputStream* sink, const shared_ptr[CSchema]& schema, shared_ptr[CRecordBatchStreamWriter]* out) cdef cppclass CRecordBatchFileWriter \ - " arrow::ipc::RecordBatchFileWriter"(CRecordBatchWriter): + " arrow::ipc::RecordBatchFileWriter"(CRecordBatchWriter): @staticmethod CStatus Open(OutputStream* sink, const shared_ptr[CSchema]& schema, shared_ptr[CRecordBatchFileWriter]* out) cdef cppclass CRecordBatchFileReader \ - " arrow::ipc::RecordBatchFileReader": + " arrow::ipc::RecordBatchFileReader": @staticmethod CStatus Open(const shared_ptr[RandomAccessFile]& file, shared_ptr[CRecordBatchFileReader]* out) @@ -722,7 +722,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus ConvertPySequence(object obj, CMemoryPool* pool, shared_ptr[CArray]* out, const shared_ptr[CDataType]& type, - int64_t size) + int64_t size) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) @@ -735,19 +735,23 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, - shared_ptr[CTensor]* out); + shared_ptr[CTensor]* out) CStatus TensorToNdarray(const CTensor& tensor, object base, PyObject** out) - CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, + CStatus ConvertArrayToPandas(PandasOptions options, + const shared_ptr[CArray]& arr, object py_ref, PyObject** out) - CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, + CStatus ConvertColumnToPandas(PandasOptions options, + const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) - CStatus ConvertTableToPandas(const shared_ptr[CTable]& table, - int nthreads, PyObject** out) + CStatus ConvertTableToPandas(PandasOptions options, + const shared_ptr[CTable]& table, + int nthreads, CMemoryPool* pool, + PyObject** out) void c_set_default_memory_pool \ " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\ @@ -767,6 +771,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyBytesReader(CBufferReader): PyBytesReader(object fo) + cdef struct PandasOptions: + c_bool strings_to_categorical + cdef extern from 'arrow/python/init.h': int arrow_init_numpy() except -1 diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index 8ac4e8c2319c1..e6285e465d2be 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -29,7 +29,8 @@ except ImportError: def have_libhdfs(): try: - check_status(HaveLibHdfs()) + with nogil: + check_status(HaveLibHdfs()) return True except: return False @@ -37,7 +38,8 @@ def have_libhdfs(): def have_libhdfs3(): try: - check_status(HaveLibHdfs3()) + with nogil: + check_status(HaveLibHdfs3()) return True except: return False @@ -73,10 +75,12 @@ cdef class HadoopFileSystem: conf.kerb_ticket = tobytes(kerb_ticket) if driver == 'libhdfs': - check_status(HaveLibHdfs()) + with nogil: + check_status(HaveLibHdfs()) conf.driver = HdfsDriver_LIBHDFS else: - check_status(HaveLibHdfs3()) + with nogil: + check_status(HaveLibHdfs3()) conf.driver = HdfsDriver_LIBHDFS3 with nogil: @@ -231,7 +235,6 @@ cdef class HadoopFileSystem: check_status(self.client.get() .GetPathInfo(c_path, info)) - def ls(self, path, bint full_info): cdef: c_string c_path = tobytes(path) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 211c2a3e6e9cf..eda8de730281d 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -255,13 +255,18 @@ cdef class NativeFile: if not hasattr(stream_or_path, 'read'): stream = open(stream_or_path, 'wb') - cleanup = lambda: stream.close() + + def cleanup(): + stream.close() else: stream = stream_or_path - cleanup = lambda: None + + def cleanup(): + pass done = False exc_info = None + def bg_write(): try: while not done or write_queue.qsize() > 0: @@ -326,6 +331,7 @@ cdef class NativeFile: done = False exc_info = None + def bg_write(): try: while not done or write_queue.qsize() > 0: @@ -441,7 +447,8 @@ cdef class MemoryMappedFile(NativeFile): else: raise ValueError('Invalid file mode: {0}'.format(mode)) - check_status(CMemoryMappedFile.Open(c_path, c_mode, &handle)) + with nogil: + check_status(CMemoryMappedFile.Open(c_path, c_mode, &handle)) self.wr_file = handle self.rd_file = handle @@ -636,7 +643,8 @@ cdef class BufferOutputStream(NativeFile): self.is_open = True def get_result(self): - check_status(self.wr_file.get().Close()) + with nogil: + check_status(self.wr_file.get().Close()) self.is_open = False return pyarrow_wrap_buffer( self.buffer) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 31ee578920eae..ceed4b0e85248 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -163,8 +163,7 @@ cdef class _RecordBatchWriter: self.closed = True def __dealloc__(self): - if not self.closed: - self.close() + pass def _open(self, sink, Schema schema): cdef: @@ -182,11 +181,24 @@ cdef class _RecordBatchWriter: self.closed = False def write_batch(self, RecordBatch batch): + """ + Write RecordBatch to stream + + Parameters + ---------- + batch : RecordBatch + """ with nogil: check_status(self.writer.get() .WriteRecordBatch(deref(batch.batch))) def close(self): + """ + Close stream and write end-of-stream 0 marker + """ + if self.closed: + return + with nogil: check_status(self.writer.get().Close()) self.closed = True @@ -279,7 +291,7 @@ cdef class _RecordBatchFileWriter(_RecordBatchWriter): with nogil: check_status( CRecordBatchFileWriter.Open(self.sink.get(), schema.sp_schema, - &writer)) + &writer)) # Cast to base class, because has same interface self.writer = writer diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 3e1419bdfc072..48a58f7b82660 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -17,6 +17,7 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow cimport CStatus from cpython cimport PyObject from libcpp cimport nullptr @@ -24,9 +25,6 @@ cdef extern from "Python.h": int PySlice_Check(object) -from pyarrow.includes.libarrow cimport CStatus - - cdef int check_status(const CStatus& status) nogil except -1 diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 4df2fcd64f60f..789801b9f06a9 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -19,31 +19,27 @@ # distutils: language = c++ # cython: embedsignature = True -from cython.operator cimport dereference as deref -from pyarrow.includes.libarrow cimport * -from pyarrow.includes.common cimport PyObject_to_object -cimport pyarrow.includes.libarrow as libarrow -cimport cpython as cp - - import datetime import decimal as _pydecimal +import multiprocessing import numpy as np +import os import six from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical +from cython.operator cimport dereference as deref +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.common cimport PyObject_to_object +cimport pyarrow.includes.libarrow as libarrow +cimport cpython as cp + cdef _pandas(): import pandas as pd return pd - arrow_init_numpy() - -import numpy as np set_numpy_nan(np.nan) -import multiprocessing -import os cdef int CPU_COUNT = int( os.environ.get('OMP_NUM_THREADS', max(multiprocessing.cpu_count() // 2, 1))) @@ -62,6 +58,7 @@ def cpu_count(): """ return CPU_COUNT + def set_cpu_count(count): global CPU_COUNT CPU_COUNT = max(int(count), 1) @@ -122,7 +119,5 @@ include "ipc.pxi" # Feather format include "feather.pxi" -#---------------------------------------------------------------------- # Public API - include "public-api.pxi" diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index cd7ad47782646..434b1c9eab90e 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -17,6 +17,7 @@ import re import json +import numpy as np import pandas as pd import six @@ -241,7 +242,34 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): return names, arrays, metadata -def table_to_blockmanager(table, nthreads=1): +def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): + if timestamps_to_ms: + import warnings + warnings.warn('timestamps_to_ms=True is deprecated', FutureWarning) + + from pyarrow.compat import DatetimeTZDtype + + if values.dtype.type != np.datetime64: + return values, type_ + + coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' + + if coerce_ms: + values = values.astype('datetime64[ms]') + type_ = pa.timestamp('ms') + + if isinstance(dtype, DatetimeTZDtype): + tz = dtype.tz + unit = 'ms' if coerce_ms else dtype.unit + type_ = pa.timestamp(unit, tz) + elif type_ is None: + # Trust the NumPy dtype + type_ = pa.from_numpy_dtype(values.dtype) + + return values, type_ + + +def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype import pyarrow.lib as lib @@ -277,7 +305,7 @@ def table_to_blockmanager(table, nthreads=1): block_table.schema.get_field_index(name) ) - result = lib.table_to_blocks(block_table, nthreads) + result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) blocks = [] for item in result: @@ -286,7 +314,7 @@ def table_to_blockmanager(table, nthreads=1): if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], - ordered=False, fastpath=True) + ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 6d39a2354f653..89dbf83ee3523 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -757,7 +757,8 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None): def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', - use_deprecated_int96_timestamps=False, **kwargs): + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, **kwargs): """ Write a Table to Parquet format @@ -773,6 +774,11 @@ def write_table(table, where, row_group_size=None, version='1.0', use_dictionary : bool or list Specify if we should use dictionary encoding in general or only for some columns. + use_deprecated_int96_timestamps : boolean, default False + Write nanosecond resolution timestamps to INT96 Parquet format + coerce_timestamps : string, default None + Cast timestamps a particular resolution. + Valid values: {None, 'ms', 'us'} compression : str or dict Specify the compression codec, either on a general basis or per-column. """ @@ -781,7 +787,8 @@ def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=use_dictionary, compression=compression, version=version, - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, + coerce_timestamps=coerce_timestamps) writer = None try: @@ -801,7 +808,8 @@ def write_table(table, where, row_group_size=None, version='1.0', def write_metadata(schema, where, version='1.0', - use_deprecated_int96_timestamps=False): + use_deprecated_int96_timestamps=False, + coerce_timestamps=None): """ Write metadata-only Parquet file from schema @@ -811,10 +819,16 @@ def write_metadata(schema, where, version='1.0', where: string or pyarrow.io.NativeFile version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 + use_deprecated_int96_timestamps : boolean, default False + Write nanosecond resolution timestamps to INT96 Parquet format + coerce_timestamps : string, default None + Cast timestamps a particular resolution. + Valid values: {None, 'ms', 'us'} """ options = dict( version=version, - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, + coerce_timestamps=coerce_timestamps ) writer = ParquetWriter(where, schema, **options) writer.close() diff --git a/python/pyarrow/plasma.pyx b/python/pyarrow/plasma.pyx index dd62d473b001a..befa283d85b54 100644 --- a/python/pyarrow/plasma.pyx +++ b/python/pyarrow/plasma.pyx @@ -60,8 +60,8 @@ cdef extern from "plasma/common.h": PLASMA_QUERY_LOCAL"plasma::PLASMA_QUERY_LOCAL", PLASMA_QUERY_ANYWHERE"plasma::PLASMA_QUERY_ANYWHERE" - cdef int ObjectStatusLocal"plasma::ObjectStatusLocal"; - cdef int ObjectStatusRemote"plasma::ObjectStatusRemote"; + cdef int ObjectStatusLocal"plasma::ObjectStatusLocal" + cdef int ObjectStatusRemote"plasma::ObjectStatusRemote" cdef extern from "plasma/client.h" nogil: @@ -102,7 +102,7 @@ cdef extern from "plasma/client.h" nogil: CStatus Wait(int64_t num_object_requests, CObjectRequest* object_requests, int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready); + int* num_objects_ready) CStatus Transfer(const char* addr, int port, const CUniqueID& object_id) @@ -312,9 +312,10 @@ cdef class PlasmaClient: result = [] for i in range(object_buffers.size()): if object_buffers[i].data_size != -1: - result.append(self._make_plasma_buffer( - object_ids[i], object_buffers[i].data, - object_buffers[i].data_size)) + result.append( + self._make_plasma_buffer(object_ids[i], + object_buffers[i].data, + object_buffers[i].data_size)) else: result.append(None) return result @@ -345,9 +346,10 @@ cdef class PlasmaClient: self._get_object_buffers(object_ids, timeout_ms, &object_buffers) result = [] for i in range(object_buffers.size()): - result.append(self._make_plasma_buffer( - object_ids[i], object_buffers[i].metadata, - object_buffers[i].metadata_size)) + result.append( + self._make_plasma_buffer(object_ids[i], + object_buffers[i].metadata, + object_buffers[i].metadata_size)) return result def seal(self, ObjectID object_id): @@ -502,7 +504,7 @@ cdef class PlasmaClient: object_requests.data(), num_returns, timeout, &num_objects_ready)) - cdef int num_to_return = min(num_objects_ready, num_returns); + cdef int num_to_return = min(num_objects_ready, num_returns) ready_ids = [] waiting_ids = set(object_ids) cdef int num_returned = 0 @@ -510,7 +512,7 @@ cdef class PlasmaClient: if num_returned == num_to_return: break if (object_requests[i].status == ObjectStatusLocal or - object_requests[i].status == ObjectStatusRemote): + object_requests[i].status == ObjectStatusRemote): ready_ids.append( ObjectID(object_requests[i].object_id.binary())) waiting_ids.discard( diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 28e07ffc37dc3..7e08f632e872e 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -47,7 +47,8 @@ cdef public api bint pyarrow_is_data_type(object type_): return isinstance(type_, DataType) -cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type(object data_type): +cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type( + object data_type): cdef DataType type_ if pyarrow_is_data_type(data_type): type_ = (data_type) @@ -57,7 +58,7 @@ cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type(object data_type) cdef public api object pyarrow_wrap_data_type( - const shared_ptr[CDataType]& type): + const shared_ptr[CDataType]& type): cdef: DataType out @@ -149,7 +150,7 @@ cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): cdef public api object pyarrow_wrap_chunked_array( - const shared_ptr[CChunkedArray]& sp_array): + const shared_ptr[CChunkedArray]& sp_array): if sp_array.get() == NULL: raise ValueError('ChunkedArray was NULL') @@ -177,7 +178,7 @@ cdef public api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): cdef public api object pyarrow_wrap_tensor( - const shared_ptr[CTensor]& sp_tensor): + const shared_ptr[CTensor]& sp_tensor): if sp_tensor.get() == NULL: raise ValueError('Tensor was NULL') @@ -238,7 +239,7 @@ cdef public api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): cdef public api object pyarrow_wrap_batch( - const shared_ptr[CRecordBatch]& cbatch): + const shared_ptr[CRecordBatch]& cbatch): cdef RecordBatch batch = RecordBatch() batch.init(cbatch) return batch diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 1f72070cb7e12..16d2bad0d2d8d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -154,11 +154,11 @@ cdef class Time32Value(ArrayValue): CTime32Type* dtype = ap.type().get() if dtype.unit() == TimeUnit_SECOND: - return (datetime.datetime(1970, 1, 1) + - datetime.timedelta(seconds=ap.Value(self.index))).time() + delta = datetime.timedelta(seconds=ap.Value(self.index)) + return (datetime.datetime(1970, 1, 1) + delta).time() else: - return (datetime.datetime(1970, 1, 1) + - datetime.timedelta(milliseconds=ap.Value(self.index))).time() + delta = datetime.timedelta(milliseconds=ap.Value(self.index)) + return (datetime.datetime(1970, 1, 1) + delta).time() cdef class Time64Value(ArrayValue): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 6277761b7d6ec..976f4297d5228 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -134,6 +134,16 @@ cdef class Column: self.sp_column = column self.column = column.get() + def __repr__(self): + from pyarrow.compat import StringIO + result = StringIO() + result.write(object.__repr__(self)) + data = self.data + for i in range(len(data)): + result.write('\nchunk {0}: {1}'.format(i, repr(data.chunk(0)))) + + return result.getvalue() + @staticmethod def from_array(object field_or_name, Array arr): cdef Field boxed_field @@ -147,7 +157,7 @@ cdef class Column: sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array)) return pyarrow_wrap_column(sp_column) - def to_pandas(self): + def to_pandas(self, strings_to_categorical=False): """ Convert the arrow::Column to a pandas.Series @@ -157,9 +167,14 @@ cdef class Column: """ cdef: PyObject* out + PandasOptions options - check_status(libarrow.ConvertColumnToPandas(self.sp_column, - self, &out)) + options = PandasOptions(strings_to_categorical=strings_to_categorical) + + with nogil: + check_status(libarrow.ConvertColumnToPandas(options, + self.sp_column, + self, &out)) return pd.Series(wrap_array_output(out), name=self.name) @@ -495,7 +510,6 @@ cdef class RecordBatch: entries.append((name, column)) return OrderedDict(entries) - def to_pandas(self, nthreads=None): """ Convert the arrow::RecordBatch to a pandas DataFrame @@ -570,22 +584,24 @@ cdef class RecordBatch: return pyarrow_wrap_batch(batch) -def table_to_blocks(Table table, int nthreads): +def table_to_blocks(PandasOptions options, Table table, int nthreads, + MemoryPool memory_pool): cdef: PyObject* result_obj shared_ptr[CTable] c_table = table.sp_table + CMemoryPool* pool + pool = maybe_unbox_memory_pool(memory_pool) with nogil: check_status( libarrow.ConvertTableToPandas( - c_table, nthreads, &result_obj + options, c_table, nthreads, pool, &result_obj ) ) return PyObject_to_object(result_obj) - cdef class Table: """ A collection of top-level named, equal length Arrow arrays. @@ -663,13 +679,8 @@ cdef class Table: return result @classmethod - def from_pandas( - cls, - df, - bint timestamps_to_ms=False, - Schema schema=None, - bint preserve_index=True - ): + def from_pandas(cls, df, bint timestamps_to_ms=False, + Schema schema=None, bint preserve_index=True): """ Convert pandas.DataFrame to an Arrow Table @@ -786,7 +797,8 @@ cdef class Table: return pyarrow_wrap_table(c_table) - def to_pandas(self, nthreads=None): + def to_pandas(self, nthreads=None, strings_to_categorical=False, + memory_pool=None): """ Convert the arrow::Table to a pandas DataFrame @@ -796,16 +808,23 @@ cdef class Table: For the default, we divide the CPU count by 2 because most modern computers have hyperthreading turned on, so doubling the CPU count beyond the number of physical cores does not help + strings_to_categorical : boolean, default False + Encode string (UTF8) and binary types to pandas.Categorical + memory_pool: MemoryPool, optional + Specific memory pool to use to allocate casted columns Returns ------- pandas.DataFrame """ + cdef: + PandasOptions options + options = PandasOptions(strings_to_categorical=strings_to_categorical) self._check_nullptr() if nthreads is None: nthreads = cpu_count() - - mgr = pdcompat.table_to_blockmanager(self, nthreads) + mgr = pdcompat.table_to_blockmanager(options, self, memory_pool, + nthreads) return pd.DataFrame(mgr) def to_pydict(self): @@ -897,7 +916,8 @@ cdef class Table: """ Number of rows in this table. - Due to the definition of a table, all columns have the same number of rows. + Due to the definition of a table, all columns have the same number of + rows. Returns ------- diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index ec261595585c2..d18ed9506bbbb 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -16,6 +16,8 @@ # specific language governing permissions and limitations # under the License. +import pytest + from pyarrow.compat import unittest, u # noqa import pyarrow as pa @@ -140,6 +142,17 @@ def test_bytes(self): assert arr.type == pa.binary() assert arr.to_pylist() == [b'foo', u1, None] + def test_utf8_to_unicode(self): + # ARROW-1225 + data = [b'foo', None, b'bar'] + arr = pa.array(data, type=pa.string()) + assert arr[0].as_py() == u'foo' + + # test a non-utf8 unicode string + val = (u'mañana').encode('utf-16-le') + with pytest.raises(pa.ArrowException): + pa.array([val], type=pa.string()) + def test_fixed_size_bytes(self): data = [b'foof', None, b'barb', b'2346'] arr = pa.array(data, type=pa.binary(4)) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index d488658563306..8969777b526c0 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -18,7 +18,7 @@ from collections import OrderedDict -import datetime +from datetime import datetime, date, time import unittest import decimal import json @@ -327,7 +327,7 @@ def test_timestamps_notimezone_no_nulls(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - }) + }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -342,7 +342,7 @@ def test_timestamps_notimezone_no_nulls(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - }) + }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -351,6 +351,17 @@ def test_timestamps_notimezone_no_nulls(self): expected_schema=schema, ) + def test_timestamps_to_ms_explicit_schema(self): + # ARROW-1328 + df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]}) + pa_type = pa.from_numpy_dtype(df['datetime'].dtype) + + arr = pa.Array.from_pandas(df['datetime'], type=pa_type, + timestamps_to_ms=True) + + tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'), + arr.to_pandas()) + def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ @@ -358,7 +369,7 @@ def test_timestamps_notimezone_nulls(self): None, '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - }) + }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -373,7 +384,7 @@ def test_timestamps_notimezone_nulls(self): None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - }) + }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -389,7 +400,7 @@ def test_timestamps_with_timezone(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - }) + }) df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') .to_frame()) self._check_pandas_roundtrip(df, timestamps_to_ms=True) @@ -402,17 +413,17 @@ def test_timestamps_with_timezone(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - }) + }) df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') .to_frame()) self._check_pandas_roundtrip(df, timestamps_to_ms=False) def test_date_infer(self): df = pd.DataFrame({ - 'date': [datetime.date(2000, 1, 1), + 'date': [date(2000, 1, 1), None, - datetime.date(1970, 1, 1), - datetime.date(2040, 2, 26)]}) + date(1970, 1, 1), + date(2040, 2, 26)]}) table = pa.Table.from_pandas(df, preserve_index=False) field = pa.field('date', pa.date32()) schema = pa.schema([field]) @@ -424,10 +435,10 @@ def test_date_infer(self): def test_date_objects_typed(self): arr = np.array([ - datetime.date(2017, 4, 3), + date(2017, 4, 3), None, - datetime.date(2017, 4, 4), - datetime.date(2017, 4, 5)], dtype=object) + date(2017, 4, 4), + date(2017, 4, 5)], dtype=object) arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32') arr_i8 = arr_i4.astype('int64') * 86400000 @@ -451,7 +462,7 @@ def test_date_objects_typed(self): table_pandas = table.to_pandas() ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04', - '2017-04-05'], + '2017-04-05'], dtype='datetime64[D]') .astype('datetime64[ns]')) ex_values[1] = pd.NaT.value @@ -470,7 +481,7 @@ def test_dates_from_integers(self): a1 = pa.Array.from_pandas(arr, type=t1) a2 = pa.Array.from_pandas(arr2, type=t2) - expected = datetime.date(2017, 4, 3) + expected = date(2017, 4, 3) assert a1[0].as_py() == expected assert a2[0].as_py() == expected @@ -480,10 +491,10 @@ def test_timedelta(self): # TODO(jreback): Pandas only support ns resolution # Arrow supports ??? for resolution df = pd.DataFrame({ - 'timedelta': np.arange(start=0, stop=3*86400000, + 'timedelta': np.arange(start=0, stop=3 * 86400000, step=86400000, dtype='timedelta64[ms]') - }) + }) pa.Table.from_pandas(df) def test_column_of_arrays(self): @@ -523,6 +534,21 @@ def test_column_of_lists(self): field = schema.field_by_name(column) self._check_array_roundtrip(df[column], type=field.type) + def test_nested_lists_all_none(self): + data = np.array([[None, None], None], dtype=object) + + arr = pa.Array.from_pandas(data) + expected = pa.array(list(data)) + assert arr.equals(expected) + assert arr.type == pa.list_(pa.null()) + + data2 = np.array([None, None, [None, None], + np.array([None, None], dtype=object)], + dtype=object) + arr = pa.Array.from_pandas(data2) + expected = pa.array([None, None, [None, None], [None, None]]) + assert arr.equals(expected) + def test_threaded_conversion(self): df = _alltypes_example() self._check_pandas_roundtrip(df, nthreads=2, @@ -536,6 +562,9 @@ def test_category(self): df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats), 'cat_ints': pd.Categorical(v2 * repeats), 'cat_binary': pd.Categorical(v3 * repeats), + 'cat_strings_ordered': pd.Categorical( + v1 * repeats, categories=['bar', 'qux', 'foo'], + ordered=True), 'ints': v2 * repeats, 'ints2': v2 * repeats, 'strings': v1 * repeats, @@ -666,8 +695,8 @@ def test_decimal_128_to_pandas(self): tm.assert_frame_equal(df, expected) def test_pytime_from_pandas(self): - pytimes = [datetime.time(1, 2, 3, 1356), - datetime.time(4, 5, 6, 1356)] + pytimes = [time(1, 2, 3, 1356), + time(4, 5, 6, 1356)] # microseconds t1 = pa.time64('us') @@ -703,9 +732,9 @@ def test_pytime_from_pandas(self): assert a4[0].as_py() == pytimes[0].replace(microsecond=0) def test_arrow_time_to_pandas(self): - pytimes = [datetime.time(1, 2, 3, 1356), - datetime.time(4, 5, 6, 1356), - datetime.time(0, 0, 0)] + pytimes = [time(1, 2, 3, 1356), + time(4, 5, 6, 1356), + time(0, 0, 0)] expected = np.array(pytimes[:2] + [None]) expected_ms = np.array([x.replace(microsecond=1000) @@ -891,6 +920,17 @@ def test_decimal_metadata(self): assert data_column['numpy_type'] == 'object' assert data_column['metadata'] == {'precision': 26, 'scale': 11} + def test_table_str_to_categorical(self): + values = [None, 'a', 'b', np.nan] + df = pd.DataFrame({'strings': values}) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) + table = pa.Table.from_pandas(df, schema=schema) + + result = table.to_pandas(strings_to_categorical=True) + expected = pd.DataFrame({'strings': pd.Categorical(values)}) + tm.assert_frame_equal(result, expected, check_dtype=True) + def _pytime_from_micros(val): microseconds = val % 1000000 @@ -899,7 +939,7 @@ def _pytime_from_micros(val): val //= 60 minutes = val % 60 hours = val // 60 - return datetime.time(hours, minutes, seconds, microseconds) + return time(hours, minutes, seconds, microseconds) def _pytime_to_micros(pytime): diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index c81a0485ce1ee..d503ea22464d5 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -277,6 +277,8 @@ def test_mock_output_stream(): stream_writer1.write_batch(record_batch) stream_writer2.write_batch(record_batch) + stream_writer1.close() + stream_writer2.close() assert f1.size() == len(f2.get_result()) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 3ad369c31f4f2..120a9825a7b56 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -40,22 +40,20 @@ def _get_sink(self): def _get_source(self): return self.sink.getvalue() - def write_batches(self): + def write_batches(self, num_batches=5): nrows = 5 df = pd.DataFrame({ 'one': np.random.randn(nrows), 'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']}) - batch = pa.RecordBatch.from_pandas(df) writer = self._get_writer(self.sink, batch.schema) - num_batches = 5 frames = [] batches = [] for i in range(num_batches): unique_df = df.copy() - unique_df['one'] = np.random.randn(nrows) + unique_df['one'] = np.random.randn(len(df)) batch = pa.RecordBatch.from_pandas(unique_df) writer.write_batch(batch) @@ -122,6 +120,22 @@ def test_empty_stream(self): with pytest.raises(pa.ArrowInvalid): pa.open_stream(buf) + def test_categorical_roundtrip(self): + df = pd.DataFrame({ + 'one': np.random.randn(5), + 'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'], + categories=['foo', 'bar'], + ordered=True) + }) + batch = pa.RecordBatch.from_pandas(df) + writer = self._get_writer(self.sink, batch.schema) + writer.write_batch(pa.RecordBatch.from_pandas(df)) + writer.close() + + table = (pa.open_stream(pa.BufferReader(self._get_source())) + .read_all()) + assert_frame_equal(table.to_pandas(), df) + def test_simple_roundtrip(self): _, batches = self.write_batches() file_contents = pa.BufferReader(self._get_source()) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index ab3b26cd4e0f1..8a20f4c469200 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -100,10 +100,11 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata - _write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = pq.read_pandas(filename.strpath) assert b'pandas' in table_read.schema.metadata @@ -120,10 +121,11 @@ def test_pandas_parquet_custom_metadata(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata - _write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') md = pq.read_metadata(filename.strpath).metadata assert b'pandas' in md @@ -139,13 +141,12 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas( - df, timestamps_to_ms=True, preserve_index=False - ) + arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8')) assert not js['index_columns'] - _write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = pq.read_pandas(filename.strpath) js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8')) @@ -340,10 +341,11 @@ def test_pandas_parquet_configuration_options(tmpdir): def make_sample_file(df): import pyarrow.parquet as pq - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() - _write_table(a_table, buf, compression='SNAPPY', version='2.0') + _write_table(a_table, buf, compression='SNAPPY', version='2.0', + coerce_timestamps='ms') buf.seek(0) return pq.ParquetFile(buf) @@ -418,22 +420,47 @@ def test_column_of_arrays(tmpdir): df, schema = dataframe_with_arrays() filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, - schema=schema) - _write_table(arrow_table, filename.strpath, version="2.0") + arrow_table = pa.Table.from_pandas(df, schema=schema) + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) +@parquet +def test_coerce_timestamps(tmpdir): + # ARROW-622 + df, schema = dataframe_with_arrays() + + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = pa.Table.from_pandas(df, schema=schema) + + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='us') + table_read = _read_table(filename.strpath) + df_read = table_read.to_pandas() + + df_expected = df.copy() + for i, x in enumerate(df_expected['datetime64']): + if isinstance(x, np.ndarray): + df_expected['datetime64'][i] = x.astype('M8[us]') + + tm.assert_frame_equal(df_expected, df_read) + + with pytest.raises(ValueError): + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='unknown') + + @parquet def test_column_of_lists(tmpdir): df, schema = dataframe_with_lists() filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, - schema=schema) - _write_table(arrow_table, filename.strpath, version="2.0") + arrow_table = pa.Table.from_pandas(df, schema=schema) + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -469,12 +496,14 @@ def test_date_time_types(): t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value - data7 = np.array([start, start + 1, start + 2], dtype='int64') + data7 = np.array([start, start + 1000, start + 2000], + dtype='int64') a7 = pa.Array.from_pandas(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value - data7_us = np.array([start, start + 1, start + 2], dtype='int64') // 1000 + data7_us = np.array([start, start + 1000, start + 2000], + dtype='int64') // 1000 a7_us = pa.Array.from_pandas(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], @@ -547,7 +576,7 @@ def _check_roundtrip(table, expected=None, **params): def test_multithreaded_read(): df = alltypes_sample(size=10000) - table = pa.Table.from_pandas(df, timestamps_to_ms=True) + table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(table, buf, compression='SNAPPY', version='2.0') @@ -585,7 +614,7 @@ def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000) - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, compression='snappy', version='2.0') @@ -608,7 +637,7 @@ def test_read_single_row_group(): N, K = 10000, 4 df = alltypes_sample(size=N) - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, @@ -631,7 +660,7 @@ def test_read_single_row_group_with_column_subset(): N, K = 10000, 4 df = alltypes_sample(size=N) - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, @@ -1098,3 +1127,14 @@ def test_write_error_deletes_incomplete_file(tmpdir): pass assert not os.path.exists(filename) + + +@parquet +def test_read_non_existent_file(tmpdir): + import pyarrow.parquet as pq + + path = 'non-existent-file.parquet' + try: + pq.read_table(path) + except Exception as e: + assert path in e.args[0] diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index fefde55bc2f95..592db4f90dac1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -97,6 +97,11 @@ cdef class DictionaryType(DataType): DataType.init(self, type) self.dict_type = type.get() + property ordered: + + def __get__(self): + return self.dict_type.ordered() + cdef class ListType(DataType): @@ -414,7 +419,7 @@ cdef DataType primitive_type(Type type): _type_cache[type] = out return out -#------------------------------------------------------------ +# ----------------------------------------------------------- # Type factory functions cdef int convert_metadata(dict metadata, @@ -798,7 +803,8 @@ cpdef ListType list_(value_type): return out -cpdef DictionaryType dictionary(DataType index_type, Array dictionary): +cpdef DictionaryType dictionary(DataType index_type, Array dictionary, + bint ordered=False): """ Dictionary (categorical, or simply encoded) type @@ -814,7 +820,8 @@ cpdef DictionaryType dictionary(DataType index_type, Array dictionary): cdef DictionaryType out = DictionaryType() cdef shared_ptr[CDataType] dict_type dict_type.reset(new CDictionaryType(index_type.sp_type, - dictionary.sp_array)) + dictionary.sp_array, + ordered == 1)) out.init(dict_type) return out diff --git a/site/_posts/2017-08-08-plasma-in-memory-object-store.md b/site/_posts/2017-08-08-plasma-in-memory-object-store.md new file mode 100644 index 0000000000000..48cfb6613cf73 --- /dev/null +++ b/site/_posts/2017-08-08-plasma-in-memory-object-store.md @@ -0,0 +1,150 @@ +--- +layout: post +title: "Plasma In-Memory Object Store" +date: "2017-08-08 00:00:00 -0400" +author: Philipp Moritz and Robert Nishihara +categories: [application] +--- + + +*[Philipp Moritz][1] and [Robert Nishihara][2] are graduate students at UC + Berkeley.* + +## Plasma: A High-Performance Shared-Memory Object Store + +### Motivating Plasma + +This blog post presents Plasma, an in-memory object store that is being +developed as part of Apache Arrow. **Plasma holds immutable objects in shared +memory so that they can be accessed efficiently by many clients across process +boundaries.** In light of the trend toward larger and larger multicore machines, +Plasma enables critical performance optimizations in the big data regime. + +Plasma was initially developed as part of [Ray][3], and has recently been moved +to Apache Arrow in the hopes that it will be broadly useful. + +One of the goals of Apache Arrow is to serve as a common data layer enabling +zero-copy data exchange between multiple frameworks. A key component of this +vision is the use of off-heap memory management (via Plasma) for storing and +sharing Arrow-serialized objects between applications. + +**Expensive serialization and deserialization as well as data copying are a +common performance bottleneck in distributed computing.** For example, a +Python-based execution framework that wishes to distribute computation across +multiple Python “worker” processes and then aggregate the results in a single +“driver” process may choose to serialize data using the built-in `pickle` +library. Assuming one Python process per core, each worker process would have to +copy and deserialize the data, resulting in excessive memory usage. The driver +process would then have to deserialize results from each of the workers, +resulting in a bottleneck. + +Using Plasma plus Arrow, the data being operated on would be placed in the +Plasma store once, and all of the workers would read the data without copying or +deserializing it (the workers would map the relevant region of memory into their +own address spaces). The workers would then put the results of their computation +back into the Plasma store, which the driver could then read and aggregate +without copying or deserializing the data. + +### The Plasma API: + +Below we illustrate a subset of the API. The C++ API is documented more fully +[here][6], and the Python API is documented [here][7]. + +**Object IDs:** Each object is associated with a string of bytes. + +**Creating an object:** Objects are stored in Plasma in two stages. First, the +object store *creates* the object by allocating a buffer for it. At this point, +the client can write to the buffer and construct the object within the allocated +buffer. When the client is done, the client *seals* the buffer making the object +immutable and making it available to other Plasma clients. + +```python +# Create an object. +object_id = pyarrow.plasma.ObjectID(20 * b'a') +object_size = 1000 +buffer = memoryview(client.create(object_id, object_size)) + +# Write to the buffer. +for i in range(1000): + buffer[i] = 0 + +# Seal the object making it immutable and available to other clients. +client.seal(object_id) +``` + +**Getting an object:** After an object has been sealed, any client who knows the +object ID can get the object. + +```python +# Get the object from the store. This blocks until the object has been sealed. +object_id = pyarrow.plasma.ObjectID(20 * b'a') +[buff] = client.get([object_id]) +buffer = memoryview(buff) +``` + +If the object has not been sealed yet, then the call to `client.get` will block +until the object has been sealed. + +### A sorting application + +To illustrate the benefits of Plasma, we demonstrate an **11x speedup** (on a +machine with 20 physical cores) for sorting a large pandas DataFrame (one +billion entries). The baseline is the built-in pandas sort function, which sorts +the DataFrame in 477 seconds. To leverage multiple cores, we implement the +following standard distributed sorting scheme. + +* We assume that the data is partitioned across K pandas DataFrames and that + each one already lives in the Plasma store. +* We subsample the data, sort the subsampled data, and use the result to define + L non-overlapping buckets. +* For each of the K data partitions and each of the L buckets, we find the + subset of the data partition that falls in the bucket, and we sort that + subset. +* For each of the L buckets, we gather all of the K sorted subsets that fall in + that bucket. +* For each of the L buckets, we merge the corresponding K sorted subsets. +* We turn each bucket into a pandas DataFrame and place it in the Plasma store. + +Using this scheme, we can sort the DataFrame (the data starts and ends in the +Plasma store), in 44 seconds, giving an 11x speedup over the baseline. + +### Design + +The Plasma store runs as a separate process. It is written in C++ and is +designed as a single-threaded event loop based on the [Redis][4] event loop library. +The plasma client library can be linked into applications. Clients communicate +with the Plasma store via messages serialized using [Google Flatbuffers][5]. + +### Call for contributions + +Plasma is a work in progress, and the API is currently unstable. Today Plasma is +primarily used in [Ray][3] as an in-memory cache for Arrow serialized objects. +We are looking for a broader set of use cases to help refine Plasma’s API. In +addition, we are looking for contributions in a variety of areas including +improving performance and building other language bindings. Please let us know +if you are interested in getting involved with the project. + +[1]: https://people.eecs.berkeley.edu/~pcmoritz/ +[2]: http://www.robertnishihara.com +[3]: https://github.com/ray-project/ray +[4]: https://redis.io/ +[5]: https://google.github.io/flatbuffers/ +[6]: https://github.com/apache/arrow/blob/master/cpp/apidoc/tutorials/plasma.md +[7]: https://github.com/apache/arrow/blob/master/python/doc/source/plasma.rst