From aa1d753a74b5517c0b20db8e5540786520b9956f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Aug 2017 11:57:43 -0400 Subject: [PATCH 01/38] ARROW-573: [C++/Python] Implement IPC metadata handling for ordered dictionaries, pandas conversions This was an oversight in the IPC implementation and pandas conversion path, and has been fixed. Author: Wes McKinney Closes #922 from wesm/ARROW-573 and squashes the following commits: 458820e5 [Wes McKinney] Suppress C4800 in MSVC 46361f3f [Wes McKinney] Implement IPC metadata handling for ordered dictionaries, faithful conversion to/from pandas.Categorical --- cpp/src/arrow/array-test.cc | 11 +++++++---- cpp/src/arrow/compare.cc | 3 ++- cpp/src/arrow/ipc/metadata.cc | 5 +++-- cpp/src/arrow/ipc/test-common.h | 2 +- cpp/src/arrow/type.cc | 7 ++++--- cpp/src/arrow/type.h | 5 +++-- python/CMakeLists.txt | 4 ++++ python/pyarrow/array.pxi | 11 ++++++++--- python/pyarrow/includes/libarrow.pxd | 4 +++- python/pyarrow/pandas_compat.py | 3 ++- python/pyarrow/tests/test_convert_pandas.py | 3 +++ python/pyarrow/tests/test_ipc.py | 22 +++++++++++++++++---- python/pyarrow/types.pxi | 11 +++++++++-- 13 files changed, 67 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 0efb51ccece0c..57d2c8b8493a5 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1881,15 +1881,18 @@ TEST(TestDictionary, Basics) { std::shared_ptr type1 = std::dynamic_pointer_cast(dictionary(int16(), dict)); - DictionaryType type2(int16(), dict); + + auto type2 = + std::dynamic_pointer_cast(::arrow::dictionary(int16(), dict, true)); ASSERT_TRUE(int16()->Equals(type1->index_type())); ASSERT_TRUE(type1->dictionary()->Equals(dict)); - ASSERT_TRUE(int16()->Equals(type2.index_type())); - ASSERT_TRUE(type2.dictionary()->Equals(dict)); + ASSERT_TRUE(int16()->Equals(type2->index_type())); + ASSERT_TRUE(type2->dictionary()->Equals(dict)); - ASSERT_EQ("dictionary", type1->ToString()); + ASSERT_EQ("dictionary", type1->ToString()); + ASSERT_EQ("dictionary", type2->ToString()); } TEST(TestDictionary, Equals) { diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index dda5fdd95d0c3..3a4a4009c6b16 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -769,7 +769,8 @@ class TypeEqualsVisitor { Status Visit(const DictionaryType& left) { const auto& right = static_cast(right_); result_ = left.index_type()->Equals(right.index_type()) && - left.dictionary()->Equals(right.dictionary()); + left.dictionary()->Equals(right.dictionary()) && + (left.ordered() == right.ordered()); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index 20fd280db6de6..d764e203e7552 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -492,7 +492,8 @@ static DictionaryOffset GetDictionaryEncoding(FBB& fbb, const DictionaryType& ty auto index_type_offset = flatbuf::CreateInt(fbb, fw_index_type.bit_width(), true); // TODO(wesm): ordered dictionaries - return flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset); + return flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset, + type.ordered()); } static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, @@ -551,7 +552,7 @@ static Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr index_type; RETURN_NOT_OK(IntFromFlatbuffer(encoding->indexType(), &index_type)); - type = std::make_shared(index_type, dictionary); + type = ::arrow::dictionary(index_type, dictionary, encoding->isOrdered()); } *out = std::make_shared(field->name()->str(), type, field->nullable()); return Status::OK(); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index cb827372d21c4..76cc8430636f8 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -462,7 +462,7 @@ Status MakeDictionary(std::shared_ptr* out) { ArrayFromVector(dict2_values, &dict2); auto f0_type = arrow::dictionary(arrow::int32(), dict1); - auto f1_type = arrow::dictionary(arrow::int8(), dict1); + auto f1_type = arrow::dictionary(arrow::int8(), dict1, true); auto f2_type = arrow::dictionary(arrow::int32(), dict2); std::shared_ptr indices0, indices1, indices2; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index b8489d44cdb00..edf4d33b23f39 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -236,7 +236,7 @@ std::shared_ptr DictionaryType::dictionary() const { return dictionary_; std::string DictionaryType::ToString() const { std::stringstream ss; ss << "dictionarytype()->ToString() - << ", indices=" << index_type_->ToString() << ">"; + << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">"; return ss.str(); } @@ -428,8 +428,9 @@ std::shared_ptr union_(const std::vector>& chil } std::shared_ptr dictionary(const std::shared_ptr& index_type, - const std::shared_ptr& dict_values) { - return std::make_shared(index_type, dict_values); + const std::shared_ptr& dict_values, + bool ordered) { + return std::make_shared(index_type, dict_values, ordered); } std::shared_ptr field(const std::string& name, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 45d97fdb32bbc..b28fe9229b2ae 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -785,8 +785,9 @@ std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); -std::shared_ptr ARROW_EXPORT dictionary( - const std::shared_ptr& index_type, const std::shared_ptr& values); +std::shared_ptr ARROW_EXPORT +dictionary(const std::shared_ptr& index_type, + const std::shared_ptr& values, bool ordered = false); std::shared_ptr ARROW_EXPORT field( const std::string& name, const std::shared_ptr& type, bool nullable = true, diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index bfae157ed6b9c..af95073f5da35 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -92,6 +92,10 @@ else() # Cython generates some bitshift expressions that MSVC does not like in # __Pyx_PyFloat_DivideObjC set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293") + + # Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning + # seem harmless, and probably not worth the effort of working around it + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800") endif() if ("${COMPILER_FAMILY}" STREQUAL "clang") diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 67418aa5eac67..f320cbedc8d3a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -189,7 +189,8 @@ cdef class Array: if isinstance(values, Categorical): return DictionaryArray.from_arrays( values.codes, values.categories.values, - mask=mask, memory_pool=memory_pool) + mask=mask, ordered=values.ordered, + memory_pool=memory_pool) elif values.dtype == object: # Object dtype undergoes a different conversion path as more type # inference may be needed @@ -564,7 +565,7 @@ cdef class DictionaryArray(Array): return self._indices @staticmethod - def from_arrays(indices, dictionary, mask=None, + def from_arrays(indices, dictionary, mask=None, ordered=False, MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be @@ -576,6 +577,8 @@ cdef class DictionaryArray(Array): dictionary : ndarray or pandas.Series mask : ndarray or pandas.Series, boolean type True values indicate that indices are actually null + ordered : boolean, default False + Set to True if the category values are ordered Returns ------- @@ -609,8 +612,10 @@ cdef class DictionaryArray(Array): if not isinstance(arrow_indices, IntegerArray): raise ValueError('Indices must be integer type') + cdef c_bool c_ordered = ordered + c_type.reset(new CDictionaryType(arrow_indices.type.sp_type, - arrow_dictionary.sp_array)) + arrow_dictionary.sp_array, c_ordered)) c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array)) result = DictionaryArray() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8d7e27915eede..a25d7a2f5b7a5 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -132,10 +132,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType): CDictionaryType(const shared_ptr[CDataType]& index_type, - const shared_ptr[CArray]& dictionary) + const shared_ptr[CArray]& dictionary, + c_bool ordered) shared_ptr[CDataType] index_type() shared_ptr[CArray] dictionary() + c_bool ordered() shared_ptr[CDataType] ctimestamp" arrow::timestamp"(TimeUnit unit) shared_ptr[CDataType] ctimestamp" arrow::timestamp"( diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index cd7ad47782646..62547a42f7359 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -284,9 +284,10 @@ def table_to_blockmanager(table, nthreads=1): block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: + ordered = block_table.schema[placement[0]].type.ordered cat = pd.Categorical(block_arr, categories=item['dictionary'], - ordered=False, fastpath=True) + ordered=ordered, fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index d488658563306..f6ea1636a3d62 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -536,6 +536,9 @@ def test_category(self): df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats), 'cat_ints': pd.Categorical(v2 * repeats), 'cat_binary': pd.Categorical(v3 * repeats), + 'cat_strings_ordered': pd.Categorical( + v1 * repeats, categories=['bar', 'qux', 'foo'], + ordered=True), 'ints': v2 * repeats, 'ints2': v2 * repeats, 'strings': v1 * repeats, diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 3ad369c31f4f2..120a9825a7b56 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -40,22 +40,20 @@ def _get_sink(self): def _get_source(self): return self.sink.getvalue() - def write_batches(self): + def write_batches(self, num_batches=5): nrows = 5 df = pd.DataFrame({ 'one': np.random.randn(nrows), 'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']}) - batch = pa.RecordBatch.from_pandas(df) writer = self._get_writer(self.sink, batch.schema) - num_batches = 5 frames = [] batches = [] for i in range(num_batches): unique_df = df.copy() - unique_df['one'] = np.random.randn(nrows) + unique_df['one'] = np.random.randn(len(df)) batch = pa.RecordBatch.from_pandas(unique_df) writer.write_batch(batch) @@ -122,6 +120,22 @@ def test_empty_stream(self): with pytest.raises(pa.ArrowInvalid): pa.open_stream(buf) + def test_categorical_roundtrip(self): + df = pd.DataFrame({ + 'one': np.random.randn(5), + 'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'], + categories=['foo', 'bar'], + ordered=True) + }) + batch = pa.RecordBatch.from_pandas(df) + writer = self._get_writer(self.sink, batch.schema) + writer.write_batch(pa.RecordBatch.from_pandas(df)) + writer.close() + + table = (pa.open_stream(pa.BufferReader(self._get_source())) + .read_all()) + assert_frame_equal(table.to_pandas(), df) + def test_simple_roundtrip(self): _, batches = self.write_batches() file_contents = pa.BufferReader(self._get_source()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index fefde55bc2f95..ad2f336061580 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -97,6 +97,11 @@ cdef class DictionaryType(DataType): DataType.init(self, type) self.dict_type = type.get() + property ordered: + + def __get__(self): + return self.dict_type.ordered() + cdef class ListType(DataType): @@ -798,7 +803,8 @@ cpdef ListType list_(value_type): return out -cpdef DictionaryType dictionary(DataType index_type, Array dictionary): +cpdef DictionaryType dictionary(DataType index_type, Array dictionary, + bint ordered=False): """ Dictionary (categorical, or simply encoded) type @@ -814,7 +820,8 @@ cpdef DictionaryType dictionary(DataType index_type, Array dictionary): cdef DictionaryType out = DictionaryType() cdef shared_ptr[CDataType] dict_type dict_type.reset(new CDictionaryType(index_type.sp_type, - dictionary.sp_array)) + dictionary.sp_array, + ordered == 1)) out.init(dict_type) return out From e5ed31fc5e903ea0a2102623413ab85577cba123 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Aug 2017 22:50:21 -0400 Subject: [PATCH 02/38] ARROW-1093: [Python] Run flake8 in Travis CI. Add note about development to README Author: Wes McKinney Closes #924 from wesm/ARROW-1093 and squashes the following commits: 75969c37 [Wes McKinney] Add spaces before continuation backslash fa24ec92 [Wes McKinney] Specify file suffixes completely c0af17c3 [Wes McKinney] Add flake8 file for Cython, fix Cython style errors 7ffa6135 [Wes McKinney] Add Cython flake8 file f10e8d1f [Wes McKinney] Run flake8 in Travis CI. Add note to README --- ci/travis_script_python.sh | 8 +++- python/.flake8.cython | 20 ++++++++++ python/README.md | 19 ++++++++- python/pyarrow/_config.pyx | 9 +++-- python/pyarrow/_parquet.pxd | 45 ++++++++++++--------- python/pyarrow/_parquet.pyx | 9 +++-- python/pyarrow/array.pxi | 9 ++--- python/pyarrow/feather.pxi | 2 +- python/pyarrow/includes/libarrow.pxd | 60 ++++++++++++++-------------- python/pyarrow/io-hdfs.pxi | 1 - python/pyarrow/io.pxi | 10 ++++- python/pyarrow/ipc.pxi | 2 +- python/pyarrow/lib.pxd | 4 +- python/pyarrow/lib.pyx | 23 +++++------ python/pyarrow/plasma.pyx | 24 ++++++----- python/pyarrow/public-api.pxi | 11 ++--- python/pyarrow/scalar.pxi | 8 ++-- python/pyarrow/table.pxi | 5 +-- python/pyarrow/types.pxi | 2 +- 19 files changed, 160 insertions(+), 111 deletions(-) create mode 100644 python/.flake8.cython diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 9135aaf38e4e7..4a50d2faaf551 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -92,7 +92,13 @@ python_version_tests() { conda install -y -q nomkl # Expensive dependencies install from Continuum package repo - conda install -y -q pip numpy pandas cython + conda install -y -q pip numpy pandas cython flake8 + + # Fail fast on style checks + flake8 pyarrow + + # Check Cython files with some checks turned off + flake8 --config=.flake8.cython pyarrow # Build C++ libraries rebuild_arrow_libraries diff --git a/python/.flake8.cython b/python/.flake8.cython new file mode 100644 index 0000000000000..53e41323051f9 --- /dev/null +++ b/python/.flake8.cython @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[flake8] +filename = *.pyx,*.pxd,*.pxi +ignore = E211,E901,E225,E226,E227 diff --git a/python/README.md b/python/README.md index 29d213babd93b..3c48d5d30b595 100644 --- a/python/README.md +++ b/python/README.md @@ -38,7 +38,21 @@ On Linux, you can also install binary wheels from PyPI with pip: pip install pyarrow ``` -### Development details +## Development + +### Coding Style + +We follow a similar PEP8-like coding style to the [pandas project][3]. + +The code must pass `flake8` (available from pip or conda) or it will fail the +build. Check for style errors before submitting your pull request with: + +``` +flake8 pyarrow +flake8 --config=.flake8.cython pyarrow +``` + +### Building from Source See the [Development][2] page in the documentation. @@ -50,4 +64,5 @@ python setup.py build_sphinx -s doc/source ``` [1]: https://github.com/apache/parquet-cpp -[2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst \ No newline at end of file +[2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst +[3]: https://github.com/pandas-dev/pandas \ No newline at end of file diff --git a/python/pyarrow/_config.pyx b/python/pyarrow/_config.pyx index a2d2d719e68d0..bc9f36d8e50cb 100644 --- a/python/pyarrow/_config.pyx +++ b/python/pyarrow/_config.pyx @@ -19,6 +19,10 @@ # distutils: language = c++ # cython: embedsignature = True +import numpy as np +import multiprocessing +import os + cdef extern from 'arrow/python/init.h': int arrow_init_numpy() except -1 @@ -27,15 +31,13 @@ cdef extern from 'arrow/python/config.h' namespace 'arrow::py': arrow_init_numpy() -import numpy as np set_numpy_nan(np.nan) -import multiprocessing -import os cdef int CPU_COUNT = int( os.environ.get('OMP_NUM_THREADS', max(multiprocessing.cpu_count() // 2, 1))) + def cpu_count(): """ Returns @@ -49,6 +51,7 @@ def cpu_count(): """ return CPU_COUNT + def set_cpu_count(count): global CPU_COUNT CPU_COUNT = max(int(count), 1) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index b1cd5eb2c2be0..7299e19b81906 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -25,17 +25,18 @@ from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: - cdef cppclass Node: - pass + cdef cppclass Node: + pass - cdef cppclass GroupNode(Node): - pass + cdef cppclass GroupNode(Node): + pass - cdef cppclass PrimitiveNode(Node): - pass + cdef cppclass PrimitiveNode(Node): + pass + + cdef cppclass ColumnPath: + c_string ToDotString() - cdef cppclass ColumnPath: - c_string ToDotString() cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: enum ParquetType" parquet::Type::type": @@ -59,8 +60,10 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetLogicalType_DATE" parquet::LogicalType::DATE" ParquetLogicalType_TIME_MILLIS" parquet::LogicalType::TIME_MILLIS" ParquetLogicalType_TIME_MICROS" parquet::LogicalType::TIME_MICROS" - ParquetLogicalType_TIMESTAMP_MILLIS" parquet::LogicalType::TIMESTAMP_MILLIS" - ParquetLogicalType_TIMESTAMP_MICROS" parquet::LogicalType::TIMESTAMP_MICROS" + ParquetLogicalType_TIMESTAMP_MILLIS \ + " parquet::LogicalType::TIMESTAMP_MILLIS" + ParquetLogicalType_TIMESTAMP_MICROS \ + " parquet::LogicalType::TIMESTAMP_MICROS" ParquetLogicalType_UINT_8" parquet::LogicalType::UINT_8" ParquetLogicalType_UINT_16" parquet::LogicalType::UINT_16" ParquetLogicalType_UINT_32" parquet::LogicalType::UINT_32" @@ -83,8 +86,10 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY" ParquetEncoding_RLE" parquet::Encoding::RLE" ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED" - ParquetEncoding_DELTA_BINARY_PACKED" parquet::Encoding::DELTA_BINARY_PACKED" - ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY" parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY" + ParquetEncoding_DELTA_BINARY_PACKED \ + " parquet::Encoding::DELTA_BINARY_PACKED" + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY \ + " parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY" ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY" ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY" @@ -231,13 +236,15 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: - CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, - const shared_ptr[const CKeyValueMetadata]& key_value_metadata, - shared_ptr[CSchema]* out) - - CStatus ToParquetSchema(const CSchema* arrow_schema, - const shared_ptr[const CKeyValueMetadata]& key_value_metadata, - shared_ptr[SchemaDescriptor]* out) + CStatus FromParquetSchema( + const SchemaDescriptor* parquet_schema, + const shared_ptr[const CKeyValueMetadata]& key_value_metadata, + shared_ptr[CSchema]* out) + + CStatus ToParquetSchema( + const CSchema* arrow_schema, + const shared_ptr[const CKeyValueMetadata]& key_value_metadata, + shared_ptr[SchemaDescriptor]* out) cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c940122da5dcf..919e82c109451 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -279,8 +279,8 @@ cdef class ColumnSchema: max_repetition_level: {3} physical_type: {4} logical_type: {5}""".format(self.name, self.path, self.max_definition_level, - self.max_repetition_level, physical_type, - logical_type) + self.max_repetition_level, physical_type, + logical_type) property name: @@ -514,7 +514,7 @@ cdef class ParquetReader: with nogil: check_status(self.reader.get() - .ReadSchemaField(field_index, &carray)); + .ReadSchemaField(field_index, &carray)) array.init(carray) return array @@ -553,7 +553,8 @@ cdef class ParquetWriter: def __cinit__(self, where, Schema schema, use_dictionary=None, compression=None, version=None, - MemoryPool memory_pool=None, use_deprecated_int96_timestamps=False): + MemoryPool memory_pool=None, + use_deprecated_int96_timestamps=False): cdef: shared_ptr[FileOutputStream] filestream shared_ptr[WriterProperties] properties diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f320cbedc8d3a..cbd036c08431f 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -66,8 +66,8 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, array : pyarrow.Array """ cdef: - shared_ptr[CArray] sp_array - CMemoryPool* pool + shared_ptr[CArray] sp_array + CMemoryPool* pool pool = maybe_unbox_memory_pool(memory_pool) if type is None: @@ -78,13 +78,13 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, ConvertPySequence( sequence, pool, &sp_array, type.sp_type ) - ) + ) else: check_status( ConvertPySequence( sequence, pool, &sp_array, type.sp_type, size ) - ) + ) return pyarrow_wrap_array(sp_array) @@ -401,7 +401,6 @@ strides: {2}""".format(self.type, self.shape, self.strides) return py_strides - cdef wrap_array_output(PyObject* output): cdef object obj = PyObject_to_object(output) diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 2e7cf6c9bd1b8..6faf2f9c69c7b 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#---------------------------------------------------------------------- +# --------------------------------------------------------------------- # Implement legacy Feather file format diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a25d7a2f5b7a5..3ea487385de76 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -178,7 +178,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringType" arrow::StringType"(CDataType): pass - cdef cppclass CFixedSizeBinaryType" arrow::FixedSizeBinaryType"(CFixedWidthType): + cdef cppclass CFixedSizeBinaryType \ + " arrow::FixedSizeBinaryType"(CFixedWidthType): CFixedSizeBinaryType(int byte_width) int byte_width() int bit_width() @@ -209,7 +210,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CKeyValueMetadata]& metadata) shared_ptr[CField] RemoveMetadata() - cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) @@ -309,9 +309,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructArray" arrow::StructArray"(CArray): CStructArray(shared_ptr[CDataType] type, int64_t length, - vector[shared_ptr[CArray]] children, - shared_ptr[CBuffer] null_bitmap = nullptr, int64_t null_count = 0, - int64_t offset = 0) + vector[shared_ptr[CArray]] children, + shared_ptr[CBuffer] null_bitmap=nullptr, + int64_t null_count=0, + int64_t offset=0) shared_ptr[CArray] field(int pos) const vector[shared_ptr[CArray]] fields() @@ -462,7 +463,6 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: cdef extern from "arrow/io/file.h" namespace "arrow::io" nogil: - cdef cppclass FileOutputStream(OutputStream): @staticmethod CStatus Open(const c_string& path, shared_ptr[FileOutputStream]* file) @@ -479,12 +479,12 @@ cdef extern from "arrow/io/file.h" namespace "arrow::io" nogil: int file_descriptor() - cdef cppclass CMemoryMappedFile" arrow::io::MemoryMappedFile"\ - (ReadWriteFileInterface): + cdef cppclass CMemoryMappedFile \ + " arrow::io::MemoryMappedFile"(ReadWriteFileInterface): @staticmethod CStatus Create(const c_string& path, int64_t size, - shared_ptr[CMemoryMappedFile]* file) + shared_ptr[CMemoryMappedFile]* file) @staticmethod CStatus Open(const c_string& path, FileMode mode, @@ -509,7 +509,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: HdfsDriver driver cdef cppclass HdfsPathInfo: - ObjectType kind; + ObjectType kind c_string name c_string owner c_string group @@ -563,21 +563,22 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil: - cdef cppclass CBufferReader" arrow::io::BufferReader"\ - (RandomAccessFile): + cdef cppclass CBufferReader \ + " arrow::io::BufferReader"(RandomAccessFile): CBufferReader(const shared_ptr[CBuffer]& buffer) CBufferReader(const uint8_t* data, int64_t nbytes) - cdef cppclass CBufferOutputStream" arrow::io::BufferOutputStream"\ - (OutputStream): + cdef cppclass CBufferOutputStream \ + " arrow::io::BufferOutputStream"(OutputStream): CBufferOutputStream(const shared_ptr[ResizableBuffer]& buffer) - cdef cppclass CMockOutputStream" arrow::io::MockOutputStream"\ - (OutputStream): + cdef cppclass CMockOutputStream \ + " arrow::io::MockOutputStream"(OutputStream): CMockOutputStream() int64_t GetExtentBytesWritten() - cdef cppclass CFixedSizeBufferWriter" arrow::io::FixedSizeBufferWriter"(WriteableFile): + cdef cppclass CFixedSizeBufferWriter \ + " arrow::io::FixedSizeBufferWriter"(WriteableFile): CFixedSizeBufferWriter(const shared_ptr[CBuffer]& buffer) @@ -609,48 +610,45 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: c_string FormatMessageType(MessageType type) - cdef cppclass CMessageReader \ - " arrow::ipc::MessageReader": + cdef cppclass CMessageReader" arrow::ipc::MessageReader": CStatus ReadNextMessage(unique_ptr[CMessage]* out) cdef cppclass CInputStreamMessageReader \ - " arrow::ipc::InputStreamMessageReader": + " arrow::ipc::InputStreamMessageReader": CInputStreamMessageReader(const shared_ptr[InputStream]& stream) - cdef cppclass CRecordBatchWriter \ - " arrow::ipc::RecordBatchWriter": + cdef cppclass CRecordBatchWriter" arrow::ipc::RecordBatchWriter": CStatus Close() CStatus WriteRecordBatch(const CRecordBatch& batch) - cdef cppclass CRecordBatchReader \ - " arrow::ipc::RecordBatchReader": + cdef cppclass CRecordBatchReader" arrow::ipc::RecordBatchReader": shared_ptr[CSchema] schema() CStatus ReadNextRecordBatch(shared_ptr[CRecordBatch]* batch) cdef cppclass CRecordBatchStreamReader \ - " arrow::ipc::RecordBatchStreamReader"(CRecordBatchReader): + " arrow::ipc::RecordBatchStreamReader"(CRecordBatchReader): @staticmethod CStatus Open(const shared_ptr[InputStream]& stream, shared_ptr[CRecordBatchStreamReader]* out) @staticmethod CStatus Open2" Open"(unique_ptr[CMessageReader] message_reader, - shared_ptr[CRecordBatchStreamReader]* out) + shared_ptr[CRecordBatchStreamReader]* out) cdef cppclass CRecordBatchStreamWriter \ - " arrow::ipc::RecordBatchStreamWriter"(CRecordBatchWriter): + " arrow::ipc::RecordBatchStreamWriter"(CRecordBatchWriter): @staticmethod CStatus Open(OutputStream* sink, const shared_ptr[CSchema]& schema, shared_ptr[CRecordBatchStreamWriter]* out) cdef cppclass CRecordBatchFileWriter \ - " arrow::ipc::RecordBatchFileWriter"(CRecordBatchWriter): + " arrow::ipc::RecordBatchFileWriter"(CRecordBatchWriter): @staticmethod CStatus Open(OutputStream* sink, const shared_ptr[CSchema]& schema, shared_ptr[CRecordBatchFileWriter]* out) cdef cppclass CRecordBatchFileReader \ - " arrow::ipc::RecordBatchFileReader": + " arrow::ipc::RecordBatchFileReader": @staticmethod CStatus Open(const shared_ptr[RandomAccessFile]& file, shared_ptr[CRecordBatchFileReader]* out) @@ -724,7 +722,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus ConvertPySequence(object obj, CMemoryPool* pool, shared_ptr[CArray]* out, const shared_ptr[CDataType]& type, - int64_t size) + int64_t size) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) @@ -737,7 +735,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, - shared_ptr[CTensor]* out); + shared_ptr[CTensor]* out) CStatus TensorToNdarray(const CTensor& tensor, object base, PyObject** out) diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index 8ac4e8c2319c1..27e9948b3a4ea 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -231,7 +231,6 @@ cdef class HadoopFileSystem: check_status(self.client.get() .GetPathInfo(c_path, info)) - def ls(self, path, bint full_info): cdef: c_string c_path = tobytes(path) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 211c2a3e6e9cf..cccb1736be6fa 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -255,13 +255,18 @@ cdef class NativeFile: if not hasattr(stream_or_path, 'read'): stream = open(stream_or_path, 'wb') - cleanup = lambda: stream.close() + + def cleanup(): + stream.close() else: stream = stream_or_path - cleanup = lambda: None + + def cleanup(): + pass done = False exc_info = None + def bg_write(): try: while not done or write_queue.qsize() > 0: @@ -326,6 +331,7 @@ cdef class NativeFile: done = False exc_info = None + def bg_write(): try: while not done or write_queue.qsize() > 0: diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 31ee578920eae..d6f62aa95c5fb 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -279,7 +279,7 @@ cdef class _RecordBatchFileWriter(_RecordBatchWriter): with nogil: check_status( CRecordBatchFileWriter.Open(self.sink.get(), schema.sp_schema, - &writer)) + &writer)) # Cast to base class, because has same interface self.writer = writer diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 3e1419bdfc072..48a58f7b82660 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -17,6 +17,7 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow cimport CStatus from cpython cimport PyObject from libcpp cimport nullptr @@ -24,9 +25,6 @@ cdef extern from "Python.h": int PySlice_Check(object) -from pyarrow.includes.libarrow cimport CStatus - - cdef int check_status(const CStatus& status) nogil except -1 diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 4df2fcd64f60f..789801b9f06a9 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -19,31 +19,27 @@ # distutils: language = c++ # cython: embedsignature = True -from cython.operator cimport dereference as deref -from pyarrow.includes.libarrow cimport * -from pyarrow.includes.common cimport PyObject_to_object -cimport pyarrow.includes.libarrow as libarrow -cimport cpython as cp - - import datetime import decimal as _pydecimal +import multiprocessing import numpy as np +import os import six from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical +from cython.operator cimport dereference as deref +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.common cimport PyObject_to_object +cimport pyarrow.includes.libarrow as libarrow +cimport cpython as cp + cdef _pandas(): import pandas as pd return pd - arrow_init_numpy() - -import numpy as np set_numpy_nan(np.nan) -import multiprocessing -import os cdef int CPU_COUNT = int( os.environ.get('OMP_NUM_THREADS', max(multiprocessing.cpu_count() // 2, 1))) @@ -62,6 +58,7 @@ def cpu_count(): """ return CPU_COUNT + def set_cpu_count(count): global CPU_COUNT CPU_COUNT = max(int(count), 1) @@ -122,7 +119,5 @@ include "ipc.pxi" # Feather format include "feather.pxi" -#---------------------------------------------------------------------- # Public API - include "public-api.pxi" diff --git a/python/pyarrow/plasma.pyx b/python/pyarrow/plasma.pyx index dd62d473b001a..befa283d85b54 100644 --- a/python/pyarrow/plasma.pyx +++ b/python/pyarrow/plasma.pyx @@ -60,8 +60,8 @@ cdef extern from "plasma/common.h": PLASMA_QUERY_LOCAL"plasma::PLASMA_QUERY_LOCAL", PLASMA_QUERY_ANYWHERE"plasma::PLASMA_QUERY_ANYWHERE" - cdef int ObjectStatusLocal"plasma::ObjectStatusLocal"; - cdef int ObjectStatusRemote"plasma::ObjectStatusRemote"; + cdef int ObjectStatusLocal"plasma::ObjectStatusLocal" + cdef int ObjectStatusRemote"plasma::ObjectStatusRemote" cdef extern from "plasma/client.h" nogil: @@ -102,7 +102,7 @@ cdef extern from "plasma/client.h" nogil: CStatus Wait(int64_t num_object_requests, CObjectRequest* object_requests, int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready); + int* num_objects_ready) CStatus Transfer(const char* addr, int port, const CUniqueID& object_id) @@ -312,9 +312,10 @@ cdef class PlasmaClient: result = [] for i in range(object_buffers.size()): if object_buffers[i].data_size != -1: - result.append(self._make_plasma_buffer( - object_ids[i], object_buffers[i].data, - object_buffers[i].data_size)) + result.append( + self._make_plasma_buffer(object_ids[i], + object_buffers[i].data, + object_buffers[i].data_size)) else: result.append(None) return result @@ -345,9 +346,10 @@ cdef class PlasmaClient: self._get_object_buffers(object_ids, timeout_ms, &object_buffers) result = [] for i in range(object_buffers.size()): - result.append(self._make_plasma_buffer( - object_ids[i], object_buffers[i].metadata, - object_buffers[i].metadata_size)) + result.append( + self._make_plasma_buffer(object_ids[i], + object_buffers[i].metadata, + object_buffers[i].metadata_size)) return result def seal(self, ObjectID object_id): @@ -502,7 +504,7 @@ cdef class PlasmaClient: object_requests.data(), num_returns, timeout, &num_objects_ready)) - cdef int num_to_return = min(num_objects_ready, num_returns); + cdef int num_to_return = min(num_objects_ready, num_returns) ready_ids = [] waiting_ids = set(object_ids) cdef int num_returned = 0 @@ -510,7 +512,7 @@ cdef class PlasmaClient: if num_returned == num_to_return: break if (object_requests[i].status == ObjectStatusLocal or - object_requests[i].status == ObjectStatusRemote): + object_requests[i].status == ObjectStatusRemote): ready_ids.append( ObjectID(object_requests[i].object_id.binary())) waiting_ids.discard( diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 28e07ffc37dc3..7e08f632e872e 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -47,7 +47,8 @@ cdef public api bint pyarrow_is_data_type(object type_): return isinstance(type_, DataType) -cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type(object data_type): +cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type( + object data_type): cdef DataType type_ if pyarrow_is_data_type(data_type): type_ = (data_type) @@ -57,7 +58,7 @@ cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type(object data_type) cdef public api object pyarrow_wrap_data_type( - const shared_ptr[CDataType]& type): + const shared_ptr[CDataType]& type): cdef: DataType out @@ -149,7 +150,7 @@ cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): cdef public api object pyarrow_wrap_chunked_array( - const shared_ptr[CChunkedArray]& sp_array): + const shared_ptr[CChunkedArray]& sp_array): if sp_array.get() == NULL: raise ValueError('ChunkedArray was NULL') @@ -177,7 +178,7 @@ cdef public api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): cdef public api object pyarrow_wrap_tensor( - const shared_ptr[CTensor]& sp_tensor): + const shared_ptr[CTensor]& sp_tensor): if sp_tensor.get() == NULL: raise ValueError('Tensor was NULL') @@ -238,7 +239,7 @@ cdef public api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): cdef public api object pyarrow_wrap_batch( - const shared_ptr[CRecordBatch]& cbatch): + const shared_ptr[CRecordBatch]& cbatch): cdef RecordBatch batch = RecordBatch() batch.init(cbatch) return batch diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 1f72070cb7e12..16d2bad0d2d8d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -154,11 +154,11 @@ cdef class Time32Value(ArrayValue): CTime32Type* dtype = ap.type().get() if dtype.unit() == TimeUnit_SECOND: - return (datetime.datetime(1970, 1, 1) + - datetime.timedelta(seconds=ap.Value(self.index))).time() + delta = datetime.timedelta(seconds=ap.Value(self.index)) + return (datetime.datetime(1970, 1, 1) + delta).time() else: - return (datetime.datetime(1970, 1, 1) + - datetime.timedelta(milliseconds=ap.Value(self.index))).time() + delta = datetime.timedelta(milliseconds=ap.Value(self.index)) + return (datetime.datetime(1970, 1, 1) + delta).time() cdef class Time64Value(ArrayValue): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 6277761b7d6ec..997b28579f847 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -495,7 +495,6 @@ cdef class RecordBatch: entries.append((name, column)) return OrderedDict(entries) - def to_pandas(self, nthreads=None): """ Convert the arrow::RecordBatch to a pandas DataFrame @@ -585,7 +584,6 @@ def table_to_blocks(Table table, int nthreads): return PyObject_to_object(result_obj) - cdef class Table: """ A collection of top-level named, equal length Arrow arrays. @@ -897,7 +895,8 @@ cdef class Table: """ Number of rows in this table. - Due to the definition of a table, all columns have the same number of rows. + Due to the definition of a table, all columns have the same number of + rows. Returns ------- diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ad2f336061580..592db4f90dac1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -419,7 +419,7 @@ cdef DataType primitive_type(Type type): _type_cache[type] = out return out -#------------------------------------------------------------ +# ----------------------------------------------------------- # Type factory functions cdef int convert_metadata(dict metadata, From 7e7861c55108dd257ed45512ec15323e9e7fb583 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 1 Aug 2017 22:54:57 -0400 Subject: [PATCH 03/38] ARROW-1257: Plasma documentation Thanks a lot to @crystalzyan who did all the heavy lifting for this PR! Author: Philipp Moritz Author: Crystal Yan Author: Robert Nishihara Closes #881 from pcmoritz/plasma-docs and squashes the following commits: c4ab47e0 [Robert Nishihara] Remove unsupported shell keyword from plasma.md. 4b987e83 [Robert Nishihara] Fix typo. 21bdc014 [Robert Nishihara] Small changes to python plasma documentation. 4163ccfa [Robert Nishihara] Some changes to plasma.md and add syntax highlighting. 791e5b0b [Philipp Moritz] API changes 80aaf89d [Philipp Moritz] cleanup c8847204 [Philipp Moritz] more fixes ba8b0dfa [Philipp Moritz] fix docs 193e00b0 [Philipp Moritz] unify installation instructions 84141b6c [Philipp Moritz] update C++ documentation 9a8437c9 [Philipp Moritz] edit the C++ tutorial (work in progress) caac4791 [Crystal Yan] Plasma C++ tutorial documentation - minor formatting fixes 5a8433e9 [Crystal Yan] Plasma C++ tutorial documentation - created a tutorial on C++ Plasma for Starting the Object Store, Creating Clients, Creating Objects, Getting Objects, Transferring to Remote Stores, Querying Status, Releasing Objects, and Shutting Down Clients and Stores. Basically all of the PlasmaClient API. Warning- I could not get C++ running on my machine to verify that any of the code runs properly/works. Please verify all code and tutorial content bc078ff8 [Philipp Moritz] complete installation instructions on macOS 3f3f373b [Philipp Moritz] fix plasma documentation f51f41e0 [Philipp Moritz] remove old test.py 2be9eab6 [Crystal Yan] Plasma documentation- Added using Pandas with Plasma sections. a49e1222 [Crystal Yan] Plasma documentation- Added parts on using Arrow with Plasma 25abf830 [Crystal Yan] Plasma documentation- tweaked contents headings hierarchy, added a bit to 'Getting an Object' subsection in Plasma API. 5cf63e92 [Crystal Yan] Plasma documentation- Copied and edited Plasma API section, added a contents header at top, minor tweaks to Linux Installation section. Still need to do Installation on Mac OS and storing Arrow/Panda in Plasma c02955bf [Crystal Yan] Plasma documentation- initial writeup of installation for linux. Installation for mac incomplete --- cpp/apidoc/index.md | 1 + cpp/apidoc/tutorials/plasma.md | 442 ++++++++++++++++++++++++++++++ python/doc/source/development.rst | 14 +- python/doc/source/index.rst | 1 + python/doc/source/plasma.rst | 337 +++++++++++++++++++++++ 5 files changed, 791 insertions(+), 4 deletions(-) create mode 100644 cpp/apidoc/tutorials/plasma.md create mode 100644 python/doc/source/plasma.rst diff --git a/cpp/apidoc/index.md b/cpp/apidoc/index.md index 8389d16b4aa1b..ab9bbaa405abc 100644 --- a/cpp/apidoc/index.md +++ b/cpp/apidoc/index.md @@ -39,6 +39,7 @@ Table of Contents * How to access [HDFS](HDFS.md) * Tutorials * [Convert a vector of row-wise data into an Arrow table](tutorials/row_wise_conversion.md) + * [Using the Plasma In-Memory Object Store](tutorials/plasma.md) Getting Started --------------- diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md new file mode 100644 index 0000000000000..9911546ed5c69 --- /dev/null +++ b/cpp/apidoc/tutorials/plasma.md @@ -0,0 +1,442 @@ + + +Using the Plasma In-Memory Object Store from C++ +================================================ + +Apache Arrow offers the ability to share your data structures among multiple +processes simultaneously through Plasma, an in-memory object store. + +Note that **the Plasma API is not stable**. + +Plasma clients are processes that run on the same machine as the object store. +They communicate with the object store over Unix domain sockets, and they read +and write data in the object store through shared memory. + +Plasma objects are immutable once they have been created. + +The following goes over the basics so you can begin using Plasma in your big +data applications. + +Starting the Plasma store +------------------------- + +To start running the Plasma object store so that clients may +connect and access the data, run the following command: + +``` +plasma_store -m 1000000000 -s /tmp/plasma +``` + +The `-m` flag specifies the size of the object store in bytes. The `-s` flag +specifies the path of the Unix domain socket that the store will listen at. + +Therefore, the above command initializes a Plasma store up to 1 GB of memory +and sets the socket to `/tmp/plasma.` + +The Plasma store will remain available as long as the `plasma_store` process is +running in a terminal window. Messages, such as alerts for disconnecting +clients, may occasionally be output. To stop running the Plasma store, you +can press `Ctrl-C` in the terminal window. + +Alternatively, you can run the Plasma store in the background and ignore all +message output with the following terminal command: + +``` +plasma_store -m 1000000000 -s /tmp/plasma 1> /dev/null 2> /dev/null & +``` + +The Plasma store will instead run silently in the background. To stop running +the Plasma store in this case, issue the command below: + +``` +killall plasma_store +``` + +Creating a Plasma client +------------------------ + +Now that the Plasma object store is up and running, it is time to make a client +process connect to it. To use the Plasma object store as a client, your +application should initialize a `plasma::PlasmaClient` object and tell it to +connect to the socket specified when starting up the Plasma object store. + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + // Disconnect the Plasma client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +Save this program in a file `test.cc` and compile it with + +``` +g++ test.cc `pkg-config --cflags --libs plasma` --std=c++11 +``` + +Note that multiple clients can be created within the same process. + +Note that a `PlasmaClient` object is **not thread safe**. + +If the Plasma store is still running, you can now execute the `a.out` executable +and the store will print something like + +``` +Disconnecting client on fd 5 +``` + +which shows that the client was successfully disconnected. + +Object IDs +---------- + +The Plasma object store uses twenty-byte identifiers for accessing objects +stored in shared memory. Each object in the Plasma store should be associated +with a unique ID. The Object ID is then a key that can be used by **any** client +to fetch that object from the Plasma store. + +Random generation of Object IDs is often good enough to ensure unique IDs: + +```cpp +// Randomly generate an Object ID. +ObjectID object_id = ObjectID::from_random(); +``` + +Now, any connected client that knows the object's Object ID can access the +same object from the Plasma object store. For easy transportation of Object IDs, +you can convert/serialize an Object ID into a binary string and back as +follows: + +```cpp +// From ObjectID to binary string +std:string id_string = object_id.binary(); + +// From binary string to ObjectID +ObjectID id_object = ObjectID::from_binary(&id_string); +``` + +You can also get a human readable representation of ObjectIDs in the same +format that git uses for commit hashes by running `ObjectID::hex`. + +Here is a test program you can run: + +```cpp +#include +#include +#include + +using namespace plasma; + +int main(int argc, char** argv) { + ObjectID object_id1 = ObjectID::from_random(); + std::cout << "object_id1 is " << object_id1.hex() << std::endl; + + std::string id_string = object_id1.binary(); + ObjectID object_id2 = ObjectID::from_binary(id_string); + std::cout << "object_id2 is " << object_id2.hex() << std::endl; +} +``` + +Creating an Object +------------------ + +Now that you learned about Object IDs that are used to refer to objects, +let's look at how objects can be stored in Plasma. + +Storing objects is a two-stage process. First a buffer is allocated with a call +to `Create`. Then it can be constructed in place by the client. Then it is made +immutable and shared with other clients via a call to `Seal`. + +The `Create` call blocks while the Plasma store allocates a buffer of the +appropriate size. The client will then map the buffer into its own address +space. At this point the object can be constructed in place using a pointer that +was written by the `Create` command. + +```cpp +int64_t data_size = 100; +// The address of the buffer allocated by the Plasma store will be written at +// this address. +uint8_t* data; +// Create a Plasma object by specifying its ID and size. +ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); +``` + +You can also specify metadata for the object; the third argument is the +metadata (as raw bytes) and the fourth argument is the size of the metadata. + +```cpp +// Create a Plasma object without metadata. +int64_t data_size = 100; +std::string metadata = "{'author': 'john'}"; +uint8_t* data; +client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data); +``` + +Now that we've obtained a pointer to our object's data, we can +write our data to it: + +```cpp +// Write some data for the Plasma object. +for (int64_t i = 0; i < data_size; i++) { + data[i] = static_cast(i % 4); +} +``` + +When the client is done, the client **seals** the buffer, making the object +immutable, and making it available to other Plasma clients: + +```cpp +// Seal the object. This makes it available for all clients. +client.Seal(object_id); +``` + +Here is an example that combines all these features: + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + // Create an object with a random ObjectID. + ObjectID object_id = ObjectID::from_binary("00000000000000000000"); + int64_t data_size = 1000; + uint8_t *data; + std::string metadata = "{'author': 'john'}"; + ARROW_CHECK_OK(client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data)); + // Write some data into the object. + for (int64_t i = 0; i < data_size; i++) { + data[i] = static_cast(i % 4); + } + // Seal the object. + ARROW_CHECK_OK(client.Seal(object_id)); + // Disconnect the client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +This example can be compiled with + +``` +g++ create.cc `pkg-config --cflags --libs plasma` --std=c++11 -o create +``` + +To verify that an object exists in the Plasma object store, you can +call `PlasmaClient::Contains()` to check if an object has +been created and sealed for a given Object ID. Note that this function +will still return False if the object has been created, but not yet +sealed: + +```cpp +// Check if an object has been created and sealed. +bool has_object; +client.Contains(object_id, &has_object); +if (has_object) { + // Object has been created and sealed, proceed +} +``` + +Getting an Object +----------------- + +After an object has been sealed, any client who knows the Object ID can get +the object. To store the retrieved object contents, you should create an +`ObjectBuffer`, then call `PlasmaClient::Get()` as follows: + +```cpp +// Get from the Plasma store by Object ID. +ObjectBuffer object_buffer; +client.Get(&object_id, 1, -1, &object_buffer); +``` + +`PlasmaClient::Get()` isn't limited to fetching a single object +from the Plasma store at once. You can specify an array of Object IDs and +`ObjectBuffers` to fetch at once, so long as you also specify the +number of objects being fetched: + +```cpp +// Get two objects at once from the Plasma store. This function +// call will block until both objects have been fetched. +ObjectBuffer multiple_buffers[2]; +ObjectID multiple_ids[2] = {object_id1, object_id2}; +client.Get(multiple_ids, 2, -1, multiple_buffers); +``` + +Since `PlasmaClient::Get()` is a blocking function call, it may be +necessary to limit the amount of time the function is allowed to take +when trying to fetch from the Plasma store. You can pass in a timeout +in milliseconds when calling `PlasmaClient::Get().` To use `PlasmaClient::Get()` +without a timeout, just pass in -1 like in the previous example calls: + +```cpp +// Make the function call give up fetching the object if it takes +// more than 100 milliseconds. +int64_t timeout = 100; +client.Get(&object_id, 1, timeout, &object_buffer); +``` + +Finally, to access the object, you can access the `data` and +`metadata` attributes of the `ObjectBuffer`. The `data` can be indexed +like any array: + +```cpp +// Access object data. +uint8_t* data = object_buffer.data; +int64_t data_size = object_buffer.data_size; + +// Access object metadata. +uint8_t* metadata = object_buffer.metadata; +uint8_t metadata_size = object_buffer.metadata_size; + +// Index into data array. +uint8_t first_data_byte = data[0]; +``` + +Here is a longer example that shows these capabilities: + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + ObjectID object_id = ObjectID::from_binary("00000000000000000000"); + ObjectBuffer object_buffer; + ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); + + // Retrieve object data. + uint8_t* data = object_buffer.data; + int64_t data_size = object_buffer.data_size; + + // Check that the data agrees with what was written in the other process. + for (int64_t i = 0; i < data_size; i++) { + ARROW_CHECK(data[i] == static_cast(i % 4)); + } + + // Disconnect the client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +If you compile it with + +``` +g++ get.cc `pkg-config --cflags --libs plasma` --std=c++11 -o get +``` + +and run it with `./get`, all the assertions will pass if you run the `create` +example from above on the same Plasma store. + + +Object Lifetime Management +-------------------------- + +The Plasma store internally does reference counting to make sure objects that +are mapped into the address space of one of the clients with `PlasmaClient::Get` +are accessible. To unmap objects from a client, call `PlasmaClient::Release`. +All objects that are mapped into a clients address space will automatically +be released when the client is disconnected from the store (this happens even +if the client process crashes or otherwise fails to call `Disconnect`). + +If a new object is created and there is not enough space in the Plasma store, +the store will evict the least recently used object (an object is in use if at +least one client has gotten it but not released it). + +Object notifications +-------------------- + +Additionally, you can arrange to have Plasma notify you when objects are +sealed in the object store. This may especially be handy when your +program is collaborating with other Plasma clients, and needs to know +when they make objects available. + +First, you can subscribe your current Plasma client to such notifications +by getting a file descriptor: + +```cpp +// Start receiving notifications into file_descriptor. +int fd; +ARROW_CHECK_OK(client.Subscribe(&fd)); +``` + +Once you have the file descriptor, you can have your current Plasma client +wait to receive the next object notification. Object notifications +include information such as Object ID, data size, and metadata size of +the next newly available object: + +```cpp +// Receive notification of the next newly available object. +// Notification information is stored in object_id, data_size, and metadata_size +ObjectID new_object_id; +int64_t data_size; +int64_t metadata_size; +ARROW_CHECK_OK(client.GetNotification(fd, &object_id, &data_size, &metadata_size)); + +// Get the newly available object. +ObjectBuffer object_buffer; +ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); +``` + +Here is a full program that shows this capability: + +```cpp +#include + +using namespace plasma; + +int main(int argc, char** argv) { + // Start up and connect a Plasma client. + PlasmaClient client; + ARROW_CHECK_OK(client.Connect("/tmp/plasma", "", PLASMA_DEFAULT_RELEASE_DELAY)); + + int fd; + ARROW_CHECK_OK(client.Subscribe(&fd)); + + ObjectID object_id; + int64_t data_size; + int64_t metadata_size; + while (true) { + ARROW_CHECK_OK(client.GetNotification(fd, &object_id, &data_size, &metadata_size)); + + std::cout << "Received object notification for object_id = " + << object_id.hex() << ", with data_size = " << data_size + << ", and metadata_size = " << metadata_size << std::endl; + } + + // Disconnect the client. + ARROW_CHECK_OK(client.Disconnect()); +} +``` + +If you compile it with + +``` +g++ subscribe.cc `pkg-config --cflags --libs plasma` --std=c++11 -o subscribe +``` + +and invoke `./create` and `./subscribe` while the Plasma store is running, +you can observe the new object arriving. diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index d0a1c544dd091..53544ba7a6ac3 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -159,12 +159,16 @@ Now build and install the Arrow C++ libraries: cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_PYTHON=on \ + -DARROW_PLASMA=on \ -DARROW_BUILD_TESTS=OFF \ .. make -j4 make install popd +If you don't want to build and install the Plasma in-memory object store, +you can omit the ``-DARROW_PLASMA=on`` flag. + Now, optionally build and install the Apache Parquet libraries in your toolchain: @@ -190,9 +194,10 @@ Now, build pyarrow: cd arrow/python python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ - --with-parquet --inplace + --with-parquet --with-plasma --inplace -If you did not build parquet-cpp, you can omit ``--with-parquet``. +If you did not build parquet-cpp, you can omit ``--with-parquet`` and if +you did not build with plasma, you can omit ``--with-plasma``. You should be able to run the unit tests with: @@ -224,9 +229,10 @@ You can build a wheel by running: .. code-block:: shell python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ - --with-parquet --bundle-arrow-cpp bdist_wheel + --with-parquet --with-plasma --bundle-arrow-cpp bdist_wheel -Again, if you did not build parquet-cpp, you should omit ``--with-parquet``. +Again, if you did not build parquet-cpp, you should omit ``--with-parquet`` and +if you did not build with plasma, you should omit ``--with-plasma``. Developing on Windows ===================== diff --git a/python/doc/source/index.rst b/python/doc/source/index.rst index a12853c448209..c2ae769b23e83 100644 --- a/python/doc/source/index.rst +++ b/python/doc/source/index.rst @@ -40,6 +40,7 @@ structures. data ipc filesystems + plasma pandas parquet api diff --git a/python/doc/source/plasma.rst b/python/doc/source/plasma.rst new file mode 100644 index 0000000000000..98dd62f97e951 --- /dev/null +++ b/python/doc/source/plasma.rst @@ -0,0 +1,337 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _io: + +The Plasma In-Memory Object Store +================================= + +.. contents:: Contents + :depth: 3 + + +The Plasma API +-------------- + +Starting the Plasma store +^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can start the Plasma store by issuing a terminal command similar to the +following: + +.. code-block:: bash + + plasma_store -m 1000000000 -s /tmp/plasma + +The ``-m`` flag specifies the size of the store in bytes, and the ``-s`` flag +specifies the socket that the store will listen at. Thus, the above command +allows the Plasma store to use up to 1GB of memory, and sets the socket to +``/tmp/plasma``. + +Leaving the current terminal window open as long as Plasma store should keep +running. Messages, concerning such as disconnecting clients, may occasionally be +printed to the screen. To stop running the Plasma store, you can press +``Ctrl-C`` in the terminal. + +Creating a Plasma client +^^^^^^^^^^^^^^^^^^^^^^^^ + +To start a Plasma client from Python, call ``plasma.connect`` using the same +socket name: + +.. code-block:: python + + import pyarrow.plasma as plasma + client = plasma.connect("/tmp/plasma", "", 0) + +If the following error occurs from running the above Python code, that +means that either the socket given is incorrect, or the ``./plasma_store`` is +not currently running. Check to see if the Plasma store is still running. + +.. code-block:: shell + + >>> client = plasma.connect("/tmp/plasma", "", 0) + Connection to socket failed for pathname /tmp/plasma + Could not connect to socket /tmp/plasma + + +Object IDs +^^^^^^^^^^ + +Each object in the Plasma store should be associated with a unique ID. The +Object ID then serves as a key that any client can use to retrieve that object +from the Plasma store. You can form an ``ObjectID`` object from a byte string of +length 20. + +.. code-block:: shell + + # Create an ObjectID. + >>> id = plasma.ObjectID(20 * b"a") + + # The character "a" is encoded as 61 in hex. + >>> id + ObjectID(6161616161616161616161616161616161616161) + +The random generation of Object IDs is often good enough to ensure unique IDs. +You can easily create a helper function that randomly generates object IDs as +follows: + +.. code-block:: python + + import numpy as np + + def random_object_id(): + return plasma.ObjectID(np.random.bytes(20)) + + +Creating an Object +^^^^^^^^^^^^^^^^^^ + +Objects are created in Plasma in two stages. First, they are **created**, which +allocates a buffer for the object. At this point, the client can write to the +buffer and construct the object within the allocated buffer. + +To create an object for Plasma, you need to create an object ID, as well as +give the object's maximum size in bytes. + +.. code-block:: python + + # Create an object. + object_id = plasma.ObjectID(20 * b"a") + object_size = 1000 + buffer = memoryview(client.create(object_id, object_size)) + + # Write to the buffer. + for i in range(1000): + buffer[i] = i % 128 + +When the client is done, the client **seals** the buffer, making the object +immutable, and making it available to other Plasma clients. + +.. code-block:: python + + # Seal the object. This makes the object immutable and available to other clients. + client.seal(object_id) + + +Getting an Object +^^^^^^^^^^^^^^^^^ + +After an object has been sealed, any client who knows the object ID can get +the object. + +.. code-block:: python + + # Create a different client. Note that this second client could be + # created in the same or in a separate, concurrent Python session. + client2 = plasma.connect("/tmp/plasma", "", 0) + + # Get the object in the second client. This blocks until the object has been sealed. + object_id2 = plasma.ObjectID(20 * b"a") + [buffer2] = client2.get([object_id]) + +If the object has not been sealed yet, then the call to client.get will block +until the object has been sealed by the client constructing the object. Using +the ``timeout_ms`` argument to get, you can specify a timeout for this (in +milliseconds). After the timeout, the interpreter will yield control back. + +.. code-block:: shell + + >>> buffer + + >>> buffer[1] + 1 + >>> buffer2 + + >>> view2 = memoryview(buffer2) + >>> view2[1] + 1 + >>> view2[129] + 1 + >>> bytes(buffer[1:4]) + b'\x01\x02\x03' + >>> bytes(view2[1:4]) + b'\x01\x02\x03' + + +Using Arrow and Pandas with Plasma +---------------------------------- + +Storing Arrow Objects in Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To store an Arrow object in Plasma, we must first **create** the object and then +**seal** it. However, Arrow objects such as ``Tensors`` may be more complicated +to write than simple binary data. + +To create the object in Plasma, you still need an ``ObjectID`` and a size to +pass in. To find out the size of your Arrow object, you can use pyarrow +API such as ``pyarrow.get_tensor_size``. + +.. code-block:: python + + import numpy as np + import pyarrow as pa + + # Create a pyarrow.Tensor object from a numpy random 2-dimensional array + data = np.random.randn(10, 4) + tensor = pa.Tensor.from_numpy(data) + + # Create the object in Plasma + object_id = plasma.ObjectID(np.random.bytes(20)) + data_size = pa.get_tensor_size(tensor) + buf = client.create(object_id, data_size) + +To write the Arrow ``Tensor`` object into the buffer, you can use Plasma to +convert the ``memoryview`` buffer into a ``pyarrow.FixedSizeBufferOutputStream`` +object. A ``pyarrow.FixedSizeBufferOutputStream`` is a format suitable for Arrow's +``pyarrow.write_tensor``: + +.. code-block:: python + + # Write the tensor into the Plasma-allocated buffer + stream = pa.FixedSizeBufferOutputStream(buf) + pa.write_tensor(tensor, stream) # Writes tensor's 552 bytes to Plasma stream + +To finish storing the Arrow object in Plasma, call ``seal``: + +.. code-block:: python + + # Seal the Plasma object + client.seal(object_id) + +Getting Arrow Objects from Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To read the object, first retrieve it as a ``PlasmaBuffer`` using its object ID. + +.. code-block:: python + + # Get the arrow object by ObjectID. + [buf2] = client.get([object_id]) + +To convert the ``PlasmaBuffer`` back into an Arrow ``Tensor``, first create a +pyarrow ``BufferReader`` object from it. You can then pass the ``BufferReader`` +into ``pyarrow.read_tensor`` to reconstruct the Arrow ``Tensor`` object: + +.. code-block:: python + + # Reconstruct the Arrow tensor object. + reader = pa.BufferReader(buf2) + tensor2 = pa.read_tensor(reader) + +Finally, you can use ``pyarrow.read_tensor`` to convert the Arrow object +back into numpy data: + +.. code-block:: python + + # Convert back to numpy + array = tensor2.to_numpy() + +Storing Pandas DataFrames in Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Storing a Pandas ``DataFrame`` still follows the **create** then **seal** +process of storing an object in the Plasma store, however one cannot directly +write the ``DataFrame`` to Plasma with Pandas alone. Plasma also needs to know +the size of the ``DataFrame`` to allocate a buffer for. + +See :ref:`pandas` for more information on using Arrow with Pandas. + +You can create the pyarrow equivalent of a Pandas ``DataFrame`` by using +``pyarrow.from_pandas`` to convert it to a ``RecordBatch``. + +.. code-block:: python + + import pyarrow as pa + import pandas as pd + + # Create a Pandas DataFrame + d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), + 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + df = pd.DataFrame(d) + + # Convert the Pandas DataFrame into a PyArrow RecordBatch + record_batch = pa.RecordBatch.from_pandas(df) + +Creating the Plasma object requires an ``ObjectID`` and the size of the +data. Now that we have converted the Pandas ``DataFrame`` into a PyArrow +``RecordBatch``, use the ``MockOutputStream`` to determine the +size of the Plasma object. + +.. code-block:: python + + # Create the Plasma object from the PyArrow RecordBatch. Most of the work here + # is done to determine the size of buffer to request from the object store. + object_id = plasma.ObjectID(np.random.bytes(20)) + mock_sink = pa.MockOutputStream() + stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) + stream_writer.write_batch(record_batch) + stream_writer.close() + data_size = mock_sink.size() + buf = client.create(object_id, data_size) + +The DataFrame can now be written to the buffer as follows. + +.. code-block:: python + + # Write the PyArrow RecordBatch to Plasma + stream = pa.FixedSizeBufferOutputStream(buf) + stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) + stream_writer.write_batch(record_batch) + stream_writer.close() + +Finally, seal the finished object for use by all clients: + +.. code-block:: python + + # Seal the Plasma object + client.seal(object_id) + +Getting Pandas DataFrames from Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since we store the Pandas DataFrame as a PyArrow ``RecordBatch`` object, +to get the object back from the Plasma store, we follow similar steps +to those specified in `Getting Arrow Objects from Plasma`_. + +We first have to convert the ``PlasmaBuffer`` returned from ``client.get`` +into an Arrow ``BufferReader`` object. + +.. code-block:: python + + # Fetch the Plasma object + [data] = client.get([object_id]) # Get PlasmaBuffer from ObjectID + buffer = pa.BufferReader(data) + +From the ``BufferReader``, we can create a specific ``RecordBatchStreamReader`` +in Arrow to reconstruct the stored PyArrow ``RecordBatch`` object. + +.. code-block:: python + + # Convert object back into an Arrow RecordBatch + reader = pa.RecordBatchStreamReader(buffer) + record_batch = reader.read_next_batch() + +The last step is to convert the PyArrow ``RecordBatch`` object back into +the original Pandas ``DataFrame`` structure. + +.. code-block:: python + + # Convert back into Pandas + result = record_batch.to_pandas() From e50b6ae5c8044ac425db8609fb837fe7c95bd393 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Aug 2017 23:00:37 -0400 Subject: [PATCH 04/38] ARROW-1308: [C++] Link utility executables to Arrow shared library if ARROW_BUILD_STATIC=off Author: Wes McKinney Closes #931 from wesm/ARROW-1308 and squashes the following commits: 88391fe0 [Wes McKinney] Link utility executables to Arrow shared library if ARROW_BUILD_STATIC=off --- cpp/src/arrow/ipc/CMakeLists.txt | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9cc61bced0619..76e52a0f78b9a 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -90,18 +90,22 @@ install(FILES writer.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/ipc") -if(MSVC) - set(UTIL_LINK_LIBS - arrow_static - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY}) +if (ARROW_BUILD_STATIC) + set(ARROW_UTIL_LIB arrow_static) else() + set(ARROW_UTIL_LIB arrow_shared) +endif() + +set(UTIL_LINK_LIBS + ${ARROW_UTIL_LIB} + ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_SYSTEM_LIBRARY}) + +if(NOT MSVC) set(UTIL_LINK_LIBS - arrow_static + ${UTIL_LINK_LIBS} pthread - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY} - dl) + ${CMAKE_DL_LIBS}) endif() if (ARROW_BUILD_UTILITIES) From b95bed050c6d5d0943fd7866cfdade7fd9cc4904 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 2 Aug 2017 11:44:40 -0400 Subject: [PATCH 05/38] ARROW-1303: [C++] Support downloading Boost CentOS 6 ships old Boost. If we support downloading Boost, we'll build RPM for CentOS 6 easily. Author: Kouhei Sutou Closes #927 from kou/cpp-boost-download and squashes the following commits: 5bf6818 [Kouhei Sutou] [C++] Always use static link for vendored Boost 832673d [Kouhei Sutou] [C++] Add ARROW_BOOST_VENDORED option 290e6e1 [Kouhei Sutou] [C++] Support downloading Boost --- cpp/CMakeLists.txt | 4 + cpp/cmake_modules/ThirdpartyToolchain.cmake | 118 ++++++++++++++------ 2 files changed, 88 insertions(+), 34 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 07b8e15b504e4..eeff9124ad983 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -114,6 +114,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Rely on boost shared libraries where relevant" ON) + option(ARROW_BOOST_VENDORED + "Use vendored Boost instead of existing Boost" + OFF) + option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1271b8a4ab3f4..ae48e8d2fb979 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -124,50 +124,100 @@ set(Boost_ADDITIONAL_VERSIONS "1.62.0" "1.61" "1.61.0" "1.62" "1.60.0" "1.60") - -if (ARROW_BOOST_USE_SHARED) - # Find shared Boost libraries. - set(Boost_USE_STATIC_LIBS OFF) - - if(MSVC) - # disable autolinking in boost - add_definitions(-DBOOST_ALL_NO_LIB) - - # force all boost libraries to dynamic link - add_definitions(-DBOOST_ALL_DYN_LINK) - endif() - +list(GET Boost_ADDITIONAL_VERSIONS 0 BOOST_LATEST_VERSION) +string(REPLACE "." "_" BOOST_LATEST_VERSION_IN_PATH ${BOOST_LATEST_VERSION}) +set(BOOST_LATEST_URL + "https://dl.bintray.com/boostorg/release/${BOOST_LATEST_VERSION}/source/boost_${BOOST_LATEST_VERSION_IN_PATH}.tar.gz") + +if (ARROW_BOOST_VENDORED) + set(BOOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/boost_ep-prefix/src/boost_ep") + set(BOOST_LIB_DIR "${BOOST_PREFIX}/stage/lib") + set(BOOST_BUILD_LINK "static") + set(BOOST_STATIC_SYSTEM_LIBRARY + "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BOOST_STATIC_FILESYSTEM_LIBRARY + "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}") + set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}") if (ARROW_BOOST_HEADER_ONLY) - find_package(Boost) + set(BOOST_BUILD_PRODUCTS) + set(BOOST_CONFIGURE_COMMAND "") + set(BOOST_BUILD_COMMAND "") else() - find_package(Boost COMPONENTS system filesystem REQUIRED) + set(BOOST_BUILD_PRODUCTS + ${BOOST_SYSTEM_LIBRARY} + ${BOOST_FILESYSTEM_LIBRARY}) + set(BOOST_CONFIGURE_COMMAND + "./bootstrap.sh" + "--prefix=${BOOST_PREFIX}" + "--with-libraries=filesystem,system") if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) - set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_BUILD_VARIANT "debug") else() - set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) - set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_BUILD_VARIANT "release") endif() - set(BOOST_SYSTEM_LIBRARY boost_system_shared) - set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + set(BOOST_BUILD_COMMAND + "./b2" + "link=${BOOST_BUILD_LINK}" + "variant=${BOOST_BUILD_VARIANT}" + "cxxflags=-fPIC") endif() + ExternalProject_Add(boost_ep + URL ${BOOST_LATEST_URL} + BUILD_BYPRODUCTS ${BOOST_BUILD_PRODUCTS} + BUILD_IN_SOURCE 1 + CONFIGURE_COMMAND ${BOOST_CONFIGURE_COMMAND} + BUILD_COMMAND ${BOOST_BUILD_COMMAND} + INSTALL_COMMAND "" + ${EP_LOG_OPTIONS}) + set(Boost_INCLUDE_DIR "${BOOST_PREFIX}") + set(Boost_INCLUDE_DIRS "${BOOST_INCLUDE_DIR}") + add_dependencies(arrow_dependencies boost_ep) else() - # Find static boost headers and libs - # TODO Differentiate here between release and debug builds - set(Boost_USE_STATIC_LIBS ON) - if (ARROW_BOOST_HEADER_ONLY) - find_package(Boost) + if (ARROW_BOOST_USE_SHARED) + # Find shared Boost libraries. + set(Boost_USE_STATIC_LIBS OFF) + + if(MSVC) + # disable autolinking in boost + add_definitions(-DBOOST_ALL_NO_LIB) + + # force all boost libraries to dynamic link + add_definitions(-DBOOST_ALL_DYN_LINK) + endif() + + if (ARROW_BOOST_HEADER_ONLY) + find_package(Boost REQUIRED) + else() + find_package(Boost COMPONENTS system filesystem REQUIRED) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + else() + set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + endif() + set(BOOST_SYSTEM_LIBRARY boost_system_shared) + set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + endif() else() - find_package(Boost COMPONENTS system filesystem REQUIRED) - if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) - set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + # Find static boost headers and libs + # TODO Differentiate here between release and debug builds + set(Boost_USE_STATIC_LIBS ON) + if (ARROW_BOOST_HEADER_ONLY) + find_package(Boost REQUIRED) else() - set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) - set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + find_package(Boost COMPONENTS system filesystem REQUIRED) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + else() + set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + endif() + set(BOOST_SYSTEM_LIBRARY boost_system_static) + set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) endif() - set(BOOST_SYSTEM_LIBRARY boost_system_static) - set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) endif() endif() From 5917e07c730ed8ac70fdc5b4872795de539ae9ba Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 2 Aug 2017 11:47:00 -0400 Subject: [PATCH 06/38] ARROW-1305: [GLib] Add GArrowIntArrayBuilder Author: Kouhei Sutou Closes #928 from kou/glib-add-int-array-builder and squashes the following commits: 3419edf [Kouhei Sutou] [GLib] Make buildable with Clang c5112d5 [Kouhei Sutou] [GLib] Add GArrowIntArrayBuilder --- c_glib/arrow-glib/array-builder.cpp | 486 ++++++++++++++------------ c_glib/arrow-glib/array-builder.h | 51 +++ c_glib/arrow-glib/array-builder.hpp | 3 +- c_glib/test/helper/buildable.rb | 4 + c_glib/test/test-int-array-builder.rb | 59 ++++ 5 files changed, 383 insertions(+), 220 deletions(-) create mode 100644 c_glib/test/test-int-array-builder.rb diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 17b2ec8b55069..23bc842756e65 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -25,6 +25,31 @@ #include #include +template +gboolean +garrow_array_builder_append(GArrowArrayBuilder *builder, + VALUE value, + GError **error, + const gchar *context) +{ + auto arrow_builder = + static_cast(garrow_array_builder_get_raw(builder)); + auto status = arrow_builder->Append(value); + return garrow_error_check(error, status, context); +} + +template +gboolean +garrow_array_builder_append_null(GArrowArrayBuilder *builder, + GError **error, + const gchar *context) +{ + auto arrow_builder = + static_cast(garrow_array_builder_get_raw(builder)); + auto status = arrow_builder->AppendNull(); + return garrow_error_check(error, status, context); +} + G_BEGIN_DECLS /** @@ -41,6 +66,11 @@ G_BEGIN_DECLS * #GArrowBooleanArrayBuilder is the class to create a new * #GArrowBooleanArray. * + * #GArrowIntArrayBuilder is the class to create a new integer + * array. Integer size is automatically chosen. It's recommend that + * you use this builder instead of specific integer size builder such + * as #GArrowInt8ArrayBuilder. + * * #GArrowInt8ArrayBuilder is the class to create a new * #GArrowInt8Array. * @@ -244,12 +274,11 @@ garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(static_cast(value)); - return garrow_error_check(error, status, "[boolean-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + static_cast(value), + error, + "[boolean-array-builder][append]"); } /** @@ -263,14 +292,83 @@ gboolean garrow_boolean_array_builder_append_null(GArrowBooleanArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[boolean-array-builder][append-null]"); +} - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[boolean-array-builder][append-null]"); + +G_DEFINE_TYPE(GArrowIntArrayBuilder, + garrow_int_array_builder, + GARROW_TYPE_ARRAY_BUILDER) + +static void +garrow_int_array_builder_init(GArrowIntArrayBuilder *builder) +{ +} + +static void +garrow_int_array_builder_class_init(GArrowIntArrayBuilderClass *klass) +{ +} + +/** + * garrow_int_array_builder_new: + * + * Returns: A newly created #GArrowIntArrayBuilder. + * + * Since: 0.6.0 + */ +GArrowIntArrayBuilder * +garrow_int_array_builder_new(void) +{ + auto memory_pool = arrow::default_memory_pool(); + auto arrow_builder = new arrow::AdaptiveIntBuilder(memory_pool); + auto builder = garrow_array_builder_new_raw(arrow_builder, + GARROW_TYPE_INT_ARRAY_BUILDER); + return GARROW_INT_ARRAY_BUILDER(builder); +} + +/** + * garrow_int_array_builder_append: + * @builder: A #GArrowIntArrayBuilder. + * @value: A int value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.6.0 + */ +gboolean +garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int-array-builder][append]"); +} + +/** + * garrow_int_array_builder_append_null: + * @builder: A #GArrowIntArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.6.0 + */ +gboolean +garrow_int_array_builder_append_null(GArrowIntArrayBuilder *builder, + GError **error) +{ + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int-array-builder][append-null]"); } @@ -315,12 +413,11 @@ garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int8-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int8-array-builder][append]"); } /** @@ -334,12 +431,10 @@ gboolean garrow_int8_array_builder_append_null(GArrowInt8ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int8-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int8-array-builder][append-null]"); } @@ -384,12 +479,11 @@ garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint8-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint8-array-builder][append]"); } /** @@ -403,12 +497,10 @@ gboolean garrow_uint8_array_builder_append_null(GArrowUInt8ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[uint8-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint8-array-builder][append-null]"); } @@ -453,12 +545,11 @@ garrow_int16_array_builder_append(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int16-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int16-array-builder][append]"); } /** @@ -472,12 +563,10 @@ gboolean garrow_int16_array_builder_append_null(GArrowInt16ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int16-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int16-array-builder][append-null]"); } @@ -522,12 +611,11 @@ garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint16-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint16-array-builder][append]"); } /** @@ -541,14 +629,10 @@ gboolean garrow_uint16_array_builder_append_null(GArrowUInt16ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[uint16-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint16-array-builder][append-null]"); } @@ -593,12 +677,11 @@ garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int32-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int32-array-builder][append]"); } /** @@ -612,12 +695,10 @@ gboolean garrow_int32_array_builder_append_null(GArrowInt32ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int32-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int32-array-builder][append-null]"); } @@ -662,12 +743,11 @@ garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint32-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint32-array-builder][append]"); } /** @@ -681,14 +761,10 @@ gboolean garrow_uint32_array_builder_append_null(GArrowUInt32ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[uint32-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint32-array-builder][append-null]"); } @@ -733,12 +809,11 @@ garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[int64-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[int64-array-builder][append]"); } /** @@ -752,12 +827,10 @@ gboolean garrow_int64_array_builder_append_null(GArrowInt64ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[int64-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[int64-array-builder][append-null]"); } @@ -802,12 +875,11 @@ garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[uint64-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[uint64-array-builder][append]"); } /** @@ -821,17 +893,10 @@ gboolean garrow_uint64_array_builder_append_null(GArrowUInt64ArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - if (status.ok()) { - return TRUE; - } else { - garrow_error_check(error, status, "[uint64-array-builder][append-null]"); - return FALSE; - } + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[uint64-array-builder][append-null]"); } @@ -876,12 +941,11 @@ garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, gfloat value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[float-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[float-array-builder][append]"); } /** @@ -895,12 +959,10 @@ gboolean garrow_float_array_builder_append_null(GArrowFloatArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[float-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[float-array-builder][append-null]"); } @@ -945,12 +1007,11 @@ garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->Append(value); - return garrow_error_check(error, status, "[double-array-builder][append]"); + return garrow_array_builder_append + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[double-array-builder][append]"); } /** @@ -964,14 +1025,10 @@ gboolean garrow_double_array_builder_append_null(GArrowDoubleArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[double-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[double-array-builder][append-null]"); } @@ -1037,14 +1094,10 @@ gboolean garrow_binary_array_builder_append_null(GArrowBinaryArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[binary-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[binary-array-builder][append-null]"); } @@ -1240,12 +1293,10 @@ gboolean garrow_list_array_builder_append_null(GArrowListArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, status, "[list-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[list-array-builder][append-null]"); } /** @@ -1390,14 +1441,10 @@ gboolean garrow_struct_array_builder_append_null(GArrowStructArrayBuilder *builder, GError **error) { - auto arrow_builder = - static_cast( - garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); - - auto status = arrow_builder->AppendNull(); - return garrow_error_check(error, - status, - "[struct-array-builder][append-null]"); + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[struct-array-builder][append-null]"); } /** @@ -1450,59 +1497,60 @@ garrow_struct_array_builder_get_field_builders(GArrowStructArrayBuilder *builder G_END_DECLS GArrowArrayBuilder * -garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder) -{ - GType type; - - switch (arrow_builder->type()->id()) { - case arrow::Type::type::BOOL: - type = GARROW_TYPE_BOOLEAN_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT8: - type = GARROW_TYPE_UINT8_ARRAY_BUILDER; - break; - case arrow::Type::type::INT8: - type = GARROW_TYPE_INT8_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT16: - type = GARROW_TYPE_UINT16_ARRAY_BUILDER; - break; - case arrow::Type::type::INT16: - type = GARROW_TYPE_INT16_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT32: - type = GARROW_TYPE_UINT32_ARRAY_BUILDER; - break; - case arrow::Type::type::INT32: - type = GARROW_TYPE_INT32_ARRAY_BUILDER; - break; - case arrow::Type::type::UINT64: - type = GARROW_TYPE_UINT64_ARRAY_BUILDER; - break; - case arrow::Type::type::INT64: - type = GARROW_TYPE_INT64_ARRAY_BUILDER; - break; - case arrow::Type::type::FLOAT: - type = GARROW_TYPE_FLOAT_ARRAY_BUILDER; - break; - case arrow::Type::type::DOUBLE: - type = GARROW_TYPE_DOUBLE_ARRAY_BUILDER; - break; - case arrow::Type::type::BINARY: - type = GARROW_TYPE_BINARY_ARRAY_BUILDER; - break; - case arrow::Type::type::STRING: - type = GARROW_TYPE_STRING_ARRAY_BUILDER; - break; - case arrow::Type::type::LIST: - type = GARROW_TYPE_LIST_ARRAY_BUILDER; - break; - case arrow::Type::type::STRUCT: - type = GARROW_TYPE_STRUCT_ARRAY_BUILDER; - break; - default: - type = GARROW_TYPE_ARRAY_BUILDER; - break; +garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder, + GType type) +{ + if (type == G_TYPE_INVALID) { + switch (arrow_builder->type()->id()) { + case arrow::Type::type::BOOL: + type = GARROW_TYPE_BOOLEAN_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT8: + type = GARROW_TYPE_UINT8_ARRAY_BUILDER; + break; + case arrow::Type::type::INT8: + type = GARROW_TYPE_INT8_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT16: + type = GARROW_TYPE_UINT16_ARRAY_BUILDER; + break; + case arrow::Type::type::INT16: + type = GARROW_TYPE_INT16_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT32: + type = GARROW_TYPE_UINT32_ARRAY_BUILDER; + break; + case arrow::Type::type::INT32: + type = GARROW_TYPE_INT32_ARRAY_BUILDER; + break; + case arrow::Type::type::UINT64: + type = GARROW_TYPE_UINT64_ARRAY_BUILDER; + break; + case arrow::Type::type::INT64: + type = GARROW_TYPE_INT64_ARRAY_BUILDER; + break; + case arrow::Type::type::FLOAT: + type = GARROW_TYPE_FLOAT_ARRAY_BUILDER; + break; + case arrow::Type::type::DOUBLE: + type = GARROW_TYPE_DOUBLE_ARRAY_BUILDER; + break; + case arrow::Type::type::BINARY: + type = GARROW_TYPE_BINARY_ARRAY_BUILDER; + break; + case arrow::Type::type::STRING: + type = GARROW_TYPE_STRING_ARRAY_BUILDER; + break; + case arrow::Type::type::LIST: + type = GARROW_TYPE_LIST_ARRAY_BUILDER; + break; + case arrow::Type::type::STRUCT: + type = GARROW_TYPE_STRUCT_ARRAY_BUILDER; + break; + default: + type = GARROW_TYPE_ARRAY_BUILDER; + break; + } } auto builder = diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index f5a8ac73d630a..97cea63078b12 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -119,6 +119,57 @@ gboolean garrow_boolean_array_builder_append_null(GArrowBooleanArrayBuilder *bui GError **error); +#define GARROW_TYPE_INT_ARRAY_BUILDER \ + (garrow_int_array_builder_get_type()) +#define GARROW_INT_ARRAY_BUILDER(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_INT_ARRAY_BUILDER, \ + GArrowIntArrayBuilder)) +#define GARROW_INT_ARRAY_BUILDER_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_INT_ARRAY_BUILDER, \ + GArrowIntArrayBuilderClass)) +#define GARROW_IS_INT_ARRAY_BUILDER(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_INT_ARRAY_BUILDER)) +#define GARROW_IS_INT_ARRAY_BUILDER_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_INT_ARRAY_BUILDER)) +#define GARROW_INT_ARRAY_BUILDER_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_INT_ARRAY_BUILDER, \ + GArrowIntArrayBuilderClass)) + +typedef struct _GArrowIntArrayBuilder GArrowIntArrayBuilder; +typedef struct _GArrowIntArrayBuilderClass GArrowIntArrayBuilderClass; + +/** + * GArrowIntArrayBuilder: + * + * It wraps `arrow::AdaptiveIntBuilder`. + */ +struct _GArrowIntArrayBuilder +{ + /*< private >*/ + GArrowArrayBuilder parent_instance; +}; + +struct _GArrowIntArrayBuilderClass +{ + GArrowArrayBuilderClass parent_class; +}; + +GType garrow_int_array_builder_get_type(void) G_GNUC_CONST; + +GArrowIntArrayBuilder *garrow_int_array_builder_new(void); + +gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error); +gboolean garrow_int_array_builder_append_null(GArrowIntArrayBuilder *builder, + GError **error); + + #define GARROW_TYPE_INT8_ARRAY_BUILDER \ (garrow_int8_array_builder_get_type()) #define GARROW_INT8_ARRAY_BUILDER(obj) \ diff --git a/c_glib/arrow-glib/array-builder.hpp b/c_glib/arrow-glib/array-builder.hpp index e65ad005c12fd..bcdc58fd8844b 100644 --- a/c_glib/arrow-glib/array-builder.hpp +++ b/c_glib/arrow-glib/array-builder.hpp @@ -22,5 +22,6 @@ #include #include -GArrowArrayBuilder *garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder); +GArrowArrayBuilder *garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder, + GType type=G_TYPE_INVALID); arrow::ArrayBuilder *garrow_array_builder_get_raw(GArrowArrayBuilder *builder); diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index f1bac47d6c7b9..3181c098c002b 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -21,6 +21,10 @@ def build_boolean_array(values) build_array(Arrow::BooleanArrayBuilder, values) end + def build_int_array(values) + build_array(Arrow::IntArrayBuilder, values) + end + def build_int8_array(values) build_array(Arrow::Int8ArrayBuilder, values) end diff --git a/c_glib/test/test-int-array-builder.rb b/c_glib/test/test-int-array-builder.rb new file mode 100644 index 0000000000000..e1a6c3b216597 --- /dev/null +++ b/c_glib/test/test-int-array-builder.rb @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestIntArrayBuilder < Test::Unit::TestCase + include Helper::Buildable + + def test_int8 + values = [-1, 2] + assert_equal(build_int_array([*values, nil]), + Arrow::Int8Array.new(3, + Arrow::Buffer.new(values.pack("c*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end + + def test_int16 + border_value = (2 ** (8 - 1)) + values = [-1, border_value] + assert_equal(build_int_array([*values, nil]), + Arrow::Int16Array.new(3, + Arrow::Buffer.new(values.pack("s*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end + + def test_int32 + border_value = (2 ** (16 - 1)) + values = [-1, border_value] + assert_equal(build_int_array([*values, nil]), + Arrow::Int32Array.new(3, + Arrow::Buffer.new(values.pack("l*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end + + def test_int64 + border_value = (2 ** (32 - 1)) + values = [-1, border_value] + assert_equal(build_int_array([*values, nil]), + Arrow::Int64Array.new(3, + Arrow::Buffer.new(values.pack("q*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) + end +end From ee928d2233da89ebd1f567ffda4833f4f07e795c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 2 Aug 2017 11:48:55 -0400 Subject: [PATCH 07/38] ARROW-1211: [C++] Enable builder classes to automatically use the default memory pool I added the `ARROW_NO_DEFAULT_MEMORY_POOL` define option to disable this in third party use. I also flipped the order of arguments to the builder constructors to be a bit more natural. I don't feel strongly about this, but it does make the code a bit nicer: ```c++ FixedSizeBinaryBuilder builder(type); FixedSizeBinaryBuilder builder(type, pool); ``` versus ```c++ FixedSizeBinaryBuilder builder(type); FixedSizeBinaryBuilder builder(pool, type); ``` Author: Wes McKinney Closes #923 from wesm/ARROW-1211 and squashes the following commits: e8a129d [Wes McKinney] Use CONFIGURE_OPTIONS as advised 7835e67 [Wes McKinney] Build c_glib in Travis CI without deprecated Arrow APIs d86a6a3 [Wes McKinney] Remove copy-paste error, unneeded using statement a712445 [Wes McKinney] Benchmark fixes 1488bb4 [Wes McKinney] Some MSVC fixes, cannot get DictionaryBuilder default ctor working there 33cd7d5 [Wes McKinney] Use macro for memory pool argument to avoid code duplication. Add macro expansion to Doxyfile 085ca86 [Wes McKinney] Add option builder constructors to use default memory pool without passing explicitly. Add ARROW_NO_DEFAULT_MEMORY_POOL define. Flip builder constructor order, deprecate old constructors. README documentation --- c_glib/configure.ac | 1 + ci/msvc-build.bat | 1 - ci/travis_before_script_c_glib.sh | 4 + cpp/README.md | 14 ++++ cpp/apidoc/Doxyfile | 2 +- cpp/src/arrow/array-decimal-test.cc | 2 +- cpp/src/arrow/array-test.cc | 48 +++++------ cpp/src/arrow/builder-benchmark.cc | 10 +-- cpp/src/arrow/builder.cc | 81 +++++++++++++------ cpp/src/arrow/builder.h | 79 ++++++++++++++---- cpp/src/arrow/ipc/ipc-json-test.cc | 2 +- cpp/src/arrow/ipc/ipc-read-write-benchmark.cc | 3 +- cpp/src/arrow/ipc/json-internal.cc | 4 +- cpp/src/arrow/ipc/test-common.h | 6 +- cpp/src/arrow/pretty_print-test.cc | 2 +- cpp/src/arrow/python/pandas_to_arrow.cc | 8 +- cpp/src/arrow/python/python-test.cc | 2 +- cpp/src/arrow/test-util.h | 9 +-- 18 files changed, 189 insertions(+), 89 deletions(-) diff --git a/c_glib/configure.ac b/c_glib/configure.ac index d4e828ba55c1b..375f76efcdd51 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -79,6 +79,7 @@ else ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" ARROW_CFLAGS="-I${ARROW_INCLUDE_DIR}" + ARROW_LIBS="-L${ARROW_LIB_DIR} -larrow" AC_SUBST(ARROW_LIB_DIR) diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat index 04fe2ab62cbd4..6ebd22fc3e354 100644 --- a/ci/msvc-build.bat +++ b/ci/msvc-build.bat @@ -104,7 +104,6 @@ cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DPARQUET_BOOST_USE_SHARED=OFF ^ - -DPARQUET_ZLIB_VENDORED=off ^ -DPARQUET_BUILD_TESTS=off .. || exit /B cmake --build . --target INSTALL --config %CONFIGURATION% || exit /B popd diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index bf2d385d79d4d..7ab8e2eaa03ee 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -74,6 +74,10 @@ CONFIGURE_OPTIONS="--prefix=$ARROW_C_GLIB_INSTALL" if [ $TRAVIS_OS_NAME != "osx" ]; then CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS --enable-gtk-doc" fi + +CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS CFLAGS=-DARROW_NO_DEPRECATED_API" +CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS CXXFLAGS=-DARROW_NO_DEPRECATED_API" + ./configure $CONFIGURE_OPTIONS make -j4 diff --git a/cpp/README.md b/cpp/README.md index 2f98b085115f5..0228faf7349c5 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -126,6 +126,14 @@ This project follows [Google's C++ Style Guide][3] with minor exceptions. We do not encourage anonymous namespaces and we relax the line length restriction to 90 characters. +### Memory Pools + +We provide a default memory pool with `arrow::default_memory_pool()`. As a +matter of convenience, some of the array builder classes have constructors +which use the default pool without explicitly passing it. You can disable these +constructors in your application (so that you are accounting properly for all +memory allocations) by defining `ARROW_NO_DEFAULT_MEMORY_POOL`. + ### Error Handling and Exceptions For error handling, we use `arrow::Status` values instead of throwing C++ @@ -149,6 +157,12 @@ constructors, the circumstances where they would are somewhat esoteric, and it is likely that an application would have encountered other more serious problems prior to having `std::bad_alloc` thrown in a constructor. +### Deprecations and API Changes + +We use the compiler definition `ARROW_NO_DEPRECATED_API` to disable APIs that +have been deprecated. It is a good practice to compile third party applications +with this flag to proactively catch and account for API changes. + ## Continuous Integration Pull requests are run through travis-ci for continuous integration. You can avoid diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index f32ad5425da35..94156d55801f1 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2084,7 +2084,7 @@ PREDEFINED = __attribute__(x)= \ # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = ARROW_MEMORY_POOL_ARG # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc index 436ce9cf7c312..e94ba48d60840 100644 --- a/cpp/src/arrow/array-decimal-test.cc +++ b/cpp/src/arrow/array-decimal-test.cc @@ -37,7 +37,7 @@ class DecimalTestBase { auto type = std::make_shared(precision, 4); int byte_width = type->byte_width(); auto pool = default_memory_pool(); - auto builder = std::make_shared(pool, type); + auto builder = std::make_shared(type, pool); size_t null_count = 0; size_t size = draw.size(); diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 57d2c8b8493a5..38aceb2d185bb 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -171,7 +171,7 @@ TEST_F(TestArray, TestIsNull) { TEST_F(TestArray, BuildLargeInMemoryArray) { const int64_t length = static_cast(std::numeric_limits::max()) + 1; - BooleanBuilder builder(default_memory_pool()); + BooleanBuilder builder; ASSERT_OK(builder.Reserve(length)); ASSERT_OK(builder.Advance(length)); @@ -754,9 +754,9 @@ TEST_F(TestStringArray, TestEmptyStringComparison) { } TEST_F(TestStringArray, CompareNullByteSlots) { - StringBuilder builder(default_memory_pool()); - StringBuilder builder2(default_memory_pool()); - StringBuilder builder3(default_memory_pool()); + StringBuilder builder; + StringBuilder builder2; + StringBuilder builder3; ASSERT_OK(builder.Append("foo")); ASSERT_OK(builder2.Append("foo")); @@ -795,7 +795,7 @@ TEST_F(TestStringArray, CompareNullByteSlots) { } TEST_F(TestStringArray, TestSliceGetString) { - StringBuilder builder(default_memory_pool()); + StringBuilder builder; ASSERT_OK(builder.Append("a")); ASSERT_OK(builder.Append("b")); @@ -958,7 +958,7 @@ TEST_F(TestBinaryArray, TestGetValue) { } TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { - BinaryBuilder builder(default_memory_pool(), arrow::binary()); + BinaryBuilder builder; string empty_string(""); for (int i = 0; i < 5; ++i) { @@ -1045,7 +1045,7 @@ void CheckSliceEquality() { using Traits = TypeTraits; using BuilderType = typename Traits::BuilderType; - BuilderType builder(default_memory_pool()); + BuilderType builder; vector strings = {"foo", "", "bar", "baz", "qux", ""}; vector is_null = {0, 1, 0, 1, 0, 0}; @@ -1102,7 +1102,7 @@ class TestFWBinaryArray : public ::testing::Test { void InitBuilder(int byte_width) { auto type = fixed_size_binary(byte_width); - builder_.reset(new FixedSizeBinaryBuilder(default_memory_pool(), type)); + builder_.reset(new FixedSizeBinaryBuilder(type, default_memory_pool())); } protected: @@ -1184,8 +1184,8 @@ TEST_F(TestFWBinaryArray, EqualsRangeEquals) { // Check that we don't compare data in null slots auto type = fixed_size_binary(4); - FixedSizeBinaryBuilder builder1(default_memory_pool(), type); - FixedSizeBinaryBuilder builder2(default_memory_pool(), type); + FixedSizeBinaryBuilder builder1(type); + FixedSizeBinaryBuilder builder2(type); ASSERT_OK(builder1.Append("foo1")); ASSERT_OK(builder1.AppendNull()); @@ -1209,7 +1209,7 @@ TEST_F(TestFWBinaryArray, EqualsRangeEquals) { TEST_F(TestFWBinaryArray, ZeroSize) { auto type = fixed_size_binary(0); - FixedSizeBinaryBuilder builder(default_memory_pool(), type); + FixedSizeBinaryBuilder builder(type); ASSERT_OK(builder.Append(nullptr)); ASSERT_OK(builder.Append(nullptr)); @@ -1233,7 +1233,7 @@ TEST_F(TestFWBinaryArray, ZeroSize) { TEST_F(TestFWBinaryArray, Slice) { auto type = fixed_size_binary(4); - FixedSizeBinaryBuilder builder(default_memory_pool(), type); + FixedSizeBinaryBuilder builder(type); vector strings = {"foo1", "foo2", "foo3", "foo4", "foo5"}; vector is_null = {0, 1, 0, 0, 0}; @@ -1519,14 +1519,14 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder(default_memory_pool()); + NumericBuilder dict_builder; ASSERT_OK(dict_builder.Append(static_cast(1))); ASSERT_OK(dict_builder.Append(static_cast(2))); std::shared_ptr dict_array; ASSERT_OK(dict_builder.Finish(&dict_array)); auto dtype = std::make_shared(int8(), dict_array); - Int8Builder int_builder(default_memory_pool()); + Int8Builder int_builder; ASSERT_OK(int_builder.Append(0)); ASSERT_OK(int_builder.Append(1)); ASSERT_OK(int_builder.Append(0)); @@ -1538,8 +1538,8 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { } TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { - NumericBuilder builder(default_memory_pool()); - // DictionaryBuilder builder(default_memory_pool()); + NumericBuilder builder; + // DictionaryBuilder builder; ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); ASSERT_OK(builder.Append(static_cast(1))); @@ -1552,14 +1552,14 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { ASSERT_OK(dictionary_builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder(default_memory_pool()); + NumericBuilder dict_builder; ASSERT_OK(dict_builder.Append(static_cast(1))); ASSERT_OK(dict_builder.Append(static_cast(2))); std::shared_ptr dict_array; ASSERT_OK(dict_builder.Finish(&dict_array)); auto dtype = std::make_shared(int8(), dict_array); - Int8Builder int_builder(default_memory_pool()); + Int8Builder int_builder; ASSERT_OK(int_builder.Append(0)); ASSERT_OK(int_builder.Append(1)); ASSERT_OK(int_builder.Append(0)); @@ -1577,8 +1577,8 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { // Build the dictionary Array DictionaryBuilder builder(default_memory_pool()); // Build expected data - NumericBuilder dict_builder(default_memory_pool()); - Int16Builder int_builder(default_memory_pool()); + NumericBuilder dict_builder; + Int16Builder int_builder; // Fill with 1024 different values for (int64_t i = 0; i < 1024; i++) { @@ -1619,14 +1619,14 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder(default_memory_pool()); + StringBuilder str_builder; ASSERT_OK(str_builder.Append("test")); ASSERT_OK(str_builder.Append("test2")); std::shared_ptr str_array; ASSERT_OK(str_builder.Finish(&str_array)); auto dtype = std::make_shared(int8(), str_array); - Int8Builder int_builder(default_memory_pool()); + Int8Builder int_builder; ASSERT_OK(int_builder.Append(0)); ASSERT_OK(int_builder.Append(1)); ASSERT_OK(int_builder.Append(0)); @@ -1641,8 +1641,8 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) { // Build the dictionary Array StringDictionaryBuilder builder(default_memory_pool()); // Build expected data - StringBuilder str_builder(default_memory_pool()); - Int16Builder int_builder(default_memory_pool()); + StringBuilder str_builder; + Int16Builder int_builder; // Fill with 1024 different values for (int64_t i = 0; i < 1024; i++) { diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index 13d7b20591dad..7ac7fe3bed533 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -30,7 +30,7 @@ static void BM_BuildPrimitiveArrayNoNulls( // 2 MiB block std::vector data(256 * 1024, 100); while (state.KeepRunning()) { - Int64Builder builder(default_memory_pool()); + Int64Builder builder; for (int i = 0; i < kFinalSize; i++) { // Build up an array of 512 MiB in size ABORT_NOT_OK(builder.Append(data.data(), data.size(), nullptr)); @@ -66,7 +66,7 @@ static void BM_BuildAdaptiveIntNoNulls( data.push_back(i); } while (state.KeepRunning()) { - AdaptiveIntBuilder builder(default_memory_pool()); + AdaptiveIntBuilder builder; for (int64_t i = 0; i < size; i += chunk_size) { // Build up an array of 512 MiB in size ABORT_NOT_OK(builder.Append(data.data() + i, chunk_size, nullptr)); @@ -85,7 +85,7 @@ static void BM_BuildAdaptiveIntNoNullsScalarAppend( data.push_back(i); } while (state.KeepRunning()) { - AdaptiveIntBuilder builder(default_memory_pool()); + AdaptiveIntBuilder builder; for (int64_t i = 0; i < size; i++) { ABORT_NOT_OK(builder.Append(data[i])); } @@ -104,7 +104,7 @@ static void BM_BuildAdaptiveUIntNoNulls( data.push_back(i); } while (state.KeepRunning()) { - AdaptiveUIntBuilder builder(default_memory_pool()); + AdaptiveUIntBuilder builder; for (int64_t i = 0; i < size; i += chunk_size) { // Build up an array of 512 MiB in size ABORT_NOT_OK(builder.Append(data.data() + i, chunk_size, nullptr)); @@ -161,7 +161,7 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const std::string value = "1234567890"; while (state.KeepRunning()) { - BinaryBuilder builder(default_memory_pool()); + BinaryBuilder builder; for (int64_t i = 0; i < iterations; i++) { ABORT_NOT_OK(builder.Append(value)); } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 391204f566954..e3eda2401a02b 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -262,7 +262,7 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) - : ArrayBuilder(pool, int64()), data_(nullptr), raw_data_(nullptr), int_size_(1) {} + : ArrayBuilder(int64(), pool), data_(nullptr), raw_data_(nullptr), int_size_(1) {} Status AdaptiveIntBuilderBase::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -612,13 +612,18 @@ Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) { } BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(pool, boolean()), data_(nullptr), raw_data_(nullptr) {} + : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} -BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type) +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) : BooleanBuilder(pool) { DCHECK_EQ(Type::BOOL, type->id()); } +#ifndef ARROW_NO_DEPRECATED_API +BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type) + : BooleanBuilder(type, pool) {} +#endif + Status BooleanBuilder::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); data_ = std::make_shared(pool_); @@ -693,18 +698,25 @@ Status BooleanBuilder::Append(const uint8_t* values, int64_t length, // DictionaryBuilder template -DictionaryBuilder::DictionaryBuilder(MemoryPool* pool, - const std::shared_ptr& type) - : ArrayBuilder(pool, type), +DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), - dict_builder_(pool, type), + dict_builder_(type, pool), values_builder_(pool) { if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } } +#ifndef ARROW_NO_DEPRECATED_API +template +DictionaryBuilder::DictionaryBuilder(MemoryPool* pool, + const std::shared_ptr& type) + : DictionaryBuilder(type, pool) {} +#endif + template Status DictionaryBuilder::Init(int64_t elements) { RETURN_NOT_OK(ArrayBuilder::Init(elements)); @@ -931,11 +943,17 @@ template class DictionaryBuilder; // ---------------------------------------------------------------------- // DecimalBuilder -DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type) - : FixedSizeBinaryBuilder(pool, type), + +DecimalBuilder::DecimalBuilder(const std::shared_ptr& type, MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool), sign_bitmap_(nullptr), sign_bitmap_data_(nullptr) {} +#ifndef ARROW_NO_DEPRECATED_API +DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type) + : DecimalBuilder(type, pool) {} +#endif + template ARROW_EXPORT Status DecimalBuilder::Append(const decimal::Decimal& val) { DCHECK_EQ(sign_bitmap_, nullptr) << "sign_bitmap_ is not null"; @@ -1014,9 +1032,9 @@ Status DecimalBuilder::Finish(std::shared_ptr* out) { ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr value_builder, const std::shared_ptr& type) - : ArrayBuilder(pool, - type ? type : std::static_pointer_cast( - std::make_shared(value_builder->type()))), + : ArrayBuilder(type ? type : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), offsets_builder_(pool), value_builder_(std::move(value_builder)) {} @@ -1090,10 +1108,15 @@ ArrayBuilder* ListBuilder::value_builder() const { // ---------------------------------------------------------------------- // String and binary +BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} + +#ifndef ARROW_NO_DEPRECATED_API BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr& type) - : ArrayBuilder(pool, type), offsets_builder_(pool), value_data_builder_(pool) {} + : BinaryBuilder(type, pool) {} +#endif -BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(pool, binary()) {} +BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} Status BinaryBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); @@ -1173,7 +1196,7 @@ const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { return value_data_builder_.data() + offset; } -StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {} +StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} Status StringBuilder::Finish(std::shared_ptr* out) { std::shared_ptr data; @@ -1186,12 +1209,18 @@ Status StringBuilder::Finish(std::shared_ptr* out) { // ---------------------------------------------------------------------- // Fixed width binary -FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(MemoryPool* pool, - const std::shared_ptr& type) - : ArrayBuilder(pool, type), +FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), byte_width_(static_cast(*type).byte_width()), byte_builder_(pool) {} +#ifndef ARROW_NO_DEPRECATED_API +FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(MemoryPool* pool, + const std::shared_ptr& type) + : FixedSizeBinaryBuilder(type, pool) {} +#endif + Status FixedSizeBinaryBuilder::Append(const uint8_t* value) { RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(true); @@ -1236,12 +1265,18 @@ Status FixedSizeBinaryBuilder::Finish(std::shared_ptr* out) { // ---------------------------------------------------------------------- // Struct -StructBuilder::StructBuilder(MemoryPool* pool, const std::shared_ptr& type, +StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, std::vector>&& field_builders) - : ArrayBuilder(pool, type) { + : ArrayBuilder(type, pool) { field_builders_ = std::move(field_builders); } +#ifndef ARROW_NO_DEPRECATED_API +StructBuilder::StructBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::vector>&& field_builders) + : StructBuilder(type, pool, std::move(field_builders)) {} +#endif + Status StructBuilder::Finish(std::shared_ptr* out) { std::vector> fields(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { @@ -1261,7 +1296,7 @@ Status StructBuilder::Finish(std::shared_ptr* out) { #define BUILDER_CASE(ENUM, BuilderType) \ case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ + out->reset(new BuilderType(type, pool)); \ return Status::OK(); // Initially looked at doing this with vtables, but shared pointers makes it @@ -1309,7 +1344,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); values_builder.emplace_back(std::move(builder)); } - out->reset(new StructBuilder(pool, type, std::move(values_builder))); + out->reset(new StructBuilder(type, pool, std::move(values_builder))); return Status::OK(); } @@ -1320,7 +1355,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, #define DICTIONARY_BUILDER_CASE(ENUM, BuilderType) \ case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ + out->reset(new BuilderType(type, pool)); \ return Status::OK(); Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 009fd7ae47d19..e441179ae7864 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -53,6 +53,12 @@ struct Decimal; static constexpr int64_t kMinBuilderCapacity = 1 << 5; +#ifdef ARROW_NO_DEFAULT_MEMORY_POOL +#define ARROW_MEMORY_POOL_ARG pool +#else +#define ARROW_MEMORY_POOL_ARG pool = default_memory_pool() +#endif + /// Base class for all data array builders. // /// This class provides a facilities for incrementally building the null bitmap @@ -60,9 +66,9 @@ static constexpr int64_t kMinBuilderCapacity = 1 << 5; /// the null count. class ARROW_EXPORT ArrayBuilder { public: - explicit ArrayBuilder(MemoryPool* pool, const std::shared_ptr& type) - : pool_(pool), - type_(type), + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), + pool_(pool), null_bitmap_(nullptr), null_count_(0), null_bitmap_data_(nullptr), @@ -117,9 +123,8 @@ class ARROW_EXPORT ArrayBuilder { std::shared_ptr type() const { return type_; } protected: - MemoryPool* pool_; - std::shared_ptr type_; + MemoryPool* pool_; // When null_bitmap are first appended to the builder, the null bitmap is allocated std::shared_ptr null_bitmap_; @@ -162,8 +167,13 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: using value_type = typename Type::c_type; + explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), data_(nullptr), raw_data_(nullptr) {} + +#ifndef ARROW_NO_DEPRECATED_API explicit PrimitiveBuilder(MemoryPool* pool, const std::shared_ptr& type) - : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {} + : PrimitiveBuilder(type, pool) {} +#endif using ArrayBuilder::Advance; @@ -210,8 +220,9 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { template explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : PrimitiveBuilder(pool, TypeTraits::type_singleton()) {} + typename std::enable_if::is_parameter_free, MemoryPool*>::type + ARROW_MEMORY_POOL_ARG) + : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} using PrimitiveBuilder::Append; using PrimitiveBuilder::Init; @@ -341,7 +352,7 @@ inline uint8_t ExpandedUIntSize(uint64_t val, uint8_t current_int_size) { class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { public: - explicit AdaptiveUIntBuilder(MemoryPool* pool); + explicit AdaptiveUIntBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); using ArrayBuilder::Advance; @@ -400,7 +411,7 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { public: - explicit AdaptiveIntBuilder(MemoryPool* pool); + explicit AdaptiveIntBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); using ArrayBuilder::Advance; @@ -459,8 +470,14 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: - explicit BooleanBuilder(MemoryPool* pool); + explicit BooleanBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 explicit BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif using ArrayBuilder::Advance; @@ -574,8 +591,14 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { /// \brief Builder class for variable-length binary data class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { public: - explicit BinaryBuilder(MemoryPool* pool); - explicit BinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); + explicit BinaryBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 + BinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); Status Append(const uint8_t* value, int32_t length); @@ -617,7 +640,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool); + explicit StringBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG); using BinaryBuilder::Append; @@ -631,7 +654,13 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { public: +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 FixedSizeBinaryBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif + + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* ARROW_MEMORY_POOL_ARG); Status Append(const uint8_t* value); Status Append(const uint8_t* data, int64_t length, @@ -653,7 +682,13 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { public: + explicit DecimalBuilder(const std::shared_ptr& type, + MemoryPool* ARROW_MEMORY_POOL_ARG); + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 explicit DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif template ARROW_EXPORT Status Append(const decimal::Decimal& val); @@ -679,8 +714,14 @@ class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { /// called to maintain data-structure consistency. class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 StructBuilder(MemoryPool* pool, const std::shared_ptr& type, std::vector>&& field_builders); +#endif + + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); Status Finish(std::shared_ptr* out) override; @@ -759,12 +800,20 @@ template class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { public: using Scalar = typename internal::DictionaryScalar::type; + + ~DictionaryBuilder() {} + +#ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.6.0 explicit DictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type); +#endif + + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); template explicit DictionaryBuilder( typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(pool, TypeTraits::type_singleton()) {} + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} /// \brief Append a scalar value Status Append(const Scalar& value); diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 35264fa02c5ba..1d5a6997ae920 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -225,7 +225,7 @@ void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, static const int kBufferSize = 10; static uint8_t buffer[kBufferSize]; static uint32_t seed = 0; - StringBuilder string_builder(default_memory_pool()); + StringBuilder string_builder; for (int i = 0; i < num_rows; ++i) { if (!is_valid[i]) { ASSERT_OK(string_builder.AppendNull()); diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc index a88120a248d2d..a6da6377c0531 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc @@ -42,8 +42,7 @@ std::shared_ptr MakeRecordBatch(int64_t total_size, int64_t num_fie std::vector values; test::randint(length, 0, 100, &values); - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, type); + typename TypeTraits::BuilderType builder(type, default_memory_pool()); for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ABORT_NOT_OK(builder.Append(values[i])); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 175d75b7d1e97..49fb6ac7ce30f 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -977,7 +977,7 @@ class ArrayReader { std::is_base_of::value || std::is_base_of::value, Status>::type Visit(const T& type) { - typename TypeTraits::BuilderType builder(pool_, type_); + typename TypeTraits::BuilderType builder(type_, pool_); const auto& json_data = obj_->FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, *obj_); @@ -1046,7 +1046,7 @@ class ArrayReader { template typename std::enable_if::value, Status>::type Visit(const T& type) { - FixedSizeBinaryBuilder builder(pool_, type_); + FixedSizeBinaryBuilder builder(type_, pool_); const auto& json_data = obj_->FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, *obj_); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 76cc8430636f8..ed33e6e95b13a 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -99,7 +99,7 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(test::MakeRandomInt32PoolBuffer(length, pool, &data)); - Int32Builder builder(pool, int32()); + Int32Builder builder(int32(), pool); if (include_nulls) { std::shared_ptr valid_bytes; RETURN_NOT_OK(test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes)); @@ -653,8 +653,8 @@ Status MakeFWBinary(std::shared_ptr* out) { std::shared_ptr a1, a2; - FixedSizeBinaryBuilder b1(default_memory_pool(), f0->type()); - FixedSizeBinaryBuilder b2(default_memory_pool(), f1->type()); + FixedSizeBinaryBuilder b1(f0->type()); + FixedSizeBinaryBuilder b2(f1->type()); std::vector values1 = {"foo1", "foo2", "foo3", "foo4"}; AppendValues(is_valid, values1, &b1); diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 049f5a58a6841..a687a8fc0e703 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -89,7 +89,7 @@ TEST_F(TestPrettyPrint, FixedSizeBinaryType) { std::shared_ptr array; auto type = fixed_size_binary(3); - FixedSizeBinaryBuilder builder(default_memory_pool(), type); + FixedSizeBinaryBuilder builder(type); ASSERT_OK(builder.Append(values[0])); ASSERT_OK(builder.Append(values[1])); diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 2fbed1b8fdf08..590be223d3f07 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -17,6 +17,8 @@ // Functions for pandas conversion via NumPy +#define ARROW_NO_DEFAULT_MEMORY_POOL + #include "arrow/python/numpy_interop.h" #include "arrow/python/pandas_to_arrow.h" @@ -586,7 +588,7 @@ Status PandasConverter::ConvertDecimals() { type_ = std::make_shared(precision, scale); const int bit_width = std::dynamic_pointer_cast(type_)->bit_width(); - DecimalBuilder builder(pool_, type_); + DecimalBuilder builder(type_, pool_); RETURN_NOT_OK(builder.Resize(length_)); for (int64_t i = 0; i < length_; ++i) { @@ -619,7 +621,7 @@ Status PandasConverter::ConvertTimes() { PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); // datetime.time stores microsecond resolution - Time64Builder builder(pool_, ::arrow::time64(TimeUnit::MICRO)); + Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_); RETURN_NOT_OK(builder.Resize(length_)); PyObject* obj; @@ -751,7 +753,7 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( // The output type at this point is inconclusive because there may be bytes // and unicode mixed in the object array - FixedSizeBinaryBuilder builder(pool_, type); + FixedSizeBinaryBuilder builder(type, pool_); RETURN_NOT_OK(builder.Resize(length_)); int64_t offset = 0; diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index b50699d1ae9d4..433ce9b37a80a 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -71,7 +71,7 @@ TEST(DecimalTest, TestPythonDecimalToString) { } TEST(PandasConversionTest, TestObjectBlockWriteFails) { - StringBuilder builder(default_memory_pool()); + StringBuilder builder; const char value[] = {'\xf1', '\0'}; for (int i = 0; i < 1000; ++i) { diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 1a3376cee6053..711d2b04025c1 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -221,8 +221,7 @@ template void ArrayFromVector(const std::shared_ptr& type, const std::vector& is_valid, const std::vector& values, std::shared_ptr* out) { - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, type); + typename TypeTraits::BuilderType builder(type, default_memory_pool()); for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ASSERT_OK(builder.Append(values[i])); @@ -236,8 +235,7 @@ void ArrayFromVector(const std::shared_ptr& type, template void ArrayFromVector(const std::vector& is_valid, const std::vector& values, std::shared_ptr* out) { - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool); + typename TypeTraits::BuilderType builder; for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ASSERT_OK(builder.Append(values[i])); @@ -250,8 +248,7 @@ void ArrayFromVector(const std::vector& is_valid, const std::vector void ArrayFromVector(const std::vector& values, std::shared_ptr* out) { - MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool); + typename TypeTraits::BuilderType builder; for (size_t i = 0; i < values.size(); ++i) { ASSERT_OK(builder.Append(values[i])); } From 93b51a039e8901671e2fee4289e492eaadaa4def Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 2 Aug 2017 18:27:18 -0400 Subject: [PATCH 08/38] ARROW-1315: [GLib] Add missing status check for arrow::ArrayBuilder::Finish() Author: Kouhei Sutou Closes #933 from kou/glib-add-missing-status-check-for-array-builder-finish and squashes the following commits: ba98ff54 [Kouhei Sutou] [GLib] Update Go examples to follow Finish() API change b951b8f6 [Kouhei Sutou] [GLib] Add missing status check for arrow::ArrayBuilder::Finish() --- c_glib/arrow-glib/array-builder.cpp | 14 +++++-- c_glib/arrow-glib/array-builder.h | 3 +- c_glib/example/build.c | 8 +++- c_glib/example/go/write-batch.go | 60 ++++++++++++++++++++++++----- c_glib/example/go/write-stream.go | 60 ++++++++++++++++++++++++----- 5 files changed, 119 insertions(+), 26 deletions(-) diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 23bc842756e65..1ef4e77e8fc50 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -220,16 +220,22 @@ garrow_array_builder_new(const std::shared_ptr &type, /** * garrow_array_builder_finish: * @builder: A #GArrowArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): The built #GArrowArray. + * Returns: (transfer full): The built #GArrowArray on success, + * %NULL on error. */ GArrowArray * -garrow_array_builder_finish(GArrowArrayBuilder *builder) +garrow_array_builder_finish(GArrowArrayBuilder *builder, GError **error) { auto arrow_builder = garrow_array_builder_get_raw(builder); std::shared_ptr arrow_array; - arrow_builder->Finish(&arrow_array); - return garrow_array_new_raw(&arrow_array); + auto status = arrow_builder->Finish(&arrow_array); + if (garrow_error_check(error, status, "[array-builder][finish]")) { + return garrow_array_new_raw(&arrow_array); + } else { + return NULL; + } } diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index 97cea63078b12..613a5bad494d1 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -65,7 +65,8 @@ struct _GArrowArrayBuilderClass GType garrow_array_builder_get_type (void) G_GNUC_CONST; -GArrowArray *garrow_array_builder_finish (GArrowArrayBuilder *builder); +GArrowArray *garrow_array_builder_finish (GArrowArrayBuilder *builder, + GError **error); #define GARROW_TYPE_BOOLEAN_ARRAY_BUILDER \ diff --git a/c_glib/example/build.c b/c_glib/example/build.c index 2722458acd5c4..8c6cf74d74815 100644 --- a/c_glib/example/build.c +++ b/c_glib/example/build.c @@ -47,7 +47,13 @@ main(int argc, char **argv) g_object_unref(builder); return EXIT_FAILURE; } - array = garrow_array_builder_finish(GARROW_ARRAY_BUILDER(builder)); + array = garrow_array_builder_finish(GARROW_ARRAY_BUILDER(builder), &error); + if (!array) { + g_print("failed to finish: %s\n", error->message); + g_error_free(error); + g_object_unref(builder); + return EXIT_FAILURE; + } g_object_unref(builder); } diff --git a/c_glib/example/go/write-batch.go b/c_glib/example/go/write-batch.go index cda09a9b4e8f7..9dbc3c00acc50 100644 --- a/c_glib/example/go/write-batch.go +++ b/c_glib/example/go/write-batch.go @@ -29,7 +29,11 @@ func BuildUInt8Array() *arrow.Array { for _, value := range []uint8{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt16Array() *arrow.Array { @@ -37,7 +41,11 @@ func BuildUInt16Array() *arrow.Array { for _, value := range []uint16{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt32Array() *arrow.Array { @@ -45,7 +53,11 @@ func BuildUInt32Array() *arrow.Array { for _, value := range []uint32{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt64Array() *arrow.Array { @@ -53,7 +65,11 @@ func BuildUInt64Array() *arrow.Array { for _, value := range []uint64{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt8Array() *arrow.Array { @@ -61,7 +77,11 @@ func BuildInt8Array() *arrow.Array { for _, value := range []int8{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt16Array() *arrow.Array { @@ -69,7 +89,11 @@ func BuildInt16Array() *arrow.Array { for _, value := range []int16{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt32Array() *arrow.Array { @@ -77,7 +101,11 @@ func BuildInt32Array() *arrow.Array { for _, value := range []int32{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt64Array() *arrow.Array { @@ -85,7 +113,11 @@ func BuildInt64Array() *arrow.Array { for _, value := range []int64{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildFloatArray() *arrow.Array { @@ -93,7 +125,11 @@ func BuildFloatArray() *arrow.Array { for _, value := range []float32{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildDoubleArray() *arrow.Array { @@ -101,7 +137,11 @@ func BuildDoubleArray() *arrow.Array { for _, value := range []float64{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func main() { diff --git a/c_glib/example/go/write-stream.go b/c_glib/example/go/write-stream.go index 20cb03ef2e324..244741e8cfeb0 100644 --- a/c_glib/example/go/write-stream.go +++ b/c_glib/example/go/write-stream.go @@ -29,7 +29,11 @@ func BuildUInt8Array() *arrow.Array { for _, value := range []uint8{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt16Array() *arrow.Array { @@ -37,7 +41,11 @@ func BuildUInt16Array() *arrow.Array { for _, value := range []uint16{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt32Array() *arrow.Array { @@ -45,7 +53,11 @@ func BuildUInt32Array() *arrow.Array { for _, value := range []uint32{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildUInt64Array() *arrow.Array { @@ -53,7 +65,11 @@ func BuildUInt64Array() *arrow.Array { for _, value := range []uint64{1, 2, 4, 8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt8Array() *arrow.Array { @@ -61,7 +77,11 @@ func BuildInt8Array() *arrow.Array { for _, value := range []int8{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt16Array() *arrow.Array { @@ -69,7 +89,11 @@ func BuildInt16Array() *arrow.Array { for _, value := range []int16{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt32Array() *arrow.Array { @@ -77,7 +101,11 @@ func BuildInt32Array() *arrow.Array { for _, value := range []int32{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildInt64Array() *arrow.Array { @@ -85,7 +113,11 @@ func BuildInt64Array() *arrow.Array { for _, value := range []int64{1, -2, 4, -8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildFloatArray() *arrow.Array { @@ -93,7 +125,11 @@ func BuildFloatArray() *arrow.Array { for _, value := range []float32{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func BuildDoubleArray() *arrow.Array { @@ -101,7 +137,11 @@ func BuildDoubleArray() *arrow.Array { for _, value := range []float64{1.1, -2.2, 4.4, -8.8} { builder.Append(value) } - return builder.Finish() + array, err := builder.Finish() + if err != nil { + log.Fatalf("Failed to build array: %v", err) + } + return array } func main() { From 21a0191579e1065212d2a435a51d9721dd33c176 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 2 Aug 2017 18:28:18 -0400 Subject: [PATCH 09/38] ARROW-1323: [GLib] Add garrow_boolean_array_get_values() Author: Kouhei Sutou Closes #934 from kou/glib-add-boolean-get-values and squashes the following commits: d6502ba2 [Kouhei Sutou] [GLib] Add garrow_boolean_array_get_values() --- c_glib/arrow-glib/array.cpp | 24 ++++++++++++++++++++++++ c_glib/arrow-glib/array.h | 2 ++ c_glib/test/test-boolean-array.rb | 11 +++++++++++ 3 files changed, 37 insertions(+) diff --git a/c_glib/arrow-glib/array.cpp b/c_glib/arrow-glib/array.cpp index 30e51fb309f97..a3c45a890321c 100644 --- a/c_glib/arrow-glib/array.cpp +++ b/c_glib/arrow-glib/array.cpp @@ -557,6 +557,30 @@ garrow_boolean_array_get_value(GArrowBooleanArray *array, return static_cast(arrow_array.get())->Value(i); } +/** + * garrow_boolean_array_get_values: + * @array: A #GArrowBooleanArray. + * @length: (out): The number of values. + * + * Returns: (array length=length): The raw boolean values. + * + * It should be freed with g_free() when no longer needed. + */ +gboolean * +garrow_boolean_array_get_values(GArrowBooleanArray *array, + gint64 *length) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_boolean_array = + std::static_pointer_cast(arrow_array); + *length = arrow_boolean_array->length(); + auto values = static_cast(g_new(gboolean, *length)); + for (gint64 i = 0; i < *length; ++i) { + values[i] = arrow_boolean_array->Value(i); + } + return values; +} + G_DEFINE_TYPE(GArrowInt8Array, \ garrow_int8_array, \ diff --git a/c_glib/arrow-glib/array.h b/c_glib/arrow-glib/array.h index 1b2ba9fc1cc39..10b2279be4bbf 100644 --- a/c_glib/arrow-glib/array.h +++ b/c_glib/arrow-glib/array.h @@ -221,6 +221,8 @@ GArrowBooleanArray *garrow_boolean_array_new(gint64 length, gboolean garrow_boolean_array_get_value (GArrowBooleanArray *array, gint64 i); +gboolean *garrow_boolean_array_get_values(GArrowBooleanArray *array, + gint64 *length); #define GARROW_TYPE_INT8_ARRAY \ diff --git a/c_glib/test/test-boolean-array.rb b/c_glib/test/test-boolean-array.rb index 43b83655638e3..622e4e90c482b 100644 --- a/c_glib/test/test-boolean-array.rb +++ b/c_glib/test/test-boolean-array.rb @@ -17,6 +17,7 @@ class TestBooleanArray < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable def test_new assert_equal(build_boolean_array([true, false, nil]), @@ -41,4 +42,14 @@ def test_value array = builder.finish assert_equal(true, array.get_value(0)) end + + def test_values + require_gi(3, 1, 9) + builder = Arrow::BooleanArrayBuilder.new + builder.append(true) + builder.append(false) + builder.append(true) + array = builder.finish + assert_equal([true, false, true], array.values) + end end From 84b7a0d176270911a9d911b3fd2522ead67ee9ee Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 2 Aug 2017 19:32:57 -0400 Subject: [PATCH 10/38] ARROW-1312: [C++] Make ARROW_JEMALLOC OFF by default until ARROW-1282 is resolved See ARROW-1282, ARROW-1311 for related discussions. Author: Wes McKinney Closes #935 from wesm/ARROW-1312 and squashes the following commits: 2a06763c [Wes McKinney] Make ARROW_JEMALLOC OFF by default until ARROW-1282 is resolved --- ci/travis_before_script_cpp.sh | 6 ------ cpp/CMakeLists.txt | 2 +- cpp/apidoc/Windows.md | 8 ++++++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index d456d308c53e3..7418b7614ae34 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -70,12 +70,6 @@ if [ $only_library_mode == "yes" ]; then $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_TESTS=OFF \ -DARROW_BUILD_UTILITIES=OFF" -else - # Deactivate jemalloc on Linux builds. We check the jemalloc+Linux build - # also in the manylinux1 image. - CMAKE_LINUX_FLAGS="\ -$CMAKE_LINUX_FLAGS \ --DARROW_JEMALLOC=ON" fi # Use Ninja for faster builds when using toolchain diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index eeff9124ad983..7d73d1ffff089 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -100,7 +100,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(ARROW_JEMALLOC "Build the Arrow jemalloc-based allocator" - ON) + OFF) option(ARROW_JEMALLOC_USE_SHARED "Rely on jemalloc shared libraries where relevant" diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md index 30b7b8f3ce210..774482ea1c4f3 100644 --- a/cpp/apidoc/Windows.md +++ b/cpp/apidoc/Windows.md @@ -187,14 +187,18 @@ Command line to build Arrow in Debug might look as following: cd cpp mkdir build cd build -cmake -G "Visual Studio 14 2015 Win64" -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=OFF -DCMAKE_BUILD_TYPE=Debug -DBOOST_ROOT=C:/local/boost_1_63_0 -DBOOST_LIBRARYDIR=C:/local/boost_1_63_0/lib64-msvc-14.0 .. +cmake -G "Visual Studio 14 2015 Win64" ^ + -DARROW_BOOST_USE_SHARED=OFF ^ + -DCMAKE_BUILD_TYPE=Debug ^ + -DBOOST_ROOT=C:/local/boost_1_63_0 ^ + -DBOOST_LIBRARYDIR=C:/local/boost_1_63_0/lib64-msvc-14.0 ^ + .. cmake --build . --config Debug ``` To get the latest build instructions, you can reference [msvc-build.bat][5], which is used by automated Appveyor builds. - [1]: https://conda.io/miniconda.html [2]: https://conda-forge.github.io/ [3]: http://cmder.net/ From 1874a8b7ccbd5a531eb2a8af479ef154694efd9d Mon Sep 17 00:00:00 2001 From: siddharth Date: Thu, 3 Aug 2017 15:06:20 -0400 Subject: [PATCH 11/38] ARROW-1310: [JAVA] revert changes made in ARROW-886 @elahrvivaz , @StevenMPhillips Reverting the changes made for ARROW-886 -- https://github.com/apache/arrow/pull/591 (1) Don't explicitly reallocate the offsetVector in realloc() function of Variable Length Vectors. If we call setSafe() on variable length vector, it will internally invoke setSafe() on the corresponding offsetVector and the latter function can decide whether to reallocate the offsetVector or not. (2) Doing (1) will break the unit test added as part of PR 591 so we need to remove that as well. Author: siddharth Closes #937 from siddharthteotia/ARROW-1310 and squashes the following commits: c5a2707c [siddharth] ARROW-1310: revert changes made in ARROW-886 --- .../templates/VariableLengthVectors.java | 1 - .../arrow/vector/TestVectorReAlloc.java | 27 +------------------ 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/java/vector/src/main/codegen/templates/VariableLengthVectors.java b/java/vector/src/main/codegen/templates/VariableLengthVectors.java index 2ad7d20de2651..70c7209947460 100644 --- a/java/vector/src/main/codegen/templates/VariableLengthVectors.java +++ b/java/vector/src/main/codegen/templates/VariableLengthVectors.java @@ -352,7 +352,6 @@ public void reset() { } public void reAlloc() { - offsetVector.reAlloc(); final long newAllocationSize = allocationSizeInBytes*2L; if (newAllocationSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index da9cb00361c0b..a5d5527539322 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -72,31 +72,6 @@ public void testFixedType() { } } - @Test - public void testVariableLengthType() { - try (final VarCharVector vector = new VarCharVector("", allocator)) { - final VarCharVector.Mutator m = vector.getMutator(); - // note: capacity ends up being - 1 due to offsets vector - vector.setInitialCapacity(511); - vector.allocateNew(); - - assertEquals(511, vector.getValueCapacity()); - - try { - m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - Assert.fail("Expected out of bounds exception"); - } catch (Exception e) { - // ok - } - - vector.reAlloc(); - assertEquals(1023, vector.getValueCapacity()); - - m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - assertEquals("foo", new String(vector.getAccessor().get(512), StandardCharsets.UTF_8)); - } - } - @Test public void testNullableType() { try (final NullableVarCharVector vector = new NullableVarCharVector("", allocator)) { @@ -114,7 +89,7 @@ public void testNullableType() { } vector.reAlloc(); - assertEquals(1024, vector.getValueCapacity()); + assertEquals(1023, vector.getValueCapacity()); m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); assertEquals("foo", new String(vector.getAccessor().get(512), StandardCharsets.UTF_8)); From 37323242ec378134f3dc6991c5d9fab44017d4cd Mon Sep 17 00:00:00 2001 From: siddharth Date: Thu, 3 Aug 2017 15:42:37 -0400 Subject: [PATCH 12/38] =?UTF-8?q?ARROW-1224:=20[Format]=20Clarify=20langua?= =?UTF-8?q?ge=20around=20buffer=20padding=20and=20align=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added some clarifications on memory layout w.r.t padding and alignment. Author: siddharth Closes #932 from siddharthteotia/ARROW-1224 and squashes the following commits: dd5f2214 [siddharth] ARROW-1224: [Format] Clarify language around buffer padding and alignment in IPC --- format/Layout.md | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/format/Layout.md b/format/Layout.md index b62b1565a754b..334251103d732 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -62,7 +62,11 @@ Base requirements linearly in the nesting level * Capable of representing fully-materialized and decoded / decompressed [Parquet][5] data -* All contiguous memory buffers are aligned at 64-byte boundaries and padded to a multiple of 64 bytes. +* It is required to have all the contiguous memory buffers in an IPC payload + aligned at 8-byte boundaries. In other words, each buffer must start at + an aligned 8-byte offset. +* The general recommendation is to align the buffers at 64-byte boundary, but + this is not absolutely necessary. * Any relative type can have null slots * Arrays are immutable once created. Implementations can provide APIs to mutate an array, but applying mutations will require a new array data structure to @@ -108,21 +112,23 @@ via byte swapping. ## Alignment and Padding -As noted above, all buffers are intended to be aligned in memory at 64 byte -boundaries and padded to a length that is a multiple of 64 bytes. The alignment -requirement follows best practices for optimized memory access: +As noted above, all buffers must be aligned in memory at 8-byte boundaries and padded +to a length that is a multiple of 8 bytes. The alignment requirement follows best +practices for optimized memory access: * Elements in numeric arrays will be guaranteed to be retrieved via aligned access. * On some architectures alignment can help limit partially used cache lines. * 64 byte alignment is recommended by the [Intel performance guide][2] for -data-structures over 64 bytes (which will be a common case for Arrow Arrays). + data-structures over 64 bytes (which will be a common case for Arrow Arrays). -Requiring padding to a multiple of 64 bytes allows for using [SIMD][4] instructions +Recommending padding to a multiple of 64 bytes allows for using [SIMD][4] instructions consistently in loops without additional conditional checks. -This should allow for simpler and more efficient code. +This should allow for simpler, efficient and CPU cache-friendly code. The specific padding length was chosen because it matches the largest known -SIMD instruction registers available as of April 2016 (Intel AVX-512). -Guaranteed padding can also allow certain compilers +SIMD instruction registers available as of April 2016 (Intel AVX-512). In other +words, we can load the entire 64-byte buffer into a 512-bit wide SIMD register +and get data-level parallelism on all the columnar values packed into the 64-byte +buffer. Guaranteed padding can also allow certain compilers to generate more optimized code directly (e.g. One can safely use Intel's `-qopt-assume-safe-padding`). From f775af7ef00504924bad3a1d68b99dae75aeff6c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Aug 2017 17:26:07 -0400 Subject: [PATCH 13/38] ARROW-1312: [Python] Follow-up: do not use jemalloc in manylinux1 builds Missed this usage in the original patch Author: Wes McKinney Closes #938 from wesm/ARROW-1312-followup and squashes the following commits: 008f2105 [Wes McKinney] Do not use jemalloc in manylinux1 builds --- python/manylinux1/build_arrow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 5a21e36e4d7d5..ab7416c3275b7 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -58,7 +58,7 @@ for PYTHON in ${PYTHON_VERSIONS}; do ARROW_BUILD_DIR=/arrow/cpp/build-PY${PYTHON} mkdir -p "${ARROW_BUILD_DIR}" pushd "${ARROW_BUILD_DIR}" - PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON .. + PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=off -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON .. make -j5 install popd From a388ddf8865dc7502779d3adbecaed0b42983ca6 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 4 Aug 2017 11:27:15 -0400 Subject: [PATCH 14/38] ARROW-1330: [Plasma] Turn on plasma tests on manylinux1 With ARROW-1312 merged, these tests should pass now. Author: Philipp Moritz Closes #939 from pcmoritz/reactivate-plasma-tests and squashes the following commits: 6ec77495 [Philipp Moritz] reactivate plasma tests in manylinux1 --- python/manylinux1/build_arrow.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index ab7416c3275b7..074bd0056a948 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -81,9 +81,7 @@ for PYTHON in ${PYTHON_VERSIONS}; do source /venv-test-${PYTHON}/bin/activate pip install repaired_wheels/*.whl - # ARROW-1264; for some reason the test case added causes a segfault inside - # the Docker container when writing and error message to stderr - py.test --parquet /venv-test-${PYTHON}/lib/*/site-packages/pyarrow -v -s --disable-plasma + py.test --parquet /venv-test-${PYTHON}/lib/*/site-packages/pyarrow -v deactivate mv repaired_wheels/*.whl /io/dist From aa5d417bbd32fcdf96462181791f6c044ab3215d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Aug 2017 11:36:08 -0400 Subject: [PATCH 15/38] ARROW-1326: [Python] Fix Sphinx Build in Travis CI, treat Sphinx warnings as errors Author: Wes McKinney Author: Wes McKinney Closes #936 from wesm/ARROW-1326 and squashes the following commits: 8af31d72 [Wes McKinney] Fix bash syntax 1d4dc699 [Wes McKinney] Only build Sphinx documentation on Linux because of case sensitivity issue b88adc95 [Wes McKinney] Travis OS X is case insensitive file system 7eb12b02 [Wes McKinney] Nudge build b33ced76 [Wes McKinney] Use sphinx-build directly, error on Sphinx warnings 11418fbb [Wes McKinney] Build Python docs inside docs directory to avoid import errors --- ci/travis_script_python.sh | 13 ++++++------- python/doc/source/api.rst | 11 ++++++++--- python/doc/source/plasma.rst | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 4a50d2faaf551..66cd17d5ff212 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -120,14 +120,13 @@ python_version_tests() { PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet - pushd $ARROW_PYTHON_DIR - # Build documentation once - if [[ "$PYTHON_VERSION" == "3.6" ]] - then - conda install -y -q --file=doc/requirements.txt - python setup.py build_sphinx -s doc/source + if [ "$PYTHON_VERSION" == "3.6" ] && [ $TRAVIS_OS_NAME == "linux" ]; then + # Build documentation once + pushd $ARROW_PYTHON_DIR/doc + conda install -y -q --file=requirements.txt + sphinx-build -b html -d _build/doctrees -W source _build/html + popd fi - popd } # run tests for python 2.7 and 3.6 diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index fd1cb728d9828..1aaf89ce9a1f0 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -91,13 +91,14 @@ Scalar Value Types .. _api.array: -Array Types and Constructors ----------------------------- +.. currentmodule:: pyarrow.lib + +Array Types +----------- .. autosummary:: :toctree: generated/ - array Array BooleanArray DictionaryArray @@ -126,6 +127,8 @@ Array Types and Constructors .. _api.table: +.. currentmodule:: pyarrow + Tables and Record Batches ------------------------- @@ -214,6 +217,8 @@ Memory Pools .. _api.type_classes: +.. currentmodule:: pyarrow.lib + Type Classes ------------ diff --git a/python/doc/source/plasma.rst b/python/doc/source/plasma.rst index 98dd62f97e951..832d9960cb539 100644 --- a/python/doc/source/plasma.rst +++ b/python/doc/source/plasma.rst @@ -16,7 +16,7 @@ .. under the License. .. currentmodule:: pyarrow -.. _io: +.. _plasma: The Plasma In-Memory Object Store ================================= From 717bed0d0a6a038cca6d44b46f60833aed6b09fc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Aug 2017 14:25:53 -0400 Subject: [PATCH 16/38] ARROW-1328: [Python] Set correct Arrow type when coercing to milliseconds and passing explicit type cc @fjetter I'm planning to deprecate the `timestamps_to_ms` argument but it needs a patch in parquet-cpp first (so that nanoseconds/microseconds can be casted on the fly to milliseconds), so let's handle this fix first Author: Wes McKinney Closes #941 from wesm/ARROW-1328 and squashes the following commits: 4632dc36 [Wes McKinney] Set correct Arrow type when coercing to milliseconds and passing explicit type (which may not be milliseconds) --- python/pyarrow/array.pxi | 26 +------------- python/pyarrow/pandas_compat.py | 24 +++++++++++++ python/pyarrow/table.pxi | 19 ++++++---- python/pyarrow/tests/test_convert_pandas.py | 39 +++++++++++++-------- 4 files changed, 62 insertions(+), 46 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cbd036c08431f..4e0c21c79e1c0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,30 +16,6 @@ # under the License. -cdef maybe_coerce_datetime64(values, dtype, DataType type, - timestamps_to_ms=False): - - from pyarrow.compat import DatetimeTZDtype - - if values.dtype.type != np.datetime64: - return values, type - - coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' - - if coerce_ms: - values = values.astype('datetime64[ms]') - - if isinstance(dtype, DatetimeTZDtype): - tz = dtype.tz - unit = 'ms' if coerce_ms else dtype.unit - type = timestamp(unit, tz) - elif type is None: - # Trust the NumPy dtype - type = from_numpy_dtype(values.dtype) - - return values, type - - def array(object sequence, DataType type=None, MemoryPool memory_pool=None, size=None): """ @@ -205,7 +181,7 @@ cdef class Array: else: out = chunked_out.get().chunk(0) else: - values, type = maybe_coerce_datetime64( + values, type = pdcompat.maybe_coerce_datetime64( values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) if type is None: diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 62547a42f7359..2881588208eb1 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -17,6 +17,7 @@ import re import json +import numpy as np import pandas as pd import six @@ -241,6 +242,29 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): return names, arrays, metadata +def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): + from pyarrow.compat import DatetimeTZDtype + + if values.dtype.type != np.datetime64: + return values, type_ + + coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' + + if coerce_ms: + values = values.astype('datetime64[ms]') + type_ = pa.timestamp('ms') + + if isinstance(dtype, DatetimeTZDtype): + tz = dtype.tz + unit = 'ms' if coerce_ms else dtype.unit + type_ = pa.timestamp(unit, tz) + elif type_ is None: + # Trust the NumPy dtype + type_ = pa.from_numpy_dtype(values.dtype) + + return values, type_ + + def table_to_blockmanager(table, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 997b28579f847..e33c9ba2f20e9 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -134,6 +134,16 @@ cdef class Column: self.sp_column = column self.column = column.get() + def __repr__(self): + from pyarrow.compat import StringIO + result = StringIO() + result.write(object.__repr__(self)) + data = self.data + for i in range(len(data)): + result.write('\nchunk {0}: {1}'.format(i, repr(data.chunk(0)))) + + return result.getvalue() + @staticmethod def from_array(object field_or_name, Array arr): cdef Field boxed_field @@ -661,13 +671,8 @@ cdef class Table: return result @classmethod - def from_pandas( - cls, - df, - bint timestamps_to_ms=False, - Schema schema=None, - bint preserve_index=True - ): + def from_pandas(cls, df, bint timestamps_to_ms=False, + Schema schema=None, bint preserve_index=True): """ Convert pandas.DataFrame to an Arrow Table diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index f6ea1636a3d62..2a51d3283203f 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -18,7 +18,7 @@ from collections import OrderedDict -import datetime +from datetime import datetime, date, time import unittest import decimal import json @@ -351,6 +351,17 @@ def test_timestamps_notimezone_no_nulls(self): expected_schema=schema, ) + def test_timestamps_to_ms_explicit_schema(self): + # ARROW-1328 + df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]}) + pa_type = pa.from_numpy_dtype(df['datetime'].dtype) + + arr = pa.Array.from_pandas(df['datetime'], type=pa_type, + timestamps_to_ms=True) + + tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'), + arr.to_pandas()) + def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ @@ -409,10 +420,10 @@ def test_timestamps_with_timezone(self): def test_date_infer(self): df = pd.DataFrame({ - 'date': [datetime.date(2000, 1, 1), + 'date': [date(2000, 1, 1), None, - datetime.date(1970, 1, 1), - datetime.date(2040, 2, 26)]}) + date(1970, 1, 1), + date(2040, 2, 26)]}) table = pa.Table.from_pandas(df, preserve_index=False) field = pa.field('date', pa.date32()) schema = pa.schema([field]) @@ -424,10 +435,10 @@ def test_date_infer(self): def test_date_objects_typed(self): arr = np.array([ - datetime.date(2017, 4, 3), + date(2017, 4, 3), None, - datetime.date(2017, 4, 4), - datetime.date(2017, 4, 5)], dtype=object) + date(2017, 4, 4), + date(2017, 4, 5)], dtype=object) arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32') arr_i8 = arr_i4.astype('int64') * 86400000 @@ -470,7 +481,7 @@ def test_dates_from_integers(self): a1 = pa.Array.from_pandas(arr, type=t1) a2 = pa.Array.from_pandas(arr2, type=t2) - expected = datetime.date(2017, 4, 3) + expected = date(2017, 4, 3) assert a1[0].as_py() == expected assert a2[0].as_py() == expected @@ -669,8 +680,8 @@ def test_decimal_128_to_pandas(self): tm.assert_frame_equal(df, expected) def test_pytime_from_pandas(self): - pytimes = [datetime.time(1, 2, 3, 1356), - datetime.time(4, 5, 6, 1356)] + pytimes = [time(1, 2, 3, 1356), + time(4, 5, 6, 1356)] # microseconds t1 = pa.time64('us') @@ -706,9 +717,9 @@ def test_pytime_from_pandas(self): assert a4[0].as_py() == pytimes[0].replace(microsecond=0) def test_arrow_time_to_pandas(self): - pytimes = [datetime.time(1, 2, 3, 1356), - datetime.time(4, 5, 6, 1356), - datetime.time(0, 0, 0)] + pytimes = [time(1, 2, 3, 1356), + time(4, 5, 6, 1356), + time(0, 0, 0)] expected = np.array(pytimes[:2] + [None]) expected_ms = np.array([x.replace(microsecond=1000) @@ -902,7 +913,7 @@ def _pytime_from_micros(val): val //= 60 minutes = val % 60 hours = val // 60 - return datetime.time(hours, minutes, seconds, microseconds) + return time(hours, minutes, seconds, microseconds) def _pytime_to_micros(pytime): From 3bc7d4604c68930cbaac0262e0c9281adf31f8c0 Mon Sep 17 00:00:00 2001 From: Li Jin Date: Fri, 4 Aug 2017 17:34:46 -0400 Subject: [PATCH 17/38] =?UTF-8?q?ARROW-1296:=20[Java]=20Fix=20allocationSi?= =?UTF-8?q?zeInBytes=20in=20FixedValueVectors.res=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix allocationSizeInBytes in FixedValueVectors.reset Author: Li Jin Closes #915 from icexelloss/vector-reset-ARROW-1296 and squashes the following commits: f9c8607f [Li Jin] Add VectorReset test; Add package private method getAllocationSize to FixedValueVectors d0117a83 [Li Jin] ARROW-1296: [Java] Fix allocationSizeInBytes in FixedValueVectors.reset() --- .../codegen/templates/FixedValueVectors.java | 6 +- .../apache/arrow/vector/TestVectorReset.java | 55 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index 5d92cd232efb3..61164ab6c9b3d 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -111,6 +111,10 @@ public Mutator getMutator(){ return mutator; } + int getAllocationSize() { + return allocationSizeInBytes; + } + @Override public void setInitialCapacity(final int valueCount) { final long size = 1L * valueCount * ${type.width}; @@ -162,7 +166,7 @@ public void allocateNew(final int valueCount) { @Override public void reset() { - allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; allocationMonitor = 0; zeroVector(); super.reset(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java new file mode 100644 index 0000000000000..d53f69489d4da --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestVectorReset { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testFixedTypeReset() { + try (final UInt4Vector vector = new UInt4Vector("", allocator)) { + final UInt4Vector.Mutator m = vector.getMutator(); + vector.allocateNew(); + final int sizeBefore = vector.getAllocationSize(); + vector.reAlloc(); + vector.reset(); + final int sizeAfter = vector.getAllocationSize(); + assertEquals(sizeBefore, sizeAfter); + } + } +} From 25439e7fb2d1047d8af5d77b36b6ffbfe4721beb Mon Sep 17 00:00:00 2001 From: siddharth Date: Fri, 4 Aug 2017 17:40:40 -0400 Subject: [PATCH 18/38] ARROW-1300: [JAVA] Fix Tests for ListVector @StevenMPhillips Fixed the following: (1) TestListVector.java doesn't include tests for nested lists where the underlying dataVector for a listVector is also a listVector. (2) The copy test in TestListVector.java only checks the bit vector contents and doesn't verify the actual contents of list vector Author: siddharth Closes #925 from siddharthteotia/ARROW-1300 and squashes the following commits: 584c79ef [siddharth] ARROW-1300: Fix tests for ListVector ff842533 [siddharth] ARROW-1300: Fix Tests for ListVector 9978199d [siddharth] ARROW-1300: Fix tests for ListVector 777e0ded [siddharth] ARROW-1300: Fix Tests for ListVector --- .../apache/arrow/vector/TestListVector.java | 151 +++++++++++++++++- 1 file changed, 143 insertions(+), 8 deletions(-) diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index a1762c466ce0b..eb30fdd46bf60 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -38,8 +38,10 @@ import org.junit.Before; import org.junit.Test; +import java.util.ArrayList; import java.util.List; + public class TestListVector { private BufferAllocator allocator; @@ -92,6 +94,26 @@ public void testCopyFrom() throws Exception { Assert.assertFalse("should be null", reader.isSet()); reader.setPosition(2); Assert.assertTrue("shouldn't be null", reader.isSet()); + + /* check the exact contents of vector */ + final ListVector.Accessor accessor = outVector.getAccessor(); + + /* index 0 */ + Object result = accessor.getObject(0); + ArrayList resultSet = (ArrayList)result; + assertEquals(3, resultSet.size()); + assertEquals(new Long(1), (Long)resultSet.get(0)); + assertEquals(new Long(2), (Long)resultSet.get(1)); + assertEquals(new Long(3), (Long)resultSet.get(2)); + + /* index 1 */ + result = accessor.getObject(1); + assertNull(result); + + /* index 2 */ + result = accessor.getObject(2); + resultSet = (ArrayList)result; + assertEquals(0, resultSet.size()); } } @@ -127,7 +149,7 @@ public void testSetLastSetUsage() throws Exception { dataVector.getMutator().setSafe(2, 1, 12); offsetVector.getMutator().setSafe(index + 1, 3); - index += 1; + index += 1; /* write [13, 14] to the list vector at index 1 */ bitVector.getMutator().setSafe(index, 1); @@ -193,7 +215,7 @@ public void testSetLastSetUsage() throws Exception { final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); - index = 0; + index = 0; offset = offsetAccessor.get(index); assertEquals(Integer.toString(0), Integer.toString(offset)); @@ -300,7 +322,7 @@ public void testSplitAndTransfer() throws Exception { final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); - int index = 0; + int index = 0; int offset = 0; Object actual = null; @@ -386,10 +408,7 @@ public void testSplitAndTransfer() throws Exception { TransferPair transferPair = listVector.makeTransferPair(toVector); - int[][] transferLengths = { {0, 2}, - {3, 1}, - {4, 1} - }; + int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}}; for (final int[] transferLength : transferLengths) { int start = transferLength[0]; @@ -423,7 +442,7 @@ public void testSplitAndTransfer() throws Exception { for(int j = 0; j < dataLength1; j++) { assertEquals("Different data at indexes: " + offset1 + " and " + offset2, - valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); + valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); offset1++; offset2++; @@ -433,4 +452,120 @@ public void testSplitAndTransfer() throws Exception { } } } + + @Test + public void testNestedListVector() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + UnionListWriter listWriter = listVector.getWriter(); + + /* allocate memory */ + listWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().bigInt().writeBigInt(150); + listWriter.list().bigInt().writeBigInt(175); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(10); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(2, listVector.getMutator().getLastSet()); + + listVector.getMutator().setValueCount(2); + + final ListVector.Accessor accessor = listVector.getAccessor(); + assertEquals(2, accessor.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = accessor.getObject(0); + ArrayList> resultSet = (ArrayList>)result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + assertEquals(new Long(150), list.get(2)); + assertEquals(new Long(175), list.get(3)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = accessor.getObject(1); + resultSet = (ArrayList>)result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of third inner list */ + + list = resultSet.get(0); + assertEquals(new Long(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(accessor.isNull(0)); + assertFalse(accessor.isNull(1)); + + /* check underlying offsetVector */ + UInt4Vector offsetVector = listVector.getOffsetVector(); + final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetAccessor.get(0)); + assertEquals(2, offsetAccessor.get(1)); + assertEquals(5, offsetAccessor.get(2)); + } + } } From 3200e914d78773bf4a59c3c0a1e1e7164d77fa64 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Aug 2017 10:44:50 -0400 Subject: [PATCH 19/38] ARROW-1327: [Python] Always release GIL before calling check_status in Cython This should prevent deadlock in some multithreaded or subinterpreter contexts. We can be more mindful of this in the future Author: Wes McKinney Closes #945 from wesm/ARROW-1327 and squashes the following commits: d690c5b3 [Wes McKinney] Fix some GIL acquisitions 870e5222 [Wes McKinney] Always release GIL before calling check_status in Cython --- cpp/src/arrow/python/builtin_convert.cc | 7 ++++- cpp/src/arrow/python/numpy_convert.cc | 4 +++ python/pyarrow/_parquet.pyx | 18 ++++++++---- python/pyarrow/array.pxi | 37 ++++++++++++++++--------- python/pyarrow/feather.pxi | 3 +- python/pyarrow/io-hdfs.pxi | 12 +++++--- python/pyarrow/io.pxi | 6 ++-- python/pyarrow/table.pxi | 5 ++-- 8 files changed, 63 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 6eaa37fb8ca93..d3bf76dbed2cc 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -660,6 +660,7 @@ Status AppendPySequence(PyObject* obj, int64_t size, } Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out) { + PyAcquireGIL lock; std::shared_ptr type; int64_t size; RETURN_NOT_OK(InferArrowTypeAndSize(obj, &size, &type)); @@ -668,6 +669,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, const std::shared_ptr& type, int64_t size) { + PyAcquireGIL lock; // Handle NA / NullType case if (type->id() == Type::NA) { out->reset(new NullArray(size)); @@ -684,7 +686,10 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out, const std::shared_ptr& type) { int64_t size; - RETURN_NOT_OK(InferArrowSize(obj, &size)); + { + PyAcquireGIL lock; + RETURN_NOT_OK(InferArrowSize(obj, &size)); + } return ConvertPySequence(obj, pool, out, type, size); } diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 95d63b8fecb5b..61192f313d29d 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -202,6 +202,8 @@ Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { #undef TO_ARROW_TYPE_CASE Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { + PyAcquireGIL lock; + if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } @@ -234,6 +236,8 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* } Status TensorToNdarray(const Tensor& tensor, PyObject* base, PyObject** out) { + PyAcquireGIL lock; + int type_num; RETURN_NOT_OK(GetNumPyType(*tensor.type(), &type_num)); PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num); diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 919e82c109451..65ca468772710 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -558,9 +558,14 @@ cdef class ParquetWriter: cdef: shared_ptr[FileOutputStream] filestream shared_ptr[WriterProperties] properties + c_string c_where + CMemoryPool* pool if isinstance(where, six.string_types): - check_status(FileOutputStream.Open(tobytes(where), &filestream)) + c_where = tobytes(where) + with nogil: + check_status(FileOutputStream.Open(c_where, + &filestream)) self.sink = filestream else: get_writer(where, &self.sink) @@ -580,11 +585,12 @@ cdef class ParquetWriter: self._set_int96_support(&arrow_properties_builder) arrow_properties = arrow_properties_builder.build() - check_status( - FileWriter.Open(deref(schema.schema), - maybe_unbox_memory_pool(memory_pool), - self.sink, properties, arrow_properties, - &self.writer)) + pool = maybe_unbox_memory_pool(memory_pool) + with nogil: + check_status( + FileWriter.Open(deref(schema.schema), pool, + self.sink, properties, arrow_properties, + &self.writer)) cdef void _set_int96_support(self, ArrowWriterProperties.Builder* props): if self.use_deprecated_int96_timestamps: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 4e0c21c79e1c0..c0c7ac6da09ed 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -44,23 +44,28 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, cdef: shared_ptr[CArray] sp_array CMemoryPool* pool + int64_t c_size pool = maybe_unbox_memory_pool(memory_pool) if type is None: - check_status(ConvertPySequence(sequence, pool, &sp_array)) + with nogil: + check_status(ConvertPySequence(sequence, pool, &sp_array)) else: if size is None: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &sp_array, type.sp_type + ) ) - ) else: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type, size + c_size = size + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &sp_array, type.sp_type, c_size + ) ) - ) return pyarrow_wrap_array(sp_array) @@ -91,7 +96,8 @@ cdef class Array: self.type = pyarrow_wrap_data_type(self.sp_array.get().type()) def _debug_print(self): - check_status(DebugPrint(deref(self.ap), 0)) + with nogil: + check_status(DebugPrint(deref(self.ap), 0)) @staticmethod def from_pandas(obj, mask=None, DataType type=None, @@ -185,7 +191,9 @@ cdef class Array: values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) if type is None: - check_status(NumPyDtypeToArrow(values.dtype, &c_type)) + dtype = values.dtype + with nogil: + check_status(NumPyDtypeToArrow(dtype, &c_type)) else: c_type = type.sp_type @@ -319,7 +327,9 @@ strides: {2}""".format(self.type, self.shape, self.strides) @staticmethod def from_numpy(obj): cdef shared_ptr[CTensor] ctensor - check_status(NdarrayToTensor(c_default_memory_pool(), obj, &ctensor)) + with nogil: + check_status(NdarrayToTensor(c_default_memory_pool(), obj, + &ctensor)) return pyarrow_wrap_tensor(ctensor) def to_numpy(self): @@ -329,7 +339,8 @@ strides: {2}""".format(self.type, self.shape, self.strides) cdef: PyObject* out - check_status(TensorToNdarray(deref(self.tp), self, &out)) + with nogil: + check_status(TensorToNdarray(deref(self.tp), self, &out)) return PyObject_to_object(out) def equals(self, Tensor other): diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 6faf2f9c69c7b..6a1fa30ba63e7 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -44,7 +44,8 @@ cdef class FeatherWriter: if self.num_rows < 0: self.num_rows = 0 self.writer.get().SetNumRows(self.num_rows) - check_status(self.writer.get().Finalize()) + with nogil: + check_status(self.writer.get().Finalize()) def write_array(self, object name, object col, object mask=None): cdef Array arr diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index 27e9948b3a4ea..e6285e465d2be 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -29,7 +29,8 @@ except ImportError: def have_libhdfs(): try: - check_status(HaveLibHdfs()) + with nogil: + check_status(HaveLibHdfs()) return True except: return False @@ -37,7 +38,8 @@ def have_libhdfs(): def have_libhdfs3(): try: - check_status(HaveLibHdfs3()) + with nogil: + check_status(HaveLibHdfs3()) return True except: return False @@ -73,10 +75,12 @@ cdef class HadoopFileSystem: conf.kerb_ticket = tobytes(kerb_ticket) if driver == 'libhdfs': - check_status(HaveLibHdfs()) + with nogil: + check_status(HaveLibHdfs()) conf.driver = HdfsDriver_LIBHDFS else: - check_status(HaveLibHdfs3()) + with nogil: + check_status(HaveLibHdfs3()) conf.driver = HdfsDriver_LIBHDFS3 with nogil: diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index cccb1736be6fa..eda8de730281d 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -447,7 +447,8 @@ cdef class MemoryMappedFile(NativeFile): else: raise ValueError('Invalid file mode: {0}'.format(mode)) - check_status(CMemoryMappedFile.Open(c_path, c_mode, &handle)) + with nogil: + check_status(CMemoryMappedFile.Open(c_path, c_mode, &handle)) self.wr_file = handle self.rd_file = handle @@ -642,7 +643,8 @@ cdef class BufferOutputStream(NativeFile): self.is_open = True def get_result(self): - check_status(self.wr_file.get().Close()) + with nogil: + check_status(self.wr_file.get().Close()) self.is_open = False return pyarrow_wrap_buffer( self.buffer) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index e33c9ba2f20e9..b9b08998b3372 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -168,8 +168,9 @@ cdef class Column: cdef: PyObject* out - check_status(libarrow.ConvertColumnToPandas(self.sp_column, - self, &out)) + with nogil: + check_status(libarrow.ConvertColumnToPandas(self.sp_column, + self, &out)) return pd.Series(wrap_array_output(out), name=self.name) From 619472ec0a6256fc6ead491fb12881b97f3acec3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Aug 2017 10:45:40 -0400 Subject: [PATCH 20/38] ARROW-1225: [Python] Decode bytes to utf8 unicode if possible when passing explicit utf8 type to pyarrow.array Author: Wes McKinney Closes #946 from wesm/ARROW-1225 and squashes the following commits: aa737b11 [Wes McKinney] Clearer error message 2f439285 [Wes McKinney] Decode bytes to utf8 unicode if possible when passing explicit utf8 type in pyarrow.array --- cpp/src/arrow/python/builtin_convert.cc | 22 ++++++++++++++------ python/pyarrow/tests/test_convert_builtin.py | 13 ++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index d3bf76dbed2cc..218fe2925fd86 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -530,14 +530,24 @@ class UTF8Converter : public TypedConverterVisitor const char* bytes; Py_ssize_t length; - if (item.obj() == Py_None) { + PyObject* obj = item.obj(); + if (obj == Py_None) { return typed_builder_->AppendNull(); - } else if (!PyUnicode_Check(item.obj())) { - return Status::Invalid("Non-unicode value encountered"); + } else if (PyBytes_Check(obj)) { + tmp.reset(PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), + PyBytes_GET_SIZE(obj))); + RETURN_IF_PYERROR(); + bytes_obj = obj; + } else if (!PyUnicode_Check(obj)) { + PyObjectStringify stringified(obj); + std::stringstream ss; + ss << "Non bytes/unicode value encountered: " << stringified.bytes; + return Status::Invalid(ss.str()); + } else { + tmp.reset(PyUnicode_AsUTF8String(obj)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); } - tmp.reset(PyUnicode_AsUTF8String(item.obj())); - RETURN_IF_PYERROR(); - bytes_obj = tmp.obj(); // No error checking length = PyBytes_GET_SIZE(bytes_obj); diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index ec261595585c2..d18ed9506bbbb 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -16,6 +16,8 @@ # specific language governing permissions and limitations # under the License. +import pytest + from pyarrow.compat import unittest, u # noqa import pyarrow as pa @@ -140,6 +142,17 @@ def test_bytes(self): assert arr.type == pa.binary() assert arr.to_pylist() == [b'foo', u1, None] + def test_utf8_to_unicode(self): + # ARROW-1225 + data = [b'foo', None, b'bar'] + arr = pa.array(data, type=pa.string()) + assert arr[0].as_py() == u'foo' + + # test a non-utf8 unicode string + val = (u'maรฑana').encode('utf-16-le') + with pytest.raises(pa.ArrowException): + pa.array([val], type=pa.string()) + def test_fixed_size_bytes(self): data = [b'foof', None, b'barb', b'2346'] arr = pa.array(data, type=pa.binary(4)) From c0acb86540ec01b62fb4d787ac50438b0ecf39e3 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Mon, 7 Aug 2017 10:49:03 -0400 Subject: [PATCH 21/38] ARROW-1333: [Plasma] Example code for using Plasma to sort a DataFrame This provides the example code used to generate the speedup mentioned in #940. It is also an end-to-end example using the Plasma object store. Author: Robert Nishihara Author: Philipp Moritz Closes #942 from robertnishihara/sortdf and squashes the following commits: a94ce907 [Philipp Moritz] add licenses 221b3362 [Robert Nishihara] Small cleanup. 0cba9904 [Robert Nishihara] Small naming changes. 94e85ee8 [Philipp Moritz] move files 07a53b36 [Philipp Moritz] add experimental setup b39caacc [Philipp Moritz] remove original multimerge function and document multimerge2d 4df2accf [Philipp Moritz] batch calls to object store 0f7c44c6 [Philipp Moritz] small cleanups 8d997786 [Robert Nishihara] Fix indentation in cython. f516ab7b [Robert Nishihara] Small fixes. 1f74047d [Robert Nishihara] Speed up Cython 6ef8a776 [Robert Nishihara] remove some printing e830587a [Robert Nishihara] Fix bug. a92996a2 [Robert Nishihara] Update plasma executable path. fc6707f4 [Robert Nishihara] Fix sorting example (fortran versus C arrays). 5d3c74aa [Robert Nishihara] Almost working sorting dataframe. --- python/examples/plasma/sorting/multimerge.pyx | 102 +++++++++ python/examples/plasma/sorting/setup.py | 27 +++ python/examples/plasma/sorting/sort_df.py | 204 ++++++++++++++++++ 3 files changed, 333 insertions(+) create mode 100644 python/examples/plasma/sorting/multimerge.pyx create mode 100644 python/examples/plasma/sorting/setup.py create mode 100644 python/examples/plasma/sorting/sort_df.py diff --git a/python/examples/plasma/sorting/multimerge.pyx b/python/examples/plasma/sorting/multimerge.pyx new file mode 100644 index 0000000000000..6dd5aaef95cb9 --- /dev/null +++ b/python/examples/plasma/sorting/multimerge.pyx @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from libc.stdint cimport uintptr_t +from libcpp.vector cimport vector +from libcpp.pair cimport pair + +cimport numpy as np +import numpy as np + + +cdef extern from "" namespace "std" nogil: + cdef cppclass priority_queue[T]: + priority_queue() except + + priority_queue(priority_queue&) except + + bint empty() + void pop() + void push(T&) + size_t size() + T& top() + + +def multimerge2d(*arrays): + """Merge a list of sorted 2d arrays into a sorted 2d array. + + This assumes C style ordering for both input and output arrays. For + each input array we have array[i,0] <= array[i+1,0] and for the output + array the same will hold. + + Ideally this code would be simpler and also support both C style + and Fortran style ordering. + """ + cdef int num_arrays = len(arrays) + assert num_arrays > 0 + + cdef int num_cols = arrays[0].shape[1] + + for i in range(num_arrays): + assert arrays[i].ndim == 2 + assert arrays[i].dtype == np.float64 + assert arrays[i].shape[1] == num_cols + assert not np.isfortran(arrays[i]) + + cdef vector[double*] data + + # The indices vector keeps track of the index of the next row to process in + # each array. + cdef vector[int] indices = num_arrays * [0] + + # The sizes vector stores the total number of elements that each array has. + cdef vector[int] sizes + + cdef priority_queue[pair[double, int]] queue + cdef pair[double, int] top + cdef int num_rows = sum([array.shape[0] for array in arrays]) + cdef np.ndarray[np.float64_t, ndim=2] result = np.zeros( + (num_rows, num_cols), dtype=np.float64) + cdef double* result_ptr = np.PyArray_DATA(result) + for i in range(num_arrays): + if arrays[i].size > 0: + sizes.push_back(arrays[i].size) + data.push_back( np.PyArray_DATA(arrays[i])) + queue.push(pair[double, int](-data[i][0], i)) + + cdef int curr_idx = 0 + cdef int j + cdef int col = 0 + + for j in range(num_rows): + top = queue.top() + for col in range(num_cols): + result_ptr[curr_idx + col] = ( + data[top.second][indices[top.second] + col]) + + indices[top.second] += num_cols + curr_idx += num_cols + + queue.pop() + if indices[top.second] < sizes[top.second]: + queue.push( + pair[double, int](-data[top.second][indices[top.second]], + top.second)) + + return result diff --git a/python/examples/plasma/sorting/setup.py b/python/examples/plasma/sorting/setup.py new file mode 100644 index 0000000000000..a578085a8e4cc --- /dev/null +++ b/python/examples/plasma/sorting/setup.py @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +from distutils.core import setup +from Cython.Build import cythonize + +setup( + name="multimerge", + extra_compile_args=["-O3", "-mtune=native", "-march=native"], + ext_modules=cythonize("multimerge.pyx"), + include_dirs=[np.get_include()], +) diff --git a/python/examples/plasma/sorting/sort_df.py b/python/examples/plasma/sorting/sort_df.py new file mode 100644 index 0000000000000..03cfd13c6d76f --- /dev/null +++ b/python/examples/plasma/sorting/sort_df.py @@ -0,0 +1,204 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from multiprocessing import Pool +import numpy as np +import os +import pandas as pd +import pyarrow as pa +import pyarrow.plasma as plasma +import subprocess +import time + +import multimerge + +# To run this example, you will first need to run "python setup.py install" in +# this directory to build the Cython module. +# +# You will only see speedups if you run this code on more data, this is just a +# small example that can run on a laptop. +# +# The values we used to get a speedup (on a m4.10xlarge instance on EC2) were +# object_store_size = 84 * 10 ** 9 +# num_cores = 20 +# num_rows = 10 ** 9 +# num_cols = 1 + +client = None +object_store_size = 2 * 10 ** 9 # 2 GB +num_cores = 8 +num_rows = 200000 +num_cols = 2 +column_names = [str(i) for i in range(num_cols)] +column_to_sort = column_names[0] + + +# Connect to clients +def connect(): + global client + client = plasma.connect('/tmp/store', '', 0) + np.random.seed(int(time.time() * 10e7) % 10000000) + + +def put_df(df): + record_batch = pa.RecordBatch.from_pandas(df) + + # Get size of record batch and schema + mock_sink = pa.MockOutputStream() + stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) + stream_writer.write_batch(record_batch) + data_size = mock_sink.size() + + # Generate an ID and allocate a buffer in the object store for the + # serialized DataFrame + object_id = plasma.ObjectID(np.random.bytes(20)) + buf = client.create(object_id, data_size) + + # Write the serialized DataFrame to the object store + sink = pa.FixedSizeBufferOutputStream(buf) + stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema) + stream_writer.write_batch(record_batch) + + # Seal the object + client.seal(object_id) + + return object_id + + +def get_dfs(object_ids): + """Retrieve dataframes from the object store given their object IDs.""" + buffers = client.get(object_ids) + return [pa.RecordBatchStreamReader(buf).read_next_batch().to_pandas() + for buf in buffers] + + +def local_sort(object_id): + """Sort a partition of a dataframe.""" + # Get the dataframe from the object store. + [df] = get_dfs([object_id]) + # Sort the dataframe. + sorted_df = df.sort_values(by=column_to_sort) + # Get evenly spaced values from the dataframe. + indices = np.linspace(0, len(df) - 1, num=num_cores, dtype=np.int64) + # Put the sorted dataframe in the object store and return the corresponding + # object ID as well as the sampled values. + return put_df(sorted_df), sorted_df.as_matrix().take(indices) + + +def local_partitions(object_id_and_pivots): + """Take a sorted partition of a dataframe and split it into more pieces.""" + object_id, pivots = object_id_and_pivots + [df] = get_dfs([object_id]) + split_at = df[column_to_sort].searchsorted(pivots) + split_at = [0] + list(split_at) + [len(df)] + # Partition the sorted dataframe and put each partition into the object + # store. + return [put_df(df[i:j]) for i, j in zip(split_at[:-1], split_at[1:])] + + +def merge(object_ids): + """Merge a number of sorted dataframes into a single sorted dataframe.""" + dfs = get_dfs(object_ids) + + # In order to use our multimerge code, we have to convert the arrays from + # the Fortran format to the C format. + arrays = [np.ascontiguousarray(df.as_matrix()) for df in dfs] + for a in arrays: + assert a.dtype == np.float64 + assert not np.isfortran(a) + + # Filter out empty arrays. + arrays = [a for a in arrays if a.shape[0] > 0] + + if len(arrays) == 0: + return None + + resulting_array = multimerge.multimerge2d(*arrays) + merged_df2 = pd.DataFrame(resulting_array, columns=column_names) + + return put_df(merged_df2) + + +if __name__ == '__main__': + # Start the plasma store. + p = subprocess.Popen(['plasma_store', + '-s', '/tmp/store', + '-m', str(object_store_size)]) + + # Connect to the plasma store. + connect() + + # Connect the processes in the pool. + pool = Pool(initializer=connect, initargs=(), processes=num_cores) + + # Create a DataFrame from a numpy array. + df = pd.DataFrame(np.random.randn(num_rows, num_cols), + columns=column_names) + + partition_ids = [put_df(partition) for partition + in np.split(df, num_cores)] + + # Begin timing the parallel sort example. + parallel_sort_start = time.time() + + # Sort each partition and subsample them. The subsampled values will be + # used to create buckets. + sorted_df_ids, pivot_groups = list(zip(*pool.map(local_sort, + partition_ids))) + + # Choose the pivots. + all_pivots = np.concatenate(pivot_groups) + indices = np.linspace(0, len(all_pivots) - 1, num=num_cores, + dtype=np.int64) + pivots = np.take(np.sort(all_pivots), indices) + + # Break all of the sorted partitions into even smaller partitions. Group + # the object IDs from each bucket together. + results = list(zip(*pool.map(local_partitions, + zip(sorted_df_ids, + len(sorted_df_ids) * [pivots])))) + + # Merge each of the buckets and store the results in the object store. + object_ids = pool.map(merge, results) + + resulting_ids = [object_id for object_id in object_ids + if object_id is not None] + + # Stop timing the paralle sort example. + parallel_sort_end = time.time() + + print('Parallel sort took {} seconds.' + .format(parallel_sort_end - parallel_sort_start)) + + serial_sort_start = time.time() + + original_sorted_df = df.sort_values(by=column_to_sort) + + serial_sort_end = time.time() + + # Check that we sorted the DataFrame properly. + + sorted_dfs = get_dfs(resulting_ids) + sorted_df = pd.concat(sorted_dfs) + + print('Serial sort took {} seconds.' + .format(serial_sort_end - serial_sort_start)) + + assert np.allclose(sorted_df.values, original_sorted_df.values) + + # Kill the object store. + p.kill() From f9d983391695a22ca400da9135ddfbeb041859ca Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 7 Aug 2017 15:04:29 -0400 Subject: [PATCH 22/38] ARROW-1283: [JAVA] Allow VectorSchemaRoot to close more than once This change allows the VectorSchemaRoot/FieldVectors to close more than once, even if the allocator has already been closed. Before, an empty ArrowBuf was created during closing which required the allocator to not be closed, however this empty buffer is not needed once the FieldVector has been closed. Author: Bryan Cutler Closes #898 from BryanCutler/java-vectorSchemaRoot-close-twice-ARROW-1283 and squashes the following commits: 2921d848 [Bryan Cutler] removed resolved comment 3b3718b8 [Bryan Cutler] Merge remote-tracking branch 'upstream/master' into java-vectorSchemaRoot-close-twice-ARROW-1283 e992fc79 [Bryan Cutler] BaseDataValueVector.close will now just clear, which releases previous and assigns an empty buffer 8ecfce2a [Bryan Cutler] Merge remote-tracking branch 'upstream/master' into java-vectorSchemaRoot-close-twice-ARROW-1283 ca38d3d8 [Bryan Cutler] use clear to release data, ensure that an empty buffer is never allocated again after closing 10ff7c36 [Bryan Cutler] Added regression test --- .../org/apache/arrow/memory/BaseAllocator.java | 3 --- .../apache/arrow/vector/BaseDataValueVector.java | 16 +--------------- .../org/apache/arrow/vector/TestValueVector.java | 10 ++++++++++ 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index be0ba77f5b2fa..b38cf679e2a12 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -171,7 +171,6 @@ public String getName() { @Override public ArrowBuf getEmpty() { - assertOpen(); return empty; } @@ -236,8 +235,6 @@ public ArrowBuf buffer(final int initialRequestSize) { } private ArrowBuf createEmpty() { - assertOpen(); - return new ArrowBuf(new AtomicInteger(), null, AllocationManager.EMPTY, null, null, 0, 0, true); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index 0fea719da8815..88e02495bfc99 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -59,8 +59,6 @@ public static List unload(List vectors) { return result; } - // TODO: Nullable vectors extend BaseDataValueVector but do not use the data field - // We should fix the inheritance tree protected ArrowBuf data; public BaseDataValueVector(String name, BufferAllocator allocator) { @@ -70,23 +68,11 @@ public BaseDataValueVector(String name, BufferAllocator allocator) { @Override public void clear() { - if (data != null) { - data.release(); - } + data.release(); data = allocator.getEmpty(); super.clear(); } - @Override - public void close() { - clear(); - if (data != null) { - data.release(); - data = null; - } - super.close(); - } - @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { return getTransferPair(ref, allocator); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 0f41c2dd790e1..159d534a31072 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -736,6 +736,16 @@ public void testFillEmptiesUsage() { } } + @Test + public void testMultipleClose() { + BufferAllocator vectorAllocator = allocator.newChildAllocator("vector_allocator", 0, Long.MAX_VALUE); + NullableIntVector vector = newVector(NullableIntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, vectorAllocator); + vector.close(); + vectorAllocator.close(); + vector.close(); + vectorAllocator.close(); + } + public static void setBytes(int index, byte[] bytes, NullableVarCharVector vector) { final int currentOffset = vector.values.offsetVector.getAccessor().get(index); From 7a4026a2b43b164ac28c96e1e796c02ca9efe496 Mon Sep 17 00:00:00 2001 From: Li Jin Date: Mon, 7 Aug 2017 15:05:48 -0400 Subject: [PATCH 23/38] ARROW-1304: [Java] Fix Indentation, WhitespaceAround and EmptyLineSeparator checkstyle warnings in Java Author: Li Jin Closes #930 from icexelloss/checkstyle-fix-ARROW-1304 and squashes the following commits: 0ba9e03f [Li Jin] ARROW-1304: [Java] Reformat java code with google_checks.xml to improve checkstyle --- .../main/java/io/netty/buffer/ArrowBuf.java | 4 +- .../org/apache/arrow/memory/Accountant.java | 4 +- .../arrow/memory/AllocationManager.java | 1 + .../apache/arrow/memory/BaseAllocator.java | 3 +- .../apache/arrow/memory/BufferAllocator.java | 6 +- .../arrow/memory/util/HistoricalLog.java | 7 +- .../apache/arrow/memory/TestAccountant.java | 1 + .../arrow/memory/TestBaseAllocator.java | 91 +- .../apache/arrow/memory/TestEndianess.java | 2 + .../org/apache/arrow/tools/EchoServer.java | 4 +- .../org/apache/arrow/tools/FileRoundtrip.java | 1 + .../org/apache/arrow/tools/Integration.java | 4 +- .../apache/arrow/tools/EchoServerTest.java | 20 +- .../apache/arrow/vector/AddOrGetResult.java | 1 + .../apache/arrow/vector/AllocationHelper.java | 18 +- .../arrow/vector/BaseDataValueVector.java | 6 +- .../apache/arrow/vector/BaseValueVector.java | 17 +- .../org/apache/arrow/vector/BitVector.java | 45 +- .../org/apache/arrow/vector/BufferBacked.java | 1 + .../org/apache/arrow/vector/FieldVector.java | 7 +- .../apache/arrow/vector/FixedWidthVector.java | 11 +- .../apache/arrow/vector/NullableVector.java | 1 + .../NullableVectorDefinitionSetter.java | 1 + .../arrow/vector/SchemaChangeCallBack.java | 1 + .../org/apache/arrow/vector/ValueVector.java | 29 +- .../arrow/vector/VariableWidthVector.java | 8 +- .../org/apache/arrow/vector/VectorLoader.java | 7 +- .../apache/arrow/vector/VectorSchemaRoot.java | 5 +- .../apache/arrow/vector/VectorTrimmer.java | 1 + .../apache/arrow/vector/VectorUnloader.java | 1 + .../org/apache/arrow/vector/ZeroVector.java | 34 +- .../complex/AbstractContainerVector.java | 6 +- .../vector/complex/AbstractMapVector.java | 54 +- .../complex/BaseRepeatedValueVector.java | 19 +- .../vector/complex/EmptyValuePopulator.java | 7 +- .../vector/complex/FixedSizeListVector.java | 7 +- .../arrow/vector/complex/ListVector.java | 15 +- .../arrow/vector/complex/MapVector.java | 21 +- .../vector/complex/NullableMapVector.java | 17 +- .../arrow/vector/complex/Positionable.java | 2 + .../vector/complex/PromotableVector.java | 1 + .../complex/RepeatedFixedWidthVectorLike.java | 5 +- .../vector/complex/RepeatedValueVector.java | 7 +- .../RepeatedVariableWidthVectorLike.java | 8 +- .../arrow/vector/complex/StateTool.java | 1 + .../vector/complex/VectorWithOrdinal.java | 1 + .../complex/impl/AbstractBaseReader.java | 9 +- .../complex/impl/AbstractBaseWriter.java | 1 + .../complex/impl/ComplexWriterImpl.java | 145 +- .../complex/impl/MapOrListWriterImpl.java | 1 + .../complex/impl/NullableMapReaderImpl.java | 11 +- .../impl/NullableMapWriterFactory.java | 31 +- .../vector/complex/impl/PromotableWriter.java | 7 +- .../complex/impl/SingleListReaderImpl.java | 3 +- .../complex/impl/SingleMapReaderImpl.java | 25 +- .../impl/UnionFixedSizeListReader.java | 1 + .../vector/complex/impl/UnionListReader.java | 1 + .../vector/complex/reader/FieldReader.java | 2 +- .../vector/complex/writer/FieldWriter.java | 2 + .../arrow/vector/dictionary/Dictionary.java | 9 +- .../vector/dictionary/DictionaryEncoder.java | 11 +- .../vector/dictionary/DictionaryProvider.java | 3 +- .../apache/arrow/vector/file/ArrowBlock.java | 19 +- .../arrow/vector/file/ArrowFileReader.java | 3 +- .../arrow/vector/file/ArrowFileWriter.java | 3 +- .../apache/arrow/vector/file/ArrowFooter.java | 30 +- .../apache/arrow/vector/file/ArrowMagic.java | 1 + .../apache/arrow/vector/file/ArrowReader.java | 22 +- .../apache/arrow/vector/file/ArrowWriter.java | 17 +- .../file/InvalidArrowFileException.java | 1 + .../apache/arrow/vector/file/ReadChannel.java | 17 +- .../vector/file/SeekableReadChannel.java | 23 +- .../arrow/vector/file/WriteChannel.java | 11 +- .../vector/file/json/JsonFileReader.java | 175 +-- .../vector/file/json/JsonFileWriter.java | 32 +- .../arrow/vector/holders/ComplexHolder.java | 1 + .../vector/holders/RepeatedListHolder.java | 3 +- .../vector/holders/RepeatedMapHolder.java | 3 +- .../arrow/vector/holders/UnionHolder.java | 1 + .../arrow/vector/holders/ValueHolder.java | 1 + .../arrow/vector/schema/ArrowBuffer.java | 19 +- .../vector/schema/ArrowDictionaryBatch.java | 80 +- .../arrow/vector/schema/ArrowFieldNode.java | 3 +- .../arrow/vector/schema/ArrowMessage.java | 14 +- .../arrow/vector/schema/ArrowRecordBatch.java | 17 +- .../arrow/vector/schema/ArrowVectorType.java | 6 +- .../arrow/vector/schema/FBSerializable.java | 1 + .../arrow/vector/schema/FBSerializables.java | 1 + .../arrow/vector/schema/TypeLayout.java | 87 +- .../arrow/vector/schema/VectorLayout.java | 35 +- .../vector/stream/ArrowStreamReader.java | 57 +- .../vector/stream/ArrowStreamWriter.java | 32 +- .../vector/stream/MessageSerializer.java | 101 +- .../apache/arrow/vector/types/DateUnit.java | 2 + .../vector/types/FloatingPointPrecision.java | 2 + .../arrow/vector/types/IntervalUnit.java | 2 + .../apache/arrow/vector/types/TimeUnit.java | 2 + .../org/apache/arrow/vector/types/Types.java | 85 +- .../apache/arrow/vector/types/UnionMode.java | 2 + .../vector/types/pojo/DictionaryEncoding.java | 1 + .../apache/arrow/vector/types/pojo/Field.java | 13 +- .../arrow/vector/types/pojo/FieldType.java | 4 + .../arrow/vector/types/pojo/Schema.java | 7 +- .../ByteArrayReadableSeekableByteChannel.java | 3 +- .../vector/util/ByteFunctionHelpers.java | 31 +- .../apache/arrow/vector/util/CallBack.java | 1 + .../apache/arrow/vector/util/DateUtility.java | 1270 ++++++++--------- .../arrow/vector/util/DecimalUtility.java | 59 +- .../arrow/vector/util/DictionaryUtility.java | 6 +- .../vector/util/JsonStringArrayList.java | 3 +- .../arrow/vector/util/JsonStringHashMap.java | 7 +- .../arrow/vector/util/MapWithOrdinal.java | 9 +- .../util/OversizedAllocationException.java | 2 +- .../util/SchemaChangeRuntimeException.java | 1 + .../org/apache/arrow/vector/util/Text.java | 258 ++-- .../arrow/vector/util/TransferPair.java | 4 + .../apache/arrow/vector/util/Validator.java | 7 +- .../java/org/joda/time/LocalDateTimes.java | 2 +- .../arrow/vector/DirtyRootAllocator.java | 1 + .../apache/arrow/vector/TestBitVector.java | 36 +- .../vector/TestBufferOwnershipTransfer.java | 11 +- .../arrow/vector/TestDecimalVector.java | 2 + .../arrow/vector/TestDictionaryVector.java | 7 +- .../arrow/vector/TestFixedSizeListVector.java | 1 + .../apache/arrow/vector/TestListVector.java | 83 +- ...TestOversizedAllocationForValueVector.java | 2 +- .../apache/arrow/vector/TestUnionVector.java | 37 +- .../org/apache/arrow/vector/TestUtils.java | 3 +- .../apache/arrow/vector/TestValueVector.java | 13 +- .../arrow/vector/TestVectorReAlloc.java | 1 + .../arrow/vector/TestVectorUnloadLoad.java | 26 +- .../complex/impl/TestPromotableWriter.java | 3 +- .../complex/writer/TestComplexWriter.java | 17 +- .../arrow/vector/file/BaseFileTest.java | 79 +- .../arrow/vector/file/TestArrowFile.java | 51 +- .../arrow/vector/file/TestArrowFooter.java | 3 +- .../vector/file/TestArrowReaderWriter.java | 9 +- .../arrow/vector/file/TestArrowStream.java | 3 +- .../vector/file/TestArrowStreamPipe.java | 21 +- .../arrow/vector/file/json/TestJSONFile.java | 11 +- .../apache/arrow/vector/pojo/TestConvert.java | 9 +- .../vector/stream/MessageSerializerTest.java | 7 +- .../arrow/vector/types/pojo/TestSchema.java | 27 +- .../arrow/vector/util/TestValidator.java | 57 +- 144 files changed, 2102 insertions(+), 1792 deletions(-) diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index 09886a6ffe316..6d17430c66966 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -406,12 +406,12 @@ public ByteBuffer internalNioBuffer(int index, int length) { @Override public ByteBuffer[] nioBuffers() { - return new ByteBuffer[]{nioBuffer()}; + return new ByteBuffer[] {nioBuffer()}; } @Override public ByteBuffer[] nioBuffers(int index, int length) { - return new ByteBuffer[]{nioBuffer(index, length)}; + return new ByteBuffer[] {nioBuffer(index, length)}; } @Override diff --git a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java index 89329b2766357..5bd6b9fe37956 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java @@ -249,9 +249,9 @@ public long getPeakMemoryAllocation() { return peakAllocation.get(); } - public long getHeadroom(){ + public long getHeadroom() { long localHeadroom = allocationLimit.get() - locallyHeldMemory.get(); - if(parent == null){ + if (parent == null) { return localHeadroom; } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index 70ca1dc32a1b3..c528937bfdcaa 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -328,6 +328,7 @@ private void inc() { * Decrement the ledger's reference count. If the ledger is decremented to zero, this ledger * should release its * ownership back to the AllocationManager + * * @param decrement amout to decrease the reference count by * @return the new reference count */ diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index b38cf679e2a12..2749b6fe030f4 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -274,8 +274,7 @@ public ArrowBuf buffer(final int initialRequestSize, BufferManager manager) { throw new OutOfMemoryException(e); } throw e; - } - finally { + } finally { if (!success) { releaseBytes(actualRequestSize); } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java index 8a40441863889..b23a6e4bd8507 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java @@ -117,9 +117,8 @@ public interface BufferAllocator extends AutoCloseable { * Create an allocation reservation. A reservation is a way of building up * a request for a buffer whose size is not known in advance. See * - * @see AllocationReservation - * * @return the newly created reservation + * @see AllocationReservation */ public AllocationReservation newReservation(); @@ -128,6 +127,7 @@ public interface BufferAllocator extends AutoCloseable { * special because we don't * worry about them leaking or managing reference counts on them since they don't actually * point to any memory. + * * @return the empty buffer */ public ArrowBuf getEmpty(); @@ -136,6 +136,7 @@ public interface BufferAllocator extends AutoCloseable { * Return the name of this allocator. This is a human readable name that can help debugging. * Typically provides * coordinates about where this allocator was created + * * @return the name of the allocator */ public String getName(); @@ -145,6 +146,7 @@ public interface BufferAllocator extends AutoCloseable { * that an allocator is * over its limit, all consumers of that allocator should aggressively try to addrss the * overlimit situation. + * * @return whether or not this allocator (or one if its parents) is over its limits */ public boolean isOverLimit(); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java index 0452dc9adf256..10a64cd984ea0 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java @@ -97,7 +97,7 @@ public synchronized void recordEvent(final String noteFormat, Object... args) { * includes the identifying string provided at construction time, and all the recorded * events with their stack traces. * - * @param sb {@link StringBuilder} to write to + * @param sb {@link StringBuilder} to write to * @param includeStackTrace whether to include the stacktrace of each event in the history */ public void buildHistory(final StringBuilder sb, boolean includeStackTrace) { @@ -106,8 +106,9 @@ public void buildHistory(final StringBuilder sb, boolean includeStackTrace) { /** * build the history and write it to sb - * @param sb output - * @param indent starting indent (usually "") + * + * @param sb output + * @param indent starting indent (usually "") * @param includeStackTrace whether to include the stacktrace of each event. */ public synchronized void buildHistory( diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java b/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java index 2624a4a047e7e..100be069fe6d4 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.memory; import static org.junit.Assert.assertEquals; diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java index 59b7be87e17be..76f2c501cf4c7 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.memory; import static org.junit.Assert.assertEquals; @@ -59,13 +60,13 @@ public void checkBuffers() { @Test public void test_privateMax() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final ArrowBuf arrowBuf1 = rootAllocator.buffer(MAX_ALLOCATION / 2); assertNotNull("allocation failed", arrowBuf1); - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("noLimits", 0, MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("noLimits", 0, MAX_ALLOCATION)) { final ArrowBuf arrowBuf2 = childAllocator.buffer(MAX_ALLOCATION / 2); assertNotNull("allocation failed", arrowBuf2); arrowBuf2.release(); @@ -75,11 +76,11 @@ public void test_privateMax() throws Exception { } } - @Test(expected=IllegalStateException.class) + @Test(expected = IllegalStateException.class) public void testRootAllocator_closeWithOutstanding() throws Exception { try { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final ArrowBuf arrowBuf = rootAllocator.buffer(512); assertNotNull("allocation failed", arrowBuf); } @@ -100,8 +101,8 @@ public void testRootAllocator_closeWithOutstanding() throws Exception { @Test public void testRootAllocator_getEmpty() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final ArrowBuf arrowBuf = rootAllocator.buffer(0); assertNotNull("allocation failed", arrowBuf); assertEquals("capacity was non-zero", 0, arrowBuf.capacity()); @@ -112,8 +113,8 @@ public void testRootAllocator_getEmpty() throws Exception { @Ignore // TODO(DRILL-2740) @Test(expected = IllegalStateException.class) public void testAllocator_unreleasedEmpty() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { @SuppressWarnings("unused") final ArrowBuf arrowBuf = rootAllocator.buffer(0); } @@ -121,8 +122,8 @@ public void testAllocator_unreleasedEmpty() throws Exception { @Test public void testAllocator_transferOwnership() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("changeOwnership1", 0, MAX_ALLOCATION); final BufferAllocator childAllocator2 = @@ -197,7 +198,7 @@ public void testRootAllocator_createChildAndUse() throws Exception { } } - @Test(expected=IllegalStateException.class) + @Test(expected = IllegalStateException.class) public void testRootAllocator_createChildDontClose() throws Exception { try { try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { @@ -232,22 +233,22 @@ private static void allocateAndFree(final BufferAllocator allocator) { final int nBufs = 8; final ArrowBuf[] arrowBufs = new ArrowBuf[nBufs]; - for(int i = 0; i < arrowBufs.length; ++i) { + for (int i = 0; i < arrowBufs.length; ++i) { ArrowBuf arrowBufi = allocator.buffer(MAX_ALLOCATION / nBufs); assertNotNull("allocation failed", arrowBufi); arrowBufs[i] = arrowBufi; } - for(ArrowBuf arrowBufi : arrowBufs) { + for (ArrowBuf arrowBufi : arrowBufs) { arrowBufi.release(); } } @Test public void testAllocator_manyAllocations() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("manyAllocations", 0, MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("manyAllocations", 0, MAX_ALLOCATION)) { allocateAndFree(childAllocator); } } @@ -255,10 +256,10 @@ public void testAllocator_manyAllocations() throws Exception { @Test public void testAllocator_overAllocate() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("overAllocate", 0, MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("overAllocate", 0, MAX_ALLOCATION)) { allocateAndFree(childAllocator); try { @@ -273,10 +274,10 @@ public void testAllocator_overAllocate() throws Exception { @Test public void testAllocator_overAllocateParent() throws Exception { - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { - try(final BufferAllocator childAllocator = - rootAllocator.newChildAllocator("overAllocateParent", 0, MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("overAllocateParent", 0, MAX_ALLOCATION)) { final ArrowBuf arrowBuf1 = rootAllocator.buffer(MAX_ALLOCATION / 2); assertNotNull("allocation failed", arrowBuf1); final ArrowBuf arrowBuf2 = childAllocator.buffer(MAX_ALLOCATION / 2); @@ -326,7 +327,7 @@ public void testAllocator_createSlices() throws Exception { try (final BufferAllocator childAllocator = rootAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { try (final BufferAllocator childAllocator2 = - childAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { + childAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { final ArrowBuf arrowBuf1 = childAllocator2.buffer(MAX_ALLOCATION / 8); @SuppressWarnings("unused") final ArrowBuf arrowBuf2 = arrowBuf1.slice(MAX_ALLOCATION / 16, MAX_ALLOCATION / 16); @@ -345,8 +346,8 @@ public void testAllocator_createSlices() throws Exception { @Test public void testAllocator_sliceRanges() throws Exception { // final AllocatorOwner allocatorOwner = new NamedOwner("sliceRanges"); - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { // Populate a buffer with byte values corresponding to their indices. final ArrowBuf arrowBuf = rootAllocator.buffer(256); assertEquals(256, arrowBuf.capacity()); @@ -362,7 +363,7 @@ public void testAllocator_sliceRanges() throws Exception { // assertEquals(256, slice3.capacity()); // assertEquals(256, slice3.writableBytes()); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { arrowBuf.writeByte(i); } assertEquals(0, arrowBuf.readerIndex()); @@ -373,18 +374,18 @@ public void testAllocator_sliceRanges() throws Exception { final ArrowBuf slice1 = (ArrowBuf) arrowBuf.slice(); assertEquals(0, slice1.readerIndex()); assertEquals(256, slice1.readableBytes()); - for(int i = 0; i < 10; ++i) { + for (int i = 0; i < 10; ++i) { assertEquals(i, slice1.readByte()); } assertEquals(256 - 10, slice1.readableBytes()); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { assertEquals((byte) i, slice1.getByte(i)); } final ArrowBuf slice2 = arrowBuf.slice(25, 25); assertEquals(0, slice2.readerIndex()); assertEquals(25, slice2.readableBytes()); - for(int i = 25; i < 50; ++i) { + for (int i = 25; i < 50; ++i) { assertEquals(i, slice2.readByte()); } @@ -404,32 +405,32 @@ public void testAllocator_sliceRanges() throws Exception { @Test public void testAllocator_slicesOfSlices() throws Exception { // final AllocatorOwner allocatorOwner = new NamedOwner("slicesOfSlices"); - try(final RootAllocator rootAllocator = - new RootAllocator(MAX_ALLOCATION)) { + try (final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { // Populate a buffer with byte values corresponding to their indices. final ArrowBuf arrowBuf = rootAllocator.buffer(256); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { arrowBuf.writeByte(i); } // Slice it up. final ArrowBuf slice0 = arrowBuf.slice(0, arrowBuf.capacity()); - for(int i = 0; i < 256; ++i) { + for (int i = 0; i < 256; ++i) { assertEquals((byte) i, arrowBuf.getByte(i)); } final ArrowBuf slice10 = slice0.slice(10, arrowBuf.capacity() - 10); - for(int i = 10; i < 256; ++i) { + for (int i = 10; i < 256; ++i) { assertEquals((byte) i, slice10.getByte(i - 10)); } final ArrowBuf slice20 = slice10.slice(10, arrowBuf.capacity() - 20); - for(int i = 20; i < 256; ++i) { + for (int i = 20; i < 256; ++i) { assertEquals((byte) i, slice20.getByte(i - 20)); } - final ArrowBuf slice30 = slice20.slice(10, arrowBuf.capacity() - 30); - for(int i = 30; i < 256; ++i) { + final ArrowBuf slice30 = slice20.slice(10, arrowBuf.capacity() - 30); + for (int i = 30; i < 256; ++i) { assertEquals((byte) i, slice30.getByte(i - 30)); } @@ -556,8 +557,8 @@ public void testAllocator_transferShared() throws Exception { public void testAllocator_unclaimedReservation() throws Exception { try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { try (final BufferAllocator childAllocator1 = - rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) { - try(final AllocationReservation reservation = childAllocator1.newReservation()) { + rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) { + try (final AllocationReservation reservation = childAllocator1.newReservation()) { assertTrue(reservation.add(64)); } rootAllocator.verify(); diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java b/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java index 25357dc7b07ef..accd15eb71705 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.memory; import static org.junit.Assert.assertEquals; + import io.netty.buffer.ByteBuf; import org.apache.arrow.memory.BufferAllocator; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java index 24079b62da919..c53f0ea86935e 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java @@ -73,7 +73,9 @@ public void run() throws IOException { LOGGER.info("Closed connection with client"); } } catch (java.net.SocketException ex) { - if (!closed) throw ex; + if (!closed) { + throw ex; + } } finally { serverSocket.close(); LOGGER.info("Server closed."); diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java index 135d4921ed128..7d71b0b8f9d3f 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java @@ -44,6 +44,7 @@ public class FileRoundtrip { private final Options options; private final PrintStream out; private final PrintStream err; + FileRoundtrip(PrintStream out, PrintStream err) { this.out = out; this.err = err; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java index 7d4c86f81670f..d2b35e65a8172 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java @@ -94,7 +94,7 @@ private File validateFile(String type, String fileName, boolean shouldExist) { } static void extractDictionaryEncodings(List fields, List encodings) { - for (Field field: fields) { + for (Field field : fields) { DictionaryEncoding encoding = field.getDictionary(); if (encoding != null) { encodings.add(encoding); @@ -214,7 +214,7 @@ public void execute(File arrowFile, File jsonFile) throws IOException { boolean hasMoreArrow = iterator.hasNext(); if (hasMoreJSON || hasMoreArrow) { throw new IllegalArgumentException("Unexpected RecordBatches. Total: " + totalBatches - + " J:" + hasMoreJSON + " " + + " J:" + hasMoreJSON + " " + "A:" + hasMoreArrow); } } diff --git a/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java b/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java index d7f6388721ebb..467965aff95d9 100644 --- a/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java +++ b/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java @@ -158,16 +158,16 @@ public void basicTest() throws InterruptedException, IOException { public void testFlatDictionary() throws IOException { DictionaryEncoding writeEncoding = new DictionaryEncoding(1L, false, null); try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - NullableIntVector writeVector = - new NullableIntVector( - "varchar", - new FieldType(true, MinorType.INT.getType(), writeEncoding, null), - allocator); - NullableVarCharVector writeDictionaryVector = - new NullableVarCharVector( - "dict", - FieldType.nullable(VARCHAR.getType()), - allocator)) { + NullableIntVector writeVector = + new NullableIntVector( + "varchar", + new FieldType(true, MinorType.INT.getType(), writeEncoding, null), + allocator); + NullableVarCharVector writeDictionaryVector = + new NullableVarCharVector( + "dict", + FieldType.nullable(VARCHAR.getType()), + allocator)) { writeVector.allocateNewSafe(); NullableIntVector.Mutator mutator = writeVector.getMutator(); mutator.set(0, 0); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java index 388eb9c447977..a5b5c9d1d6ad2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import com.google.common.base.Preconditions; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java index 15c3a0227c656..2a0f39d0cb59b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.vector.complex.RepeatedFixedWidthVectorLike; @@ -27,32 +28,33 @@ public static void allocate(ValueVector v, int valueCount, int bytesPerValue) { allocate(v, valueCount, bytesPerValue, 5); } - public static void allocatePrecomputedChildCount(ValueVector v, int valueCount, int bytesPerValue, int childValCount){ - if(v instanceof FixedWidthVector) { + public static void allocatePrecomputedChildCount(ValueVector v, int valueCount, int bytesPerValue, int childValCount) { + if (v instanceof FixedWidthVector) { ((FixedWidthVector) v).allocateNew(valueCount); } else if (v instanceof VariableWidthVector) { ((VariableWidthVector) v).allocateNew(valueCount * bytesPerValue, valueCount); - } else if(v instanceof RepeatedFixedWidthVectorLike) { + } else if (v instanceof RepeatedFixedWidthVectorLike) { ((RepeatedFixedWidthVectorLike) v).allocateNew(valueCount, childValCount); - } else if(v instanceof RepeatedVariableWidthVectorLike) { + } else if (v instanceof RepeatedVariableWidthVectorLike) { ((RepeatedVariableWidthVectorLike) v).allocateNew(childValCount * bytesPerValue, valueCount, childValCount); } else { v.allocateNew(); } } - public static void allocate(ValueVector v, int valueCount, int bytesPerValue, int repeatedPerTop){ + public static void allocate(ValueVector v, int valueCount, int bytesPerValue, int repeatedPerTop) { allocatePrecomputedChildCount(v, valueCount, bytesPerValue, repeatedPerTop * valueCount); } /** * Allocates the exact amount if v is fixed width, otherwise falls back to dynamic allocation - * @param v value vector we are trying to allocate - * @param valueCount size we are trying to allocate + * + * @param v value vector we are trying to allocate + * @param valueCount size we are trying to allocate * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory */ public static void allocateNew(ValueVector v, int valueCount) { - if (v instanceof FixedWidthVector) { + if (v instanceof FixedWidthVector) { ((FixedWidthVector) v).allocateNew(valueCount); } else { v.allocateNew(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index 88e02495bfc99..01340f66c4095 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.ArrayList; @@ -84,7 +85,7 @@ public ArrowBuf[] getBuffers(boolean clear) { if (getBufferSize() == 0) { out = new ArrowBuf[0]; } else { - out = new ArrowBuf[]{data}; + out = new ArrowBuf[] {data}; data.readerIndex(0); if (clear) { data.retain(1); @@ -123,5 +124,6 @@ public ArrowBuf unLoad() { * This method has a similar effect of allocateNew() without actually clearing and reallocating * the value vector. The purpose is to move the value vector to a "mutate" state */ - public void reset() {} + public void reset() { + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 2a61403c0dcbe..05d2aa933cbef 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.Iterator; @@ -66,7 +67,8 @@ public TransferPair getTransferPair(BufferAllocator allocator) { } public abstract static class BaseAccessor implements ValueVector.Accessor { - protected BaseAccessor() { } + protected BaseAccessor() { + } @Override public boolean isNull(int index) { @@ -79,7 +81,7 @@ public int getNullCount() { int nullCount = 0; for (int i = 0; i < getValueCount(); i++) { if (isNull(i)) { - nullCount ++; + nullCount++; } } return nullCount; @@ -87,14 +89,17 @@ public int getNullCount() { } public abstract static class BaseMutator implements ValueVector.Mutator { - protected BaseMutator() { } + protected BaseMutator() { + } @Override - public void generateTestData(int values) {} + public void generateTestData(int values) { + } //TODO: consider making mutator stateless(if possible) on another issue. @Override - public void reset() {} + public void reset() { + } } @Override @@ -103,7 +108,7 @@ public Iterator iterator() { } public static boolean checkBufRefs(final ValueVector vv) { - for(final ArrowBuf buffer : vv.getBuffers(false)) { + for (final ArrowBuf buffer : vv.getBuffers(false)) { if (buffer.refCnt() <= 0) { throw new IllegalStateException("zero refcount"); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index f34ef2c2a2244..e80ca829c667e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.memory.BufferAllocator; @@ -115,7 +116,7 @@ int getSizeFromCount(int valueCount) { @Override public int getValueCapacity() { - return (int)Math.min((long)Integer.MAX_VALUE, data.capacity() * 8L); + return (int) Math.min((long) Integer.MAX_VALUE, data.capacity() * 8L); } private int getByteIndex(int index) { @@ -165,8 +166,7 @@ public void reset() { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param valueCount - * The number of values which can be contained within this vector. + * @param valueCount The number of values which can be contained within this vector. */ @Override public void allocateNew(int valueCount) { @@ -195,7 +195,7 @@ public void reAlloc() { throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); } - final int curSize = (int)newAllocationSize; + final int curSize = (int) newAllocationSize; final ArrowBuf newBuf = allocator.buffer(curSize); newBuf.setZero(0, newBuf.capacity()); newBuf.setBytes(0, data, 0, data.capacity()); @@ -287,7 +287,7 @@ public void splitAndTransferTo(int startIndex, int length, BitVector target) { target.data.setByte(byteSize - 1, ((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset)); } else { target.data.setByte(byteSize - 1, - (((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset) + (this.data.getByte(firstByte + byteSize) << (8 - offset)))); + (((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset) + (this.data.getByte(firstByte + byteSize) << (8 - offset)))); } } } @@ -342,15 +342,14 @@ public class Accessor extends BaseAccessor { /** * Get the byte holding the desired bit, then mask all other bits. Iff the result is 0, the bit was not set. * - * @param index - * position of the bit in the vector + * @param index position of the bit in the vector * @return 1 if set, otherwise 0 */ public final int get(int index) { int byteIndex = index >> 3; byte b = data.getByte(byteIndex); int bitIndex = index & 7; - return Long.bitCount(b & (1L << bitIndex)); + return Long.bitCount(b & (1L << bitIndex)); } @Override @@ -379,6 +378,7 @@ public final void get(int index, NullableBitHolder holder) { /** * Get the number nulls, this correspond to the number of bits set to 0 in the vector + * * @return the number of bits set to 0 */ @Override @@ -414,10 +414,8 @@ private Mutator() { /** * Set the bit at the given index to the specified value. * - * @param index - * position of the bit to set - * @param value - * value to set (either 1 or 0) + * @param index position of the bit to set + * @param value value to set (either 1 or 0) */ public final void set(int index, int value) { int byteIndex = byteIndex(index); @@ -448,8 +446,9 @@ public final void setToOne(int index) { /** * set count bits to 1 in data starting at firstBitIndex + * * @param firstBitIndex the index of the first bit to set - * @param count the number of bits to set + * @param count the number of bits to set */ public void setRangeToOne(int firstBitIndex, int count) { int starByteIndex = byteIndex(firstBitIndex); @@ -473,7 +472,7 @@ public void setRangeToOne(int firstBitIndex, int count) { final byte bitMask = (byte) (0xFFL << startByteBitIndex); currentByte |= bitMask; data.setByte(starByteIndex, currentByte); - ++ starByteIndex; + ++starByteIndex; } // fill in one full byte at a time @@ -518,28 +517,28 @@ final void set(int index, NullableBitHolder holder) { } public void setSafe(int index, int value) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } set(index, value); } public void setSafeToOne(int index) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } setToOne(index); } public void setSafe(int index, BitHolder holder) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } set(index, holder.value); } public void setSafe(int index, NullableBitHolder holder) { - while(index >= getValueCapacity()) { + while (index >= getValueCapacity()) { reAlloc(); } set(index, holder.value); @@ -550,7 +549,7 @@ public final void setValueCount(int valueCount) { int currentValueCapacity = getValueCapacity(); BitVector.this.valueCount = valueCount; int idx = getSizeFromCount(valueCount); - while(valueCount > getValueCapacity()) { + while (valueCount > getValueCapacity()) { reAlloc(); } if (valueCount > 0 && currentValueCapacity > valueCount * 2) { @@ -564,7 +563,7 @@ public final void setValueCount(int valueCount) { @Override public final void generateTestData(int values) { boolean even = true; - for(int i = 0; i < values; i++, even = !even) { + for (int i = 0; i < values; i++, even = !even) { if (even) { set(i, 1); } @@ -576,10 +575,10 @@ public void generateTestDataAlt(int size) { setValueCount(size); boolean even = true; final int valueCount = getAccessor().getValueCount(); - for(int i = 0; i < valueCount; i++, even = !even) { - if(even){ + for (int i = 0; i < valueCount; i++, even = !even) { + if (even) { set(i, (byte) 1); - }else{ + } else { set(i, (byte) 0); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java index 3c8b3210d77ff..a0dbf2bdcf101 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.vector.schema.ArrowFieldNode; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java index 6c2c8302a7b8b..af7a7912c72cc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.List; @@ -33,12 +34,14 @@ public interface FieldVector extends ValueVector { /** * Initializes the child vectors * to be later loaded with loadBuffers + * * @param children the schema */ void initializeChildrenFromFields(List children); /** * the returned list is the same size as the list passed to initializeChildrenFromFields + * * @return the children according to schema (empty for primitive types) */ List getChildrenFromFields(); @@ -46,13 +49,15 @@ public interface FieldVector extends ValueVector { /** * loads data in the vectors * (ownBuffers must be the same size as getFieldVectors()) - * @param fieldNode the fieldNode + * + * @param fieldNode the fieldNode * @param ownBuffers the buffers for this Field (own buffers only, children not included) */ void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers); /** * (same size as getFieldVectors() since it is their content) + * * @return the buffers containing the data for this vector (ready for reading) */ List getFieldBuffers(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java index 59057000bbca9..ec410fc089c9c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java @@ -15,21 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; -public interface FixedWidthVector extends ValueVector{ +public interface FixedWidthVector extends ValueVector { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param valueCount Number of values in the vector. + * @param valueCount Number of values in the vector. */ void allocateNew(int valueCount); -/** - * Zero out the underlying buffer backing this vector. - */ + /** + * Zero out the underlying buffer backing this vector. + */ void zeroVector(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java index b49e9167c2589..b2455e9e42b4b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; public interface NullableVector extends ValueVector { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java index b819c5d39e99c..1e0746aabaa61 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; public interface NullableVectorDefinitionSetter { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java b/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java index 6fdcda20480f8..54c0c591e2b92 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java @@ -42,6 +42,7 @@ public void doWork() { /** * Returns the value of schema-changed state, resetting the * schema-changed state to {@code false}. + * * @return the previous schema-changed state */ public boolean getSchemaChangedAndReset() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 3812c0b2fc319..0c95bcfcbd6b5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.io.Closeable; @@ -40,11 +41,11 @@ * There are a few "rules" around vectors: * *
    - *
  • values need to be written in order (e.g. index 0, 1, 2, 5)
  • - *
  • null vectors start with all values as null before writing anything
  • - *
  • for variable width types, the offset vector should be all zeros before writing
  • - *
  • you must call setValueCount before a vector can be read
  • - *
  • you should never write to a vector once it has been read.
  • + *
  • values need to be written in order (e.g. index 0, 1, 2, 5)
  • + *
  • null vectors start with all values as null before writing anything
  • + *
  • for variable width types, the offset vector should be all zeros before writing
  • + *
  • you must call setValueCount before a vector can be read
  • + *
  • you should never write to a vector once it has been read.
  • *
* * Please note that the current implementation doesn't enforce those rules, hence we may find few places that @@ -58,12 +59,14 @@ public interface ValueVector extends Closeable, Iterable { /** * Allocate new buffers. ValueVector implements logic to determine how much to allocate. + * * @throws OutOfMemoryException Thrown if no memory can be allocated. */ void allocateNew() throws OutOfMemoryException; /** * Allocates new buffers. ValueVector implements logic to determine how much to allocate. + * * @return Returns true if allocation was successful. */ boolean allocateNewSafe(); @@ -78,12 +81,14 @@ public interface ValueVector extends Closeable, Iterable { /** * Set the initial record capacity + * * @param numRecords the initial record capacity. */ void setInitialCapacity(int numRecords); /** * Returns the maximum number of values that can be stored in this vector instance. + * * @return the maximum number of values that can be stored in this vector instance. */ int getValueCapacity(); @@ -101,6 +106,7 @@ public interface ValueVector extends Closeable, Iterable { /** * Get information about how this field is materialized. + * * @return the field corresponding to this vector */ Field getField(); @@ -109,6 +115,7 @@ public interface ValueVector extends Closeable, Iterable { /** * to transfer quota responsibility + * * @param allocator the target allocator * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new target vector of * the same type. @@ -121,6 +128,7 @@ public interface ValueVector extends Closeable, Iterable { /** * makes a new transfer pair used to transfer underlying buffers + * * @param target the target for the transfer * @return a new {@link org.apache.arrow.vector.util.TransferPair transfer pair} that is used to transfer underlying * buffers into the target vector. @@ -167,9 +175,9 @@ public interface ValueVector extends Closeable, Iterable { * Return the underlying buffers associated with this vector. Note that this doesn't impact the reference counts for * this buffer so it only should be used for in-context access. Also note that this buffer changes regularly thus * external classes shouldn't hold a reference to it (unless they change it). - * @param clear Whether to clear vector before returning; the buffers will still be refcounted; - * but the returned array will be the only reference to them * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted; + * but the returned array will be the only reference to them * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this vector instance. */ ArrowBuf[] getBuffers(boolean clear); @@ -181,8 +189,7 @@ interface Accessor { /** * Get the Java Object representation of the element at the specified position. Useful for testing. * - * @param index - * Index of the value to get + * @param index Index of the value to get * @return the friendly java type */ Object getObject(int index); @@ -211,7 +218,7 @@ interface Mutator { /** * Sets the number of values that is stored in this vector to the given value count. * - * @param valueCount value count to set. + * @param valueCount value count to set. */ void setValueCount(int valueCount); @@ -221,8 +228,8 @@ interface Mutator { void reset(); /** - * @deprecated this has nothing to do with value vector abstraction and should be removed. * @param values the number of values to generate + * @deprecated this has nothing to do with value vector abstraction and should be removed. */ @Deprecated void generateTestData(int values); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java index ed164b548b5bd..04c00b7c8349c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java @@ -15,20 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; -public interface VariableWidthVector extends ValueVector{ +public interface VariableWidthVector extends ValueVector { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param totalBytes Desired size of the underlying data buffer. - * @param valueCount Number of values in the vector. + * @param totalBytes Desired size of the underlying data buffer. + * @param valueCount Number of values in the vector. */ void allocateNew(int totalBytes, int valueCount); /** * Provide the maximum amount of variable width bytes that can be stored in this vector. + * * @return the byte capacity of this vector */ int getByteCapacity(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index e640c7cb78418..58fc80bbba17c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static com.google.common.base.Preconditions.checkArgument; @@ -41,6 +42,7 @@ public class VectorLoader { /** * will create children in root based on schema + * * @param root the root to add vectors to based on schema */ public VectorLoader(VectorSchemaRoot root) { @@ -50,12 +52,13 @@ public VectorLoader(VectorSchemaRoot root) { /** * Loads the record batch in the vectors * will not close the record batch + * * @param recordBatch the batch to load */ public void load(ArrowRecordBatch recordBatch) { Iterator buffers = recordBatch.getBuffers().iterator(); Iterator nodes = recordBatch.getNodes().iterator(); - for (FieldVector fieldVector: root.getFieldVectors()) { + for (FieldVector fieldVector : root.getFieldVectors()) { loadBuffers(fieldVector, fieldVector.getField(), buffers, nodes); } root.setRowCount(recordBatch.getLength()); @@ -77,7 +80,7 @@ private void loadBuffers(FieldVector vector, Field field, Iterator buf vector.loadFieldBuffers(fieldNode, ownBuffers); } catch (RuntimeException e) { throw new IllegalArgumentException("Could not load buffers for field " + - field + ". error message: " + e.getMessage(), e); + field + ". error message: " + e.getMessage(), e); } List children = field.getChildren(); if (children.size() > 0) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java index 73deb0b3a426e..0c8868cad55b5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.ArrayList; @@ -67,7 +68,7 @@ public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) } if (fieldVectors.size() != schema.getFields().size()) { throw new IllegalArgumentException("The root vector did not create the right number of children. found " + - fieldVectors.size() + " expected " + schema.getFields().size()); + fieldVectors.size() + " expected " + schema.getFields().size()); } return new VectorSchemaRoot(schema, fieldVectors, 0); } @@ -102,7 +103,7 @@ public void close() { ex = chain(ex, e); } } - if (ex!= null) { + if (ex != null) { throw ex; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java index 055857e956084..ada471e63e710 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import io.netty.buffer.ByteBuf; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index 996524521cb68..f8385a7262a21 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.ArrayList; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index 01e22f2574346..cce73897718a3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import java.util.Collections; @@ -42,10 +43,12 @@ public class ZeroVector implements FieldVector { private final TransferPair defaultPair = new TransferPair() { @Override - public void transfer() { } + public void transfer() { + } @Override - public void splitAndTransfer(int startIndex, int length) { } + public void splitAndTransfer(int startIndex, int length) { + } @Override public ValueVector getTo() { @@ -53,7 +56,8 @@ public ValueVector getTo() { } @Override - public void copyValueSafe(int from, int to) { } + public void copyValueSafe(int from, int to) { + } }; private final Accessor defaultAccessor = new Accessor() { @@ -80,22 +84,28 @@ public int getNullCount() { private final Mutator defaultMutator = new Mutator() { @Override - public void setValueCount(int valueCount) { } + public void setValueCount(int valueCount) { + } @Override - public void reset() { } + public void reset() { + } @Override - public void generateTestData(int values) { } + public void generateTestData(int values) { + } }; - public ZeroVector() { } + public ZeroVector() { + } @Override - public void close() { } + public void close() { + } @Override - public void clear() { } + public void clear() { + } @Override public Field getField() { @@ -144,7 +154,8 @@ public boolean allocateNewSafe() { } @Override - public void reAlloc() {} + public void reAlloc() { + } @Override public BufferAllocator getAllocator() { @@ -152,7 +163,8 @@ public BufferAllocator getAllocator() { } @Override - public void setInitialCapacity(int numRecords) { } + public void setInitialCapacity(int numRecords) { + } @Override public int getValueCapacity() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java index 2aeeca25f0e9e..db0ff86df47a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.memory.BufferAllocator; @@ -58,6 +59,7 @@ public BufferAllocator getAllocator() { /** * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given field name if exists or null. + * * @param name the name of the child to return * @return the corresponding FieldVector */ @@ -68,9 +70,9 @@ public FieldVector getChild(String name) { /** * Clears out all underlying child vectors. */ - @Override + @Override public void close() { - for (ValueVector vector:(Iterable)this) { + for (ValueVector vector : (Iterable) this) { vector.close(); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java index 4b6d82cc8b291..26b0f90581ffc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import java.util.ArrayList; @@ -42,7 +43,7 @@ public abstract class AbstractMapVector extends AbstractContainerVector { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); // Maintains a map with key as field name and value is the vector itself - private final MapWithOrdinal vectors = new MapWithOrdinal<>(); + private final MapWithOrdinal vectors = new MapWithOrdinal<>(); protected AbstractMapVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, allocator, callBack); @@ -50,7 +51,7 @@ protected AbstractMapVector(String name, BufferAllocator allocator, CallBack cal @Override public void close() { - for(final ValueVector valueVector : vectors.values()) { + for (final ValueVector valueVector : vectors.values()) { valueVector.close(); } vectors.clear(); @@ -83,7 +84,7 @@ public boolean allocateNewSafe() { @Override public void reAlloc() { - for (final ValueVector v: vectors.values()) { + for (final ValueVector v : vectors.values()) { v.reAlloc(); } } @@ -94,27 +95,26 @@ public void reAlloc() { * * Execution takes place in the following order: *
    - *
  • - * if field is new, create and insert a new vector of desired type. - *
  • - *
  • - * if field exists and existing vector is of desired vector type, return the vector. - *
  • - *
  • - * if field exists and null filled, clear the existing vector; create and insert a new vector of desired type. - *
  • - *
  • - * otherwise, throw an {@link java.lang.IllegalStateException} - *
  • + *
  • + * if field is new, create and insert a new vector of desired type. + *
  • + *
  • + * if field exists and existing vector is of desired vector type, return the vector. + *
  • + *
  • + * if field exists and null filled, clear the existing vector; create and insert a new vector of desired type. + *
  • + *
  • + * otherwise, throw an {@link java.lang.IllegalStateException} + *
  • *
* * @param childName the name of the field * @param fieldType the type for the vector - * @param clazz class of expected vector type - * @param class type of expected vector type - * @throws java.lang.IllegalStateException raised if there is a hard schema change - * + * @param clazz class of expected vector type + * @param class type of expected vector type * @return resultant {@link org.apache.arrow.vector.ValueVector} + * @throws java.lang.IllegalStateException raised if there is a hard schema change */ @Override public T addOrGet(String childName, FieldType fieldType, Class clazz) { @@ -151,6 +151,7 @@ private boolean nullFilled(ValueVector vector) { /** * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given ordinal identifier. + * * @param id the ordinal of the child to return * @return the corresponding child */ @@ -161,7 +162,8 @@ public ValueVector getChildByOrdinal(int id) { /** * Returns a {@link org.apache.arrow.vector.ValueVector} instance of subtype of T corresponding to the given * field name if exists or null. - * @param name the name of the child to return + * + * @param name the name of the child to return * @param clazz the expected type of the child * @return the child corresponding to this name */ @@ -191,7 +193,8 @@ protected ValueVector add(String childName, FieldType fieldType) { * Inserts the vector with the given name if it does not exist else replaces it with the new value. * * Note that this method does not enforce any vector type check nor throws a schema change exception. - * @param name the name of the child to add + * + * @param name the name of the child to add * @param vector the vector to add as a child */ protected void putChild(String name, FieldVector vector) { @@ -200,8 +203,9 @@ protected void putChild(String name, FieldVector vector) { /** * Inserts the input vector into the map if it does not exist, replaces if it exists already - * @param name field name - * @param vector vector to be inserted + * + * @param name field name + * @param vector vector to be inserted */ protected void putVector(String name, FieldVector vector) { final ValueVector old = vectors.put( @@ -210,7 +214,7 @@ protected void putVector(String name, FieldVector vector) { ); if (old != null && old != vector) { logger.debug("Field [{}] mutated from [{}] to [{}]", name, old.getClass().getSimpleName(), - vector.getClass().getSimpleName()); + vector.getClass().getSimpleName()); } } @@ -298,7 +302,7 @@ public ArrowBuf[] getBuffers(boolean clear) { @Override public int getBufferSize() { - int actualBufSize = 0 ; + int actualBufSize = 0; for (final ValueVector v : vectors.values()) { for (final ArrowBuf buf : v.getBuffers(false)) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 3bfa8e7f7ce67..8e2877f892a64 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import java.util.Collections; @@ -144,7 +145,7 @@ public void clear() { public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); if (clear) { - for (ArrowBuf buffer:buffers) { + for (ArrowBuf buffer : buffers) { buffer.retain(); } clear(); @@ -156,7 +157,7 @@ public ArrowBuf[] getBuffers(boolean clear) { * @return 1 if inner vector is explicitly set via #addOrGetVector else 0 */ public int size() { - return vector == DEFAULT_DATA_VECTOR ? 0:1; + return vector == DEFAULT_DATA_VECTOR ? 0 : 1; } public AddOrGetResult addOrGetVector(FieldType fieldType) { @@ -166,8 +167,8 @@ public AddOrGetResult addOrGetVector(FieldType fieldT // returned vector must have the same field created = true; if (callBack != null && - // not a schema change if changing from ZeroVector to ZeroVector - (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { callBack.doWork(); } } @@ -178,7 +179,7 @@ public AddOrGetResult addOrGetVector(FieldType fieldT throw new SchemaChangeRuntimeException(msg); } - return new AddOrGetResult<>((T)vector, created); + return new AddOrGetResult<>((T) vector, created); } protected void replaceDataVector(FieldVector v) { @@ -200,7 +201,7 @@ public int getInnerValueCount() { @Override public int getInnerValueCountAt(int index) { - return offsets.getAccessor().get(index+1) - offsets.getAccessor().get(index); + return offsets.getAccessor().get(index + 1) - offsets.getAccessor().get(index); } @Override @@ -222,15 +223,15 @@ public int startNewValue(int index) { offsets.reAlloc(); } int offset = offsets.getAccessor().get(index); - offsets.getMutator().setSafe(index+1, offset); - setValueCount(index+1); + offsets.getMutator().setSafe(index + 1, offset); + setValueCount(index + 1); return offset; } @Override public void setValueCount(int valueCount) { // TODO: populate offset end points - offsets.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount+1); + offsets.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount + 1); final int childValueCount = valueCount == 0 ? 0 : offsets.getAccessor().get(valueCount); vector.getMutator().setValueCount(childValueCount); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java index df699755770a5..a76fbbe11a1fb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.UInt4Vector; @@ -34,8 +35,8 @@ public EmptyValuePopulator(UInt4Vector offsets) { /** * Marks all values since the last set as empty. The last set value is obtained from underlying offsets vector. * - * @param lastIndex the last index (inclusive) in the offsets vector until which empty population takes place - * @throws java.lang.IndexOutOfBoundsException if lastIndex is negative or greater than offsets capacity. + * @param lastIndex the last index (inclusive) in the offsets vector until which empty population takes place + * @throws java.lang.IndexOutOfBoundsException if lastIndex is negative or greater than offsets capacity. */ public void populate(int lastIndex) { if (lastIndex < 0) { @@ -48,7 +49,7 @@ public void populate(int lastIndex) { for (int i = lastSet; i < lastIndex; i++) { mutator.setSafe(i + 1, previousEnd); } - mutator.setValueCount(lastIndex+1); + mutator.setValueCount(lastIndex + 1); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index b6d938f3fd863..3f0f1b05b6733 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex; import static java.util.Collections.singletonList; @@ -240,7 +241,7 @@ public void clear() { public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers = ObjectArrays.concat(bits.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); if (clear) { - for (ArrowBuf buffer: buffers) { + for (ArrowBuf buffer : buffers) { buffer.retain(); } clear(); @@ -267,7 +268,7 @@ public AddOrGetResult addOrGetVector(FieldType type) // returned vector must have the same field if (!Objects.equals(vector.getField().getType(), type.getType())) { final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", - type.getType(), vector.getField().getType()); + type.getType(), vector.getField().getType()); throw new SchemaChangeRuntimeException(msg); } @@ -301,7 +302,7 @@ public Object getObject(int index) { } final List vals = new JsonStringArrayList<>(listSize); final ValueVector.Accessor valuesAccessor = vector.getAccessor(); - for(int i = 0; i < listSize; i++) { + for (int i = 0; i < listSize; i++) { vals.add(valuesAccessor.getObject(index * listSize + i)); } return vals; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index fdeac3971657d..7b6b97a8ed997 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; @@ -199,7 +200,7 @@ public TransferImpl(ListVector to) { to.addOrGetVector(vector.getField().getFieldType()); } dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); - pairs = new TransferPair[] { bitsTransferPair, offsetsTransferPair, dataTransferPair }; + pairs = new TransferPair[] {bitsTransferPair, offsetsTransferPair, dataTransferPair}; } @Override @@ -316,9 +317,9 @@ public void clear() { @Override public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), ObjectArrays.concat(bits.getBuffers(false), - vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); + vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); if (clear) { - for (ArrowBuf buffer:buffers) { + for (ArrowBuf buffer : buffers) { buffer.retain(); } clear(); @@ -351,7 +352,7 @@ public Object getObject(int index) { final int start = offsetsAccessor.get(index); final int end = offsetsAccessor.get(index + 1); final ValueVector.Accessor valuesAccessor = getDataVector().getAccessor(); - for(int i = start; i < end; i++) { + for (int i = start; i < end; i++) { vals.add(valuesAccessor.getObject(i)); } return vals; @@ -388,7 +389,7 @@ public int startNewValue(int index) { * End the current value * * @param index index of the value to end - * @param size number of elements in the list that was written + * @param size number of elements in the list that was written */ public void endValue(int index, int size) { offsets.getMutator().set(index + 1, offsets.getAccessor().get(index + 1) + size); @@ -414,7 +415,9 @@ public void setLastSet(int value) { lastSet = value; } - public int getLastSet() { return lastSet; } + public int getLastSet() { + return lastSet; + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index bdd30f88f2cc2..d8d0964ab3792 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; @@ -79,7 +80,7 @@ public FieldReader getReader() { transient private MapTransferPair ephPair; public void copyFromSafe(int fromIndex, int thisIndex, MapVector from) { - if(ephPair == null || ephPair.from != from) { + if (ephPair == null || ephPair.from != from) { ephPair = (MapTransferPair) from.makeTransferPair(this); } ephPair.copyValueSafe(fromIndex, thisIndex); @@ -107,7 +108,7 @@ public int getBufferSize() { return 0; } long buffer = 0; - for (final ValueVector v : (Iterable)this) { + for (final ValueVector v : (Iterable) this) { buffer += v.getBufferSize(); } @@ -148,7 +149,7 @@ public TransferPair getTransferPair(String ref, BufferAllocator allocator) { return new MapTransferPair(this, new MapVector(ref, allocator, fieldType, callBack), false); } - protected static class MapTransferPair implements TransferPair{ + protected static class MapTransferPair implements TransferPair { private final TransferPair[] pairs; private final MapVector from; private final MapVector to; @@ -165,7 +166,7 @@ protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { int i = 0; FieldVector vector; - for (String child:from.getChildFieldNames()) { + for (String child : from.getChildFieldNames()) { int preSize = to.size(); vector = from.getChild(child); if (vector == null) { @@ -252,7 +253,7 @@ public class Accessor extends BaseValueVector.BaseAccessor { @Override public Object getObject(int index) { Map vv = new JsonStringHashMap<>(); - for (String child:getChildFieldNames()) { + for (String child : getChildFieldNames()) { ValueVector v = getChild(child); if (v != null && index < v.getAccessor().getValueCount()) { Object value = v.getAccessor().getObject(index); @@ -290,10 +291,12 @@ public void setValueCount(int valueCount) { } @Override - public void reset() { } + public void reset() { + } @Override - public void generateTestData(int values) { } + public void generateTestData(int values) { + } } @Override @@ -329,11 +332,11 @@ public void close() { valueCount = 0; super.close(); - } + } public void initializeChildrenFromFields(List children) { for (Field field : children) { - FieldVector vector = (FieldVector)this.add(field.getName(), field.getFieldType()); + FieldVector vector = (FieldVector) this.add(field.getName(), field.getFieldType()); vector.initializeChildrenFromFields(field.getChildren()); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index ee95fdef59401..e70a915561f8b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; @@ -185,7 +186,7 @@ public void clear() { @Override - public int getBufferSize(){ + public int getBufferSize() { return super.getBufferSize() + bits.getBufferSize(); } @@ -229,7 +230,7 @@ public void reAlloc() { super.reAlloc(); } - public final class Accessor extends MapVector.Accessor { + public final class Accessor extends MapVector.Accessor { final BitVector.Accessor bAccessor = bits.getAccessor(); @Override @@ -257,7 +258,7 @@ public boolean isNull(int index) { return isSet(index) == 0; } - public int isSet(int index){ + public int isSet(int index) { return bAccessor.get(index); } @@ -265,15 +266,15 @@ public int isSet(int index){ public final class Mutator extends MapVector.Mutator implements NullableVectorDefinitionSetter { - private Mutator(){ + private Mutator() { } @Override - public void setIndexDefined(int index){ + public void setIndexDefined(int index) { bits.getMutator().setSafe(index, 1); } - public void setNull(int index){ + public void setNull(int index) { bits.getMutator().setSafe(index, 0); } @@ -285,13 +286,13 @@ public void setValueCount(int valueCount) { } @Override - public void generateTestData(int valueCount){ + public void generateTestData(int valueCount) { super.generateTestData(valueCount); bits.getMutator().generateTestDataAlt(valueCount); } @Override - public void reset(){ + public void reset() { bits.getMutator().setValueCount(0); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java index e1a4f36296987..f0a5174b2dce3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; public interface Positionable { public int getPosition(); + public void setPosition(int index); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java index 8b528b4ccab9b..4b19b9ffa0f9d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/PromotableVector.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.AddOrGetResult; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java index 23850bc9034df..866883f532665 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; /** @@ -25,8 +26,8 @@ public interface RepeatedFixedWidthVectorLike { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param valueCount Number of separate repeating groupings. - * @param innerValueCount Number of supported values in the vector. + * @param valueCount Number of separate repeating groupings. + * @param innerValueCount Number of supported values in the vector. */ void allocateNew(int valueCount, int innerValueCount); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java index de58eda0b11a2..91147c663f248 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.UInt4Vector; @@ -26,7 +27,6 @@ * A repeated vector contains values that may either be flat or nested. A value consists of zero or more cells(inner values). * Current design maintains data and offsets vectors. Each cell is stored in the data vector. Repeated vector * uses the offset vector to determine the sequence of cells pertaining to an individual value. - * */ public interface RepeatedValueVector extends ValueVector { @@ -51,6 +51,7 @@ public interface RepeatedValueVector extends ValueVector { interface RepeatedAccessor extends ValueVector.Accessor { /** * The result includes empty, null valued cells. + * * @return total number of cells that vector contains. */ int getInnerValueCount(); @@ -63,7 +64,7 @@ interface RepeatedAccessor extends ValueVector.Accessor { int getInnerValueCountAt(int index); /** - * @param index value index + * @param index value index * @return true if the value at the given index is empty, false otherwise. */ boolean isEmpty(int index); @@ -74,7 +75,7 @@ interface RepeatedMutator extends ValueVector.Mutator { /** * Starts a new value that is a container of cells. * - * @param index index of new value to start + * @param index index of new value to start * @return index into the child vector */ int startNewValue(int index); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java index 29f9d75c74671..1e4f54ea37209 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java @@ -15,20 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; public interface RepeatedVariableWidthVectorLike { /** * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. * - * @param totalBytes Desired size of the underlying data buffer. - * @param parentValueCount Number of separate repeating groupings. - * @param childValueCount Number of supported values in the vector. + * @param totalBytes Desired size of the underlying data buffer. + * @param parentValueCount Number of separate repeating groupings. + * @param childValueCount Number of supported values in the vector. */ void allocateNew(int totalBytes, int parentValueCount, int childValueCount); /** * Provide the maximum amount of variable width bytes that can be stored int his vector. + * * @return the byte capacity */ int getByteCapacity(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java index 05a79d24295e4..627998045c93c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import java.util.Arrays; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java index d04fc1c022c05..1633b3ad09892 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex; import org.apache.arrow.vector.ValueVector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java index 7c73c27ecff41..1eeced4598a55 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import java.util.Iterator; @@ -25,7 +26,7 @@ import org.apache.arrow.vector.holders.UnionHolder; -abstract class AbstractBaseReader implements FieldReader{ +abstract class AbstractBaseReader implements FieldReader { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseReader.class); @@ -40,11 +41,11 @@ public int getPosition() { return index; } - public void setPosition(int index){ + public void setPosition(int index) { this.index = index; } - protected int idx(){ + protected int idx() { return index; } @@ -86,6 +87,6 @@ public void copyAsValue(UnionWriter writer) { @Override public void copyAsValue(ListWriter writer) { - ComplexCopier.copy(this, (FieldWriter)writer); + ComplexCopier.copy(this, (FieldWriter) writer); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java index 13a0a6bd9e28f..2f224fe3a5b7a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.writer.FieldWriter; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 6851d6d45d562..a6960238b9165 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.ListVector; @@ -38,13 +39,15 @@ public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWri private final boolean unionEnabled; private final NullableMapWriterFactory nullableMapWriterFactory; - private enum Mode { INIT, MAP, LIST }; + private enum Mode {INIT, MAP, LIST} + + ; - public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled, boolean caseSensitive){ + public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled, boolean caseSensitive) { this.name = name; this.container = container; this.unionEnabled = unionEnabled; - nullableMapWriterFactory = caseSensitive? NullableMapWriterFactory.getNullableCaseSensitiveMapWriterFactoryInstance() : + nullableMapWriterFactory = caseSensitive ? NullableMapWriterFactory.getNullableCaseSensitiveMapWriterFactoryInstance() : NullableMapWriterFactory.getNullableMapWriterFactoryInstance(); } @@ -52,7 +55,7 @@ public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled) this(name, container, unionEnabled, false); } - public ComplexWriterImpl(String name, MapVector container){ + public ComplexWriterImpl(String name, MapVector container) { this(name, container, false); } @@ -66,12 +69,12 @@ public int getValueCapacity() { return container.getValueCapacity(); } - private void check(Mode... modes){ + private void check(Mode... modes) { StateTool.check(mode, modes); } @Override - public void reset(){ + public void reset() { setPosition(0); } @@ -85,58 +88,58 @@ public void close() throws Exception { } @Override - public void clear(){ - switch(mode){ - case MAP: - mapRoot.clear(); - break; - case LIST: - listRoot.clear(); - break; + public void clear() { + switch (mode) { + case MAP: + mapRoot.clear(); + break; + case LIST: + listRoot.clear(); + break; } } @Override - public void setValueCount(int count){ - switch(mode){ - case MAP: - mapRoot.setValueCount(count); - break; - case LIST: - listRoot.setValueCount(count); - break; + public void setValueCount(int count) { + switch (mode) { + case MAP: + mapRoot.setValueCount(count); + break; + case LIST: + listRoot.setValueCount(count); + break; } } @Override - public void setPosition(int index){ + public void setPosition(int index) { super.setPosition(index); - switch(mode){ - case MAP: - mapRoot.setPosition(index); - break; - case LIST: - listRoot.setPosition(index); - break; + switch (mode) { + case MAP: + mapRoot.setPosition(index); + break; + case LIST: + listRoot.setPosition(index); + break; } } - public MapWriter directMap(){ + public MapWriter directMap() { Preconditions.checkArgument(name == null); - switch(mode){ + switch (mode) { - case INIT: - mapRoot = nullableMapWriterFactory.build((NullableMapVector) container); - mapRoot.setPosition(idx()); - mode = Mode.MAP; - break; + case INIT: + mapRoot = nullableMapWriterFactory.build((NullableMapVector) container); + mapRoot.setPosition(idx()); + mode = Mode.MAP; + break; - case MAP: - break; + case MAP: + break; - default: + default: check(Mode.INIT, Mode.MAP); } @@ -145,20 +148,20 @@ public MapWriter directMap(){ @Override public MapWriter rootAsMap() { - switch(mode){ + switch (mode) { - case INIT: - // TODO allow dictionaries in complex types - NullableMapVector map = container.addOrGetMap(name); - mapRoot = nullableMapWriterFactory.build(map); - mapRoot.setPosition(idx()); - mode = Mode.MAP; - break; + case INIT: + // TODO allow dictionaries in complex types + NullableMapVector map = container.addOrGetMap(name); + mapRoot = nullableMapWriterFactory.build(map); + mapRoot.setPosition(idx()); + mode = Mode.MAP; + break; - case MAP: - break; + case MAP: + break; - default: + default: check(Mode.INIT, Mode.MAP); } @@ -167,33 +170,33 @@ public MapWriter rootAsMap() { @Override public void allocate() { - if(mapRoot != null) { + if (mapRoot != null) { mapRoot.allocate(); - } else if(listRoot != null) { + } else if (listRoot != null) { listRoot.allocate(); } } @Override public ListWriter rootAsList() { - switch(mode){ - - case INIT: - int vectorCount = container.size(); - // TODO allow dictionaries in complex types - ListVector listVector = container.addOrGetList(name); - if (container.size() > vectorCount) { - listVector.allocateNew(); - } - listRoot = new UnionListWriter(listVector, nullableMapWriterFactory); - listRoot.setPosition(idx()); - mode = Mode.LIST; - break; - - case LIST: - break; - - default: + switch (mode) { + + case INIT: + int vectorCount = container.size(); + // TODO allow dictionaries in complex types + ListVector listVector = container.addOrGetList(name); + if (container.size() > vectorCount) { + listVector.allocateNew(); + } + listRoot = new UnionListWriter(listVector, nullableMapWriterFactory); + listRoot.setPosition(idx()); + mode = Mode.LIST; + break; + + case LIST: + break; + + default: check(Mode.INIT, Mode.MAP); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java index f8a9d4232aadc..0d860b6a04115 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.writer.BaseWriter; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java index 067716e8ea290..614c266acf147 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.MapVector; @@ -28,8 +29,8 @@ public class NullableMapReaderImpl extends SingleMapReaderImpl { private NullableMapVector nullableMapVector; public NullableMapReaderImpl(MapVector vector) { - super((NullableMapVector)vector); - this.nullableMapVector = (NullableMapVector)vector; + super((NullableMapVector) vector); + this.nullableMapVector = (NullableMapVector) vector; } @Override @@ -38,19 +39,19 @@ public Field getField() { } @Override - public void copyAsValue(MapWriter writer){ + public void copyAsValue(MapWriter writer) { NullableMapWriter impl = (NullableMapWriter) writer; impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); } @Override - public void copyAsField(String name, MapWriter writer){ + public void copyAsField(String name, MapWriter writer) { NullableMapWriter impl = (NullableMapWriter) writer.map(name); impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); } @Override - public boolean isSet(){ + public boolean isSet() { return !nullableMapVector.getAccessor().isNull(idx()); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java index d932cfb3e1287..d2dcb2374d0e7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapWriterFactory.java @@ -15,28 +15,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.complex.NullableMapVector; public class NullableMapWriterFactory { - private final boolean caseSensitive; - private static final NullableMapWriterFactory nullableMapWriterFactory = new NullableMapWriterFactory(false); - private static final NullableMapWriterFactory nullableCaseSensitiveWriterFactory = new NullableMapWriterFactory(true); + private final boolean caseSensitive; + private static final NullableMapWriterFactory nullableMapWriterFactory = new NullableMapWriterFactory(false); + private static final NullableMapWriterFactory nullableCaseSensitiveWriterFactory = new NullableMapWriterFactory(true); - public NullableMapWriterFactory(boolean caseSensitive) { - this.caseSensitive = caseSensitive; - } + public NullableMapWriterFactory(boolean caseSensitive) { + this.caseSensitive = caseSensitive; + } - public NullableMapWriter build(NullableMapVector container) { - return this.caseSensitive? new NullableCaseSensitiveMapWriter(container) : new NullableMapWriter(container); - } + public NullableMapWriter build(NullableMapVector container) { + return this.caseSensitive ? new NullableCaseSensitiveMapWriter(container) : new NullableMapWriter(container); + } - public static NullableMapWriterFactory getNullableMapWriterFactoryInstance() { - return nullableMapWriterFactory; - } + public static NullableMapWriterFactory getNullableMapWriterFactoryInstance() { + return nullableMapWriterFactory; + } - public static NullableMapWriterFactory getNullableCaseSensitiveMapWriterFactoryInstance() { - return nullableCaseSensitiveWriterFactory; - } + public static NullableMapWriterFactory getNullableCaseSensitiveMapWriterFactoryInstance() { + return nullableCaseSensitiveWriterFactory; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index d16718e75a701..9722196ed7cd2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.FieldVector; @@ -120,7 +121,7 @@ public void setPosition(int index) { protected FieldWriter getWriter(MinorType type) { if (state == State.UNION) { - ((UnionWriter)writer).getWriter(type); + ((UnionWriter) writer).getWriter(type); } else if (state == State.UNTYPED) { if (type == null) { // ??? @@ -132,7 +133,7 @@ protected FieldWriter getWriter(MinorType type) { writer.setPosition(position); } else if (type != this.type) { promoteToUnion(); - ((UnionWriter)writer).getWriter(type); + ((UnionWriter) writer).getWriter(type); } return writer; } @@ -157,7 +158,7 @@ private FieldWriter promoteToUnion() { } else if (listVector != null) { unionVector = listVector.promoteToUnion(); } - unionVector.addVector((FieldVector)tp.getTo()); + unionVector.addVector((FieldVector) tp.getTo()); writer = new UnionWriter(unionVector, nullableMapWriterFactory); writer.setPosition(idx()); for (int i = 0; i <= idx(); i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java index b8f58658eae15..f2b46ab98db7f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java @@ -17,6 +17,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; @@ -27,7 +28,7 @@ import org.apache.arrow.vector.types.Types.MinorType; @SuppressWarnings("unused") -public class SingleListReaderImpl extends AbstractFieldReader{ +public class SingleListReaderImpl extends AbstractFieldReader { private final String name; private final AbstractContainerVector container; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java index 48019093e387f..3ebd0cd7dd959 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; @@ -31,7 +32,7 @@ import com.google.common.collect.Maps; @SuppressWarnings("unused") -public class SingleMapReaderImpl extends AbstractFieldReader{ +public class SingleMapReaderImpl extends AbstractFieldReader { private final MapVector vector; private final Map fields = Maps.newHashMap(); @@ -40,8 +41,8 @@ public SingleMapReaderImpl(MapVector vector) { this.vector = vector; } - private void setChildrenPosition(int index){ - for(FieldReader r : fields.values()){ + private void setChildrenPosition(int index) { + for (FieldReader r : fields.values()) { r.setPosition(index); } } @@ -52,13 +53,13 @@ public Field getField() { } @Override - public FieldReader reader(String name){ + public FieldReader reader(String name) { FieldReader reader = fields.get(name); - if(reader == null){ + if (reader == null) { ValueVector child = vector.getChild(name); - if(child == null){ + if (child == null) { reader = NullReader.INSTANCE; - }else{ + } else { reader = child.getReader(); } fields.put(name, reader); @@ -68,9 +69,9 @@ public FieldReader reader(String name){ } @Override - public void setPosition(int index){ + public void setPosition(int index) { super.setPosition(index); - for(FieldReader r : fields.values()){ + for (FieldReader r : fields.values()) { r.setPosition(index); } } @@ -91,18 +92,18 @@ public boolean isSet() { } @Override - public java.util.Iterator iterator(){ + public java.util.Iterator iterator() { return vector.fieldNameIterator(); } @Override - public void copyAsValue(MapWriter writer){ + public void copyAsValue(MapWriter writer) { SingleMapWriter impl = (SingleMapWriter) writer; impl.container.copyFromSafe(idx(), impl.idx(), vector); } @Override - public void copyAsField(String name, MapWriter writer){ + public void copyAsField(String name, MapWriter writer) { SingleMapWriter impl = (SingleMapWriter) writer.map(name); impl.container.copyFromSafe(idx(), impl.idx(), vector); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java index 515d4ab8ce907..f3e9b8773f25e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.ValueVector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java index 2bd0ca87cd074..b98c36d2bf721 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.complex.impl; import org.apache.arrow.vector.UInt4Vector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java index c4eb3dc739a49..df142c7819d7c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.reader; import org.apache.arrow.vector.complex.reader.BaseReader.ListReader; @@ -24,6 +25,5 @@ import org.apache.arrow.vector.complex.reader.BaseReader.ScalarReader; - public interface FieldReader extends MapReader, ListReader, ScalarReader, RepeatedMapReader, RepeatedListReader { } \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java index ecffe0bec0e84..a2a1f5d000a8a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.writer; import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; @@ -23,5 +24,6 @@ public interface FieldWriter extends MapWriter, ListWriter, ScalarWriter { void allocate(); + void clear(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java index 0c1cadfdafdbf..c2f692035946f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/Dictionary.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.dictionary; import java.util.Objects; @@ -53,8 +54,12 @@ public String toString() { @Override public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } Dictionary that = (Dictionary) o; return Objects.equals(encoding, that.encoding) && Objects.equals(dictionary, that.dictionary); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 0f49ce61f1cdf..7e20794cbbed2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.dictionary; import java.lang.reflect.InvocationTargetException; @@ -39,7 +40,7 @@ public class DictionaryEncoder { /** * Dictionary encodes a vector with a provided dictionary. The dictionary must contain all values in the vector. * - * @param vector vector to encode + * @param vector vector to encode * @param dictionary dictionary used for encoding * @return dictionary encoded vector */ @@ -55,7 +56,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { Field valueField = vector.getField(); FieldType indexFieldType = new FieldType(valueField.isNullable(), dictionary.getEncoding().getIndexType(), - dictionary.getEncoding(), valueField.getMetadata()); + dictionary.getEncoding(), valueField.getMetadata()); Field indexField = new Field(valueField.getName(), indexFieldType, null); // vector to hold our indices (dictionary encoded values) @@ -65,11 +66,11 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { // use reflection to pull out the set method // TODO implement a common interface for int vectors Method setter = null; - for (Class c: ImmutableList.of(int.class, long.class)) { + for (Class c : ImmutableList.of(int.class, long.class)) { try { setter = mutator.getClass().getMethod("set", int.class, c); break; - } catch(NoSuchMethodException e) { + } catch (NoSuchMethodException e) { // ignore } } @@ -108,7 +109,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { /** * Decodes a dictionary encoded array using the provided dictionary. * - * @param indices dictionary encoded values, must be int type + * @param indices dictionary encoded values, must be int type * @param dictionary dictionary used to decode the values * @return vector with values restored from dictionary */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java index 87516c9a8fc5b..a170cea21d273 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.dictionary; import java.util.HashMap; @@ -31,7 +32,7 @@ public static class MapDictionaryProvider implements DictionaryProvider { public MapDictionaryProvider(Dictionary... dictionaries) { this.map = new HashMap<>(); - for (Dictionary dictionary: dictionaries) { + for (Dictionary dictionary : dictionaries) { put(dictionary); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java index 90fb02b059707..e1b4d6a8b215e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import org.apache.arrow.flatbuf.Block; @@ -64,19 +65,25 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } ArrowBlock other = (ArrowBlock) obj; - if (bodyLength != other.bodyLength) + if (bodyLength != other.bodyLength) { return false; - if (metadataLength != other.metadataLength) + } + if (metadataLength != other.metadataLength) { return false; - if (offset != other.offset) + } + if (offset != other.offset) { return false; + } return true; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java index f4d6ada932494..d711b9c6c1e26 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -117,7 +118,7 @@ private ArrowDictionaryBatch readDictionaryBatch(SeekableReadChannel in, ArrowBlock block, BufferAllocator allocator) throws IOException { LOGGER.debug(String.format("DictionaryRecordBatch at %d, metadata: %d, body: %d", - block.getOffset(), block.getMetadataLength(), block.getBodyLength())); + block.getOffset(), block.getMetadataLength(), block.getBodyLength())); in.setPosition(block.getOffset()); ArrowDictionaryBatch batch = MessageSerializer.deserializeDictionaryBatch(in, block, allocator); if (batch == null) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java index 23d210a3ee73b..06519bc49fd1c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -47,7 +48,7 @@ protected void endInternal(WriteChannel out, List records) throws IOException { long footerStart = out.getCurrentPosition(); out.write(new ArrowFooter(schema, dictionaries, records), false); - int footerLength = (int)(out.getCurrentPosition() - footerStart); + int footerLength = (int) (out.getCurrentPosition() - footerStart); if (footerLength <= 0) { throw new InvalidArrowFileException("invalid footer"); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java index 1c0008a9184a0..1e95321fdec5b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; @@ -48,7 +49,7 @@ public ArrowFooter(Footer footer) { Schema.convertSchema(footer.schema()), dictionaries(footer), recordBatches(footer) - ); + ); } private static List recordBatches(Footer footer) { @@ -112,28 +113,37 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } ArrowFooter other = (ArrowFooter) obj; if (dictionaries == null) { - if (other.dictionaries != null) + if (other.dictionaries != null) { return false; - } else if (!dictionaries.equals(other.dictionaries)) + } + } else if (!dictionaries.equals(other.dictionaries)) { return false; + } if (recordBatches == null) { - if (other.recordBatches != null) + if (other.recordBatches != null) { return false; - } else if (!recordBatches.equals(other.recordBatches)) + } + } else if (!recordBatches.equals(other.recordBatches)) { return false; + } if (schema == null) { - if (other.schema != null) + if (other.schema != null) { return false; - } else if (!schema.equals(other.schema)) + } + } else if (!schema.equals(other.schema)) { return false; + } return true; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java index 99ea96b3856d5..0d2da375295fe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java index f6b104145527c..646d6feeef086 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -94,12 +95,21 @@ public boolean loadNextBatch() throws IOException { ArrowMessageVisitor visitor = new ArrowMessageVisitor() { @Override public Boolean visit(ArrowDictionaryBatch message) { - try { load(message); } finally { message.close(); } + try { + load(message); + } finally { + message.close(); + } return true; } + @Override public Boolean visit(ArrowRecordBatch message) { - try { loader.load(message); } finally { message.close(); } + try { + loader.load(message); + } finally { + message.close(); + } return false; } }; @@ -119,13 +129,15 @@ public Boolean visit(ArrowRecordBatch message) { return readBatch; } - public long bytesRead() { return in.bytesRead(); } + public long bytesRead() { + return in.bytesRead(); + } @Override public void close() throws IOException { if (initialized) { root.close(); - for (Dictionary dictionary: dictionaries.values()) { + for (Dictionary dictionary : dictionaries.values()) { dictionary.getVector().close(); } } @@ -153,7 +165,7 @@ private void initialize() throws IOException { Map dictionaries = new HashMap<>(); // Convert fields with dictionaries to have the index type - for (Field field: originalSchema.getFields()) { + for (Field field : originalSchema.getFields()) { Field updated = DictionaryUtility.toMemoryFormat(field, allocator, dictionaries); fields.add(updated); vectors.add(updated.createVector(allocator)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java index 3b37071382ff6..b35aba5426e4a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -60,9 +61,9 @@ public abstract class ArrowWriter implements AutoCloseable { /** * Note: fields are not closed when the writer is closed * - * @param root the vectors to write to the output + * @param root the vectors to write to the output * @param provider where to find the dictionaries - * @param out the output where to write + * @param out the output where to write */ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { this.unloader = new VectorUnloader(root); @@ -72,13 +73,13 @@ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, Writab Set dictionaryIdsUsed = new HashSet<>(); // Convert fields with dictionaries to have dictionary type - for (Field field: root.getSchema().getFields()) { + for (Field field : root.getSchema().getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); } // Create a record batch for each dictionary this.dictionaries = new ArrayList<>(dictionaryIdsUsed.size()); - for (long id: dictionaryIdsUsed) { + for (long id : dictionaryIdsUsed) { Dictionary dictionary = provider.lookup(id); FieldVector vector = dictionary.getVector(); int count = vector.getAccessor().getValueCount(); @@ -105,7 +106,7 @@ public void writeBatch() throws IOException { protected void writeRecordBatch(ArrowRecordBatch batch) throws IOException { ArrowBlock block = MessageSerializer.serialize(out, batch); LOGGER.debug(String.format("RecordBatch at %d, metadata: %d, body: %d", - block.getOffset(), block.getMetadataLength(), block.getBodyLength())); + block.getOffset(), block.getMetadataLength(), block.getBodyLength())); recordBlocks.add(block); } @@ -114,7 +115,9 @@ public void end() throws IOException { ensureEnded(); } - public long bytesWritten() { return out.getCurrentPosition(); } + public long bytesWritten() { + return out.getCurrentPosition(); + } private void ensureStarted() throws IOException { if (!started) { @@ -128,7 +131,7 @@ private void ensureStarted() throws IOException { try { ArrowBlock block = MessageSerializer.serialize(out, batch); LOGGER.debug(String.format("DictionaryRecordBatch at %d, metadata: %d, body: %d", - block.getOffset(), block.getMetadataLength(), block.getBodyLength())); + block.getOffset(), block.getMetadataLength(), block.getBodyLength())); dictionaryBlocks.add(block); } finally { batch.close(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java b/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java index 3ec75dcb12a2b..607207f41b06c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; public class InvalidArrowFileException extends RuntimeException { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java index 87450e38f6852..b0eb8f3d84d9a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -37,11 +38,14 @@ public ReadChannel(ReadableByteChannel in) { this.in = in; } - public long bytesRead() { return bytesRead; } + public long bytesRead() { + return bytesRead; + } /** * Reads bytes into buffer until it is full (buffer.remaining() == 0). Returns the * number of bytes read which can be less than full if there are no more. + * * @param buffer The buffer to read to * @return the number of byte read * @throws IOException if nit enough bytes left to read @@ -51,9 +55,13 @@ public int readFully(ByteBuffer buffer) throws IOException { int totalRead = 0; while (buffer.remaining() != 0) { int read = in.read(buffer); - if (read < 0) return totalRead; + if (read < 0) { + return totalRead; + } totalRead += read; - if (read == 0) break; + if (read == 0) { + break; + } } this.bytesRead += totalRead; return totalRead; @@ -61,8 +69,9 @@ public int readFully(ByteBuffer buffer) throws IOException { /** * Reads up to len into buffer. Returns bytes read. + * * @param buffer the buffer to read to - * @param l the amount of bytes to read + * @param l the amount of bytes to read * @return the number of bytes read * @throws IOException if nit enough bytes left to read */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java index 914c3cb4b33a9..46bea1314da63 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -22,18 +23,18 @@ public class SeekableReadChannel extends ReadChannel { - private final SeekableByteChannel in; + private final SeekableByteChannel in; - public SeekableReadChannel(SeekableByteChannel in) { - super(in); - this.in = in; - } + public SeekableReadChannel(SeekableByteChannel in) { + super(in); + this.in = in; + } - public void setPosition(long position) throws IOException { - in.position(position); - } + public void setPosition(long position) throws IOException { + in.position(position); + } - public long size() throws IOException { - return in.size(); - } + public long size() throws IOException { + return in.size(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java index 42104d181a2d0..89c9d1f9b7a44 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.IOException; @@ -62,7 +63,7 @@ public long writeZeros(int zeroCount) throws IOException { public long align() throws IOException { if (currentPosition % 8 != 0) { // align on 8 byte boundaries - return writeZeros(8 - (int)(currentPosition % 8)); + return writeZeros(8 - (int) (currentPosition % 8)); } return 0; } @@ -77,10 +78,10 @@ public long write(ByteBuffer buffer) throws IOException { public static byte[] intToBytes(int value) { byte[] outBuffer = new byte[4]; - outBuffer[3] = (byte)(value >>> 24); - outBuffer[2] = (byte)(value >>> 16); - outBuffer[1] = (byte)(value >>> 8); - outBuffer[0] = (byte)(value >>> 0); + outBuffer[3] = (byte) (value >>> 24); + outBuffer[2] = (byte) (value >>> 16); + outBuffer[1] = (byte) (value >>> 8); + outBuffer[0] = (byte) (value >>> 0); return outBuffer; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 364d273fadae4..484a82fdaab67 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.file.json; import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; @@ -114,7 +115,7 @@ public Schema start() throws JsonParseException, IOException { dictionaries = new HashMap<>(); // Convert fields with dictionaries to have the index type - for (Field field: originalSchema.getFields()) { + for (Field field : originalSchema.getFields()) { fields.add(DictionaryUtility.toMemoryFormat(field, allocator, dictionaries)); } this.schema = new Schema(fields, originalSchema.getCustomMetadata()); @@ -233,7 +234,7 @@ private void readVector(Field field, FieldVector vector) throws JsonParseExcepti BufferBacked innerVector = fieldInnerVectors.get(v); nextFieldIs(vectorType.getName()); readToken(START_ARRAY); - ValueVector valueVector = (ValueVector)innerVector; + ValueVector valueVector = (ValueVector) innerVector; valueVector.allocateNew(); Mutator mutator = valueVector.getMutator(); @@ -262,7 +263,7 @@ private void readVector(Field field, FieldVector vector) throws JsonParseExcepti readToken(END_ARRAY); } if (vector instanceof NullableMapVector) { - ((NullableMapVector)vector).valueCount = count; + ((NullableMapVector) vector).valueCount = count; } } readToken(END_OBJECT); @@ -278,96 +279,96 @@ private byte[] decodeHexSafe(String hexString) throws IOException { private void setValueFromParser(ValueVector valueVector, int i) throws IOException { switch (valueVector.getMinorType()) { - case BIT: - ((BitVector)valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0); - break; - case TINYINT: - ((TinyIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case SMALLINT: - ((SmallIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case INT: - ((IntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case BIGINT: - ((BigIntVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case UINT1: - ((UInt1Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT2: - ((UInt2Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT4: - ((UInt4Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT8: - ((UInt8Vector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case FLOAT4: - ((Float4Vector)valueVector).getMutator().set(i, parser.readValueAs(Float.class)); - break; - case FLOAT8: - ((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class)); - break; - case VARBINARY: - ((VarBinaryVector)valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); - break; - case VARCHAR: - ((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); - break; - case DATEDAY: - ((DateDayVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case DATEMILLI: - ((DateMilliVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESEC: - ((TimeSecVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case TIMEMILLI: - ((TimeMilliVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case TIMEMICRO: - ((TimeMicroVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMENANO: - ((TimeNanoVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPSEC: - ((TimeStampSecVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMILLI: - ((TimeStampMilliVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMICRO: - ((TimeStampMicroVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPNANO: - ((TimeStampNanoVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPSECTZ: - ((TimeStampSecTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMILLITZ: - ((TimeStampMilliTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMICROTZ: - ((TimeStampMicroTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPNANOTZ: - ((TimeStampNanoTZVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - default: - throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType()); + case BIT: + ((BitVector) valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0); + break; + case TINYINT: + ((TinyIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case SMALLINT: + ((SmallIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case INT: + ((IntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case BIGINT: + ((BigIntVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case UINT1: + ((UInt1Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT2: + ((UInt2Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT4: + ((UInt4Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT8: + ((UInt8Vector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case FLOAT4: + ((Float4Vector) valueVector).getMutator().set(i, parser.readValueAs(Float.class)); + break; + case FLOAT8: + ((Float8Vector) valueVector).getMutator().set(i, parser.readValueAs(Double.class)); + break; + case VARBINARY: + ((VarBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); + break; + case VARCHAR: + ((VarCharVector) valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); + break; + case DATEDAY: + ((DateDayVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case DATEMILLI: + ((DateMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESEC: + ((TimeSecVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case TIMEMILLI: + ((TimeMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case TIMEMICRO: + ((TimeMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMENANO: + ((TimeNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPSEC: + ((TimeStampSecVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMILLI: + ((TimeStampMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMICRO: + ((TimeStampMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPNANO: + ((TimeStampNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPSECTZ: + ((TimeStampSecTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMILLITZ: + ((TimeStampMilliTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPMICROTZ: + ((TimeStampMicroTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case TIMESTAMPNANOTZ: + ((TimeStampNanoTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + default: + throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType()); } } @Override public void close() throws IOException { parser.close(); - for (Dictionary dictionary: dictionaries.values()) { + for (Dictionary dictionary : dictionaries.values()) { dictionary.getVector().close(); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index befa92251f0f9..a2229cef23150 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.file.json; import java.io.File; @@ -60,12 +61,15 @@ public class JsonFileWriter implements AutoCloseable { public static final class JSONWriteConfig { private final boolean pretty; + private JSONWriteConfig(boolean pretty) { this.pretty = pretty; } + private JSONWriteConfig() { this.pretty = false; } + public JSONWriteConfig pretty(boolean pretty) { return new JSONWriteConfig(pretty); } @@ -98,7 +102,7 @@ public void start(Schema schema, DictionaryProvider provider) throws IOException this.schema = schema; // Store original Schema to ensure batches written match // Convert fields with dictionaries to have dictionary type - for (Field field: schema.getFields()) { + for (Field field : schema.getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); } Schema updatedSchema = new Schema(fields, schema.getCustomMetadata()); @@ -117,7 +121,7 @@ public void start(Schema schema, DictionaryProvider provider) throws IOException private void writeDictionaryBatches(JsonGenerator generator, Set dictionaryIdsUsed, DictionaryProvider provider) throws IOException { generator.writeArrayFieldStart("dictionaries"); - for (Long id: dictionaryIdsUsed) { + for (Long id : dictionaryIdsUsed) { generator.writeStartObject(); generator.writeObjectField("id", id); @@ -170,7 +174,7 @@ private void writeVector(Field field, FieldVector vector) throws IOException { ArrowVectorType vectorType = vectorTypes.get(v); BufferBacked innerVector = fieldInnerVectors.get(v); generator.writeArrayFieldStart(vectorType.getName()); - ValueVector valueVector = (ValueVector)innerVector; + ValueVector valueVector = (ValueVector) innerVector; for (int i = 0; i < valueVector.getAccessor().getValueCount(); i++) { writeValueToGenerator(valueVector, i); } @@ -197,37 +201,37 @@ private void writeVector(Field field, FieldVector vector) throws IOException { private void writeValueToGenerator(ValueVector valueVector, int i) throws IOException { switch (valueVector.getMinorType()) { case DATEDAY: - generator.writeNumber(((DateDayVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((DateDayVector) valueVector).getAccessor().get(i)); break; case DATEMILLI: - generator.writeNumber(((DateMilliVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((DateMilliVector) valueVector).getAccessor().get(i)); break; case TIMESEC: - generator.writeNumber(((TimeSecVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeSecVector) valueVector).getAccessor().get(i)); break; case TIMEMILLI: - generator.writeNumber(((TimeMilliVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeMilliVector) valueVector).getAccessor().get(i)); break; case TIMEMICRO: - generator.writeNumber(((TimeMicroVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeMicroVector) valueVector).getAccessor().get(i)); break; case TIMENANO: - generator.writeNumber(((TimeNanoVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeNanoVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPSEC: - generator.writeNumber(((TimeStampSecVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampSecVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPMILLI: - generator.writeNumber(((TimeStampMilliVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampMilliVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPMICRO: - generator.writeNumber(((TimeStampMicroVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampMicroVector) valueVector).getAccessor().get(i)); break; case TIMESTAMPNANO: - generator.writeNumber(((TimeStampNanoVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((TimeStampNanoVector) valueVector).getAccessor().get(i)); break; case BIT: - generator.writeNumber(((BitVector)valueVector).getAccessor().get(i)); + generator.writeNumber(((BitVector) valueVector).getAccessor().get(i)); break; case VARBINARY: String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java index 0f9310da55b79..9f923ee343cdc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; import org.apache.arrow.vector.complex.reader.FieldReader; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java index 83506cdc17549..d8b2317f8ff4f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; -public final class RepeatedListHolder implements ValueHolder{ +public final class RepeatedListHolder implements ValueHolder { public int start; public int end; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java index 85d782b381835..c400b4dc49a80 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; -public final class RepeatedMapHolder implements ValueHolder{ +public final class RepeatedMapHolder implements ValueHolder { public int start; public int end; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java index b1b695e58a954..46cf4c8e8c712 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; import org.apache.arrow.vector.complex.reader.FieldReader; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java index 16777c806ec2d..4d012635e548a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.holders; /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java index 4e2e200d67645..d8c9e3001d0a5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import org.apache.arrow.flatbuf.Buffer; @@ -58,19 +59,25 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } ArrowBuffer other = (ArrowBuffer) obj; - if (offset != other.offset) + if (offset != other.offset) { return false; - if (page != other.page) + } + if (page != other.page) { return false; - if (size != other.size) + } + if (size != other.size) { return false; + } return true; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java index 901877b7058cd..635fa3fb42307 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import com.google.flatbuffers.FlatBufferBuilder; @@ -22,39 +23,48 @@ public class ArrowDictionaryBatch implements ArrowMessage { - private final long dictionaryId; - private final ArrowRecordBatch dictionary; - - public ArrowDictionaryBatch(long dictionaryId, ArrowRecordBatch dictionary) { - this.dictionaryId = dictionaryId; - this.dictionary = dictionary; - } - - public long getDictionaryId() { return dictionaryId; } - public ArrowRecordBatch getDictionary() { return dictionary; } - - @Override - public int writeTo(FlatBufferBuilder builder) { - int dataOffset = dictionary.writeTo(builder); - DictionaryBatch.startDictionaryBatch(builder); - DictionaryBatch.addId(builder, dictionaryId); - DictionaryBatch.addData(builder, dataOffset); - return DictionaryBatch.endDictionaryBatch(builder); - } - - @Override - public int computeBodyLength() { return dictionary.computeBodyLength(); } - - @Override - public T accepts(ArrowMessageVisitor visitor) { return visitor.visit(this); } - - @Override - public String toString() { - return "ArrowDictionaryBatch [dictionaryId=" + dictionaryId + ", dictionary=" + dictionary + "]"; - } - - @Override - public void close() { - dictionary.close(); - } + private final long dictionaryId; + private final ArrowRecordBatch dictionary; + + public ArrowDictionaryBatch(long dictionaryId, ArrowRecordBatch dictionary) { + this.dictionaryId = dictionaryId; + this.dictionary = dictionary; + } + + public long getDictionaryId() { + return dictionaryId; + } + + public ArrowRecordBatch getDictionary() { + return dictionary; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + int dataOffset = dictionary.writeTo(builder); + DictionaryBatch.startDictionaryBatch(builder); + DictionaryBatch.addId(builder, dictionaryId); + DictionaryBatch.addData(builder, dataOffset); + return DictionaryBatch.endDictionaryBatch(builder); + } + + @Override + public int computeBodyLength() { + return dictionary.computeBodyLength(); + } + + @Override + public T accepts(ArrowMessageVisitor visitor) { + return visitor.visit(this); + } + + @Override + public String toString() { + return "ArrowDictionaryBatch [dictionaryId=" + dictionaryId + ", dictionary=" + dictionary + "]"; + } + + @Override + public void close() { + dictionary.close(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java index 72ce982f2e7ee..3ed384ed7e280 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import org.apache.arrow.flatbuf.FieldNode; @@ -34,7 +35,7 @@ public ArrowFieldNode(int length, int nullCount) { @Override public int writeTo(FlatBufferBuilder builder) { - return FieldNode.createFieldNode(builder, (long)length, (long)nullCount); + return FieldNode.createFieldNode(builder, (long) length, (long) nullCount); } public int getNullCount() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java index d307428889b0f..f59b4b6c1721e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java @@ -15,16 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; public interface ArrowMessage extends FBSerializable, AutoCloseable { - public int computeBodyLength(); + public int computeBodyLength(); + + public T accepts(ArrowMessageVisitor visitor); - public T accepts(ArrowMessageVisitor visitor); + public static interface ArrowMessageVisitor { + public T visit(ArrowDictionaryBatch message); - public static interface ArrowMessageVisitor { - public T visit(ArrowDictionaryBatch message); - public T visit(ArrowRecordBatch message); - } + public T visit(ArrowRecordBatch message); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java index 6a716fa138a7a..d2f3782469597 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; @@ -36,10 +37,14 @@ public class ArrowRecordBatch implements ArrowMessage { private static final Logger LOGGER = LoggerFactory.getLogger(ArrowRecordBatch.class); - /** number of records */ + /** + * number of records + */ private final int length; - /** Nodes correspond to the pre-ordered flattened logical schema */ + /** + * Nodes correspond to the pre-ordered flattened logical schema + */ private final List nodes; private final List buffers; @@ -53,8 +58,8 @@ public ArrowRecordBatch(int length, List nodes, List b } /** - * @param length how many rows in this batch - * @param nodes field level info + * @param length how many rows in this batch + * @param nodes field level info * @param buffers will be retained until this recordBatch is closed */ public ArrowRecordBatch(int length, List nodes, List buffers, boolean alignBuffers) { @@ -119,7 +124,9 @@ public int writeTo(FlatBufferBuilder builder) { } @Override - public T accepts(ArrowMessageVisitor visitor) { return visitor.visit(this); } + public T accepts(ArrowMessageVisitor visitor) { + return visitor.visit(this); + } /** * releases the buffers diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java index 68da7052f2b8b..9d2fdfaafe4aa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import java.util.Map; @@ -34,10 +35,11 @@ public class ArrowVectorType { public static final ArrowVectorType TYPE = new ArrowVectorType(VectorType.TYPE); private static final Map typeByName; + static { - ArrowVectorType[] types = { DATA, OFFSET, VALIDITY, TYPE }; + ArrowVectorType[] types = {DATA, OFFSET, VALIDITY, TYPE}; Builder builder = ImmutableMap.builder(); - for (ArrowVectorType type: types) { + for (ArrowVectorType type : types) { builder.put(type.getName(), type); } typeByName = builder.build(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java index d23ed91948e5d..91d60ea995b89 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import com.google.flatbuffers.FlatBufferBuilder; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java index 31c17ad6df02b..ae5aa555e745e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import java.util.ArrayList; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 24840ec988ac3..29407bf1ab4e1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import static java.util.Arrays.asList; @@ -61,11 +62,13 @@ public class TypeLayout { public static TypeLayout getTypeLayout(final ArrowType arrowType) { TypeLayout layout = arrowType.accept(new ArrowTypeVisitor() { - @Override public TypeLayout visit(Int type) { + @Override + public TypeLayout visit(Int type) { return newFixedWidthTypeLayout(dataVector(type.getBitWidth())); } - @Override public TypeLayout visit(Union type) { + @Override + public TypeLayout visit(Union type) { List vectors; switch (type.getMode()) { case Dense: @@ -74,12 +77,12 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { validityVector(), typeVector(), offsetVector() // offset to find the vector - ); + ); break; case Sparse: vectors = asList( typeVector() // type of the value at the index or 0 if null - ); + ); break; default: throw new UnsupportedOperationException("Unsupported Union Mode: " + type.getMode()); @@ -87,64 +90,73 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { return new TypeLayout(vectors); } - @Override public TypeLayout visit(Struct type) { + @Override + public TypeLayout visit(Struct type) { List vectors = asList( validityVector() - ); + ); return new TypeLayout(vectors); } - @Override public TypeLayout visit(Timestamp type) { + @Override + public TypeLayout visit(Timestamp type) { return newFixedWidthTypeLayout(dataVector(64)); } - @Override public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + @Override + public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { List vectors = asList( validityVector(), offsetVector() - ); + ); return new TypeLayout(vectors); } - @Override public TypeLayout visit(FixedSizeList type) { + @Override + public TypeLayout visit(FixedSizeList type) { List vectors = asList( validityVector() - ); + ); return new TypeLayout(vectors); } - @Override public TypeLayout visit(FloatingPoint type) { + @Override + public TypeLayout visit(FloatingPoint type) { int bitWidth; switch (type.getPrecision()) { - case HALF: - bitWidth = 16; - break; - case SINGLE: - bitWidth = 32; - break; - case DOUBLE: - bitWidth = 64; - break; - default: - throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); + case HALF: + bitWidth = 16; + break; + case SINGLE: + bitWidth = 32; + break; + case DOUBLE: + bitWidth = 64; + break; + default: + throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); } return newFixedWidthTypeLayout(dataVector(bitWidth)); } - @Override public TypeLayout visit(Decimal type) { + @Override + public TypeLayout visit(Decimal type) { // TODO: check size return newFixedWidthTypeLayout(dataVector(64)); // actually depends on the type fields } - @Override public TypeLayout visit(Bool type) { + @Override + public TypeLayout visit(Bool type) { return newFixedWidthTypeLayout(booleanVector()); } - @Override public TypeLayout visit(Binary type) { + @Override + public TypeLayout visit(Binary type) { return newVariableWidthTypeLayout(); } - @Override public TypeLayout visit(Utf8 type) { + @Override + public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); } @@ -178,12 +190,12 @@ public TypeLayout visit(Time type) { @Override public TypeLayout visit(Interval type) { // TODO: check size switch (type.getUnit()) { - case DAY_TIME: - return newFixedWidthTypeLayout(dataVector(64)); - case YEAR_MONTH: - return newFixedWidthTypeLayout(dataVector(64)); - default: - throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); + case DAY_TIME: + return newFixedWidthTypeLayout(dataVector(64)); + case YEAR_MONTH: + return newFixedWidthTypeLayout(dataVector(64)); + default: + throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); } } @@ -228,12 +240,15 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } TypeLayout other = (TypeLayout) obj; return vectors.equals(other.vectors); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java index 2073795b2a199..0871baf38edaa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.schema; import static org.apache.arrow.vector.schema.ArrowVectorType.DATA; @@ -48,16 +49,16 @@ public static VectorLayout offsetVector() { public static VectorLayout dataVector(int typeBitWidth) { switch (typeBitWidth) { - case 8: - return VALUES_8; - case 16: - return VALUES_16; - case 32: - return VALUES_32; - case 64: - return VALUES_64; - default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + case 8: + return VALUES_8; + case 16: + return VALUES_16; + case 32: + return VALUES_32; + case 64: + return VALUES_64; + default: + throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); } } @@ -81,7 +82,7 @@ public static VectorLayout byteVector() { private VectorLayout(@JsonProperty("type") ArrowVectorType type, @JsonProperty("typeBitWidth") int typeBitWidth) { super(); this.type = Preconditions.checkNotNull(type); - this.typeBitWidth = (short)typeBitWidth; + this.typeBitWidth = (short) typeBitWidth; if (typeBitWidth <= 0) { throw new IllegalArgumentException("bitWidth invalid: " + typeBitWidth); } @@ -111,18 +112,22 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } VectorLayout other = (VectorLayout) obj; return type.equals(other.type) && (typeBitWidth == other.typeBitWidth); } @Override - public int writeTo(FlatBufferBuilder builder) {; + public int writeTo(FlatBufferBuilder builder) { + ; return org.apache.arrow.flatbuf.VectorLayout.createVectorLayout(builder, typeBitWidth, type.getType()); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java index 641978a516ae4..5b6300076b6c2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import java.io.IOException; @@ -33,31 +34,33 @@ */ public class ArrowStreamReader extends ArrowReader { - /** - * Constructs a streaming read, reading bytes from 'in'. Non-blocking. - * @param in the stream to read from - * @param allocator to allocate new buffers - */ - public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { - super(new ReadChannel(in), allocator); - } - - public ArrowStreamReader(InputStream in, BufferAllocator allocator) { - this(Channels.newChannel(in), allocator); - } - - /** - * Reads the schema message from the beginning of the stream. - * @param in to allocate new buffers - * @return the deserialized arrow schema - */ - @Override - protected Schema readSchema(ReadChannel in) throws IOException { - return MessageSerializer.deserializeSchema(in); - } - - @Override - protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { - return MessageSerializer.deserializeMessageBatch(in, allocator); - } + /** + * Constructs a streaming read, reading bytes from 'in'. Non-blocking. + * + * @param in the stream to read from + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { + super(new ReadChannel(in), allocator); + } + + public ArrowStreamReader(InputStream in, BufferAllocator allocator) { + this(Channels.newChannel(in), allocator); + } + + /** + * Reads the schema message from the beginning of the stream. + * + * @param in to allocate new buffers + * @return the deserialized arrow schema + */ + @Override + protected Schema readSchema(ReadChannel in) throws IOException { + return MessageSerializer.deserializeSchema(in); + } + + @Override + protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { + return MessageSerializer.deserializeMessageBatch(in, allocator); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java index ea29cd99804c8..b854cd2bb6e74 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import org.apache.arrow.memory.BufferAllocator; @@ -35,22 +36,23 @@ public class ArrowStreamWriter extends ArrowWriter { - public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, OutputStream out) { - this(root, provider, Channels.newChannel(out)); - } + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, OutputStream out) { + this(root, provider, Channels.newChannel(out)); + } - public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { - super(root, provider, out); - } + public ArrowStreamWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { + super(root, provider, out); + } - @Override - protected void startInternal(WriteChannel out) throws IOException {} + @Override + protected void startInternal(WriteChannel out) throws IOException { + } - @Override - protected void endInternal(WriteChannel out, - Schema schema, - List dictionaries, - List records) throws IOException { - out.writeIntLittleEndian(0); - } + @Override + protected void endInternal(WriteChannel out, + Schema schema, + List dictionaries, + List records) throws IOException { + out.writeIntLittleEndian(0); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java index 05ad92ded1d52..a70d029389427 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import java.io.IOException; @@ -48,29 +49,30 @@ * Utility class for serializing Messages. Messages are all serialized a similar way. * 1. 4 byte little endian message header prefix * 2. FB serialized Message: This includes it the body length, which is the serialized - * body and the type of the message. + * body and the type of the message. * 3. Serialized message. * * For schema messages, the serialization is simply the FB serialized Schema. * * For RecordBatch messages the serialization is: - * 1. 4 byte little endian batch metadata header - * 2. FB serialized RowBatch - * 3. Padding to align to 8 byte boundary. - * 4. serialized RowBatch buffers. + * 1. 4 byte little endian batch metadata header + * 2. FB serialized RowBatch + * 3. Padding to align to 8 byte boundary. + * 4. serialized RowBatch buffers. */ public class MessageSerializer { public static int bytesToInt(byte[] bytes) { return ((bytes[3] & 255) << 24) + - ((bytes[2] & 255) << 16) + - ((bytes[1] & 255) << 8) + - ((bytes[0] & 255) << 0); + ((bytes[2] & 255) << 16) + + ((bytes[1] & 255) << 8) + + ((bytes[0] & 255) << 0); } /** * Serialize a schema object. - * @param out where to write the schema + * + * @param out where to write the schema * @param schema the object to serialize to out * @return the resulting size of the serialized schema * @throws IOException if something went wrong @@ -86,6 +88,7 @@ public static long serialize(WriteChannel out, Schema schema) throws IOException /** * Deserializes a schema object. Format is from serialize(). + * * @param in the channel to deserialize from * @return the deserialized object * @throws IOException if something went wrong @@ -106,13 +109,14 @@ public static Schema deserializeSchema(ReadChannel in) throws IOException { /** * Serializes an ArrowRecordBatch. Returns the offset and length of the written batch. - * @param out where to write the batch + * + * @param out where to write the batch * @param batch the object to serialize to out * @return the serialized block metadata * @throws IOException if something went wrong */ public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) - throws IOException { + throws IOException { long start = out.getCurrentPosition(); int bodyLength = batch.computeBodyLength(); @@ -125,9 +129,9 @@ public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) int metadataLength = serializedMessage.remaining(); // calculate alignment bytes so that metadata length points to the correct location after alignment - int padding = (int)((start + metadataLength + 4) % 8); + int padding = (int) ((start + metadataLength + 4) % 8); if (padding != 0) { - metadataLength += (8 - padding); + metadataLength += (8 - padding); } out.writeIntLittleEndian(metadataLength); @@ -152,7 +156,7 @@ public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) t ArrowBuffer layout = buffersLayout.get(i); long startPosition = bufferStart + layout.getOffset(); if (startPosition != out.getCurrentPosition()) { - out.writeZeros((int)(startPosition - out.getCurrentPosition())); + out.writeZeros((int) (startPosition - out.getCurrentPosition())); } out.write(buffer); if (out.getCurrentPosition() != startPosition + layout.getSize()) { @@ -165,9 +169,10 @@ public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) t /** * Deserializes a RecordBatch - * @param in the channel to deserialize from + * + * @param in the channel to deserialize from * @param message the object to derialize to - * @param alloc to allocate buffers + * @param alloc to allocate buffers * @return the deserialized object * @throws IOException if something went wrong */ @@ -188,14 +193,15 @@ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, Message me /** * Deserializes a RecordBatch knowing the size of the entire message up front. This * minimizes the number of reads to the underlying stream. - * @param in the channel to deserialize from + * + * @param in the channel to deserialize from * @param block the object to derialize to * @param alloc to allocate buffers * @return the deserialized object * @throws IOException if something went wrong */ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, ArrowBlock block, - BufferAllocator alloc) throws IOException { + BufferAllocator alloc) throws IOException { // Metadata length contains integer prefix plus byte padding long totalLen = block.getMetadataLength() + block.getBodyLength(); @@ -223,37 +229,38 @@ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, ArrowBlock // Deserializes a record batch given the Flatbuffer metadata and in-memory body public static ArrowRecordBatch deserializeRecordBatch(RecordBatch recordBatchFB, - ArrowBuf body) throws IOException { + ArrowBuf body) throws IOException { // Now read the body int nodesLength = recordBatchFB.nodesLength(); List nodes = new ArrayList<>(); for (int i = 0; i < nodesLength; ++i) { FieldNode node = recordBatchFB.nodes(i); - if ((int)node.length() != node.length() || - (int)node.nullCount() != node.nullCount()) { + if ((int) node.length() != node.length() || + (int) node.nullCount() != node.nullCount()) { throw new IOException("Cannot currently deserialize record batches with " + - "node length larger than Int.MAX_VALUE"); + "node length larger than Int.MAX_VALUE"); } - nodes.add(new ArrowFieldNode((int)node.length(), (int)node.nullCount())); + nodes.add(new ArrowFieldNode((int) node.length(), (int) node.nullCount())); } List buffers = new ArrayList<>(); for (int i = 0; i < recordBatchFB.buffersLength(); ++i) { Buffer bufferFB = recordBatchFB.buffers(i); - ArrowBuf vectorBuffer = body.slice((int)bufferFB.offset(), (int)bufferFB.length()); + ArrowBuf vectorBuffer = body.slice((int) bufferFB.offset(), (int) bufferFB.length()); buffers.add(vectorBuffer); } - if ((int)recordBatchFB.length() != recordBatchFB.length()) { + if ((int) recordBatchFB.length() != recordBatchFB.length()) { throw new IOException("Cannot currently deserialize record batches over 2GB"); } ArrowRecordBatch arrowRecordBatch = - new ArrowRecordBatch((int)recordBatchFB.length(), nodes, buffers); + new ArrowRecordBatch((int) recordBatchFB.length(), nodes, buffers); body.release(); return arrowRecordBatch; } /** * Serializes a dictionary ArrowRecordBatch. Returns the offset and length of the written batch. - * @param out where to serialize + * + * @param out where to serialize * @param batch the batch to serialize * @return the metadata of the serialized block * @throws IOException if something went wrong @@ -290,15 +297,16 @@ public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) /** * Deserializes a DictionaryBatch - * @param in where to read from + * + * @param in where to read from * @param message the message message metadata to deserialize - * @param alloc the allocator for new buffers + * @param alloc the allocator for new buffers * @return the corresponding dictionary batch * @throws IOException if something went wrong */ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, - Message message, - BufferAllocator alloc) throws IOException { + Message message, + BufferAllocator alloc) throws IOException { DictionaryBatch dictionaryBatchFB = (DictionaryBatch) message.header(new DictionaryBatch()); int bodyLength = (int) message.bodyLength(); @@ -315,7 +323,8 @@ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, /** * Deserializes a DictionaryBatch knowing the size of the entire message up front. This * minimizes the number of reads to the underlying stream. - * @param in where to read from + * + * @param in where to read from * @param block block metadata for deserializing * @param alloc to allocate new buffers * @return the corresponding dictionary @@ -345,7 +354,7 @@ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, // Now read the body final ArrowBuf body = buffer.slice(block.getMetadataLength(), - (int) totalLen - block.getMetadataLength()); + (int) totalLen - block.getMetadataLength()); ArrowRecordBatch recordBatch = deserializeRecordBatch(dictionaryBatchFB.data(), body); return new ArrowDictionaryBatch(dictionaryBatchFB.id(), recordBatch); } @@ -359,22 +368,26 @@ public static ArrowMessage deserializeMessageBatch(ReadChannel in, BufferAllocat } switch (message.headerType()) { - case MessageHeader.RecordBatch: return deserializeRecordBatch(in, message, alloc); - case MessageHeader.DictionaryBatch: return deserializeDictionaryBatch(in, message, alloc); - default: throw new IOException("Unexpected message header type " + message.headerType()); + case MessageHeader.RecordBatch: + return deserializeRecordBatch(in, message, alloc); + case MessageHeader.DictionaryBatch: + return deserializeDictionaryBatch(in, message, alloc); + default: + throw new IOException("Unexpected message header type " + message.headerType()); } } /** * Serializes a message header. - * @param builder to write the flatbuf to - * @param headerType headerType field + * + * @param builder to write the flatbuf to + * @param headerType headerType field * @param headerOffset header offset field - * @param bodyLength body length field + * @param bodyLength body length field * @return the corresponding ByteBuffer */ public static ByteBuffer serializeMessage(FlatBufferBuilder builder, byte headerType, - int headerOffset, int bodyLength) { + int headerOffset, int bodyLength) { Message.startMessage(builder); Message.addHeaderType(builder, headerType); Message.addHeader(builder, headerOffset); @@ -387,9 +400,13 @@ public static ByteBuffer serializeMessage(FlatBufferBuilder builder, byte header private static Message deserializeMessage(ReadChannel in) throws IOException { // Read the message size. There is an i32 little endian prefix. ByteBuffer buffer = ByteBuffer.allocate(4); - if (in.readFully(buffer) != 4) return null; + if (in.readFully(buffer) != 4) { + return null; + } int messageLength = bytesToInt(buffer.array()); - if (messageLength == 0) return null; + if (messageLength == 0) { + return null; + } buffer = ByteBuffer.allocate(messageLength); if (in.readFully(buffer) != messageLength) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java b/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java index e5beebffde9e4..003d3cdbceb3b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/DateUnit.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum DateUnit { @@ -22,6 +23,7 @@ public enum DateUnit { MILLISECOND(org.apache.arrow.flatbuf.DateUnit.MILLISECOND); private static final DateUnit[] valuesByFlatbufId = new DateUnit[DateUnit.values().length]; + static { for (DateUnit v : DateUnit.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java b/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java index 3206969fb7ead..ec253287b261d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/FloatingPointPrecision.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; import org.apache.arrow.flatbuf.Precision; @@ -25,6 +26,7 @@ public enum FloatingPointPrecision { DOUBLE(Precision.DOUBLE); private static final FloatingPointPrecision[] valuesByFlatbufId = new FloatingPointPrecision[FloatingPointPrecision.values().length]; + static { for (FloatingPointPrecision v : FloatingPointPrecision.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java b/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java index b3ddf1fe497de..a8157f19e1a69 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/IntervalUnit.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum IntervalUnit { @@ -22,6 +23,7 @@ public enum IntervalUnit { DAY_TIME(org.apache.arrow.flatbuf.IntervalUnit.DAY_TIME); private static final IntervalUnit[] valuesByFlatbufId = new IntervalUnit[IntervalUnit.values().length]; + static { for (IntervalUnit v : IntervalUnit.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java b/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java index cea9866965854..1da9321fcc4ee 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/TimeUnit.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum TimeUnit { @@ -24,6 +25,7 @@ public enum TimeUnit { NANOSECOND(org.apache.arrow.flatbuf.TimeUnit.NANOSECOND); private static final TimeUnit[] valuesByFlatbufId = new TimeUnit[TimeUnit.values().length]; + static { for (TimeUnit v : TimeUnit.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 6591a4b16da16..c57dd6dafe9e6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types; import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; @@ -531,73 +532,83 @@ public final ArrowType getType() { public static MinorType getMinorTypeForArrowType(ArrowType arrowType) { return arrowType.accept(new ArrowTypeVisitor() { - @Override public MinorType visit(Null type) { + @Override + public MinorType visit(Null type) { return MinorType.NULL; } - @Override public MinorType visit(Struct type) { + @Override + public MinorType visit(Struct type) { return MinorType.MAP; } - @Override public MinorType visit(List type) { + @Override + public MinorType visit(List type) { return MinorType.LIST; } - @Override public MinorType visit(FixedSizeList type) { + @Override + public MinorType visit(FixedSizeList type) { return MinorType.FIXED_SIZE_LIST; } - @Override public MinorType visit(Union type) { + @Override + public MinorType visit(Union type) { return MinorType.UNION; } @Override public MinorType visit(Int type) { switch (type.getBitWidth()) { - case 8: - return type.getIsSigned() ? MinorType.TINYINT : MinorType.UINT1; - case 16: - return type.getIsSigned() ? MinorType.SMALLINT : MinorType.UINT2; - case 32: - return type.getIsSigned() ? MinorType.INT : MinorType.UINT4; - case 64: - return type.getIsSigned() ? MinorType.BIGINT : MinorType.UINT8; - default: - throw new IllegalArgumentException("only 8, 16, 32, 64 supported: " + type); + case 8: + return type.getIsSigned() ? MinorType.TINYINT : MinorType.UINT1; + case 16: + return type.getIsSigned() ? MinorType.SMALLINT : MinorType.UINT2; + case 32: + return type.getIsSigned() ? MinorType.INT : MinorType.UINT4; + case 64: + return type.getIsSigned() ? MinorType.BIGINT : MinorType.UINT8; + default: + throw new IllegalArgumentException("only 8, 16, 32, 64 supported: " + type); } } @Override public MinorType visit(FloatingPoint type) { switch (type.getPrecision()) { - case HALF: - throw new UnsupportedOperationException("NYI: " + type); - case SINGLE: - return MinorType.FLOAT4; - case DOUBLE: - return MinorType.FLOAT8; - default: - throw new IllegalArgumentException("unknown precision: " + type); + case HALF: + throw new UnsupportedOperationException("NYI: " + type); + case SINGLE: + return MinorType.FLOAT4; + case DOUBLE: + return MinorType.FLOAT8; + default: + throw new IllegalArgumentException("unknown precision: " + type); } } - @Override public MinorType visit(Utf8 type) { + @Override + public MinorType visit(Utf8 type) { return MinorType.VARCHAR; } - @Override public MinorType visit(Binary type) { + @Override + public MinorType visit(Binary type) { return MinorType.VARBINARY; } - @Override public MinorType visit(Bool type) { + @Override + public MinorType visit(Bool type) { return MinorType.BIT; } - @Override public MinorType visit(Decimal type) { + @Override + public MinorType visit(Decimal type) { return MinorType.DECIMAL; } - @Override public MinorType visit(Date type) { + @Override + public MinorType visit(Date type) { switch (type.getUnit()) { case DAY: return MinorType.DATEDAY; @@ -608,7 +619,8 @@ public MinorType visit(FloatingPoint type) { } } - @Override public MinorType visit(Time type) { + @Override + public MinorType visit(Time type) { switch (type.getUnit()) { case SECOND: return MinorType.TIMESEC; @@ -623,7 +635,8 @@ public MinorType visit(FloatingPoint type) { } } - @Override public MinorType visit(Timestamp type) { + @Override + public MinorType visit(Timestamp type) { String tz = type.getTimezone(); switch (type.getUnit()) { case SECOND: @@ -642,12 +655,12 @@ public MinorType visit(FloatingPoint type) { @Override public MinorType visit(Interval type) { switch (type.getUnit()) { - case DAY_TIME: - return MinorType.INTERVALDAY; - case YEAR_MONTH: - return MinorType.INTERVALYEAR; - default: - throw new IllegalArgumentException("unknown unit: " + type); + case DAY_TIME: + return MinorType.INTERVALDAY; + case YEAR_MONTH: + return MinorType.INTERVALYEAR; + default: + throw new IllegalArgumentException("unknown unit: " + type); } } }); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java b/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java index 8e957bc0b6e34..231e85fb2c726 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/UnionMode.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types; public enum UnionMode { @@ -22,6 +23,7 @@ public enum UnionMode { Dense(org.apache.arrow.flatbuf.UnionMode.Dense); private static final UnionMode[] valuesByFlatbufId = new UnionMode[UnionMode.values().length]; + static { for (UnionMode v : UnionMode.values()) { valuesByFlatbufId[v.flatbufID] = v; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java index 8a0224d5564b0..2657532fa9be6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/DictionaryEncoding.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.types.pojo; import java.util.Objects; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index a8f2ae5fbab8c..48e71a976c0e8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; @@ -166,7 +167,7 @@ public int getField(FlatBufferBuilder builder) { int layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, buffersData); int[] metadataOffsets = new int[getMetadata().size()]; Iterator> metadataIterator = getMetadata().entrySet().iterator(); - for (int i = 0; i < metadataOffsets.length; i ++) { + for (int i = 0; i < metadataOffsets.length; i++) { Entry kv = metadataIterator.next(); int keyOffset = builder.createString(kv.getKey()); int valueOffset = builder.createString(kv.getValue()); @@ -239,11 +240,11 @@ public boolean equals(Object obj) { } Field that = (Field) obj; return Objects.equals(this.name, that.name) && - Objects.equals(this.isNullable(), that.isNullable()) && - Objects.equals(this.getType(), that.getType()) && - Objects.equals(this.getDictionary(), that.getDictionary()) && - Objects.equals(this.getMetadata(), that.getMetadata()) && - Objects.equals(this.children, that.children); + Objects.equals(this.isNullable(), that.isNullable()) && + Objects.equals(this.getType(), that.getType()) && + Objects.equals(this.getDictionary(), that.getDictionary()) && + Objects.equals(this.getMetadata(), that.getMetadata()) && + Objects.equals(this.children, that.children); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java index c8fc689cd2c9c..f0856198a4c79 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/FieldType.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; import static com.google.common.base.Preconditions.checkNotNull; @@ -55,12 +56,15 @@ public FieldType(boolean nullable, ArrowType type, DictionaryEncoding dictionary public boolean isNullable() { return nullable; } + public ArrowType getType() { return type; } + public DictionaryEncoding getDictionary() { return dictionary; } + public Map getMetadata() { return metadata; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java index 82e2ef55c20c6..a87d4490060df 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; @@ -53,7 +54,7 @@ public class Schema { /** * @param fields the list of the fields - * @param name the name of the field to return + * @param name the name of the field to return * @return the corresponding field * @throws IllegalArgumentException if the field was not found */ @@ -146,7 +147,7 @@ public int getSchema(FlatBufferBuilder builder) { int fieldsOffset = org.apache.arrow.flatbuf.Schema.createFieldsVector(builder, fieldOffsets); int[] metadataOffsets = new int[metadata.size()]; Iterator> metadataIterator = metadata.entrySet().iterator(); - for (int i = 0; i < metadataOffsets.length; i ++) { + for (int i = 0; i < metadataOffsets.length; i++) { Entry kv = metadataIterator.next(); int keyOffset = builder.createString(kv.getKey()); int valueOffset = builder.createString(kv.getValue()); @@ -174,7 +175,7 @@ public boolean equals(Object obj) { return false; } return Objects.equals(this.fields, ((Schema) obj).fields) && - Objects.equals(this.metadata, ((Schema) obj).metadata); + Objects.equals(this.metadata, ((Schema) obj).metadata); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java index 69840fefa968b..80d4a4684c512 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteArrayReadableSeekableByteChannel.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.io.IOException; @@ -58,7 +59,7 @@ public long position() throws IOException { @Override public SeekableByteChannel position(final long newPosition) throws IOException { - this.position = (int)newPosition; + this.position = (int) newPosition; return this; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index 68b9fb25f2112..5fe556a552714 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ + package org.apache.arrow.vector.util; import io.netty.buffer.ArrowBuf; @@ -31,15 +32,15 @@ public class ByteFunctionHelpers { /** * Helper function to check for equality of bytes in two ArrowBufs * - * @param left Left ArrowBuf for comparison + * @param left Left ArrowBuf for comparison * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right Right ArrowBuf for comparison + * @param lEnd end offset in the buffer + * @param right Right ArrowBuf for comparison * @param rStart start offset in the buffer - * @param rEnd end offset in the buffer + * @param rEnd end offset in the buffer * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise */ - public static final int equal(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd){ + public static final int equal(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd) { if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { left.checkBytes(lStart, lEnd); right.checkBytes(rStart, rEnd); @@ -48,7 +49,7 @@ public static final int equal(final ArrowBuf left, int lStart, int lEnd, final A } private static final int memEqual(final long laddr, int lStart, int lEnd, final long raddr, int rStart, - final int rEnd) { + final int rEnd) { int n = lEnd - lStart; if (n == rEnd - rStart) { @@ -85,15 +86,15 @@ private static final int memEqual(final long laddr, int lStart, int lEnd, final * * Function will check data before completing in the case that * - * @param left Left ArrowBuf to compare + * @param left Left ArrowBuf to compare * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right Right ArrowBuf to compare + * @param lEnd end offset in the buffer + * @param right Right ArrowBuf to compare * @param rStart start offset in the buffer - * @param rEnd end offset in the buffer + * @param rEnd end offset in the buffer * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise */ - public static final int compare(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd){ + public static final int compare(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd) { if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { left.checkBytes(lStart, lEnd); right.checkBytes(rStart, rEnd); @@ -140,12 +141,12 @@ private static final int memcmp(final long laddr, int lStart, int lEnd, final lo /** * Helper function to compare a set of bytes in ArrowBuf to a ByteArray. * - * @param left Left ArrowBuf for comparison purposes + * @param left Left ArrowBuf for comparison purposes * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right second input to be compared + * @param lEnd end offset in the buffer + * @param right second input to be compared * @param rStart start offset in the byte array - * @param rEnd end offset in the byte array + * @param rEnd end offset in the byte array * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise */ public static final int compare(final ArrowBuf left, int lStart, int lEnd, final byte[] right, int rStart, final int rEnd) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java b/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java index 249834270b3fe..38e3b78c778ea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java index 8aad41744f673..3dd169b82357b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -33,660 +33,660 @@ public class DateUtility { - /* We have a hashmap that stores the timezone as the key and an index as the value - * While storing the timezone in value vectors, holders we only use this index. As we - * reconstruct the timestamp, we use this index to index through the array timezoneList - * and get the corresponding timezone and pass it to joda-time - */ + /* We have a hashmap that stores the timezone as the key and an index as the value + * While storing the timezone in value vectors, holders we only use this index. As we + * reconstruct the timestamp, we use this index to index through the array timezoneList + * and get the corresponding timezone and pass it to joda-time + */ public static ObjectIntHashMap timezoneMap = new ObjectIntHashMap(); - public static String[] timezoneList = {"Africa/Abidjan", - "Africa/Accra", - "Africa/Addis_Ababa", - "Africa/Algiers", - "Africa/Asmara", - "Africa/Asmera", - "Africa/Bamako", - "Africa/Bangui", - "Africa/Banjul", - "Africa/Bissau", - "Africa/Blantyre", - "Africa/Brazzaville", - "Africa/Bujumbura", - "Africa/Cairo", - "Africa/Casablanca", - "Africa/Ceuta", - "Africa/Conakry", - "Africa/Dakar", - "Africa/Dar_es_Salaam", - "Africa/Djibouti", - "Africa/Douala", - "Africa/El_Aaiun", - "Africa/Freetown", - "Africa/Gaborone", - "Africa/Harare", - "Africa/Johannesburg", - "Africa/Juba", - "Africa/Kampala", - "Africa/Khartoum", - "Africa/Kigali", - "Africa/Kinshasa", - "Africa/Lagos", - "Africa/Libreville", - "Africa/Lome", - "Africa/Luanda", - "Africa/Lubumbashi", - "Africa/Lusaka", - "Africa/Malabo", - "Africa/Maputo", - "Africa/Maseru", - "Africa/Mbabane", - "Africa/Mogadishu", - "Africa/Monrovia", - "Africa/Nairobi", - "Africa/Ndjamena", - "Africa/Niamey", - "Africa/Nouakchott", - "Africa/Ouagadougou", - "Africa/Porto-Novo", - "Africa/Sao_Tome", - "Africa/Timbuktu", - "Africa/Tripoli", - "Africa/Tunis", - "Africa/Windhoek", - "America/Adak", - "America/Anchorage", - "America/Anguilla", - "America/Antigua", - "America/Araguaina", - "America/Argentina/Buenos_Aires", - "America/Argentina/Catamarca", - "America/Argentina/ComodRivadavia", - "America/Argentina/Cordoba", - "America/Argentina/Jujuy", - "America/Argentina/La_Rioja", - "America/Argentina/Mendoza", - "America/Argentina/Rio_Gallegos", - "America/Argentina/Salta", - "America/Argentina/San_Juan", - "America/Argentina/San_Luis", - "America/Argentina/Tucuman", - "America/Argentina/Ushuaia", - "America/Aruba", - "America/Asuncion", - "America/Atikokan", - "America/Atka", - "America/Bahia", - "America/Bahia_Banderas", - "America/Barbados", - "America/Belem", - "America/Belize", - "America/Blanc-Sablon", - "America/Boa_Vista", - "America/Bogota", - "America/Boise", - "America/Buenos_Aires", - "America/Cambridge_Bay", - "America/Campo_Grande", - "America/Cancun", - "America/Caracas", - "America/Catamarca", - "America/Cayenne", - "America/Cayman", - "America/Chicago", - "America/Chihuahua", - "America/Coral_Harbour", - "America/Cordoba", - "America/Costa_Rica", - "America/Cuiaba", - "America/Curacao", - "America/Danmarkshavn", - "America/Dawson", - "America/Dawson_Creek", - "America/Denver", - "America/Detroit", - "America/Dominica", - "America/Edmonton", - "America/Eirunepe", - "America/El_Salvador", - "America/Ensenada", - "America/Fort_Wayne", - "America/Fortaleza", - "America/Glace_Bay", - "America/Godthab", - "America/Goose_Bay", - "America/Grand_Turk", - "America/Grenada", - "America/Guadeloupe", - "America/Guatemala", - "America/Guayaquil", - "America/Guyana", - "America/Halifax", - "America/Havana", - "America/Hermosillo", - "America/Indiana/Indianapolis", - "America/Indiana/Knox", - "America/Indiana/Marengo", - "America/Indiana/Petersburg", - "America/Indiana/Tell_City", - "America/Indiana/Vevay", - "America/Indiana/Vincennes", - "America/Indiana/Winamac", - "America/Indianapolis", - "America/Inuvik", - "America/Iqaluit", - "America/Jamaica", - "America/Jujuy", - "America/Juneau", - "America/Kentucky/Louisville", - "America/Kentucky/Monticello", - "America/Knox_IN", - "America/Kralendijk", - "America/La_Paz", - "America/Lima", - "America/Los_Angeles", - "America/Louisville", - "America/Lower_Princes", - "America/Maceio", - "America/Managua", - "America/Manaus", - "America/Marigot", - "America/Martinique", - "America/Matamoros", - "America/Mazatlan", - "America/Mendoza", - "America/Menominee", - "America/Merida", - "America/Metlakatla", - "America/Mexico_City", - "America/Miquelon", - "America/Moncton", - "America/Monterrey", - "America/Montevideo", - "America/Montreal", - "America/Montserrat", - "America/Nassau", - "America/New_York", - "America/Nipigon", - "America/Nome", - "America/Noronha", - "America/North_Dakota/Beulah", - "America/North_Dakota/Center", - "America/North_Dakota/New_Salem", - "America/Ojinaga", - "America/Panama", - "America/Pangnirtung", - "America/Paramaribo", - "America/Phoenix", - "America/Port-au-Prince", - "America/Port_of_Spain", - "America/Porto_Acre", - "America/Porto_Velho", - "America/Puerto_Rico", - "America/Rainy_River", - "America/Rankin_Inlet", - "America/Recife", - "America/Regina", - "America/Resolute", - "America/Rio_Branco", - "America/Rosario", - "America/Santa_Isabel", - "America/Santarem", - "America/Santiago", - "America/Santo_Domingo", - "America/Sao_Paulo", - "America/Scoresbysund", - "America/Shiprock", - "America/Sitka", - "America/St_Barthelemy", - "America/St_Johns", - "America/St_Kitts", - "America/St_Lucia", - "America/St_Thomas", - "America/St_Vincent", - "America/Swift_Current", - "America/Tegucigalpa", - "America/Thule", - "America/Thunder_Bay", - "America/Tijuana", - "America/Toronto", - "America/Tortola", - "America/Vancouver", - "America/Virgin", - "America/Whitehorse", - "America/Winnipeg", - "America/Yakutat", - "America/Yellowknife", - "Antarctica/Casey", - "Antarctica/Davis", - "Antarctica/DumontDUrville", - "Antarctica/Macquarie", - "Antarctica/Mawson", - "Antarctica/McMurdo", - "Antarctica/Palmer", - "Antarctica/Rothera", - "Antarctica/South_Pole", - "Antarctica/Syowa", - "Antarctica/Vostok", - "Arctic/Longyearbyen", - "Asia/Aden", - "Asia/Almaty", - "Asia/Amman", - "Asia/Anadyr", - "Asia/Aqtau", - "Asia/Aqtobe", - "Asia/Ashgabat", - "Asia/Ashkhabad", - "Asia/Baghdad", - "Asia/Bahrain", - "Asia/Baku", - "Asia/Bangkok", - "Asia/Beirut", - "Asia/Bishkek", - "Asia/Brunei", - "Asia/Calcutta", - "Asia/Choibalsan", - "Asia/Chongqing", - "Asia/Chungking", - "Asia/Colombo", - "Asia/Dacca", - "Asia/Damascus", - "Asia/Dhaka", - "Asia/Dili", - "Asia/Dubai", - "Asia/Dushanbe", - "Asia/Gaza", - "Asia/Harbin", - "Asia/Hebron", - "Asia/Ho_Chi_Minh", - "Asia/Hong_Kong", - "Asia/Hovd", - "Asia/Irkutsk", - "Asia/Istanbul", - "Asia/Jakarta", - "Asia/Jayapura", - "Asia/Jerusalem", - "Asia/Kabul", - "Asia/Kamchatka", - "Asia/Karachi", - "Asia/Kashgar", - "Asia/Kathmandu", - "Asia/Katmandu", - "Asia/Kolkata", - "Asia/Krasnoyarsk", - "Asia/Kuala_Lumpur", - "Asia/Kuching", - "Asia/Kuwait", - "Asia/Macao", - "Asia/Macau", - "Asia/Magadan", - "Asia/Makassar", - "Asia/Manila", - "Asia/Muscat", - "Asia/Nicosia", - "Asia/Novokuznetsk", - "Asia/Novosibirsk", - "Asia/Omsk", - "Asia/Oral", - "Asia/Phnom_Penh", - "Asia/Pontianak", - "Asia/Pyongyang", - "Asia/Qatar", - "Asia/Qyzylorda", - "Asia/Rangoon", - "Asia/Riyadh", - "Asia/Saigon", - "Asia/Sakhalin", - "Asia/Samarkand", - "Asia/Seoul", - "Asia/Shanghai", - "Asia/Singapore", - "Asia/Taipei", - "Asia/Tashkent", - "Asia/Tbilisi", - "Asia/Tehran", - "Asia/Tel_Aviv", - "Asia/Thimbu", - "Asia/Thimphu", - "Asia/Tokyo", - "Asia/Ujung_Pandang", - "Asia/Ulaanbaatar", - "Asia/Ulan_Bator", - "Asia/Urumqi", - "Asia/Vientiane", - "Asia/Vladivostok", - "Asia/Yakutsk", - "Asia/Yekaterinburg", - "Asia/Yerevan", - "Atlantic/Azores", - "Atlantic/Bermuda", - "Atlantic/Canary", - "Atlantic/Cape_Verde", - "Atlantic/Faeroe", - "Atlantic/Faroe", - "Atlantic/Jan_Mayen", - "Atlantic/Madeira", - "Atlantic/Reykjavik", - "Atlantic/South_Georgia", - "Atlantic/St_Helena", - "Atlantic/Stanley", - "Australia/ACT", - "Australia/Adelaide", - "Australia/Brisbane", - "Australia/Broken_Hill", - "Australia/Canberra", - "Australia/Currie", - "Australia/Darwin", - "Australia/Eucla", - "Australia/Hobart", - "Australia/LHI", - "Australia/Lindeman", - "Australia/Lord_Howe", - "Australia/Melbourne", - "Australia/NSW", - "Australia/North", - "Australia/Perth", - "Australia/Queensland", - "Australia/South", - "Australia/Sydney", - "Australia/Tasmania", - "Australia/Victoria", - "Australia/West", - "Australia/Yancowinna", - "Brazil/Acre", - "Brazil/DeNoronha", - "Brazil/East", - "Brazil/West", - "CET", - "CST6CDT", - "Canada/Atlantic", - "Canada/Central", - "Canada/East-Saskatchewan", - "Canada/Eastern", - "Canada/Mountain", - "Canada/Newfoundland", - "Canada/Pacific", - "Canada/Saskatchewan", - "Canada/Yukon", - "Chile/Continental", - "Chile/EasterIsland", - "Cuba", - "EET", - "EST", - "EST5EDT", - "Egypt", - "Eire", - "Etc/GMT", - "Etc/GMT+0", - "Etc/GMT+1", - "Etc/GMT+10", - "Etc/GMT+11", - "Etc/GMT+12", - "Etc/GMT+2", - "Etc/GMT+3", - "Etc/GMT+4", - "Etc/GMT+5", - "Etc/GMT+6", - "Etc/GMT+7", - "Etc/GMT+8", - "Etc/GMT+9", - "Etc/GMT-0", - "Etc/GMT-1", - "Etc/GMT-10", - "Etc/GMT-11", - "Etc/GMT-12", - "Etc/GMT-13", - "Etc/GMT-14", - "Etc/GMT-2", - "Etc/GMT-3", - "Etc/GMT-4", - "Etc/GMT-5", - "Etc/GMT-6", - "Etc/GMT-7", - "Etc/GMT-8", - "Etc/GMT-9", - "Etc/GMT0", - "Etc/Greenwich", - "Etc/UCT", - "Etc/UTC", - "Etc/Universal", - "Etc/Zulu", - "Europe/Amsterdam", - "Europe/Andorra", - "Europe/Athens", - "Europe/Belfast", - "Europe/Belgrade", - "Europe/Berlin", - "Europe/Bratislava", - "Europe/Brussels", - "Europe/Bucharest", - "Europe/Budapest", - "Europe/Chisinau", - "Europe/Copenhagen", - "Europe/Dublin", - "Europe/Gibraltar", - "Europe/Guernsey", - "Europe/Helsinki", - "Europe/Isle_of_Man", - "Europe/Istanbul", - "Europe/Jersey", - "Europe/Kaliningrad", - "Europe/Kiev", - "Europe/Lisbon", - "Europe/Ljubljana", - "Europe/London", - "Europe/Luxembourg", - "Europe/Madrid", - "Europe/Malta", - "Europe/Mariehamn", - "Europe/Minsk", - "Europe/Monaco", - "Europe/Moscow", - "Europe/Nicosia", - "Europe/Oslo", - "Europe/Paris", - "Europe/Podgorica", - "Europe/Prague", - "Europe/Riga", - "Europe/Rome", - "Europe/Samara", - "Europe/San_Marino", - "Europe/Sarajevo", - "Europe/Simferopol", - "Europe/Skopje", - "Europe/Sofia", - "Europe/Stockholm", - "Europe/Tallinn", - "Europe/Tirane", - "Europe/Tiraspol", - "Europe/Uzhgorod", - "Europe/Vaduz", - "Europe/Vatican", - "Europe/Vienna", - "Europe/Vilnius", - "Europe/Volgograd", - "Europe/Warsaw", - "Europe/Zagreb", - "Europe/Zaporozhye", - "Europe/Zurich", - "GB", - "GB-Eire", - "GMT", - "GMT+0", - "GMT-0", - "GMT0", - "Greenwich", - "HST", - "Hongkong", - "Iceland", - "Indian/Antananarivo", - "Indian/Chagos", - "Indian/Christmas", - "Indian/Cocos", - "Indian/Comoro", - "Indian/Kerguelen", - "Indian/Mahe", - "Indian/Maldives", - "Indian/Mauritius", - "Indian/Mayotte", - "Indian/Reunion", - "Iran", - "Israel", - "Jamaica", - "Japan", - "Kwajalein", - "Libya", - "MET", - "MST", - "MST7MDT", - "Mexico/BajaNorte", - "Mexico/BajaSur", - "Mexico/General", - "NZ", - "NZ-CHAT", - "Navajo", - "PRC", - "PST8PDT", - "Pacific/Apia", - "Pacific/Auckland", - "Pacific/Chatham", - "Pacific/Chuuk", - "Pacific/Easter", - "Pacific/Efate", - "Pacific/Enderbury", - "Pacific/Fakaofo", - "Pacific/Fiji", - "Pacific/Funafuti", - "Pacific/Galapagos", - "Pacific/Gambier", - "Pacific/Guadalcanal", - "Pacific/Guam", - "Pacific/Honolulu", - "Pacific/Johnston", - "Pacific/Kiritimati", - "Pacific/Kosrae", - "Pacific/Kwajalein", - "Pacific/Majuro", - "Pacific/Marquesas", - "Pacific/Midway", - "Pacific/Nauru", - "Pacific/Niue", - "Pacific/Norfolk", - "Pacific/Noumea", - "Pacific/Pago_Pago", - "Pacific/Palau", - "Pacific/Pitcairn", - "Pacific/Pohnpei", - "Pacific/Ponape", - "Pacific/Port_Moresby", - "Pacific/Rarotonga", - "Pacific/Saipan", - "Pacific/Samoa", - "Pacific/Tahiti", - "Pacific/Tarawa", - "Pacific/Tongatapu", - "Pacific/Truk", - "Pacific/Wake", - "Pacific/Wallis", - "Pacific/Yap", - "Poland", - "Portugal", - "ROC", - "ROK", - "Singapore", - "Turkey", - "UCT", - "US/Alaska", - "US/Aleutian", - "US/Arizona", - "US/Central", - "US/East-Indiana", - "US/Eastern", - "US/Hawaii", - "US/Indiana-Starke", - "US/Michigan", - "US/Mountain", - "US/Pacific", - "US/Pacific-New", - "US/Samoa", - "UTC", - "Universal", - "W-SU", - "WET", - "Zulu"}; + public static String[] timezoneList = {"Africa/Abidjan", + "Africa/Accra", + "Africa/Addis_Ababa", + "Africa/Algiers", + "Africa/Asmara", + "Africa/Asmera", + "Africa/Bamako", + "Africa/Bangui", + "Africa/Banjul", + "Africa/Bissau", + "Africa/Blantyre", + "Africa/Brazzaville", + "Africa/Bujumbura", + "Africa/Cairo", + "Africa/Casablanca", + "Africa/Ceuta", + "Africa/Conakry", + "Africa/Dakar", + "Africa/Dar_es_Salaam", + "Africa/Djibouti", + "Africa/Douala", + "Africa/El_Aaiun", + "Africa/Freetown", + "Africa/Gaborone", + "Africa/Harare", + "Africa/Johannesburg", + "Africa/Juba", + "Africa/Kampala", + "Africa/Khartoum", + "Africa/Kigali", + "Africa/Kinshasa", + "Africa/Lagos", + "Africa/Libreville", + "Africa/Lome", + "Africa/Luanda", + "Africa/Lubumbashi", + "Africa/Lusaka", + "Africa/Malabo", + "Africa/Maputo", + "Africa/Maseru", + "Africa/Mbabane", + "Africa/Mogadishu", + "Africa/Monrovia", + "Africa/Nairobi", + "Africa/Ndjamena", + "Africa/Niamey", + "Africa/Nouakchott", + "Africa/Ouagadougou", + "Africa/Porto-Novo", + "Africa/Sao_Tome", + "Africa/Timbuktu", + "Africa/Tripoli", + "Africa/Tunis", + "Africa/Windhoek", + "America/Adak", + "America/Anchorage", + "America/Anguilla", + "America/Antigua", + "America/Araguaina", + "America/Argentina/Buenos_Aires", + "America/Argentina/Catamarca", + "America/Argentina/ComodRivadavia", + "America/Argentina/Cordoba", + "America/Argentina/Jujuy", + "America/Argentina/La_Rioja", + "America/Argentina/Mendoza", + "America/Argentina/Rio_Gallegos", + "America/Argentina/Salta", + "America/Argentina/San_Juan", + "America/Argentina/San_Luis", + "America/Argentina/Tucuman", + "America/Argentina/Ushuaia", + "America/Aruba", + "America/Asuncion", + "America/Atikokan", + "America/Atka", + "America/Bahia", + "America/Bahia_Banderas", + "America/Barbados", + "America/Belem", + "America/Belize", + "America/Blanc-Sablon", + "America/Boa_Vista", + "America/Bogota", + "America/Boise", + "America/Buenos_Aires", + "America/Cambridge_Bay", + "America/Campo_Grande", + "America/Cancun", + "America/Caracas", + "America/Catamarca", + "America/Cayenne", + "America/Cayman", + "America/Chicago", + "America/Chihuahua", + "America/Coral_Harbour", + "America/Cordoba", + "America/Costa_Rica", + "America/Cuiaba", + "America/Curacao", + "America/Danmarkshavn", + "America/Dawson", + "America/Dawson_Creek", + "America/Denver", + "America/Detroit", + "America/Dominica", + "America/Edmonton", + "America/Eirunepe", + "America/El_Salvador", + "America/Ensenada", + "America/Fort_Wayne", + "America/Fortaleza", + "America/Glace_Bay", + "America/Godthab", + "America/Goose_Bay", + "America/Grand_Turk", + "America/Grenada", + "America/Guadeloupe", + "America/Guatemala", + "America/Guayaquil", + "America/Guyana", + "America/Halifax", + "America/Havana", + "America/Hermosillo", + "America/Indiana/Indianapolis", + "America/Indiana/Knox", + "America/Indiana/Marengo", + "America/Indiana/Petersburg", + "America/Indiana/Tell_City", + "America/Indiana/Vevay", + "America/Indiana/Vincennes", + "America/Indiana/Winamac", + "America/Indianapolis", + "America/Inuvik", + "America/Iqaluit", + "America/Jamaica", + "America/Jujuy", + "America/Juneau", + "America/Kentucky/Louisville", + "America/Kentucky/Monticello", + "America/Knox_IN", + "America/Kralendijk", + "America/La_Paz", + "America/Lima", + "America/Los_Angeles", + "America/Louisville", + "America/Lower_Princes", + "America/Maceio", + "America/Managua", + "America/Manaus", + "America/Marigot", + "America/Martinique", + "America/Matamoros", + "America/Mazatlan", + "America/Mendoza", + "America/Menominee", + "America/Merida", + "America/Metlakatla", + "America/Mexico_City", + "America/Miquelon", + "America/Moncton", + "America/Monterrey", + "America/Montevideo", + "America/Montreal", + "America/Montserrat", + "America/Nassau", + "America/New_York", + "America/Nipigon", + "America/Nome", + "America/Noronha", + "America/North_Dakota/Beulah", + "America/North_Dakota/Center", + "America/North_Dakota/New_Salem", + "America/Ojinaga", + "America/Panama", + "America/Pangnirtung", + "America/Paramaribo", + "America/Phoenix", + "America/Port-au-Prince", + "America/Port_of_Spain", + "America/Porto_Acre", + "America/Porto_Velho", + "America/Puerto_Rico", + "America/Rainy_River", + "America/Rankin_Inlet", + "America/Recife", + "America/Regina", + "America/Resolute", + "America/Rio_Branco", + "America/Rosario", + "America/Santa_Isabel", + "America/Santarem", + "America/Santiago", + "America/Santo_Domingo", + "America/Sao_Paulo", + "America/Scoresbysund", + "America/Shiprock", + "America/Sitka", + "America/St_Barthelemy", + "America/St_Johns", + "America/St_Kitts", + "America/St_Lucia", + "America/St_Thomas", + "America/St_Vincent", + "America/Swift_Current", + "America/Tegucigalpa", + "America/Thule", + "America/Thunder_Bay", + "America/Tijuana", + "America/Toronto", + "America/Tortola", + "America/Vancouver", + "America/Virgin", + "America/Whitehorse", + "America/Winnipeg", + "America/Yakutat", + "America/Yellowknife", + "Antarctica/Casey", + "Antarctica/Davis", + "Antarctica/DumontDUrville", + "Antarctica/Macquarie", + "Antarctica/Mawson", + "Antarctica/McMurdo", + "Antarctica/Palmer", + "Antarctica/Rothera", + "Antarctica/South_Pole", + "Antarctica/Syowa", + "Antarctica/Vostok", + "Arctic/Longyearbyen", + "Asia/Aden", + "Asia/Almaty", + "Asia/Amman", + "Asia/Anadyr", + "Asia/Aqtau", + "Asia/Aqtobe", + "Asia/Ashgabat", + "Asia/Ashkhabad", + "Asia/Baghdad", + "Asia/Bahrain", + "Asia/Baku", + "Asia/Bangkok", + "Asia/Beirut", + "Asia/Bishkek", + "Asia/Brunei", + "Asia/Calcutta", + "Asia/Choibalsan", + "Asia/Chongqing", + "Asia/Chungking", + "Asia/Colombo", + "Asia/Dacca", + "Asia/Damascus", + "Asia/Dhaka", + "Asia/Dili", + "Asia/Dubai", + "Asia/Dushanbe", + "Asia/Gaza", + "Asia/Harbin", + "Asia/Hebron", + "Asia/Ho_Chi_Minh", + "Asia/Hong_Kong", + "Asia/Hovd", + "Asia/Irkutsk", + "Asia/Istanbul", + "Asia/Jakarta", + "Asia/Jayapura", + "Asia/Jerusalem", + "Asia/Kabul", + "Asia/Kamchatka", + "Asia/Karachi", + "Asia/Kashgar", + "Asia/Kathmandu", + "Asia/Katmandu", + "Asia/Kolkata", + "Asia/Krasnoyarsk", + "Asia/Kuala_Lumpur", + "Asia/Kuching", + "Asia/Kuwait", + "Asia/Macao", + "Asia/Macau", + "Asia/Magadan", + "Asia/Makassar", + "Asia/Manila", + "Asia/Muscat", + "Asia/Nicosia", + "Asia/Novokuznetsk", + "Asia/Novosibirsk", + "Asia/Omsk", + "Asia/Oral", + "Asia/Phnom_Penh", + "Asia/Pontianak", + "Asia/Pyongyang", + "Asia/Qatar", + "Asia/Qyzylorda", + "Asia/Rangoon", + "Asia/Riyadh", + "Asia/Saigon", + "Asia/Sakhalin", + "Asia/Samarkand", + "Asia/Seoul", + "Asia/Shanghai", + "Asia/Singapore", + "Asia/Taipei", + "Asia/Tashkent", + "Asia/Tbilisi", + "Asia/Tehran", + "Asia/Tel_Aviv", + "Asia/Thimbu", + "Asia/Thimphu", + "Asia/Tokyo", + "Asia/Ujung_Pandang", + "Asia/Ulaanbaatar", + "Asia/Ulan_Bator", + "Asia/Urumqi", + "Asia/Vientiane", + "Asia/Vladivostok", + "Asia/Yakutsk", + "Asia/Yekaterinburg", + "Asia/Yerevan", + "Atlantic/Azores", + "Atlantic/Bermuda", + "Atlantic/Canary", + "Atlantic/Cape_Verde", + "Atlantic/Faeroe", + "Atlantic/Faroe", + "Atlantic/Jan_Mayen", + "Atlantic/Madeira", + "Atlantic/Reykjavik", + "Atlantic/South_Georgia", + "Atlantic/St_Helena", + "Atlantic/Stanley", + "Australia/ACT", + "Australia/Adelaide", + "Australia/Brisbane", + "Australia/Broken_Hill", + "Australia/Canberra", + "Australia/Currie", + "Australia/Darwin", + "Australia/Eucla", + "Australia/Hobart", + "Australia/LHI", + "Australia/Lindeman", + "Australia/Lord_Howe", + "Australia/Melbourne", + "Australia/NSW", + "Australia/North", + "Australia/Perth", + "Australia/Queensland", + "Australia/South", + "Australia/Sydney", + "Australia/Tasmania", + "Australia/Victoria", + "Australia/West", + "Australia/Yancowinna", + "Brazil/Acre", + "Brazil/DeNoronha", + "Brazil/East", + "Brazil/West", + "CET", + "CST6CDT", + "Canada/Atlantic", + "Canada/Central", + "Canada/East-Saskatchewan", + "Canada/Eastern", + "Canada/Mountain", + "Canada/Newfoundland", + "Canada/Pacific", + "Canada/Saskatchewan", + "Canada/Yukon", + "Chile/Continental", + "Chile/EasterIsland", + "Cuba", + "EET", + "EST", + "EST5EDT", + "Egypt", + "Eire", + "Etc/GMT", + "Etc/GMT+0", + "Etc/GMT+1", + "Etc/GMT+10", + "Etc/GMT+11", + "Etc/GMT+12", + "Etc/GMT+2", + "Etc/GMT+3", + "Etc/GMT+4", + "Etc/GMT+5", + "Etc/GMT+6", + "Etc/GMT+7", + "Etc/GMT+8", + "Etc/GMT+9", + "Etc/GMT-0", + "Etc/GMT-1", + "Etc/GMT-10", + "Etc/GMT-11", + "Etc/GMT-12", + "Etc/GMT-13", + "Etc/GMT-14", + "Etc/GMT-2", + "Etc/GMT-3", + "Etc/GMT-4", + "Etc/GMT-5", + "Etc/GMT-6", + "Etc/GMT-7", + "Etc/GMT-8", + "Etc/GMT-9", + "Etc/GMT0", + "Etc/Greenwich", + "Etc/UCT", + "Etc/UTC", + "Etc/Universal", + "Etc/Zulu", + "Europe/Amsterdam", + "Europe/Andorra", + "Europe/Athens", + "Europe/Belfast", + "Europe/Belgrade", + "Europe/Berlin", + "Europe/Bratislava", + "Europe/Brussels", + "Europe/Bucharest", + "Europe/Budapest", + "Europe/Chisinau", + "Europe/Copenhagen", + "Europe/Dublin", + "Europe/Gibraltar", + "Europe/Guernsey", + "Europe/Helsinki", + "Europe/Isle_of_Man", + "Europe/Istanbul", + "Europe/Jersey", + "Europe/Kaliningrad", + "Europe/Kiev", + "Europe/Lisbon", + "Europe/Ljubljana", + "Europe/London", + "Europe/Luxembourg", + "Europe/Madrid", + "Europe/Malta", + "Europe/Mariehamn", + "Europe/Minsk", + "Europe/Monaco", + "Europe/Moscow", + "Europe/Nicosia", + "Europe/Oslo", + "Europe/Paris", + "Europe/Podgorica", + "Europe/Prague", + "Europe/Riga", + "Europe/Rome", + "Europe/Samara", + "Europe/San_Marino", + "Europe/Sarajevo", + "Europe/Simferopol", + "Europe/Skopje", + "Europe/Sofia", + "Europe/Stockholm", + "Europe/Tallinn", + "Europe/Tirane", + "Europe/Tiraspol", + "Europe/Uzhgorod", + "Europe/Vaduz", + "Europe/Vatican", + "Europe/Vienna", + "Europe/Vilnius", + "Europe/Volgograd", + "Europe/Warsaw", + "Europe/Zagreb", + "Europe/Zaporozhye", + "Europe/Zurich", + "GB", + "GB-Eire", + "GMT", + "GMT+0", + "GMT-0", + "GMT0", + "Greenwich", + "HST", + "Hongkong", + "Iceland", + "Indian/Antananarivo", + "Indian/Chagos", + "Indian/Christmas", + "Indian/Cocos", + "Indian/Comoro", + "Indian/Kerguelen", + "Indian/Mahe", + "Indian/Maldives", + "Indian/Mauritius", + "Indian/Mayotte", + "Indian/Reunion", + "Iran", + "Israel", + "Jamaica", + "Japan", + "Kwajalein", + "Libya", + "MET", + "MST", + "MST7MDT", + "Mexico/BajaNorte", + "Mexico/BajaSur", + "Mexico/General", + "NZ", + "NZ-CHAT", + "Navajo", + "PRC", + "PST8PDT", + "Pacific/Apia", + "Pacific/Auckland", + "Pacific/Chatham", + "Pacific/Chuuk", + "Pacific/Easter", + "Pacific/Efate", + "Pacific/Enderbury", + "Pacific/Fakaofo", + "Pacific/Fiji", + "Pacific/Funafuti", + "Pacific/Galapagos", + "Pacific/Gambier", + "Pacific/Guadalcanal", + "Pacific/Guam", + "Pacific/Honolulu", + "Pacific/Johnston", + "Pacific/Kiritimati", + "Pacific/Kosrae", + "Pacific/Kwajalein", + "Pacific/Majuro", + "Pacific/Marquesas", + "Pacific/Midway", + "Pacific/Nauru", + "Pacific/Niue", + "Pacific/Norfolk", + "Pacific/Noumea", + "Pacific/Pago_Pago", + "Pacific/Palau", + "Pacific/Pitcairn", + "Pacific/Pohnpei", + "Pacific/Ponape", + "Pacific/Port_Moresby", + "Pacific/Rarotonga", + "Pacific/Saipan", + "Pacific/Samoa", + "Pacific/Tahiti", + "Pacific/Tarawa", + "Pacific/Tongatapu", + "Pacific/Truk", + "Pacific/Wake", + "Pacific/Wallis", + "Pacific/Yap", + "Poland", + "Portugal", + "ROC", + "ROK", + "Singapore", + "Turkey", + "UCT", + "US/Alaska", + "US/Aleutian", + "US/Arizona", + "US/Central", + "US/East-Indiana", + "US/Eastern", + "US/Hawaii", + "US/Indiana-Starke", + "US/Michigan", + "US/Mountain", + "US/Pacific", + "US/Pacific-New", + "US/Samoa", + "UTC", + "Universal", + "W-SU", + "WET", + "Zulu"}; - static { - for (int i = 0; i < timezoneList.length; i++) { - timezoneMap.put(timezoneList[i], i); - } + static { + for (int i = 0; i < timezoneList.length; i++) { + timezoneMap.put(timezoneList[i], i); } + } - public static final DateTimeFormatter formatDate = DateTimeFormat.forPattern("yyyy-MM-dd"); - public static final DateTimeFormatter formatTimeStampMilli = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS"); - public static final DateTimeFormatter formatTimeStampTZ = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); - public static final DateTimeFormatter formatTime = DateTimeFormat.forPattern("HH:mm:ss.SSS"); + public static final DateTimeFormatter formatDate = DateTimeFormat.forPattern("yyyy-MM-dd"); + public static final DateTimeFormatter formatTimeStampMilli = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS"); + public static final DateTimeFormatter formatTimeStampTZ = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); + public static final DateTimeFormatter formatTime = DateTimeFormat.forPattern("HH:mm:ss.SSS"); - public static DateTimeFormatter dateTimeTZFormat = null; - public static DateTimeFormatter timeFormat = null; + public static DateTimeFormatter dateTimeTZFormat = null; + public static DateTimeFormatter timeFormat = null; - public static final int yearsToMonths = 12; - public static final int hoursToMillis = 60 * 60 * 1000; - public static final int minutesToMillis = 60 * 1000; - public static final int secondsToMillis = 1000; - public static final int monthToStandardDays = 30; - public static final long monthsToMillis = 2592000000L; // 30 * 24 * 60 * 60 * 1000 - public static final int daysToStandardMillis = 24 * 60 * 60 * 1000; + public static final int yearsToMonths = 12; + public static final int hoursToMillis = 60 * 60 * 1000; + public static final int minutesToMillis = 60 * 1000; + public static final int secondsToMillis = 1000; + public static final int monthToStandardDays = 30; + public static final long monthsToMillis = 2592000000L; // 30 * 24 * 60 * 60 * 1000 + public static final int daysToStandardMillis = 24 * 60 * 60 * 1000; public static int getIndex(String timezone) { - return timezoneMap.get(timezone); - } - - public static String getTimeZone(int index) { - return timezoneList[index]; - } + return timezoneMap.get(timezone); + } - // Function returns the date time formatter used to parse date strings - public static DateTimeFormatter getDateTimeFormatter() { + public static String getTimeZone(int index) { + return timezoneList[index]; + } - if (dateTimeTZFormat == null) { - DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); - DateTimeParser optionalTime = DateTimeFormat.forPattern(" HH:mm:ss").getParser(); - DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); - DateTimeParser optionalZone = DateTimeFormat.forPattern(" ZZZ").getParser(); + // Function returns the date time formatter used to parse date strings + public static DateTimeFormatter getDateTimeFormatter() { - dateTimeTZFormat = new DateTimeFormatterBuilder().append(dateFormatter).appendOptional(optionalTime).appendOptional(optionalSec).appendOptional(optionalZone).toFormatter(); - } + if (dateTimeTZFormat == null) { + DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); + DateTimeParser optionalTime = DateTimeFormat.forPattern(" HH:mm:ss").getParser(); + DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); + DateTimeParser optionalZone = DateTimeFormat.forPattern(" ZZZ").getParser(); - return dateTimeTZFormat; + dateTimeTZFormat = new DateTimeFormatterBuilder().append(dateFormatter).appendOptional(optionalTime).appendOptional(optionalSec).appendOptional(optionalZone).toFormatter(); } - // Function returns time formatter used to parse time strings - public static DateTimeFormatter getTimeFormatter() { - if (timeFormat == null) { - DateTimeFormatter timeFormatter = DateTimeFormat.forPattern("HH:mm:ss"); - DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); - timeFormat = new DateTimeFormatterBuilder().append(timeFormatter).appendOptional(optionalSec).toFormatter(); - } - return timeFormat; - } + return dateTimeTZFormat; + } - public static int monthsFromPeriod(Period period){ - return (period.getYears() * yearsToMonths) + period.getMonths(); + // Function returns time formatter used to parse time strings + public static DateTimeFormatter getTimeFormatter() { + if (timeFormat == null) { + DateTimeFormatter timeFormatter = DateTimeFormat.forPattern("HH:mm:ss"); + DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); + timeFormat = new DateTimeFormatterBuilder().append(timeFormatter).appendOptional(optionalSec).toFormatter(); } + return timeFormat; + } - public static int millisFromPeriod(final Period period){ - return (period.getHours() * hoursToMillis) + - (period.getMinutes() * minutesToMillis) + - (period.getSeconds() * secondsToMillis) + - (period.getMillis()); - } + public static int monthsFromPeriod(Period period) { + return (period.getYears() * yearsToMonths) + period.getMonths(); + } - public static long toMillis(LocalDateTime localDateTime) { - return LocalDateTimes.getLocalMillis(localDateTime); - } + public static int millisFromPeriod(final Period period) { + return (period.getHours() * hoursToMillis) + + (period.getMinutes() * minutesToMillis) + + (period.getSeconds() * secondsToMillis) + + (period.getMillis()); + } - public static int toMillisOfDay(final LocalDateTime localDateTime) { - return localDateTime.toDateTime(DateTimeZone.UTC).millisOfDay().get(); - } + public static long toMillis(LocalDateTime localDateTime) { + return LocalDateTimes.getLocalMillis(localDateTime); + } + + public static int toMillisOfDay(final LocalDateTime localDateTime) { + return localDateTime.toDateTime(DateTimeZone.UTC).millisOfDay().get(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 4c439b2cc1066..4b11b368dff1e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import io.netty.buffer.ArrowBuf; @@ -32,39 +33,39 @@ public class DecimalUtility { public final static int MAX_DIGITS = 9; public final static int DIGITS_BASE = 1000000000; public final static int DIGITS_MAX = 999999999; - public final static int INTEGER_SIZE = (Integer.SIZE/8); + public final static int INTEGER_SIZE = (Integer.SIZE / 8); public final static String[] decimalToString = {"", - "0", - "00", - "000", - "0000", - "00000", - "000000", - "0000000", - "00000000", - "000000000"}; + "0", + "00", + "000", + "0000", + "00000", + "000000", + "0000000", + "00000000", + "000000000"}; public final static long[] scale_long_constants = { - 1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000l, - 100000000000l, - 1000000000000l, - 10000000000000l, - 100000000000000l, - 1000000000000000l, - 10000000000000000l, - 100000000000000000l, - 1000000000000000000l}; + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000l, + 100000000000l, + 1000000000000l, + 10000000000000l, + 100000000000000l, + 1000000000000000l, + 10000000000000000l, + 100000000000000000l, + 1000000000000000000l}; public static final int DECIMAL_BYTE_LENGTH = 16; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java index 4108dc4610838..cf0596c8c1fb4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -40,7 +40,7 @@ public class DictionaryUtility { * have the dictionary type * * NOTE: in the message format, fields have the dictionary type - * in the memory format, they have the index type + * in the memory format, they have the index type */ public static Field toMessageFormat(Field field, DictionaryProvider provider, Set dictionaryIdsUsed) { DictionaryEncoding encoding = field.getDictionary(); @@ -51,7 +51,7 @@ public static Field toMessageFormat(Field field, DictionaryProvider provider, Se } List updatedChildren = new ArrayList<>(children.size()); - for (Field child: children) { + for (Field child : children) { updatedChildren.add(toMessageFormat(child, provider, dictionaryIdsUsed)); } @@ -85,7 +85,7 @@ public static Field toMemoryFormat(Field field, BufferAllocator allocator, Map updatedChildren = new ArrayList<>(children.size()); - for (Field child: children) { + for (Field child : children) { updatedChildren.add(toMemoryFormat(child, allocator, dictionaries)); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java index c598069c2c309..480bd76d445b0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.ArrayList; @@ -58,7 +59,7 @@ public boolean equals(Object obj) { public final String toString() { try { return mapper.writeValueAsString(this); - } catch(JsonProcessingException e) { + } catch (JsonProcessingException e) { throw new IllegalStateException("Cannot serialize array list to JSON string", e); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java index e8ce5221eebd9..6455389d582b9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.LinkedHashMap; @@ -51,14 +52,14 @@ public boolean equals(Object obj) { return false; } for (K key : this.keySet()) { - if (this.get(key) == null ) { + if (this.get(key) == null) { if (other.get(key) == null) { continue; } else { return false; } } - if ( ! this.get(key).equals(other.get(key))) { + if (!this.get(key).equals(other.get(key))) { return false; } } @@ -69,7 +70,7 @@ public boolean equals(Object obj) { public final String toString() { try { return mapper.writeValueAsString(this); - } catch(JsonProcessingException e) { + } catch (JsonProcessingException e) { throw new IllegalStateException("Cannot serialize hash map to JSON string", e); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java index b35aaa401bae4..6d3b390379a56 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.AbstractMap; @@ -95,16 +96,16 @@ public V get(Object key) { public V put(K key, V value) { final Entry oldPair = primary.get(key); // if key exists try replacing otherwise, assign a new ordinal identifier - final int ordinal = oldPair == null ? primary.size():oldPair.getKey(); + final int ordinal = oldPair == null ? primary.size() : oldPair.getKey(); primary.put(key, new AbstractMap.SimpleImmutableEntry<>(ordinal, value)); secondary.put(ordinal, value); - return oldPair==null ? null:oldPair.getValue(); + return oldPair == null ? null : oldPair.getValue(); } @Override public V remove(Object key) { final Entry oldPair = primary.remove(key); - if (oldPair!=null) { + if (oldPair != null) { final int lastOrdinal = secondary.size(); final V last = secondary.get(lastOrdinal); // normalize mappings so that all numbers until primary.size() is assigned @@ -112,7 +113,7 @@ public V remove(Object key) { secondary.put(oldPair.getKey(), last); primary.put((K) key, new AbstractMap.SimpleImmutableEntry<>(oldPair.getKey(), last)); } - return oldPair==null ? null:oldPair.getValue(); + return oldPair == null ? null : oldPair.getValue(); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java b/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java index bd7396249a72c..b4ff2522daf33 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; @@ -23,7 +24,6 @@ * {@link org.apache.arrow.memory.BufferAllocator#buffer(int) allocator}. * *

Operators should handle this exception to split the batch and later resume the execution on the next iteration.

- * */ public class OversizedAllocationException extends RuntimeException { public OversizedAllocationException() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java index c281561430707..ddfea948a8f74 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java index ce82f445ad883..15ce132fc801c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.io.DataInput; @@ -74,13 +75,16 @@ public Text() { /** * Construct from a string. + * * @param string initialize from that string */ public Text(String string) { set(string); } - /** Construct from another text. + /** + * Construct from another text. + * * @param utf8 initialize from that Text */ public Text(Text utf8) { @@ -89,6 +93,7 @@ public Text(Text utf8) { /** * Construct from a byte array. + * * @param utf8 initialize from that byte array */ public Text(byte[] utf8) { @@ -98,6 +103,7 @@ public Text(byte[] utf8) { /** * Get a copy of the bytes that is exactly the length of the data. See {@link #getBytes()} for faster access to the * underlying array. + * * @return a copy of the underlying array */ public byte[] copyBytes() { @@ -109,13 +115,16 @@ public byte[] copyBytes() { /** * Returns the raw bytes; however, only data up to {@link #getLength()} is valid. Please use {@link #copyBytes()} if * you need the returned array to be precisely the length of the data. + * * @return the underlying array */ public byte[] getBytes() { return bytes; } - /** @return the number of bytes in the byte array */ + /** + * @return the number of bytes in the byte array + */ public int getLength() { return length; } @@ -128,12 +137,10 @@ public int getLength() { * @return the Unicode scalar value at position or -1 if the position is invalid or points to a trailing byte */ public int charAt(int position) { - if (position > this.length) - { + if (position > this.length) { return -1; // too long } - if (position < 0) - { + if (position < 0) { return -1; // duh. } @@ -150,7 +157,7 @@ public int find(String what) { * starting position is measured in bytes and the return value is in terms of byte position in the buffer. The backing * buffer is not converted to a string for this operation. * - * @param what the string to search for + * @param what the string to search for * @param start where to start from * @return byte position of the first occurence of the search string in the UTF-8 buffer or -1 if not found */ @@ -196,6 +203,7 @@ public int find(String what, int start) { /** * Set to contain the contents of a string. + * * @param string the string to initialize from */ public void set(String string) { @@ -210,14 +218,18 @@ public void set(String string) { /** * Set to a utf8 byte array + * * @param utf8 the byte array to initialize from */ public void set(byte[] utf8) { set(utf8, 0, utf8.length); } - /** copy a text. - * @param other the text to initialize from */ + /** + * copy a text. + * + * @param other the text to initialize from + */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); } @@ -225,12 +237,9 @@ public void set(Text other) { /** * Set the Text to range of bytes * - * @param utf8 - * the data to copy from - * @param start - * the first position of the new string - * @param len - * the number of bytes of the new string + * @param utf8 the data to copy from + * @param start the first position of the new string + * @param len the number of bytes of the new string */ public void set(byte[] utf8, int start, int len) { setCapacity(len, false); @@ -241,12 +250,9 @@ public void set(byte[] utf8, int start, int len) { /** * Append a range of bytes to the end of the given text * - * @param utf8 - * the data to copy from - * @param start - * the first position to append from utf8 - * @param len - * the number of bytes to append + * @param utf8 the data to copy from + * @param start the first position to append from utf8 + * @param len the number of bytes to append */ public void append(byte[] utf8, int start, int len) { setCapacity(length + len, true); @@ -270,7 +276,7 @@ public void clear() { * then the capacity and existing content of the buffer are unchanged. If len is larger than the current * capacity, the Text object's capacity is increased to match. * - * @param len the number of bytes we need + * @param len the number of bytes we need * @param keepData should the old data be kept */ private void setCapacity(int len, boolean keepData) { @@ -295,7 +301,8 @@ public String toString() { /** * Read a Text object whose length is already known. This allows creating Text from a stream which uses a different * serialization format. - * @param in the input to initialize from + * + * @param in the input to initialize from * @param len how many bytes to read from in * @throws IOException if something bad happens */ @@ -351,9 +358,11 @@ public int hashCode() { } // / STATIC UTILITIES FROM HERE DOWN + /** * Converts the provided byte array to a String using the UTF-8 encoding. If the input is malformed, replace by a * default value. + * * @param utf8 bytes to decode * @return the decoded string * @throws CharacterCodingException if this is not valid UTF-8 @@ -371,9 +380,10 @@ public static String decode(byte[] utf8, int start, int length) * Converts the provided byte array to a String using the UTF-8 encoding. If replace is true, then * malformed input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a * MalformedInputException. - * @param utf8 the bytes to decode - * @param start where to start from - * @param length length of the bytes to decode + * + * @param utf8 the bytes to decode + * @param start where to start from + * @param length length of the bytes to decode * @param replace whether to replace malformed characters with U+FFFD * @return the decoded string * @throws CharacterCodingException if the input could not be decoded @@ -418,8 +428,7 @@ public static ByteBuffer encode(String string) * input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a * MalformedInputException. * - - * @param string the string to encode + * @param string the string to encode * @param replace whether to replace malformed characters with U+FFFD * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit() * @throws CharacterCodingException if the string could not be encoded @@ -453,10 +462,8 @@ public static ByteBuffer encode(String string, boolean replace) /** * Check if a byte array contains valid utf-8 * - * @param utf8 - * byte array - * @throws MalformedInputException - * if the byte array contains invalid utf-8 + * @param utf8 byte array + * @throws MalformedInputException if the byte array contains invalid utf-8 */ public static void validateUTF8(byte[] utf8) throws MalformedInputException { validateUTF8(utf8, 0, utf8.length); @@ -465,14 +472,10 @@ public static void validateUTF8(byte[] utf8) throws MalformedInputException { /** * Check to see if a byte array is valid utf-8 * - * @param utf8 - * the array of bytes - * @param start - * the offset of the first byte in the array - * @param len - * the length of the byte sequence - * @throws MalformedInputException - * if the byte array contains invalid bytes + * @param utf8 the array of bytes + * @param start the offset of the first byte in the array + * @param len the length of the byte sequence + * @throws MalformedInputException if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { @@ -484,67 +487,67 @@ public static void validateUTF8(byte[] utf8, int start, int len) int aByte = utf8[count] & 0xFF; switch (state) { - case LEAD_BYTE: - leadByte = aByte; - length = bytesFromUTF8[aByte]; + case LEAD_BYTE: + leadByte = aByte; + length = bytesFromUTF8[aByte]; + + switch (length) { + case 0: // check for ASCII + if (leadByte > 0x7F) { + throw new MalformedInputException(count); + } + break; + case 1: + if (leadByte < 0xC2 || leadByte > 0xDF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 2: + if (leadByte < 0xE0 || leadByte > 0xEF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 3: + if (leadByte < 0xF0 || leadByte > 0xF4) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + default: + // too long! Longest valid UTF-8 is 4 bytes (lead + three) + // or if < 0 we got a trail byte in the lead byte position + throw new MalformedInputException(count); + } // switch (length) + break; - switch (length) { - case 0: // check for ASCII - if (leadByte > 0x7F) { + case TRAIL_BYTE_1: + if (leadByte == 0xF0 && aByte < 0x90) { throw new MalformedInputException(count); } - break; - case 1: - if (leadByte < 0xC2 || leadByte > 0xDF) { + if (leadByte == 0xF4 && aByte > 0x8F) { throw new MalformedInputException(count); } - state = TRAIL_BYTE_1; - break; - case 2: - if (leadByte < 0xE0 || leadByte > 0xEF) { + if (leadByte == 0xE0 && aByte < 0xA0) { throw new MalformedInputException(count); } - state = TRAIL_BYTE_1; - break; - case 3: - if (leadByte < 0xF0 || leadByte > 0xF4) { + if (leadByte == 0xED && aByte > 0x9F) { + throw new MalformedInputException(count); + } + // falls through to regular trail-byte test!! + case TRAIL_BYTE: + if (aByte < 0x80 || aByte > 0xBF) { throw new MalformedInputException(count); } - state = TRAIL_BYTE_1; + if (--length == 0) { + state = LEAD_BYTE; + } else { + state = TRAIL_BYTE; + } break; default: - // too long! Longest valid UTF-8 is 4 bytes (lead + three) - // or if < 0 we got a trail byte in the lead byte position - throw new MalformedInputException(count); - } // switch (length) - break; - - case TRAIL_BYTE_1: - if (leadByte == 0xF0 && aByte < 0x90) { - throw new MalformedInputException(count); - } - if (leadByte == 0xF4 && aByte > 0x8F) { - throw new MalformedInputException(count); - } - if (leadByte == 0xE0 && aByte < 0xA0) { - throw new MalformedInputException(count); - } - if (leadByte == 0xED && aByte > 0x9F) { - throw new MalformedInputException(count); - } - // falls through to regular trail-byte test!! - case TRAIL_BYTE: - if (aByte < 0x80 || aByte > 0xBF) { - throw new MalformedInputException(count); - } - if (--length == 0) { - state = LEAD_BYTE; - } else { - state = TRAIL_BYTE; - } - break; - default: - break; + break; } // switch (state) count++; } @@ -556,25 +559,26 @@ public static void validateUTF8(byte[] utf8, int start, int len) * six byte sequences. */ static final int[] bytesFromUTF8 = - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - // trail bytes - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, - 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + // trail bytes + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5}; /** * Returns the next code point at the current position in the buffer. The buffer's position will be incremented. Any * mark set on this buffer will be changed by this method! + * * @param bytes the incoming bytes * @return the corresponding unicode codepoint */ @@ -583,30 +587,29 @@ public static int bytesToCodePoint(ByteBuffer bytes) { byte b = bytes.get(); bytes.reset(); int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; - if (extraBytesToRead < 0) - { + if (extraBytesToRead < 0) { return -1; // trailing byte! } int ch = 0; switch (extraBytesToRead) { - case 5: - ch += (bytes.get() & 0xFF); - ch <<= 6; /* remember, illegal UTF-8 */ - case 4: - ch += (bytes.get() & 0xFF); - ch <<= 6; /* remember, illegal UTF-8 */ - case 3: - ch += (bytes.get() & 0xFF); - ch <<= 6; - case 2: - ch += (bytes.get() & 0xFF); - ch <<= 6; - case 1: - ch += (bytes.get() & 0xFF); - ch <<= 6; - case 0: - ch += (bytes.get() & 0xFF); + case 5: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + case 4: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + case 3: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 2: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 1: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 0: + ch += (bytes.get() & 0xFF); } ch -= offsetsFromUTF8[extraBytesToRead]; @@ -614,14 +617,13 @@ public static int bytesToCodePoint(ByteBuffer bytes) { } static final int offsetsFromUTF8[] = - { 0x00000000, 0x00003080, - 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; + {0x00000000, 0x00003080, + 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080}; /** * For the given string, returns the number of UTF-8 bytes required to encode the string. * - * @param string - * text to encode + * @param string text to encode * @return number of UTF-8 bytes required to encode */ public static int utf8Length(String string) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java b/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java index 6e68d55226266..314ffdcb6637c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java @@ -15,13 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import org.apache.arrow.vector.ValueVector; public interface TransferPair { public void transfer(); + public void splitAndTransfer(int startIndex, int length); + public ValueVector getTo(); + public void copyValueSafe(int from, int to); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index b70a63fe7dd02..5851bd5fa5d97 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.util; import java.util.Arrays; @@ -68,7 +69,7 @@ public static void compareDictionaries(List encodings1, List Dictionary dict2 = provider2.lookup(id); if (dict1 == null || dict2 == null) { - throw new IllegalArgumentException("The DictionaryProvider did not contain the required dictionary with id: " + id +"\n" + dict1 + "\n" + dict2); + throw new IllegalArgumentException("The DictionaryProvider did not contain the required dictionary with id: " + id + "\n" + dict1 + "\n" + dict2); } try { @@ -132,9 +133,9 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { ArrowType.FloatingPoint fpType = (ArrowType.FloatingPoint) type; switch (fpType.getPrecision()) { case DOUBLE: - return equalEnough((Double)o1, (Double)o2); + return equalEnough((Double) o1, (Double) o2); case SINGLE: - return equalEnough((Float)o1, (Float)o2); + return equalEnough((Float) o1, (Float) o2); case HALF: default: throw new UnsupportedOperationException("unsupported precision: " + fpType); diff --git a/java/vector/src/main/java/org/joda/time/LocalDateTimes.java b/java/vector/src/main/java/org/joda/time/LocalDateTimes.java index e4f999e1d828e..a1c18fe9a5f41 100644 --- a/java/vector/src/main/java/org/joda/time/LocalDateTimes.java +++ b/java/vector/src/main/java/org/joda/time/LocalDateTimes.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.joda.time; /** * Workaround to access package protected fields in JODA - * */ public class LocalDateTimes { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java b/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java index f775f1d2d67af..febd59fba7408 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import org.apache.arrow.memory.BufferManager; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java index 194b78585faaf..495bed389e568 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -75,11 +76,10 @@ public void testSplitAndTransfer() throws Exception { sourceVector.allocateNew(40); /* populate the bitvector -- 010101010101010101010101..... */ - for(int i = 0; i < 40; i++) { - if((i & 1) == 1) { + for (int i = 0; i < 40; i++) { + if ((i & 1) == 1) { sourceMutator.set(i, 1); - } - else { + } else { sourceMutator.set(i, 0); } } @@ -87,18 +87,17 @@ public void testSplitAndTransfer() throws Exception { sourceMutator.setValueCount(40); /* check the vector output */ - for(int i = 0; i < 40; i++) { + for (int i = 0; i < 40; i++) { int result = sourceAccessor.get(i); - if((i & 1) == 1) { + if ((i & 1) == 1) { assertEquals(Integer.toString(1), Integer.toString(result)); - } - else { + } else { assertEquals(Integer.toString(0), Integer.toString(result)); } } final TransferPair transferPair = sourceVector.getTransferPair(allocator); - final BitVector toVector = (BitVector)transferPair.getTo(); + final BitVector toVector = (BitVector) transferPair.getTo(); final BitVector.Accessor toAccessor = toVector.getAccessor(); final BitVector.Mutator toMutator = toVector.getMutator(); @@ -110,13 +109,13 @@ public void testSplitAndTransfer() throws Exception { * (2.1) the length is a multiple of 8 * (2.2) the length is not a multiple of 8 */ - final int[][] transferLengths = { {0, 8}, /* (1) */ - {8, 10}, /* (1) */ - {18, 0}, /* zero length scenario */ - {18, 8}, /* (2.1) */ - {26, 0}, /* zero length scenario */ - {26, 14} /* (2.2) */ - }; + final int[][] transferLengths = {{0, 8}, /* (1) */ + {8, 10}, /* (1) */ + {18, 0}, /* zero length scenario */ + {18, 8}, /* (2.1) */ + {26, 0}, /* zero length scenario */ + {26, 14} /* (2.2) */ + }; for (final int[] transferLength : transferLengths) { final int start = transferLength[0]; @@ -127,10 +126,9 @@ public void testSplitAndTransfer() throws Exception { /* check the toVector output after doing splitAndTransfer */ for (int i = 0; i < length; i++) { int result = toAccessor.get(i); - if((i & 1) == 1) { + if ((i & 1) == 1) { assertEquals(Integer.toString(1), Integer.toString(result)); - } - else { + } else { assertEquals(Integer.toString(0), Integer.toString(result)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 08e3700daeebf..54fc306717088 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -65,7 +66,7 @@ public void testTransferVariableidth() { v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expected = 8*4096 + 4*4096 + 4096; + int expected = 8 * 4096 + 4 * 4096 + 4096; assertEquals(expected, childAllocator2.getAllocatedMemory()); } @@ -90,11 +91,11 @@ public void emptyListTransferShouldNotTriggerSchemaChange() { final Pointer trigger1 = new Pointer<>(); final Pointer trigger2 = new Pointer<>(); final ListVector v1 = new ListVector("v1", allocator, - FieldType.nullable(ArrowType.Null.INSTANCE), - newTriggerCallback(trigger1)); + FieldType.nullable(ArrowType.Null.INSTANCE), + newTriggerCallback(trigger1)); final ListVector v2 = new ListVector("v2", allocator, - FieldType.nullable(ArrowType.Null.INSTANCE), - newTriggerCallback(trigger2)); + FieldType.nullable(ArrowType.Null.INSTANCE), + newTriggerCallback(trigger2)); v1.makeTransferPair(v2).transfer(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java index b98c24d189528..774fbe084f1c2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -39,6 +40,7 @@ public class TestDecimalVector { intValues[2 * i] = -1 * (1 << i + 1); } } + private int scale = 3; @Test diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index 3bf3b1cedff38..f2db9baac04ca 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; @@ -35,8 +36,8 @@ public class TestDictionaryVector { private BufferAllocator allocator; byte[] zero = "foo".getBytes(StandardCharsets.UTF_8); - byte[] one = "bar".getBytes(StandardCharsets.UTF_8); - byte[] two = "baz".getBytes(StandardCharsets.UTF_8); + byte[] one = "bar".getBytes(StandardCharsets.UTF_8); + byte[] two = "baz".getBytes(StandardCharsets.UTF_8); @Before public void init() { @@ -74,7 +75,7 @@ public void testEncodeStrings() { Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); - try(final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { + try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { // verify indices assertEquals(NullableIntVector.class, encoded.getClass()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java index 304db9dcc00bd..5677f2566797a 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import com.google.common.collect.Lists; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index eb30fdd46bf60..fdb576ef75cc4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -15,12 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertFalse; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; @@ -100,11 +101,11 @@ public void testCopyFrom() throws Exception { /* index 0 */ Object result = accessor.getObject(0); - ArrayList resultSet = (ArrayList)result; + ArrayList resultSet = (ArrayList) result; assertEquals(3, resultSet.size()); - assertEquals(new Long(1), (Long)resultSet.get(0)); - assertEquals(new Long(2), (Long)resultSet.get(1)); - assertEquals(new Long(3), (Long)resultSet.get(2)); + assertEquals(new Long(1), (Long) resultSet.get(0)); + assertEquals(new Long(2), (Long) resultSet.get(1)); + assertEquals(new Long(3), (Long) resultSet.get(2)); /* index 1 */ result = accessor.getObject(1); @@ -112,7 +113,7 @@ public void testCopyFrom() throws Exception { /* index 2 */ result = accessor.getObject(2); - resultSet = (ArrayList)result; + resultSet = (ArrayList) result; assertEquals(0, resultSet.size()); } } @@ -130,11 +131,11 @@ public void testSetLastSetUsage() throws Exception { /* get inner vectors; bitVector and offsetVector */ List innerVectors = listVector.getFieldInnerVectors(); - BitVector bitVector = (BitVector)innerVectors.get(0); - UInt4Vector offsetVector = (UInt4Vector)innerVectors.get(1); + BitVector bitVector = (BitVector) innerVectors.get(0); + UInt4Vector offsetVector = (UInt4Vector) innerVectors.get(1); /* get the underlying data vector -- NullableBigIntVector */ - NullableBigIntVector dataVector = (NullableBigIntVector)listVector.getDataVector(); + NullableBigIntVector dataVector = (NullableBigIntVector) listVector.getDataVector(); /* check current lastSet */ assertEquals(Integer.toString(0), Integer.toString(listVector.getMutator().getLastSet())); @@ -220,36 +221,36 @@ public void testSetLastSetUsage() throws Exception { assertEquals(Integer.toString(0), Integer.toString(offset)); Object actual = valueAccessor.getObject(offset); - assertEquals(new Long(10), (Long)actual); + assertEquals(new Long(10), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(11), (Long)actual); + assertEquals(new Long(11), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(12), (Long)actual); + assertEquals(new Long(12), (Long) actual); index++; offset = offsetAccessor.get(index); assertEquals(Integer.toString(3), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(13), (Long)actual); + assertEquals(new Long(13), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(14), (Long)actual); + assertEquals(new Long(14), (Long) actual); index++; offset = offsetAccessor.get(index); assertEquals(Integer.toString(5), Integer.toString(offset)); actual = valueAccessor.getObject(offsetAccessor.get(index)); - assertEquals(new Long(15), (Long)actual); + assertEquals(new Long(15), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(16), (Long)actual); + assertEquals(new Long(16), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(17), (Long)actual); + assertEquals(new Long(17), (Long) actual); index++; offset = offsetAccessor.get(index); @@ -313,10 +314,10 @@ public void testSplitAndTransfer() throws Exception { assertEquals(5, listVector.getMutator().getLastSet()); /* get offsetVector */ - UInt4Vector offsetVector = (UInt4Vector)listVector.getOffsetVector(); + UInt4Vector offsetVector = (UInt4Vector) listVector.getOffsetVector(); /* get dataVector */ - NullableBigIntVector dataVector = (NullableBigIntVector)listVector.getDataVector(); + NullableBigIntVector dataVector = (NullableBigIntVector) listVector.getDataVector(); /* check the vector output */ final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); @@ -332,13 +333,13 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(0), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(10), (Long)actual); + assertEquals(new Long(10), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(11), (Long)actual); + assertEquals(new Long(11), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(12), (Long)actual); + assertEquals(new Long(12), (Long) actual); /* index 1 */ index++; @@ -347,10 +348,10 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(3), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(13), (Long)actual); + assertEquals(new Long(13), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(14), (Long)actual); + assertEquals(new Long(14), (Long) actual); /* index 2 */ index++; @@ -359,16 +360,16 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(5), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(15), (Long)actual); + assertEquals(new Long(15), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(16), (Long)actual); + assertEquals(new Long(16), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(17), (Long)actual); + assertEquals(new Long(17), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(18), (Long)actual); + assertEquals(new Long(18), (Long) actual); /* index 3 */ index++; @@ -377,7 +378,7 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(9), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(19), (Long)actual); + assertEquals(new Long(19), (Long) actual); /* index 4 */ index++; @@ -386,16 +387,16 @@ public void testSplitAndTransfer() throws Exception { assertEquals(Integer.toString(10), Integer.toString(offset)); actual = valueAccessor.getObject(offset); - assertEquals(new Long(20), (Long)actual); + assertEquals(new Long(20), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(21), (Long)actual); + assertEquals(new Long(21), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(22), (Long)actual); + assertEquals(new Long(22), (Long) actual); offset++; actual = valueAccessor.getObject(offset); - assertEquals(new Long(23), (Long)actual); + assertEquals(new Long(23), (Long) actual); /* index 5 */ index++; @@ -423,26 +424,26 @@ public void testSplitAndTransfer() throws Exception { transferPair.splitAndTransfer(start, splitLength); /* get offsetVector of toVector */ - UInt4Vector offsetVector1 = (UInt4Vector)toVector.getOffsetVector(); + UInt4Vector offsetVector1 = (UInt4Vector) toVector.getOffsetVector(); UInt4Vector.Accessor offsetAccessor1 = offsetVector1.getAccessor(); /* get dataVector of toVector */ - NullableBigIntVector dataVector1 = (NullableBigIntVector)toVector.getDataVector(); + NullableBigIntVector dataVector1 = (NullableBigIntVector) toVector.getDataVector(); NullableBigIntVector.Accessor valueAccessor1 = dataVector1.getAccessor(); - for(int i = 0; i < splitLength; i++) { + for (int i = 0; i < splitLength; i++) { dataLength1 = offsetAccessor.get(start + i + 1) - offsetAccessor.get(start + i); dataLength2 = offsetAccessor1.get(i + 1) - offsetAccessor1.get(i); assertEquals("Different data lengths at index: " + i + " and start: " + start, - dataLength1, dataLength2); + dataLength1, dataLength2); offset1 = offsetAccessor.get(start + i); offset2 = offsetAccessor1.get(i); - for(int j = 0; j < dataLength1; j++) { + for (int j = 0; j < dataLength1; j++) { assertEquals("Different data at indexes: " + offset1 + " and " + offset2, - valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); + valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); offset1++; offset2++; @@ -515,7 +516,7 @@ public void testNestedListVector() throws Exception { /* get listVector value at index 0 -- the value itself is a listvector */ Object result = accessor.getObject(0); - ArrayList> resultSet = (ArrayList>)result; + ArrayList> resultSet = (ArrayList>) result; ArrayList list; assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ @@ -535,7 +536,7 @@ public void testNestedListVector() throws Exception { /* get listVector value at index 1 -- the value itself is a listvector */ result = accessor.getObject(1); - resultSet = (ArrayList>)result; + resultSet = (ArrayList>) result; assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ assertEquals(1, resultSet.get(0).size()); /* size of first inner list */ diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java index 9baebc5a2992c..ba2ebbf05ad0d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java @@ -90,7 +90,7 @@ public void testBitVectorReallocation() { // common: value count < MAX_VALUE_ALLOCATION try { vector.allocateNew(expectedValueCapacity); - for (int i=0; i<3;i++) { + for (int i = 0; i < 3; i++) { vector.reAlloc(); // expand buffer size } assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index a5159242d76f9..a75b196fbcc30 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; @@ -98,7 +99,7 @@ public void testTransfer() throws Exception { mutator.setSafe(5, newBitHolder(false)); mutator.setValueCount(6); - try(UnionVector destVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector destVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { TransferPair pair = srcVector.makeTransferPair(destVector); // Creating the transfer should transfer the type of the field at least. @@ -111,7 +112,7 @@ public void testTransfer() throws Exception { // now check the values are transferred assertEquals(srcVector.getAccessor().getValueCount(), destVector.getAccessor().getValueCount()); - for(int i=0; i T newVector(Class c, String name, ArrowType type, BufferAllocator allocator) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 159d534a31072..72214ed2ed6fb 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; @@ -394,7 +395,7 @@ public void testBitVectorRangeSetAllOnes() { } private void validateRange(int length, int start, int count) { - String desc = "[" + start + ", " + (start + count) + ") "; + String desc = "[" + start + ", " + (start + count) + ") "; try (BitVector bitVector = new BitVector("bits", allocator)) { bitVector.reset(); bitVector.allocateNew(length); @@ -489,7 +490,7 @@ public void testFillEmptiesNotOverfill() { @Test public void testCopyFromWithNulls() { try (final NullableVarCharVector vector = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final NullableVarCharVector vector2 = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + final NullableVarCharVector vector2 = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); for (int i = 0; i < 4095; i++) { @@ -608,15 +609,15 @@ public void testVectorLoadUnload() { VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); try ( - ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); - BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("new vector", 0, Long.MAX_VALUE); - VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, finalVectorsAllocator); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("new vector", 0, Long.MAX_VALUE); + VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, finalVectorsAllocator); ) { VectorLoader vectorLoader = new VectorLoader(schemaRoot2); vectorLoader.load(recordBatch); - NullableVarCharVector vector2 = (NullableVarCharVector)schemaRoot2.getVector(fieldName); + NullableVarCharVector vector2 = (NullableVarCharVector) schemaRoot2.getVector(fieldName); NullableVarCharVector.Mutator mutator2 = vector2.getMutator(); /* diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index a5d5527539322..4ac7536c017db 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index f3694659a8f51..7facf73f511da 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector; import static java.util.Arrays.asList; @@ -176,8 +177,9 @@ public void testUnloadLoadAddPadding() throws IOException { /** * The validity buffer can be empty if: - * - all values are defined - * - all values are null + * - all values are defined + * - all values are null + * * @throws IOException */ @Test @@ -185,7 +187,7 @@ public void testLoadEmptyValidityBuffer() throws IOException { Schema schema = new Schema(asList( new Field("intDefined", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), new Field("intNull", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) - )); + )); int count = 10; ArrowBuf validity = allocator.buffer(10).slice(0, 0); ArrowBuf[] values = new ArrowBuf[2]; @@ -208,8 +210,8 @@ public void testLoadEmptyValidityBuffer() throws IOException { vectorLoader.load(recordBatch); - NullableIntVector intDefinedVector = (NullableIntVector)newRoot.getVector("intDefined"); - NullableIntVector intNullVector = (NullableIntVector)newRoot.getVector("intNull"); + NullableIntVector intDefinedVector = (NullableIntVector) newRoot.getVector("intDefined"); + NullableIntVector intNullVector = (NullableIntVector) newRoot.getVector("intNull"); for (int i = 0; i < count; i++) { assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i)); assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i)); @@ -244,20 +246,20 @@ public void testLoadEmptyValidityBuffer() throws IOException { public void testUnloadLoadDuplicates() throws IOException { int count = 10; Schema schema = new Schema(asList( - new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), - new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) + new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), + new Field("duplicate", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) )); try ( - BufferAllocator originalVectorsAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + BufferAllocator originalVectorsAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); ) { List sources = new ArrayList<>(); - for (Field field: schema.getFields()) { + for (Field field : schema.getFields()) { FieldVector vector = field.createVector(originalVectorsAllocator); vector.allocateNew(); sources.add(vector); NullableIntVector.Mutator mutator = (NullableIntVector.Mutator) vector.getMutator(); - for (int i = 0; i < count; i++) { + for (int i = 0; i < count; i++) { mutator.set(i, i); } mutator.setValueCount(count); @@ -266,8 +268,8 @@ public void testUnloadLoadDuplicates() throws IOException { try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), sources, count)) { VectorUnloader vectorUnloader = new VectorUnloader(root); try (ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); - BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator);) { + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator);) { // load it VectorLoader vectorLoader = new VectorLoader(newRoot); vectorLoader.load(recordBatch); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index e826fa53b0977..97efb7d5a6d30 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.impl; import static org.junit.Assert.assertEquals; @@ -67,7 +68,7 @@ public void testPromoteToUnion() throws Exception { writer.setPosition(1); writer.bit("A").writeBit(1); - writer.decimal("dec", 10,10); + writer.decimal("dec", 10, 10); writer.setPosition(2); writer.integer("A").writeInt(10); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 5a9c80dc124a2..f81cd557a9d8f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.complex.writer; import static org.junit.Assert.*; @@ -217,7 +218,7 @@ public void listScalarType() { for (int i = 0; i < COUNT; i++) { listWriter.startList(); for (int j = 0; j < i % 7; j++) { - if (j%2 == 0) { + if (j % 2 == 0) { listWriter.writeInt(j); } else { IntHolder holder = new IntHolder(); @@ -259,7 +260,7 @@ public void listScalarTypeNullable() { listReader.setPosition(i); if (i % 2 == 0) { assertTrue("index is set: " + i, listReader.isSet()); - assertEquals("correct length at: " + i, i % 7, ((List)listReader.readObject()).size()); + assertEquals("correct length at: " + i, i % 7, ((List) listReader.readObject()).size()); } else { assertFalse("index is not set: " + i, listReader.isSet()); assertNull("index is not set: " + i, listReader.readObject()); @@ -529,10 +530,10 @@ public void promotableWriterSchema() { private Set getFieldNames(List fields) { Set fieldNames = new HashSet<>(); - for (Field field: fields) { + for (Field field : fields) { fieldNames.add(field.getName()); if (!field.getChildren().isEmpty()) { - for (String name: getFieldNames(field.getChildren())) { + for (String name : getFieldNames(field.getChildren())) { fieldNames.add(field.getName() + "::" + name); } } @@ -698,7 +699,7 @@ private void checkTimestampField(Field field, String name) { private void checkTimestampTZField(Field field, String name, String tz) { checkTimestampField(field, name); - Assert.assertEquals(tz, ((Timestamp)field.getType()).getTimezone()); + Assert.assertEquals(tz, ((Timestamp) field.getType()).getTimezone()); } @Test @@ -824,13 +825,13 @@ public void complexCopierWithList() { TransferPair tp = mapVector.getTransferPair(allocator); tp.splitAndTransfer(0, 1); MapVector toMapVector = (MapVector) tp.getTo(); - JsonStringHashMap toMapValue = (JsonStringHashMap) toMapVector.getAccessor().getObject(0); + JsonStringHashMap toMapValue = (JsonStringHashMap) toMapVector.getAccessor().getObject(0); JsonStringArrayList object = (JsonStringArrayList) toMapValue.get("list"); assertEquals(1, object.get(0)); assertEquals(2, object.get(1)); - JsonStringHashMap innerMap = (JsonStringHashMap) object.get(2); + JsonStringHashMap innerMap = (JsonStringHashMap) object.get(2); assertEquals(1, innerMap.get("a")); - innerMap = (JsonStringHashMap) object.get(3); + innerMap = (JsonStringHashMap) object.get(3); assertEquals(2, innerMap.get("a")); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java index 3f717cbc18b6e..732bd98b7c61c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.nio.charset.StandardCharsets; @@ -155,7 +156,7 @@ protected void validateComplexContent(int count, VectorSchemaRoot root) { Assert.assertNull(intVal); } Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); - Assert.assertEquals(i % 3, ((List)root.getVector("list").getAccessor().getObject(i)).size()); + Assert.assertEquals(i % 3, ((List) root.getVector("list").getAccessor().getObject(i)).size()); NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); FieldReader mapReader = root.getVector("map").getReader(); mapReader.setPosition(i); @@ -198,11 +199,11 @@ protected void validateDateTimeContent(int count, VectorSchemaRoot root) { Assert.assertEquals(count, root.getRowCount()); printVectors(root.getFieldVectors()); for (int i = 0; i < count; i++) { - long dateVal = ((NullableDateMilliVector)root.getVector("date")).getAccessor().get(i); + long dateVal = ((NullableDateMilliVector) root.getVector("date")).getAccessor().get(i); LocalDateTime dt = makeDateTimeFromCount(i); LocalDateTime dateExpected = dt.minusMillis(dt.getMillisOfDay()); Assert.assertEquals(DateUtility.toMillis(dateExpected), dateVal); - long timeVal = ((NullableTimeMilliVector)root.getVector("time")).getAccessor().get(i); + long timeVal = ((NullableTimeMilliVector) root.getVector("time")).getAccessor().get(i); Assert.assertEquals(dt.getMillisOfDay(), timeVal); Object timestampMilliVal = root.getVector("timestamp-milli").getAccessor().getObject(i); Assert.assertEquals(dt, timestampMilliVal); @@ -450,20 +451,20 @@ public void validateUnionData(int count, VectorSchemaRoot root) { for (int i = 0; i < count; i++) { unionReader.setPosition(i); switch (i % 4) { - case 0: - Assert.assertEquals(i, unionReader.readInteger().intValue()); - break; - case 1: - Assert.assertEquals(i, unionReader.readLong().longValue()); - break; - case 2: - Assert.assertEquals(i % 3, unionReader.size()); - break; - case 3: - NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); - unionReader.reader("timestamp").read(h); - Assert.assertEquals(i, h.value); - break; + case 0: + Assert.assertEquals(i, unionReader.readInteger().intValue()); + break; + case 1: + Assert.assertEquals(i, unionReader.readLong().longValue()); + break; + case 2: + Assert.assertEquals(i % 3, unionReader.size()); + break; + case 3: + NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); + unionReader.reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + break; } } } @@ -483,28 +484,28 @@ public void writeUnionData(int count, NullableMapVector parent) { MapWriter mapWriter = rootWriter.map("union"); for (int i = 0; i < count; i++) { switch (i % 4) { - case 0: - intWriter.setPosition(i); - intWriter.writeInt(i); - break; - case 1: - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - break; - case 2: - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 3; j++) { - listWriter.varChar().writeVarChar(0, 3, varchar); - } - listWriter.endList(); - break; - case 3: - mapWriter.setPosition(i); - mapWriter.start(); - mapWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); - mapWriter.end(); - break; + case 0: + intWriter.setPosition(i); + intWriter.writeInt(i); + break; + case 1: + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + break; + case 2: + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + break; + case 3: + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); + mapWriter.end(); + break; } } writer.setValueCount(count); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index 90fb5d252d68d..c483ba7de91c6 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import java.io.ByteArrayInputStream; @@ -107,19 +108,19 @@ public void testWriteRead() throws IOException { // read try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); FileInputStream fileInputStream = new FileInputStream(file); - ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator){ - @Override - protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator allocator) throws IOException { - ArrowMessage message = super.readMessage(in, allocator); - if (message != null) { - ArrowRecordBatch batch = (ArrowRecordBatch) message; - List buffersLayout = batch.getBuffersLayout(); - for (ArrowBuffer arrowBuffer : buffersLayout) { - Assert.assertEquals(0, arrowBuffer.getOffset() % 8); - } - } - return message; - } + ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator) { + @Override + protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator allocator) throws IOException { + ArrowMessage message = super.readMessage(in, allocator); + if (message != null) { + ArrowRecordBatch batch = (ArrowRecordBatch) message; + List buffersLayout = batch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + } + return message; + } }) { Schema schema = arrowReader.getVectorSchemaRoot().getSchema(); LOGGER.debug("reading schema: " + schema); @@ -134,7 +135,7 @@ protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator alloc // Read from stream. try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); ByteArrayInputStream input = new ByteArrayInputStream(stream.toByteArray()); - ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator){ + ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator) { @Override protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { ArrowMessage message = super.readMessage(in, allocator); @@ -203,17 +204,17 @@ public void testWriteReadComplex() throws IOException { public void testWriteReadMultipleRBs() throws IOException { File file = new File("target/mytest_multiple.arrow"); ByteArrayOutputStream stream = new ByteArrayOutputStream(); - int[] counts = { 10, 5 }; + int[] counts = {10, 5}; // write try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); MapVector parent = MapVector.empty("parent", originalVectorAllocator); - FileOutputStream fileOutputStream = new FileOutputStream(file)){ + FileOutputStream fileOutputStream = new FileOutputStream(file)) { writeData(counts[0], parent); VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); - try(ArrowFileWriter fileWriter = new ArrowFileWriter(root, null, fileOutputStream.getChannel()); - ArrowStreamWriter streamWriter = new ArrowStreamWriter(root, null, stream)) { + try (ArrowFileWriter fileWriter = new ArrowFileWriter(root, null, fileOutputStream.getChannel()); + ArrowStreamWriter streamWriter = new ArrowStreamWriter(root, null, stream)) { fileWriter.start(); streamWriter.start(); @@ -318,7 +319,7 @@ public void testWriteReadTiny() throws IOException { root.getFieldVectors().get(0).allocateNew(); NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); for (int i = 0; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte)(i + 1)); + mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } mutator.setValueCount(16); root.setRowCount(16); @@ -367,7 +368,7 @@ private void validateTinyData(VectorSchemaRoot root) { NullableTinyIntVector vector = (NullableTinyIntVector) root.getFieldVectors().get(0); for (int i = 0; i < 16; i++) { if (i < 8) { - Assert.assertEquals((byte)(i + 1), vector.getAccessor().get(i)); + Assert.assertEquals((byte) (i + 1), vector.getAccessor().get(i)); } else { Assert.assertTrue(vector.getAccessor().isNull(i)); } @@ -384,7 +385,7 @@ public void testWriteReadMetadata() throws IOException { childFields.add(new Field("float-child", new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null, metadata(2)), null)); childFields.add(new Field("int-child", new FieldType(false, new ArrowType.Int(32, true), null, metadata(3)), null)); childFields.add(new Field("list-child", new FieldType(true, ArrowType.List.INSTANCE, null, metadata(4)), - ImmutableList.of(new Field("l1", FieldType.nullable(new ArrowType.Int(16 ,true)), null)))); + ImmutableList.of(new Field("l1", FieldType.nullable(new ArrowType.Int(16, true)), null)))); Field field = new Field("meta", new FieldType(true, ArrowType.Struct.INSTANCE, null, metadata(0)), childFields); Map metadata = new HashMap<>(); metadata.put("s1", "v1"); @@ -425,7 +426,7 @@ public void testWriteReadMetadata() throws IOException { Assert.assertEquals(originalSchema.getCustomMetadata(), schema.getCustomMetadata()); Field top = schema.getFields().get(0); Assert.assertEquals(metadata(0), top.getMetadata()); - for (int i = 0; i < 4; i ++) { + for (int i = 0; i < 4; i++) { Assert.assertEquals(metadata(i + 1), top.getChildren().get(i).getMetadata()); } } @@ -441,7 +442,7 @@ public void testWriteReadMetadata() throws IOException { Assert.assertEquals(originalSchema.getCustomMetadata(), schema.getCustomMetadata()); Field top = schema.getFields().get(0); Assert.assertEquals(metadata(0), top.getMetadata()); - for (int i = 0; i < 4; i ++) { + for (int i = 0; i < 4; i++) { Assert.assertEquals(metadata(i + 1), top.getChildren().get(i).getMetadata()); } } @@ -475,7 +476,7 @@ public void testWriteReadDictionary() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } @@ -534,7 +535,7 @@ public void testWriteReadNestedDictionary() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java index 3014e64b4eea4..4612465323130 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static java.util.Arrays.asList; @@ -40,7 +41,7 @@ public class TestArrowFooter { public void test() { Schema schema = new Schema(asList( new Field("a", FieldType.nullable(new ArrowType.Int(8, true)), Collections.emptyList()) - )); + )); ArrowFooter footer = new ArrowFooter(schema, Collections.emptyList(), Collections.emptyList()); ArrowFooter newFooter = roundTrip(footer); assertEquals(footer, newFooter); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java index 55629d5107c86..65332aa2c7de2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static java.nio.channels.Channels.newChannel; @@ -77,15 +78,15 @@ public void test() throws IOException { FieldVector vector = TestUtils.newVector(FieldVector.class, "testField", type, allocator); vector.initializeChildrenFromFields(schema.getFields().get(0).getChildren()); - byte[] validity = new byte[] { (byte) 255, 0}; + byte[] validity = new byte[] {(byte) 255, 0}; // second half is "undefined" - byte[] values = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + byte[] values = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; ByteArrayOutputStream out = new ByteArrayOutputStream(); try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), asList(vector), 16); ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out))) { ArrowBuf validityb = buf(validity); - ArrowBuf valuesb = buf(values); + ArrowBuf valuesb = buf(values); writer.writeRecordBatch(new ArrowRecordBatch(16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb))); } @@ -113,7 +114,7 @@ public void test() throws IOException { // Read just the header. This demonstrates being able to read without need to // deserialize the buffer. ByteBuffer headerBuffer = ByteBuffer.allocate(recordBatches.get(0).getMetadataLength()); - headerBuffer.put(byteArray, (int)recordBatches.get(0).getOffset(), headerBuffer.capacity()); + headerBuffer.put(byteArray, (int) recordBatches.get(0).getOffset(), headerBuffer.capacity()); headerBuffer.position(4); Message messageFB = Message.getRootAsMessage(headerBuffer); RecordBatch recordBatchFB = (RecordBatch) messageFB.header(new RecordBatch()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java index 7e9afd381c181..e2efabef0095b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static java.util.Arrays.asList; @@ -71,7 +72,7 @@ public void testReadWrite() throws IOException { root.getFieldVectors().get(0).allocateNew(); NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); for (int i = 0; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte)(i + 1)); + mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } mutator.setValueCount(16); root.setRowCount(16); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java index 20d4482da7c98..a19c3795fd5bb 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file; import static org.junit.Assert.assertEquals; @@ -65,7 +66,7 @@ public void run() { // Send a changing batch id first mutator.set(0, j); for (int i = 1; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte)(i + 1)); + mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } mutator.setValueCount(16); root.setRowCount(16); @@ -80,7 +81,9 @@ public void run() { } } - public long bytesWritten() { return writer.bytesWritten(); } + public long bytesWritten() { + return writer.bytesWritten(); + } } private final class ReaderThread extends Thread { @@ -104,6 +107,7 @@ protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) th } return message; } + @Override public boolean loadNextBatch() throws IOException { if (!super.loadNextBatch()) { @@ -113,10 +117,10 @@ public boolean loadNextBatch() throws IOException { VectorSchemaRoot root = getVectorSchemaRoot(); Assert.assertEquals(16, root.getRowCount()); NullableTinyIntVector vector = (NullableTinyIntVector) root.getFieldVectors().get(0); - Assert.assertEquals((byte)(batchesRead - 1), vector.getAccessor().get(0)); + Assert.assertEquals((byte) (batchesRead - 1), vector.getAccessor().get(0)); for (int i = 1; i < 16; i++) { if (i < 8) { - Assert.assertEquals((byte)(i + 1), vector.getAccessor().get(i)); + Assert.assertEquals((byte) (i + 1), vector.getAccessor().get(i)); } else { Assert.assertTrue(vector.getAccessor().isNull(i)); } @@ -143,8 +147,13 @@ public void run() { } } - public int getBatchesRead() { return batchesRead; } - public long bytesRead() { return reader.bytesRead(); } + public int getBatchesRead() { + return batchesRead; + } + + public long bytesRead() { + return reader.bytesRead(); + } } // Starts up a producer and consumer thread to read/write batches. diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java index 311cada194eaf..24b2138386da1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.file.json; import java.io.File; @@ -53,7 +54,7 @@ public void testWriteReadComplexJSON() throws IOException { // read try ( BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); - ) { + ) { JsonFileReader reader = new JsonFileReader(file, readerAllocator); Schema schema = reader.start(); LOGGER.debug("reading schema: " + schema); @@ -109,7 +110,7 @@ public void testWriteReadUnionJSON() throws IOException { try ( BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - ) { + ) { JsonFileReader reader = new JsonFileReader(file, readerAllocator); Schema schema = reader.start(); LOGGER.debug("reading schema: " + schema); @@ -174,7 +175,7 @@ public void testWriteReadDictionaryJSON() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } @@ -215,7 +216,7 @@ public void testWriteReadNestedDictionaryJSON() throws IOException { } // Need to close dictionary vectors - for (long id: provider.getDictionaryIds()) { + for (long id : provider.getDictionaryIds()) { provider.lookup(id).getVector().close(); } } @@ -240,7 +241,7 @@ public void testWriteReadNestedDictionaryJSON() throws IOException { public void testSetStructLength() throws IOException { File file = new File("../../integration/data/struct_example.json"); try ( - BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); ) { JsonFileReader reader = new JsonFileReader(file, readerAllocator); Schema schema = reader.start(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 62c21f7da0db6..f98aeac8c8196 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.pojo; import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; @@ -92,15 +93,15 @@ public void nestedSchema() { childrenBuilder.add(new Field("child3", FieldType.nullable(new Struct()), ImmutableList.of( new Field("child3.1", FieldType.nullable(Utf8.INSTANCE), null), new Field("child3.2", FieldType.nullable(new FloatingPoint(DOUBLE)), ImmutableList.of()) - ))); + ))); childrenBuilder.add(new Field("child4", FieldType.nullable(new List()), ImmutableList.of( new Field("child4.1", FieldType.nullable(Utf8.INSTANCE), null) - ))); - childrenBuilder.add(new Field("child5", FieldType.nullable(new Union(UnionMode.Sparse, new int[] { MinorType.TIMESTAMPMILLI.ordinal(), MinorType.FLOAT8.ordinal() } )), ImmutableList.of( + ))); + childrenBuilder.add(new Field("child5", FieldType.nullable(new Union(UnionMode.Sparse, new int[] {MinorType.TIMESTAMPMILLI.ordinal(), MinorType.FLOAT8.ordinal()})), ImmutableList.of( new Field("child5.1", FieldType.nullable(new Timestamp(TimeUnit.MILLISECOND, null)), null), new Field("child5.2", FieldType.nullable(new FloatingPoint(DOUBLE)), ImmutableList.of()), new Field("child5.3", true, new Timestamp(TimeUnit.MILLISECOND, "UTC"), null) - ))); + ))); Schema initialSchema = new Schema(childrenBuilder.build()); run(initialSchema); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java b/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java index 9678423c0fbbe..f968768f5e67d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.stream; import static java.util.Arrays.asList; @@ -107,13 +108,13 @@ public void testdeSerializeRecordBatchLongMetaData() throws IOException { @Test public void testSerializeRecordBatch() throws IOException { - byte[] validity = new byte[] { (byte)255, 0}; + byte[] validity = new byte[] {(byte) 255, 0}; // second half is "undefined" - byte[] values = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + byte[] values = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE); ArrowBuf validityb = buf(alloc, validity); - ArrowBuf valuesb = buf(alloc, values); + ArrowBuf valuesb = buf(alloc, values); ArrowRecordBatch batch = new ArrowRecordBatch( 16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb)); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java index 84cc10787f7b0..43b0907720f83 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.arrow.vector.types.pojo; import static java.util.Arrays.asList; @@ -66,7 +67,7 @@ public void testComplex() throws IOException { field("g", new Timestamp(TimeUnit.MILLISECOND, "UTC")), field("h", new Timestamp(TimeUnit.MICROSECOND, null)), field("i", new Interval(IntervalUnit.DAY_TIME)) - )); + )); roundTrip(schema); assertEquals( "Schema, e: List, f: FloatingPoint(SINGLE), g: Timestamp(MILLISECOND, UTC), h: Timestamp(MICROSECOND, null), i: Interval(DAY_TIME)>", @@ -95,7 +96,7 @@ public void testAll() throws IOException { field("q", new Timestamp(TimeUnit.MILLISECOND, "UTC")), field("r", new Timestamp(TimeUnit.MICROSECOND, null)), field("s", new Interval(IntervalUnit.DAY_TIME)) - )); + )); roundTrip(schema); } @@ -103,7 +104,7 @@ public void testAll() throws IOException { public void testUnion() throws IOException { Schema schema = new Schema(asList( field("d", new Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new Null())) - )); + )); roundTrip(schema); contains(schema, "Sparse"); } @@ -113,7 +114,7 @@ public void testDate() throws IOException { Schema schema = new Schema(asList( field("a", new Date(DateUnit.DAY)), field("b", new Date(DateUnit.MILLISECOND)) - )); + )); roundTrip(schema); assertEquals( "Schema", @@ -123,15 +124,15 @@ public void testDate() throws IOException { @Test public void testTime() throws IOException { Schema schema = new Schema(asList( - field("a", new Time(TimeUnit.SECOND, 32)), - field("b", new Time(TimeUnit.MILLISECOND, 32)), - field("c", new Time(TimeUnit.MICROSECOND, 64)), - field("d", new Time(TimeUnit.NANOSECOND, 64)) + field("a", new Time(TimeUnit.SECOND, 32)), + field("b", new Time(TimeUnit.MILLISECOND, 32)), + field("c", new Time(TimeUnit.MICROSECOND, 64)), + field("d", new Time(TimeUnit.NANOSECOND, 64)) )); roundTrip(schema); assertEquals( - "Schema", - schema.toString()); + "Schema", + schema.toString()); } @Test @@ -145,7 +146,7 @@ public void testTS() throws IOException { field("f", new Timestamp(TimeUnit.MILLISECOND, null)), field("g", new Timestamp(TimeUnit.MICROSECOND, null)), field("h", new Timestamp(TimeUnit.NANOSECOND, null)) - )); + )); roundTrip(schema); assertEquals( "Schema", @@ -157,7 +158,7 @@ public void testInterval() throws IOException { Schema schema = new Schema(asList( field("a", new Interval(IntervalUnit.YEAR_MONTH)), field("b", new Interval(IntervalUnit.DAY_TIME)) - )); + )); roundTrip(schema); contains(schema, "YEAR_MONTH", "DAY_TIME"); } @@ -168,7 +169,7 @@ public void testFP() throws IOException { field("a", new FloatingPoint(FloatingPointPrecision.HALF)), field("b", new FloatingPoint(FloatingPointPrecision.SINGLE)), field("c", new FloatingPoint(FloatingPointPrecision.DOUBLE)) - )); + )); roundTrip(schema); contains(schema, "HALF", "SINGLE", "DOUBLE"); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java index 7cf638e57d849..95b08099c204d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestValidator.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.arrow.vector.util; import static org.apache.arrow.vector.util.Validator.equalEnough; @@ -26,32 +27,32 @@ public class TestValidator { - @Test - public void testFloatComp() { - assertTrue(equalEnough(912.4140000000002F, 912.414F)); - assertTrue(equalEnough(912.4140000000002D, 912.414D)); - assertTrue(equalEnough(912.414F, 912.4140000000002F)); - assertTrue(equalEnough(912.414D, 912.4140000000002D)); - assertFalse(equalEnough(912.414D, 912.4140001D)); - assertFalse(equalEnough(null, 912.414D)); - assertTrue(equalEnough((Float)null, null)); - assertTrue(equalEnough((Double)null, null)); - assertFalse(equalEnough(912.414D, null)); - assertFalse(equalEnough(Double.MAX_VALUE, Double.MIN_VALUE)); - assertFalse(equalEnough(Double.MIN_VALUE, Double.MAX_VALUE)); - assertTrue(equalEnough(Double.MAX_VALUE, Double.MAX_VALUE)); - assertTrue(equalEnough(Double.MIN_VALUE, Double.MIN_VALUE)); - assertTrue(equalEnough(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY)); - assertFalse(equalEnough(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); - assertTrue(equalEnough(Double.NaN, Double.NaN)); - assertFalse(equalEnough(1.0, Double.NaN)); - assertFalse(equalEnough(Float.MAX_VALUE, Float.MIN_VALUE)); - assertFalse(equalEnough(Float.MIN_VALUE, Float.MAX_VALUE)); - assertTrue(equalEnough(Float.MAX_VALUE, Float.MAX_VALUE)); - assertTrue(equalEnough(Float.MIN_VALUE, Float.MIN_VALUE)); - assertTrue(equalEnough(Float.NEGATIVE_INFINITY, Float.NEGATIVE_INFINITY)); - assertFalse(equalEnough(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)); - assertTrue(equalEnough(Float.NaN, Float.NaN)); - assertFalse(equalEnough(1.0F, Float.NaN)); - } + @Test + public void testFloatComp() { + assertTrue(equalEnough(912.4140000000002F, 912.414F)); + assertTrue(equalEnough(912.4140000000002D, 912.414D)); + assertTrue(equalEnough(912.414F, 912.4140000000002F)); + assertTrue(equalEnough(912.414D, 912.4140000000002D)); + assertFalse(equalEnough(912.414D, 912.4140001D)); + assertFalse(equalEnough(null, 912.414D)); + assertTrue(equalEnough((Float) null, null)); + assertTrue(equalEnough((Double) null, null)); + assertFalse(equalEnough(912.414D, null)); + assertFalse(equalEnough(Double.MAX_VALUE, Double.MIN_VALUE)); + assertFalse(equalEnough(Double.MIN_VALUE, Double.MAX_VALUE)); + assertTrue(equalEnough(Double.MAX_VALUE, Double.MAX_VALUE)); + assertTrue(equalEnough(Double.MIN_VALUE, Double.MIN_VALUE)); + assertTrue(equalEnough(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY)); + assertFalse(equalEnough(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); + assertTrue(equalEnough(Double.NaN, Double.NaN)); + assertFalse(equalEnough(1.0, Double.NaN)); + assertFalse(equalEnough(Float.MAX_VALUE, Float.MIN_VALUE)); + assertFalse(equalEnough(Float.MIN_VALUE, Float.MAX_VALUE)); + assertTrue(equalEnough(Float.MAX_VALUE, Float.MAX_VALUE)); + assertTrue(equalEnough(Float.MIN_VALUE, Float.MIN_VALUE)); + assertTrue(equalEnough(Float.NEGATIVE_INFINITY, Float.NEGATIVE_INFINITY)); + assertFalse(equalEnough(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)); + assertTrue(equalEnough(Float.NaN, Float.NaN)); + assertFalse(equalEnough(1.0F, Float.NaN)); + } } From 0b91cadfe1cdf5964f275c7d8324332e9e49b0d9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Aug 2017 19:10:04 -0400 Subject: [PATCH 24/38] ARROW-622: [Python] Add coerce_timestamps option to parquet.write_table, deprecate timestamps_to_ms argument Requires PARQUET-1078 https://github.com/apache/parquet-cpp/pull/380 cc @xhochy @fjetter @cpcloud, could you have a look. This needs to go into 0.6.0 Author: Wes McKinney Closes #944 from wesm/ARROW-622 and squashes the following commits: 3a21dfe5 [Wes McKinney] Add test to exhaust more paths of coerce_timestamps, error handling 45bbf5b2 [Wes McKinney] Add coerce_timestamps to write_metadata 172a9e1a [Wes McKinney] Implement coerce_timestamps option --- python/pyarrow/_parquet.pxd | 4 +- python/pyarrow/_parquet.pyx | 16 +++++- python/pyarrow/pandas_compat.py | 4 ++ python/pyarrow/parquet.py | 22 +++++++-- python/pyarrow/tests/test_parquet.py | 73 +++++++++++++++++++--------- 5 files changed, 91 insertions(+), 28 deletions(-) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 7299e19b81906..ced654915e57e 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -21,7 +21,8 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, CTable, CMemoryPool, CKeyValueMetadata, - RandomAccessFile, OutputStream) + RandomAccessFile, OutputStream, + TimeUnit) cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: @@ -266,5 +267,6 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: Builder() Builder* disable_deprecated_int96_timestamps() Builder* enable_deprecated_int96_timestamps() + Builder* coerce_timestamps(TimeUnit unit) shared_ptr[ArrowWriterProperties] build() c_bool support_deprecated_int96_timestamps() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 65ca468772710..f3b7875045904 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -547,6 +547,7 @@ cdef class ParquetWriter: cdef readonly: object use_dictionary object use_deprecated_int96_timestamps + object coerce_timestamps object compression object version int row_group_size @@ -554,7 +555,8 @@ cdef class ParquetWriter: def __cinit__(self, where, Schema schema, use_dictionary=None, compression=None, version=None, MemoryPool memory_pool=None, - use_deprecated_int96_timestamps=False): + use_deprecated_int96_timestamps=False, + coerce_timestamps=None): cdef: shared_ptr[FileOutputStream] filestream shared_ptr[WriterProperties] properties @@ -574,6 +576,7 @@ cdef class ParquetWriter: self.compression = compression self.version = version self.use_deprecated_int96_timestamps = use_deprecated_int96_timestamps + self.coerce_timestamps = coerce_timestamps cdef WriterProperties.Builder properties_builder self._set_version(&properties_builder) @@ -583,6 +586,7 @@ cdef class ParquetWriter: cdef ArrowWriterProperties.Builder arrow_properties_builder self._set_int96_support(&arrow_properties_builder) + self._set_coerce_timestamps(&arrow_properties_builder) arrow_properties = arrow_properties_builder.build() pool = maybe_unbox_memory_pool(memory_pool) @@ -598,6 +602,16 @@ cdef class ParquetWriter: else: props.disable_deprecated_int96_timestamps() + cdef int _set_coerce_timestamps( + self, ArrowWriterProperties.Builder* props) except -1: + if self.coerce_timestamps == 'ms': + props.coerce_timestamps(TimeUnit_MILLI) + elif self.coerce_timestamps == 'us': + props.coerce_timestamps(TimeUnit_MICRO) + elif self.coerce_timestamps is not None: + raise ValueError('Invalid value for coerce_timestamps: {0}' + .format(self.coerce_timestamps)) + cdef void _set_version(self, WriterProperties.Builder* props): if self.version is not None: if self.version == "1.0": diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 2881588208eb1..ddd562238e38a 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -243,6 +243,10 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): + if timestamps_to_ms: + import warnings + warnings.warn('timestamps_to_ms=True is deprecated', FutureWarning) + from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 6d39a2354f653..89dbf83ee3523 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -757,7 +757,8 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None): def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', - use_deprecated_int96_timestamps=False, **kwargs): + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, **kwargs): """ Write a Table to Parquet format @@ -773,6 +774,11 @@ def write_table(table, where, row_group_size=None, version='1.0', use_dictionary : bool or list Specify if we should use dictionary encoding in general or only for some columns. + use_deprecated_int96_timestamps : boolean, default False + Write nanosecond resolution timestamps to INT96 Parquet format + coerce_timestamps : string, default None + Cast timestamps a particular resolution. + Valid values: {None, 'ms', 'us'} compression : str or dict Specify the compression codec, either on a general basis or per-column. """ @@ -781,7 +787,8 @@ def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=use_dictionary, compression=compression, version=version, - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, + coerce_timestamps=coerce_timestamps) writer = None try: @@ -801,7 +808,8 @@ def write_table(table, where, row_group_size=None, version='1.0', def write_metadata(schema, where, version='1.0', - use_deprecated_int96_timestamps=False): + use_deprecated_int96_timestamps=False, + coerce_timestamps=None): """ Write metadata-only Parquet file from schema @@ -811,10 +819,16 @@ def write_metadata(schema, where, version='1.0', where: string or pyarrow.io.NativeFile version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 + use_deprecated_int96_timestamps : boolean, default False + Write nanosecond resolution timestamps to INT96 Parquet format + coerce_timestamps : string, default None + Cast timestamps a particular resolution. + Valid values: {None, 'ms', 'us'} """ options = dict( version=version, - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, + coerce_timestamps=coerce_timestamps ) writer = ParquetWriter(where, schema, **options) writer.close() diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index ab3b26cd4e0f1..9a570b9d5dab4 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -100,10 +100,11 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata - _write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = pq.read_pandas(filename.strpath) assert b'pandas' in table_read.schema.metadata @@ -120,10 +121,11 @@ def test_pandas_parquet_custom_metadata(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata - _write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') md = pq.read_metadata(filename.strpath).metadata assert b'pandas' in md @@ -139,13 +141,12 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas( - df, timestamps_to_ms=True, preserve_index=False - ) + arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8')) assert not js['index_columns'] - _write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = pq.read_pandas(filename.strpath) js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8')) @@ -340,10 +341,11 @@ def test_pandas_parquet_configuration_options(tmpdir): def make_sample_file(df): import pyarrow.parquet as pq - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() - _write_table(a_table, buf, compression='SNAPPY', version='2.0') + _write_table(a_table, buf, compression='SNAPPY', version='2.0', + coerce_timestamps='ms') buf.seek(0) return pq.ParquetFile(buf) @@ -418,22 +420,47 @@ def test_column_of_arrays(tmpdir): df, schema = dataframe_with_arrays() filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, - schema=schema) - _write_table(arrow_table, filename.strpath, version="2.0") + arrow_table = pa.Table.from_pandas(df, schema=schema) + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) +@parquet +def test_coerce_timestamps(tmpdir): + # ARROW-622 + df, schema = dataframe_with_arrays() + + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = pa.Table.from_pandas(df, schema=schema) + + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='us') + table_read = _read_table(filename.strpath) + df_read = table_read.to_pandas() + + df_expected = df.copy() + for i, x in enumerate(df_expected['datetime64']): + if isinstance(x, np.ndarray): + df_expected['datetime64'][i] = x.astype('M8[us]') + + tm.assert_frame_equal(df_expected, df_read) + + with pytest.raises(ValueError): + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='unknown') + + @parquet def test_column_of_lists(tmpdir): df, schema = dataframe_with_lists() filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, - schema=schema) - _write_table(arrow_table, filename.strpath, version="2.0") + arrow_table = pa.Table.from_pandas(df, schema=schema) + _write_table(arrow_table, filename.strpath, version="2.0", + coerce_timestamps='ms') table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -469,12 +496,14 @@ def test_date_time_types(): t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value - data7 = np.array([start, start + 1, start + 2], dtype='int64') + data7 = np.array([start, start + 1000, start + 2000], + dtype='int64') a7 = pa.Array.from_pandas(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value - data7_us = np.array([start, start + 1, start + 2], dtype='int64') // 1000 + data7_us = np.array([start, start + 1000, start + 2000], + dtype='int64') // 1000 a7_us = pa.Array.from_pandas(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], @@ -547,7 +576,7 @@ def _check_roundtrip(table, expected=None, **params): def test_multithreaded_read(): df = alltypes_sample(size=10000) - table = pa.Table.from_pandas(df, timestamps_to_ms=True) + table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(table, buf, compression='SNAPPY', version='2.0') @@ -585,7 +614,7 @@ def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000) - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, compression='snappy', version='2.0') @@ -608,7 +637,7 @@ def test_read_single_row_group(): N, K = 10000, 4 df = alltypes_sample(size=N) - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, @@ -631,7 +660,7 @@ def test_read_single_row_group_with_column_subset(): N, K = 10000, 4 df = alltypes_sample(size=N) - a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) + a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, From 2015198f1b6fcd8a0219d81bccc5a2f34fa66d34 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Mon, 7 Aug 2017 19:13:21 -0400 Subject: [PATCH 25/38] =?UTF-8?q?ARROW-1263:=20[C++]=20Get=20CPU=20info=20?= =?UTF-8?q?on=20Windows;=20Resolve=20patching=20whitespac=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit โ€ฆes issue on cmake build. Author: Max Risuhin Closes #943 from MaxRis/ARROW-1263 and squashes the following commits: 14ed7c05 [Max Risuhin] ARROW-1263: [C++] Get CPU info on Windows; Resolve patching whitespaces issue on cmake build. --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 +- cpp/src/arrow/util/cpu-info.cc | 45 +++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ae48e8d2fb979..a888e92392db6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -698,7 +698,7 @@ if (ARROW_WITH_LZ4) if (MSVC) set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib") set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln) - set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_wholeprogramoptimization_param.patch) + set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_wholeprogramoptimization_param.patch) else() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") set(LZ4_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh) @@ -742,7 +742,7 @@ if (ARROW_WITH_ZSTD) if (MSVC) set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/libzstd_static.lib") set(ZSTD_BUILD_COMMAND BUILD_COMMAND msbuild ${ZSTD_BUILD_DIR}/build/VS2010/zstd.sln /t:Build /v:minimal /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /p:OutDir=${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/ /p:SolutionDir=${ZSTD_BUILD_DIR}/build/VS2010/ ) - set(ZSTD_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose ${CMAKE_SOURCE_DIR}/build-support/zstd_msbuild_wholeprogramoptimization_param.patch) + set(ZSTD_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/zstd_msbuild_wholeprogramoptimization_param.patch) else() set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/lib/libzstd.a") set(ZSTD_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-zstd-lib.sh) diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc index b0667cb33ada4..d0a9a14fc60c0 100644 --- a/cpp/src/arrow/util/cpu-info.cc +++ b/cpp/src/arrow/util/cpu-info.cc @@ -31,7 +31,11 @@ #endif #ifdef _WIN32 +#include #include +#include +#include + #endif #include @@ -132,6 +136,46 @@ bool RetrieveCacheSize(int64_t* cache_sizes) { } return true; } + +bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name) { + if (!hardware_flags || !model_name) { + return false; + } + const int register_ECX_id = 1; + int highest_valid_id = 0; + int highest_extended_valid_id = 0; + std::bitset<32> features_ECX; + std::array cpu_info; + + // Get highest valid id + __cpuid(cpu_info.data(), 0); + highest_valid_id = cpu_info[0]; + + if (highest_valid_id <= register_ECX_id) return false; + + __cpuidex(cpu_info.data(), register_ECX_id, 0); + features_ECX = cpu_info[2]; + + // Get highest extended id + __cpuid(cpu_info.data(), 0x80000000); + highest_extended_valid_id = cpu_info[0]; + + // Retrieve CPU model name + if (highest_extended_valid_id >= 0x80000004) { + model_name->clear(); + for (int i = 0x80000002; i <= 0x80000004; ++i) { + __cpuidex(cpu_info.data(), i, 0); + *model_name += + std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info)); + } + } + + if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; + return true; +} #endif void CpuInfo::Init() { @@ -203,6 +247,7 @@ void CpuInfo::Init() { if (!RetrieveCacheSize(cache_sizes_)) { SetDefaultCacheSize(); } + RetrieveCPUInfo(&hardware_flags_, &model_name_); #else SetDefaultCacheSize(); #endif From 02ab74841d1a2f3f15aeab20fc4acbc5d737047d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Aug 2017 22:06:28 -0400 Subject: [PATCH 26/38] ARROW-1336: [C++] Add arrow::schema factory function, simply some awkward constructors Author: Wes McKinney Closes #948 from wesm/ARROW-1336 and squashes the following commits: a901b375 [Wes McKinney] Add arrow::schema factory function, simply awkward constructors with initializer list --- cpp/src/arrow/ipc/feather-test.cc | 2 +- cpp/src/arrow/ipc/ipc-json-test.cc | 4 +-- cpp/src/arrow/ipc/ipc-read-write-test.cc | 4 +-- cpp/src/arrow/ipc/json-internal.cc | 3 +- cpp/src/arrow/ipc/metadata.cc | 2 +- cpp/src/arrow/ipc/test-common.h | 36 ++++++++++++------------ cpp/src/arrow/python/builtin_convert.cc | 4 +-- cpp/src/arrow/table-test.cc | 27 +++++++++--------- cpp/src/arrow/type-test.cc | 25 +++++++--------- cpp/src/arrow/type.cc | 14 +++++++++ cpp/src/arrow/type.h | 33 ++++++++++++++++++++++ 11 files changed, 97 insertions(+), 57 deletions(-) diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index b76b518788b91..e74a60dd48925 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -354,7 +354,7 @@ TEST_F(TestTableWriter, TimeTypes) { auto f1 = field("f1", time32(TimeUnit::MILLI)); auto f2 = field("f2", timestamp(TimeUnit::NANO)); auto f3 = field("f3", timestamp(TimeUnit::SECOND, "US/Los_Angeles")); - std::shared_ptr schema(new Schema({f0, f1, f2, f3})); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::vector values_vec = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr values; diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 1d5a6997ae920..ddb2e37288e46 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -247,8 +247,8 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { auto v2_type = int32(); auto v3_type = utf8(); - std::shared_ptr schema( - new Schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)})); + auto schema = + ::arrow::schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)}); std::unique_ptr writer; ASSERT_OK(JsonWriter::Open(schema, &writer)); diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index a6246c96f2d9a..045296163ea11 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -303,7 +303,7 @@ TEST_P(TestIpcRoundTrip, ZeroLengthArrays) { TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { auto CheckArray = [this](const std::shared_ptr& array) { auto f0 = field("f0", array->type()); - auto schema = std::shared_ptr(new Schema({f0})); + auto schema = ::arrow::schema({f0}); RecordBatch batch(schema, array->length(), {array}); auto sliced_batch = batch.Slice(0, 5); @@ -421,7 +421,7 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { auto f0 = field("f0", type); - *schema = std::shared_ptr(new Schema({f0})); + *schema = ::arrow::schema({f0}); std::vector> arrays = {array}; *batch = std::make_shared(*schema, batch_length, arrays); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 49fb6ac7ce30f..bc2b0d18e72c7 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -129,8 +129,7 @@ class SchemaWriter { writer_->Key("data"); // Make a dummy record batch. A bit tedious as we have to make a schema - auto schema = std::shared_ptr( - new Schema({arrow::field("dictionary", dictionary->type())})); + auto schema = ::arrow::schema({arrow::field("dictionary", dictionary->type())}); RecordBatch batch(schema, dictionary->length(), {dictionary}); RETURN_NOT_OK(WriteRecordBatch(batch, writer_)); writer_->EndObject(); diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index d764e203e7552..faf01a568483a 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -1035,7 +1035,7 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem } } - *out = std::make_shared(fields, metadata); + *out = ::arrow::schema(std::move(fields), metadata); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index ed33e6e95b13a..c1e79d43cc61d 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -179,7 +179,7 @@ Status MakeBooleanBatchSized(const int length, std::shared_ptr* out // Make the schema auto f0 = field("f0", boolean()); auto f1 = field("f1", boolean()); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::shared_ptr a0, a1; RETURN_NOT_OK(MakeRandomBooleanArray(length, true, &a0)); @@ -196,7 +196,7 @@ Status MakeIntBatchSized(int length, std::shared_ptr* out) { // Make the schema auto f0 = field("f0", int32()); auto f1 = field("f1", int32()); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); // Example data std::shared_ptr a0, a1; @@ -237,7 +237,7 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { auto binary_type = binary(); auto f0 = field("f0", string_type); auto f1 = field("f1", binary_type); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::shared_ptr a0, a1; MemoryPool* pool = default_memory_pool(); @@ -259,7 +259,7 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { Status MakeNullRecordBatch(std::shared_ptr* out) { const int64_t length = 500; auto f0 = field("f0", null()); - std::shared_ptr schema(new Schema({f0})); + auto schema = ::arrow::schema({f0}); std::shared_ptr a0 = std::make_shared(length); out->reset(new RecordBatch(schema, length, {a0})); return Status::OK(); @@ -270,7 +270,7 @@ Status MakeListRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", kListInt32); auto f1 = field("f1", kListListInt32); auto f2 = field("f2", int32()); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Example data @@ -293,7 +293,7 @@ Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", kListInt32); auto f1 = field("f1", kListListInt32); auto f2 = field("f2", int32()); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Example data MemoryPool* pool = default_memory_pool(); @@ -313,7 +313,7 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", kListInt32); auto f1 = field("f1", kListListInt32); auto f2 = field("f2", int32()); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Example data MemoryPool* pool = default_memory_pool(); @@ -345,7 +345,7 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { } auto f0 = field("f0", type); - std::shared_ptr schema(new Schema({f0})); + auto schema = ::arrow::schema({f0}); std::vector> arrays = {array}; out->reset(new RecordBatch(schema, batch_length, arrays)); return Status::OK(); @@ -364,7 +364,7 @@ Status MakeStruct(std::shared_ptr* out) { {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); auto f0 = field("non_null_struct", type); auto f1 = field("null_struct", type); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); // construct individual nullable/non-nullable struct arrays std::shared_ptr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); @@ -397,7 +397,7 @@ Status MakeUnion(std::shared_ptr* out) { auto f1 = field("sparse", sparse_type); auto f2 = field("dense", dense_type); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); // Create data std::vector> sparse_children(2); @@ -520,9 +520,9 @@ Status MakeDictionary(std::shared_ptr* out) { auto a4 = std::make_shared(f4_type, indices4); // construct batch - std::shared_ptr schema(new Schema( + auto schema = ::arrow::schema( {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type), - field("list of encoded string", f3_type), field("encoded list", f4_type)})); + field("list of encoded string", f3_type), field("encoded list", f4_type)}); std::vector> arrays = {a0, a1, a2, a3, a4}; @@ -560,8 +560,8 @@ Status MakeDictionaryFlat(std::shared_ptr* out) { auto a2 = std::make_shared(f2_type, indices2); // construct batch - std::shared_ptr schema(new Schema( - {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type)})); + auto schema = ::arrow::schema( + {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type)}); std::vector> arrays = {a0, a1, a2}; out->reset(new RecordBatch(schema, length, arrays)); @@ -572,7 +572,7 @@ Status MakeDates(std::shared_ptr* out) { std::vector is_valid = {true, true, true, false, true, true, true}; auto f0 = field("f0", date32()); auto f1 = field("f1", date64()); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::vector date32_values = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr date32_array; @@ -594,7 +594,7 @@ Status MakeTimestamps(std::shared_ptr* out) { auto f0 = field("f0", timestamp(TimeUnit::MILLI)); auto f1 = field("f1", timestamp(TimeUnit::NANO, "America/New_York")); auto f2 = field("f2", timestamp(TimeUnit::SECOND)); - std::shared_ptr schema(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); std::vector ts_values = {1489269000000, 1489270000000, 1489271000000, 1489272000000, 1489272000000, 1489273000000}; @@ -615,7 +615,7 @@ Status MakeTimes(std::shared_ptr* out) { auto f1 = field("f1", time64(TimeUnit::NANO)); auto f2 = field("f2", time32(TimeUnit::SECOND)); auto f3 = field("f3", time64(TimeUnit::NANO)); - std::shared_ptr schema(new Schema({f0, f1, f2, f3})); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; @@ -649,7 +649,7 @@ Status MakeFWBinary(std::shared_ptr* out) { std::vector is_valid = {true, true, true, false}; auto f0 = field("f0", fixed_size_binary(4)); auto f1 = field("f1", fixed_size_binary(0)); - std::shared_ptr schema(new Schema({f0, f1})); + auto schema = ::arrow::schema({f0, f1}); std::shared_ptr a1, a2; diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 218fe2925fd86..b693b3e6b4d76 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -534,8 +534,8 @@ class UTF8Converter : public TypedConverterVisitor if (obj == Py_None) { return typed_builder_->AppendNull(); } else if (PyBytes_Check(obj)) { - tmp.reset(PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), - PyBytes_GET_SIZE(obj))); + tmp.reset( + PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj))); RETURN_IF_PYERROR(); bytes_obj = obj; } else if (!PyUnicode_Check(obj)) { diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 8dba8c052e922..1ba7a2f95798c 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -214,7 +214,7 @@ class TestTable : public TestBase { }; TEST_F(TestTable, EmptySchema) { - auto empty_schema = shared_ptr(new Schema({})); + auto empty_schema = ::arrow::schema({}); table_.reset(new Table(empty_schema, columns_)); ASSERT_OK(table_->ValidateColumns()); ASSERT_EQ(0, table_->num_rows()); @@ -373,18 +373,17 @@ TEST_F(TestTable, RemoveColumn) { std::shared_ptr result; ASSERT_OK(table.RemoveColumn(0, &result)); - auto ex_schema = - std::shared_ptr(new Schema({schema_->field(1), schema_->field(2)})); + auto ex_schema = ::arrow::schema({schema_->field(1), schema_->field(2)}); std::vector> ex_columns = {table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.RemoveColumn(1, &result)); - ex_schema = std::shared_ptr(new Schema({schema_->field(0), schema_->field(2)})); + ex_schema = ::arrow::schema({schema_->field(0), schema_->field(2)}); ex_columns = {table.column(0), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.RemoveColumn(2, &result)); - ex_schema = std::shared_ptr(new Schema({schema_->field(0), schema_->field(1)})); + ex_schema = ::arrow::schema({schema_->field(0), schema_->field(1)}); ex_columns = {table.column(0), table.column(1)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); } @@ -410,27 +409,27 @@ TEST_F(TestTable, AddColumn) { // Add column 0 in different places ASSERT_OK(table.AddColumn(0, columns_[0], &result)); - auto ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); + auto ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); std::vector> ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(1, columns_[0], &result)); - ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); + ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(2, columns_[0], &result)); - ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)})); + ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)}); ex_columns = {table.column(0), table.column(1), table.column(0), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(3, columns_[0], &result)); - ex_schema = std::shared_ptr(new Schema( - {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)})); + ex_schema = ::arrow::schema( + {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)}); ex_columns = {table.column(0), table.column(1), table.column(2), table.column(0)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); } @@ -470,7 +469,7 @@ TEST_F(TestRecordBatch, Validate) { auto f1 = field("f1", uint8()); auto f2 = field("f2", int16()); - auto schema = std::shared_ptr(new Schema({f0, f1, f2})); + auto schema = ::arrow::schema({f0, f1, f2}); auto a0 = MakePrimitive(length); auto a1 = MakePrimitive(length); diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 6b86b4d2f1024..4ac5c85d480ed 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -97,15 +97,14 @@ TEST_F(TestSchema, Basics) { auto f2 = field("f2", utf8()); - vector> fields = {f0, f1, f2}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2}); ASSERT_EQ(3, schema->num_fields()); ASSERT_TRUE(f0->Equals(schema->field(0))); ASSERT_TRUE(f1->Equals(schema->field(1))); ASSERT_TRUE(f2->Equals(schema->field(2))); - auto schema2 = std::make_shared(fields); + auto schema2 = ::arrow::schema({f0, f1, f2}); vector> fields3 = {f0, f1_optional, f2}; auto schema3 = std::make_shared(fields3); @@ -119,8 +118,7 @@ TEST_F(TestSchema, ToString) { auto f2 = field("f2", utf8()); auto f3 = field("f3", list(int16())); - vector> fields = {f0, f1, f2, f3}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::string result = schema->ToString(); std::string expected = R"(f0: int32 @@ -137,8 +135,7 @@ TEST_F(TestSchema, GetFieldByName) { auto f2 = field("f2", utf8()); auto f3 = field("f3", list(int16())); - vector> fields = {f0, f1, f2, f3}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2, f3}); std::shared_ptr result; @@ -158,13 +155,12 @@ TEST_F(TestSchema, GetFieldIndex) { auto f2 = field("f2", utf8()); auto f3 = field("f3", list(int16())); - vector> fields = {f0, f1, f2, f3}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2, f3}); - ASSERT_EQ(0, schema->GetFieldIndex(fields[0]->name())); - ASSERT_EQ(1, schema->GetFieldIndex(fields[1]->name())); - ASSERT_EQ(2, schema->GetFieldIndex(fields[2]->name())); - ASSERT_EQ(3, schema->GetFieldIndex(fields[3]->name())); + ASSERT_EQ(0, schema->GetFieldIndex(f0->name())); + ASSERT_EQ(1, schema->GetFieldIndex(f1->name())); + ASSERT_EQ(2, schema->GetFieldIndex(f2->name())); + ASSERT_EQ(3, schema->GetFieldIndex(f3->name())); ASSERT_EQ(-1, schema->GetFieldIndex("not-found")); } @@ -172,10 +168,9 @@ TEST_F(TestSchema, TestMetadataConstruction) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); - vector> fields = {f0, f1, f2}; auto metadata = std::shared_ptr( new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); - auto schema = std::make_shared(fields, metadata); + auto schema = ::arrow::schema({f0, f1, f2}, metadata); ASSERT_TRUE(metadata->Equals(*schema->metadata())); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index edf4d33b23f39..4443e8d8a455b 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -252,6 +252,10 @@ Schema::Schema(const std::vector>& fields, const std::shared_ptr& metadata) : fields_(fields), metadata_(metadata) {} +Schema::Schema(std::vector>&& fields, + const std::shared_ptr& metadata) + : fields_(std::move(fields)), metadata_(metadata) {} + bool Schema::Equals(const Schema& other) const { if (this == &other) { return true; @@ -343,6 +347,16 @@ std::string Schema::ToString() const { return buffer.str(); } +std::shared_ptr schema(const std::vector>& fields, + const std::shared_ptr& metadata) { + return std::make_shared(fields, metadata); +} + +std::shared_ptr schema(std::vector>&& fields, + const std::shared_ptr& metadata) { + return std::make_shared(std::move(fields), metadata); +} + // ---------------------------------------------------------------------- // Visitors and factory functions diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index b28fe9229b2ae..4917ebb481368 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -708,6 +708,10 @@ class ARROW_EXPORT Schema { public: explicit Schema(const std::vector>& fields, const std::shared_ptr& metadata = nullptr); + + explicit Schema(std::vector>&& fields, + const std::shared_ptr& metadata = nullptr); + virtual ~Schema() = default; /// Returns true if all of the schema fields are equal @@ -772,27 +776,56 @@ std::shared_ptr ARROW_EXPORT timestamp(TimeUnit::type unit); std::shared_ptr ARROW_EXPORT timestamp(TimeUnit::type unit, const std::string& timezone); +/// \brief Create an instance of 32-bit time type /// Unit can be either SECOND or MILLI std::shared_ptr ARROW_EXPORT time32(TimeUnit::type unit); +/// \brief Create an instance of 64-bit time type /// Unit can be either MICRO or NANO std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); +/// \brief Create an instance of Struct type std::shared_ptr ARROW_EXPORT struct_(const std::vector>& fields); +/// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); +/// \brief Create an instance of Dictionary type std::shared_ptr ARROW_EXPORT dictionary(const std::shared_ptr& index_type, const std::shared_ptr& values, bool ordered = false); +/// \brief Create a Field instance +/// +/// \param name the field name +/// \param type the field value type +/// \param nullable whether the values are nullable, default true +/// \param metadata any custom key-value metadata, default nullptr std::shared_ptr ARROW_EXPORT field( const std::string& name, const std::shared_ptr& type, bool nullable = true, const std::shared_ptr& metadata = nullptr); +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields +/// \param metadata any custom key-value metadata, default nullptr +/// \return schema shared_ptr to Schema +std::shared_ptr ARROW_EXPORT +schema(const std::vector>& fields, + const std::shared_ptr& metadata = nullptr); + +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields (rvalue reference) +/// \param metadata any custom key-value metadata, default nullptr +/// \return schema shared_ptr to Schema +std::shared_ptr ARROW_EXPORT +schema(std::vector>&& fields, + const std::shared_ptr& metadata = nullptr); + // ---------------------------------------------------------------------- // From 66ab6b2616260977ab9a29bdd59872fb98133d13 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Aug 2017 22:08:12 -0400 Subject: [PATCH 27/38] ARROW-1309: [Python] Handle nested lists with all None values in Array.from_pandas Author: Wes McKinney Closes #947 from wesm/ARROW-1309 and squashes the following commits: dc464922 [Wes McKinney] Expand test case to include ndarray 86039ec2 [Wes McKinney] Bugfix, add multiple nulls at start of array 08687cac [Wes McKinney] NullBuilder, scaffolding --- cpp/src/arrow/builder.cc | 62 +++++++++++++-------- cpp/src/arrow/builder.h | 13 +++++ cpp/src/arrow/python/builtin_convert.cc | 42 ++++++++------ cpp/src/arrow/python/pandas_to_arrow.cc | 50 ++++++++++++++--- cpp/src/arrow/type_fwd.h | 1 + cpp/src/arrow/type_traits.h | 1 + python/pyarrow/tests/test_convert_pandas.py | 15 +++++ 7 files changed, 136 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index e3eda2401a02b..889c64db9fdb8 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -177,6 +177,17 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { length_ = new_length; } +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::Finish(std::shared_ptr* out) { + *out = std::make_shared(length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + template Status PrimitiveBuilder::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -1306,26 +1317,30 @@ Status StructBuilder::Finish(std::shared_ptr* out) { Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out) { switch (type->id()) { - BUILDER_CASE(UINT8, UInt8Builder); - BUILDER_CASE(INT8, Int8Builder); - BUILDER_CASE(UINT16, UInt16Builder); - BUILDER_CASE(INT16, Int16Builder); - BUILDER_CASE(UINT32, UInt32Builder); - BUILDER_CASE(INT32, Int32Builder); - BUILDER_CASE(UINT64, UInt64Builder); - BUILDER_CASE(INT64, Int64Builder); - BUILDER_CASE(DATE32, Date32Builder); - BUILDER_CASE(DATE64, Date64Builder); - BUILDER_CASE(TIME32, Time32Builder); - BUILDER_CASE(TIME64, Time64Builder); - BUILDER_CASE(TIMESTAMP, TimestampBuilder); - BUILDER_CASE(BOOL, BooleanBuilder); - BUILDER_CASE(FLOAT, FloatBuilder); - BUILDER_CASE(DOUBLE, DoubleBuilder); - BUILDER_CASE(STRING, StringBuilder); - BUILDER_CASE(BINARY, BinaryBuilder); - BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); - BUILDER_CASE(DECIMAL, DecimalBuilder); + case Type::NA: { + out->reset(new NullBuilder(pool)); + return Status::OK(); + } + BUILDER_CASE(UINT8, UInt8Builder); + BUILDER_CASE(INT8, Int8Builder); + BUILDER_CASE(UINT16, UInt16Builder); + BUILDER_CASE(INT16, Int16Builder); + BUILDER_CASE(UINT32, UInt32Builder); + BUILDER_CASE(INT32, Int32Builder); + BUILDER_CASE(UINT64, UInt64Builder); + BUILDER_CASE(INT64, Int64Builder); + BUILDER_CASE(DATE32, Date32Builder); + BUILDER_CASE(DATE64, Date64Builder); + BUILDER_CASE(TIME32, Time32Builder); + BUILDER_CASE(TIME64, Time64Builder); + BUILDER_CASE(TIMESTAMP, TimestampBuilder); + BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(FLOAT, FloatBuilder); + BUILDER_CASE(DOUBLE, DoubleBuilder); + BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); + BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); + BUILDER_CASE(DECIMAL, DecimalBuilder); case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = @@ -1348,8 +1363,11 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, return Status::OK(); } - default: - return Status::NotImplemented(type->ToString()); + default: { + std::stringstream ss; + ss << "MakeBuilder: cannot construct builder for type " << type->ToString(); + return Status::NotImplemented(ss.str()); + } } } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index e441179ae7864..b15005f62bc7e 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -162,6 +162,19 @@ class ARROW_EXPORT ArrayBuilder { DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG) : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Finish(std::shared_ptr* out) override; +}; + template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index b693b3e6b4d76..ccaf280b0a383 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -155,7 +155,7 @@ static constexpr int MAX_NESTING_LEVELS = 32; // SeqVisitor is used to infer the type. class SeqVisitor { public: - SeqVisitor() : max_nesting_level_(0) { + SeqVisitor() : max_nesting_level_(0), max_observed_level_(0) { memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); } @@ -217,24 +217,13 @@ class SeqVisitor { if (num_nesting_levels() > 1) { return Status::Invalid("Mixed nesting levels not supported"); // If the nesting goes deeper than the deepest scalar - } else if (max_observed_level() < max_nesting_level_) { + } else if (max_observed_level_ < max_nesting_level_) { return Status::Invalid("Mixed nesting levels not supported"); } } return Status::OK(); } - // Returns the deepest level which has scalar elements. - int max_observed_level() const { - int result = 0; - for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { - result = i; - } - } - return result; - } - // Returns the number of nesting levels which have scalar elements. int num_nesting_levels() const { int result = 0; @@ -252,6 +241,8 @@ class SeqVisitor { // Track observed // Deapest nesting level (irregardless of scalars) int max_nesting_level_; + int max_observed_level_; + // Number of scalar elements at each nesting level. // (TOOD: We really only need to know if a scalar is present, not the count). int nesting_histogram_[MAX_NESTING_LEVELS]; @@ -263,13 +254,15 @@ class SeqVisitor { } else if (PyDict_Check(item_ref.obj())) { return Status::NotImplemented("No type inference for dicts"); } else { - // We permit nulls at any level of nesting - if (item_ref.obj() == Py_None) { - // TODO - } else { + // We permit nulls at any level of nesting, but they aren't treated like + // other scalar values as far as the checking for mixed nesting structure + if (item_ref.obj() != Py_None) { ++nesting_histogram_[level]; - return scalars_.Visit(item_ref.obj()); } + if (level > max_observed_level_) { + max_observed_level_ = level; + } + return scalars_.Visit(item_ref.obj()); } return Status::OK(); } @@ -392,6 +385,17 @@ class TypedConverterVisitor : public TypedConverter { virtual Status AppendItem(const OwnedRef& item) = 0; }; +class NullConverter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + if (item.obj() == Py_None) { + return typed_builder_->AppendNull(); + } else { + return Status::Invalid("NullConverter: passed non-None value"); + } + } +}; + class BoolConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { @@ -616,6 +620,8 @@ class DecimalConverter // Dynamic constructor for sequence converters std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->id()) { + case Type::NA: + return std::make_shared(); case Type::BOOL: return std::make_shared(); case Type::INT64: diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 590be223d3f07..060fcb2453800 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -944,10 +944,6 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr return Status::NotImplemented("mask not supported in object conversions yet"); } - if (is_strided()) { - return Status::NotImplemented("strided arrays not implemented for lists"); - } - BuilderT* value_builder = static_cast(builder->value_builder()); auto foreach_item = [&](PyObject* object) { @@ -991,6 +987,47 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr return LoopPySequence(list, foreach_item); } +template <> +inline Status PandasConverter::ConvertTypedLists( + const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { + PyAcquireGIL lock; + + // TODO: mask not supported here + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } + + auto value_builder = static_cast(builder->value_builder()); + + auto foreach_item = [&](PyObject* object) { + if (PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else if (PyArray_Check(object)) { + auto numpy_array = reinterpret_cast(object); + RETURN_NOT_OK(builder->Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + + for (int64_t i = 0; i < static_cast(PyArray_SIZE(numpy_array)); ++i) { + RETURN_NOT_OK(value_builder->AppendNull()); + } + return Status::OK(); + } else if (PyList_Check(object)) { + RETURN_NOT_OK(builder->Append(true)); + const Py_ssize_t size = PySequence_Size(object); + for (Py_ssize_t i = 0; i < size; ++i) { + RETURN_NOT_OK(value_builder->AppendNull()); + } + return Status::OK(); + } else { + return Status::TypeError("Unsupported Python type for list items"); + } + }; + + return LoopPySequence(list, foreach_item); +} + template <> inline Status PandasConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { @@ -1003,10 +1040,6 @@ inline Status PandasConverter::ConvertTypedLists( return Status::NotImplemented("mask not supported in object conversions yet"); } - if (is_strided()) { - return Status::NotImplemented("strided arrays not implemented for lists"); - } - auto value_builder = static_cast(builder->value_builder()); auto foreach_item = [&](PyObject* object) { @@ -1053,6 +1086,7 @@ inline Status PandasConverter::ConvertTypedLists( Status PandasConverter::ConvertLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { switch (type->id()) { + LIST_CASE(NA, NPY_OBJECT, NullType) LIST_CASE(UINT8, NPY_UINT8, UInt8Type) LIST_CASE(INT8, NPY_INT8, Int8Type) LIST_CASE(UINT16, NPY_UINT16, UInt16Type) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 99c09bd6b7dca..0d06b6f6cb86e 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -42,6 +42,7 @@ class DictionaryArray; class NullType; class NullArray; +class NullBuilder; class BooleanType; class BooleanArray; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 973b0e15c5434..f05eb56718f5f 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -31,6 +31,7 @@ struct TypeTraits {}; template <> struct TypeTraits { using ArrayType = NullArray; + using BuilderType = NullBuilder; constexpr static bool is_parameter_free = false; }; diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 2a51d3283203f..93058fb0a47b4 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -534,6 +534,21 @@ def test_column_of_lists(self): field = schema.field_by_name(column) self._check_array_roundtrip(df[column], type=field.type) + def test_nested_lists_all_none(self): + data = np.array([[None, None], None], dtype=object) + + arr = pa.Array.from_pandas(data) + expected = pa.array(list(data)) + assert arr.equals(expected) + assert arr.type == pa.list_(pa.null()) + + data2 = np.array([None, None, [None, None], + np.array([None, None], dtype=object)], + dtype=object) + arr = pa.Array.from_pandas(data2) + expected = pa.array([None, None, [None, None], [None, None]]) + assert arr.equals(expected) + def test_threaded_conversion(self): df = _alltypes_example() self._check_pandas_roundtrip(df, nthreads=2, From 03dcce44671f355b3d259b913fcabace609a9cd2 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Mon, 7 Aug 2017 22:50:38 -0400 Subject: [PATCH 28/38] ARROW-1173: [Plasma] Add blog post describing Plasma object store Author: Robert Nishihara Closes #940 from robertnishihara/plasmablogpost and squashes the following commits: d7230930 [Robert Nishihara] Update blog post date. 48c9c7b9 [Robert Nishihara] Change speedup after improving baseline. 2ae1d66e [Robert Nishihara] Add blog post describing Plasma object store. --- ...017-08-08-plasma-in-memory-object-store.md | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 site/_posts/2017-08-08-plasma-in-memory-object-store.md diff --git a/site/_posts/2017-08-08-plasma-in-memory-object-store.md b/site/_posts/2017-08-08-plasma-in-memory-object-store.md new file mode 100644 index 0000000000000..48cfb6613cf73 --- /dev/null +++ b/site/_posts/2017-08-08-plasma-in-memory-object-store.md @@ -0,0 +1,150 @@ +--- +layout: post +title: "Plasma In-Memory Object Store" +date: "2017-08-08 00:00:00 -0400" +author: Philipp Moritz and Robert Nishihara +categories: [application] +--- + + +*[Philipp Moritz][1] and [Robert Nishihara][2] are graduate students at UC + Berkeley.* + +## Plasma: A High-Performance Shared-Memory Object Store + +### Motivating Plasma + +This blog post presents Plasma, an in-memory object store that is being +developed as part of Apache Arrow. **Plasma holds immutable objects in shared +memory so that they can be accessed efficiently by many clients across process +boundaries.** In light of the trend toward larger and larger multicore machines, +Plasma enables critical performance optimizations in the big data regime. + +Plasma was initially developed as part of [Ray][3], and has recently been moved +to Apache Arrow in the hopes that it will be broadly useful. + +One of the goals of Apache Arrow is to serve as a common data layer enabling +zero-copy data exchange between multiple frameworks. A key component of this +vision is the use of off-heap memory management (via Plasma) for storing and +sharing Arrow-serialized objects between applications. + +**Expensive serialization and deserialization as well as data copying are a +common performance bottleneck in distributed computing.** For example, a +Python-based execution framework that wishes to distribute computation across +multiple Python โ€œworkerโ€ processes and then aggregate the results in a single +โ€œdriverโ€ process may choose to serialize data using the built-in `pickle` +library. Assuming one Python process per core, each worker process would have to +copy and deserialize the data, resulting in excessive memory usage. The driver +process would then have to deserialize results from each of the workers, +resulting in a bottleneck. + +Using Plasma plus Arrow, the data being operated on would be placed in the +Plasma store once, and all of the workers would read the data without copying or +deserializing it (the workers would map the relevant region of memory into their +own address spaces). The workers would then put the results of their computation +back into the Plasma store, which the driver could then read and aggregate +without copying or deserializing the data. + +### The Plasma API: + +Below we illustrate a subset of the API. The C++ API is documented more fully +[here][6], and the Python API is documented [here][7]. + +**Object IDs:** Each object is associated with a string of bytes. + +**Creating an object:** Objects are stored in Plasma in two stages. First, the +object store *creates* the object by allocating a buffer for it. At this point, +the client can write to the buffer and construct the object within the allocated +buffer. When the client is done, the client *seals* the buffer making the object +immutable and making it available to other Plasma clients. + +```python +# Create an object. +object_id = pyarrow.plasma.ObjectID(20 * b'a') +object_size = 1000 +buffer = memoryview(client.create(object_id, object_size)) + +# Write to the buffer. +for i in range(1000): + buffer[i] = 0 + +# Seal the object making it immutable and available to other clients. +client.seal(object_id) +``` + +**Getting an object:** After an object has been sealed, any client who knows the +object ID can get the object. + +```python +# Get the object from the store. This blocks until the object has been sealed. +object_id = pyarrow.plasma.ObjectID(20 * b'a') +[buff] = client.get([object_id]) +buffer = memoryview(buff) +``` + +If the object has not been sealed yet, then the call to `client.get` will block +until the object has been sealed. + +### A sorting application + +To illustrate the benefits of Plasma, we demonstrate an **11x speedup** (on a +machine with 20 physical cores) for sorting a large pandas DataFrame (one +billion entries). The baseline is the built-in pandas sort function, which sorts +the DataFrame in 477 seconds. To leverage multiple cores, we implement the +following standard distributed sorting scheme. + +* We assume that the data is partitioned across K pandas DataFrames and that + each one already lives in the Plasma store. +* We subsample the data, sort the subsampled data, and use the result to define + L non-overlapping buckets. +* For each of the K data partitions and each of the L buckets, we find the + subset of the data partition that falls in the bucket, and we sort that + subset. +* For each of the L buckets, we gather all of the K sorted subsets that fall in + that bucket. +* For each of the L buckets, we merge the corresponding K sorted subsets. +* We turn each bucket into a pandas DataFrame and place it in the Plasma store. + +Using this scheme, we can sort the DataFrame (the data starts and ends in the +Plasma store), in 44 seconds, giving an 11x speedup over the baseline. + +### Design + +The Plasma store runs as a separate process. It is written in C++ and is +designed as a single-threaded event loop based on the [Redis][4] event loop library. +The plasma client library can be linked into applications. Clients communicate +with the Plasma store via messages serialized using [Google Flatbuffers][5]. + +### Call for contributions + +Plasma is a work in progress, and the API is currently unstable. Today Plasma is +primarily used in [Ray][3] as an in-memory cache for Arrow serialized objects. +We are looking for a broader set of use cases to help refine Plasmaโ€™s API. In +addition, we are looking for contributions in a variety of areas including +improving performance and building other language bindings. Please let us know +if you are interested in getting involved with the project. + +[1]: https://people.eecs.berkeley.edu/~pcmoritz/ +[2]: http://www.robertnishihara.com +[3]: https://github.com/ray-project/ray +[4]: https://redis.io/ +[5]: https://google.github.io/flatbuffers/ +[6]: https://github.com/apache/arrow/blob/master/cpp/apidoc/tutorials/plasma.md +[7]: https://github.com/apache/arrow/blob/master/python/doc/source/plasma.rst From 939957f33ed0dd02013917b366ff85eb857c3947 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Aug 2017 23:38:42 -0400 Subject: [PATCH 29/38] ARROW-1335: [C++] Add offset to PrimitiveArray::raw_values to make consistent with other raw_values This is an API change, but fixes an existing inconsistency that was the source of several bugs that had gone unnoticed because they were only being tested with code having 0 offset (i.e. unsliced). We'll need a corresponding patch in parquet-cpp Author: Wes McKinney Closes #949 from wesm/ARROW-1335 and squashes the following commits: 10431ebf [Wes McKinney] Use raw_values in more places 3c96eb4a [Wes McKinney] Add offset to PrimitiveArray::raw_values to make consistent with other raw_values functions --- cpp/src/arrow/array.cc | 5 ++ cpp/src/arrow/array.h | 10 ++-- cpp/src/arrow/compare.cc | 16 +++--- cpp/src/arrow/python/arrow_to_pandas.cc | 74 ++++++++++++------------- 4 files changed, 53 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ab0be7a0964c6..637eb2417fcfd 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -159,6 +159,11 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t le std::make_shared(type, length, std::move(buffers), null_count, offset)); } +const uint8_t* PrimitiveArray::raw_values() const { + return raw_values_ + + offset() * static_cast(*type()).bit_width() / 8; +} + template NumericArray::NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) { diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index a853f2bb5f93d..777fbe0b006b3 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -292,8 +292,8 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { /// Does not account for any slice offset std::shared_ptr values() const { return data_->buffers[1]; } - /// Does not account for any slice offset - const uint8_t* raw_values() const { return raw_values_; } + /// \brief Return pointer to start of raw data + const uint8_t* raw_values() const; protected: PrimitiveArray() {} @@ -521,7 +521,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { int32_t byte_width() const { return byte_width_; } - const uint8_t* raw_values() const { return raw_values_; } + const uint8_t* raw_values() const { return raw_values_ + byte_width_ * data_->offset; } std::shared_ptr Slice(int64_t offset, int64_t length) const override; @@ -567,7 +567,9 @@ class ARROW_EXPORT DecimalArray : public FlatArray { int32_t byte_width() const { return static_cast(*type()).byte_width(); } - const uint8_t* raw_values() const { return raw_values_; } + + /// \brief Return pointer to value data, accounting for any offset + const uint8_t* raw_values() const { return raw_values_ + byte_width() * data_->offset; } private: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 3a4a4009c6b16..c01f190351044 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -231,11 +231,11 @@ class RangeEqualsVisitor { const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.raw_values() + left.offset() * width; + left_data = left.raw_values(); } if (right.values()) { - right_data = right.raw_values() + right.offset() * width; + right_data = right.raw_values(); } for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; @@ -265,11 +265,11 @@ class RangeEqualsVisitor { const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.raw_values() + left.offset() * width; + left_data = left.raw_values(); } if (right.values()) { - right_data = right.raw_values() + right.offset() * width; + right_data = right.raw_values(); } for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; @@ -352,10 +352,10 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.values()->data() + left.offset() * byte_width; + left_data = left.raw_values(); } if (right.values()) { - right_data = right.values()->data() + right.offset() * byte_width; + right_data = right.raw_values(); } if (left.null_count() > 0) { @@ -399,10 +399,10 @@ static bool IsEqualDecimal(const DecimalArray& left, const DecimalArray& right) const uint8_t* right_data = nullptr; if (left.values()) { - left_data = left.values()->data(); + left_data = left.raw_values(); } if (right.values()) { - right_data = right.values()->data(); + right_data = right.raw_values(); } const int32_t byte_width = left.byte_width(); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 86f82fdbd8de5..8c769ee5eeaf8 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -266,13 +266,12 @@ class PandasBlock { template inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); // Upcast to double, set NaN as appropriate - for (int i = 0; i < arr->length(); ++i) { - *out_values++ = prim_arr->IsNull(i) ? NAN : static_cast(in_values[i]); + for (int i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? NAN : static_cast(in_values[i]); } } } @@ -280,21 +279,19 @@ inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values template inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); - memcpy(out_values, in_values, sizeof(T) * arr->length()); - out_values += arr->length(); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); + memcpy(out_values, in_values, sizeof(T) * arr.length()); + out_values += arr.length(); } } template inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); + for (int64_t i = 0; i < arr.length(); ++i) { *out_values = in_values[i]; } } @@ -520,19 +517,18 @@ inline Status ConvertListsLike(const std::shared_ptr& col, template inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - const uint8_t* valid_bits = arr->null_bitmap_data(); + const uint8_t* valid_bits = arr.null_bitmap_data(); - if (arr->null_count() > 0) { - for (int64_t i = 0; i < arr->length(); ++i) { + if (arr.null_count() > 0) { + for (int64_t i = 0; i < arr.length(); ++i) { *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i]; } } else { - memcpy(out_values, in_values, sizeof(T) * arr->length()); - out_values += arr->length(); + memcpy(out_values, in_values, sizeof(T) * arr.length()); + out_values += arr.length(); } } } @@ -541,12 +537,11 @@ template inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_value, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? na_value : static_cast(in_values[i]); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? na_value : static_cast(in_values[i]); } } } @@ -554,13 +549,12 @@ inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_valu template inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? kPandasTimestampNull - : (static_cast(in_values[i]) * SHIFT); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? kPandasTimestampNull + : (static_cast(in_values[i]) * SHIFT); } } } @@ -1004,6 +998,7 @@ class CategoricalBlock : public PandasBlock { for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); const auto& dict_arr = static_cast(*arr); + const auto& indices = static_cast(*dict_arr.indices()); auto in_values = reinterpret_cast(indices.raw_values()); @@ -1386,8 +1381,8 @@ class ArrowDeserializer { Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr arr) { typedef typename internal::arrow_traits::T T; - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& prim_arr = static_cast(*arr); + auto in_values = reinterpret_cast(prim_arr.raw_values()); // Zero-Copy. We can pass the data pointer directly to NumPy. void* data = const_cast(in_values); @@ -1461,12 +1456,11 @@ class ArrowDeserializer { constexpr int64_t kShift = traits::npy_shift; for (int c = 0; c < data_.num_chunks(); c++) { - const std::shared_ptr arr = data_.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->raw_values()); + const auto& arr = static_cast(*data_.chunk(c)); + auto in_values = reinterpret_cast(arr.raw_values()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / kShift; + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? na_value : in_values[i] / kShift; } } return Status::OK(); From 5281a8264e0af807043b4fd70a7213ee4b176742 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Aug 2017 09:29:09 -0400 Subject: [PATCH 30/38] ARROW-1334: [C++] Add alternate Table constructor that takes vector of Array Author: Wes McKinney Closes #950 from wesm/ARROW-1334 and squashes the following commits: f0655408 [Wes McKinney] Fix compiler warning d9559682 [Wes McKinney] Add Table constructor that takes vector of Array instead of Column --- cpp/src/arrow/python/python-test.cc | 4 +--- cpp/src/arrow/table-test.cc | 3 +++ cpp/src/arrow/table.cc | 32 +++++++++++++++++++++++------ cpp/src/arrow/table.h | 18 +++++++++------- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 433ce9b37a80a..dd956463fec76 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -85,9 +85,7 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { auto f2 = field("f2", utf8()); auto f3 = field("f3", utf8()); std::vector> fields = {f1, f2, f3}; - std::vector> cols = {std::make_shared(f1, arr), - std::make_shared(f2, arr), - std::make_shared(f3, arr)}; + std::vector> cols = {arr, arr, arr}; auto schema = std::make_shared(fields); auto table = std::make_shared
(schema, cols); diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 1ba7a2f95798c..4b67492b7ed4a 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -230,6 +230,9 @@ TEST_F(TestTable, Ctors) { ASSERT_EQ(length, table_->num_rows()); ASSERT_EQ(3, table_->num_columns()); + auto array_ctor = std::make_shared
(schema_, arrays_); + ASSERT_TRUE(table_->Equals(*array_ctor)); + table_.reset(new Table(schema_, columns_, length)); ASSERT_OK(table_->ValidateColumns()); ASSERT_EQ(length, table_->num_rows()); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 665ce2d84dea4..1f0c6d785448d 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -273,18 +273,38 @@ Status RecordBatch::Validate() const { // Table methods Table::Table(const std::shared_ptr& schema, - const std::vector>& columns) + const std::vector>& columns, int64_t num_rows) : schema_(schema), columns_(columns) { - if (columns.size() == 0) { - num_rows_ = 0; + if (num_rows < 0) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } } else { - num_rows_ = columns[0]->length(); + num_rows_ = num_rows; } } Table::Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows) - : schema_(schema), columns_(columns), num_rows_(num_rows) {} + const std::vector>& columns, int64_t num_rows) + : schema_(schema) { + if (num_rows < 0) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } + } else { + num_rows_ = num_rows; + } + + columns_.resize(columns.size()); + for (size_t i = 0; i < columns.size(); ++i) { + columns_[i] = std::make_shared(schema->field(static_cast(i)), + columns[i]); + } +} std::shared_ptr
Table::ReplaceSchemaMetadata( const std::shared_ptr& metadata) const { diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 6afd618da043b..31ca97a37078c 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -192,16 +192,20 @@ class ARROW_EXPORT RecordBatch { // Immutable container of fixed-length columns conforming to a particular schema class ARROW_EXPORT Table { public: - // If columns is zero-length, the table's number of rows is zero + /// \brief Construct Table from schema and columns + /// If columns is zero-length, the table's number of rows is zero + /// \param schema + /// \param columns + /// \param num_rows number of rows in table, -1 (default) to infer from columns Table(const std::shared_ptr& schema, - const std::vector>& columns); + const std::vector>& columns, int64_t num_rows = -1); - // num_rows is a parameter to allow for tables of a particular size not - // having any materialized columns. Each column should therefore have the - // same length as num_rows -- you can validate this using - // Table::ValidateColumns + /// \brief Construct Table from schema and arrays + /// \param schema + /// \param arrays + /// \param num_rows number of rows in table, -1 (default) to infer from columns Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows); + const std::vector>& arrays, int64_t num_rows = -1); // Construct table from RecordBatch, but only if all of the batch schemas are // equal. Returns Status::Invalid if there is some problem From 20cee707cbfdaa5dc4f2b7dea09619f34a1f9f71 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Aug 2017 09:30:16 -0400 Subject: [PATCH 31/38] ARROW-1338: [Python] Do not close RecordBatchWriter on dealloc in case sink is no longer valid Also add missing close() statements to test_mock_output_stream to fix invalid writes causing core dump on OS X. Author: Wes McKinney Closes #952 from wesm/ARROW-1338 and squashes the following commits: 88e8cefe [Wes McKinney] Do not close RecordBatchWriter on dealloc in case sink is no longer valid. Add missing close() statements to test_mock_output_stream --- python/pyarrow/ipc.pxi | 16 ++++++++++++++-- python/pyarrow/tests/test_io.py | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index d6f62aa95c5fb..ceed4b0e85248 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -163,8 +163,7 @@ cdef class _RecordBatchWriter: self.closed = True def __dealloc__(self): - if not self.closed: - self.close() + pass def _open(self, sink, Schema schema): cdef: @@ -182,11 +181,24 @@ cdef class _RecordBatchWriter: self.closed = False def write_batch(self, RecordBatch batch): + """ + Write RecordBatch to stream + + Parameters + ---------- + batch : RecordBatch + """ with nogil: check_status(self.writer.get() .WriteRecordBatch(deref(batch.batch))) def close(self): + """ + Close stream and write end-of-stream 0 marker + """ + if self.closed: + return + with nogil: check_status(self.writer.get().Close()) self.closed = True diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index c81a0485ce1ee..d503ea22464d5 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -277,6 +277,8 @@ def test_mock_output_stream(): stream_writer1.write_batch(record_batch) stream_writer2.write_batch(record_batch) + stream_writer1.close() + stream_writer2.close() assert f1.size() == len(f2.get_result()) From 2615b47032d58284e0606b21cb216aa4b303a72c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Aug 2017 11:56:32 -0400 Subject: [PATCH 32/38] ARROW-1306: [C++] Use UTF8 filenames in local file error messages Encoded utf16-le bytes were being written to error messages (which are output to UTF-8 consoles), resulting in unintelligible displays. This also improves the error message when opening the file fails per ARROW-1121. Author: Wes McKinney Closes #951 from wesm/ARROW-1306 and squashes the following commits: fd0d93f7 [Wes McKinney] Restore utf8_data method a4aae504 [Wes McKinney] MSVC fixes b847b66c [Wes McKinney] Change PlatformFilename to be allocated with OSFile d445fcad [Wes McKinney] Add Python unit test for ARROW-1306 0dc220c2 [Wes McKinney] MSVC fixes 9d80e491 [Wes McKinney] Add PlatformFilename abstraction, write error messages with UTF8 filenames --- cpp/src/arrow/io/file.cc | 128 ++++++++++++++------------- cpp/src/arrow/io/file.h | 23 ++++- cpp/src/arrow/io/io-file-test.cc | 9 +- python/pyarrow/compat.py | 3 +- python/pyarrow/tests/test_parquet.py | 11 +++ 5 files changed, 106 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 82e3ba8109c23..57d30f73baa29 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -118,33 +118,64 @@ namespace io { // ---------------------------------------------------------------------- // Cross-platform file compatability layer + #if defined(_MSC_VER) + constexpr const char* kRangeExceptionError = "Range exception during wide-char string conversion"; + +struct PlatformFilename { + static Status Init(const std::string& utf8_path, PlatformFilename* out) { + std::wstring_convert> utf16_converter; + + if (!utf8_path.empty()) { + try { + out->utf16_path = utf16_converter.from_bytes(utf8_path); + } catch (const std::range_error&) { + return Status::Invalid(kRangeExceptionError); + } + } else { + out->utf16_path = std::wstring(); + } + out->utf8_path = utf8_path; + return Status::OK(); + } + + const char* data() const { return reinterpret_cast(utf16_path.c_str()); } + + const char* utf8_data() const { return utf8_path.c_str(); } + + size_t length() const { return utf16_path.size(); } + + std::string utf8_path; + std::wstring utf16_path; +}; + +#else + +struct PlatformFilename { + static Status Init(const std::string& utf8_path, PlatformFilename* out) { + out->utf8_path = utf8_path; + return Status::OK(); + } + + const char* data() const { return utf8_path.c_str(); } + + const char* utf8_data() const { return data(); } + + size_t length() const { return utf8_path.size(); } + + std::string utf8_path; +}; + #endif -static inline Status CheckOpenResult(int ret, int errno_actual, const char* filename, - size_t filename_length) { +static inline Status CheckOpenResult(int ret, int errno_actual, + const PlatformFilename& filename) { if (ret == -1) { // TODO: errno codes to strings std::stringstream ss; - ss << "Failed to open file: "; -#if defined(_MSC_VER) - // using wchar_t - - // this requires c++11 - std::wstring_convert, wchar_t> converter; - std::wstring wide_string(reinterpret_cast(filename), - filename_length / sizeof(wchar_t)); - try { - std::string byte_string = converter.to_bytes(wide_string); - ss << byte_string; - } catch (const std::range_error&) { - ss << kRangeExceptionError; - } -#else - ss << filename; -#endif + ss << "Failed to open local file: " << filename.utf8_data(); return Status::IOError(ss.str()); } return Status::OK(); @@ -161,54 +192,27 @@ static inline int64_t lseek64_compat(int fd, int64_t pos, int whence) { #endif } -#if defined(_MSC_VER) -static inline Status ConvertToUtf16(const std::string& input, std::wstring* result) { - if (result == nullptr) { - return Status::Invalid("Pointer to result is not valid"); - } - - if (input.empty()) { - *result = std::wstring(); - return Status::OK(); - } - - std::wstring_convert> utf16_converter; - try { - *result = utf16_converter.from_bytes(input); - } catch (const std::range_error&) { - return Status::Invalid(kRangeExceptionError); - } - return Status::OK(); -} -#endif - -static inline Status FileOpenReadable(const std::string& filename, int* fd) { +static inline Status FileOpenReadable(const PlatformFilename& filename, int* fd) { int ret; errno_t errno_actual = 0; #if defined(_MSC_VER) - std::wstring wide_filename; - RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename)); - - errno_actual = - _wsopen_s(fd, wide_filename.c_str(), _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); + errno_actual = _wsopen_s(fd, reinterpret_cast(filename.data()), + _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); ret = *fd; #else - ret = *fd = open(filename.c_str(), O_RDONLY | O_BINARY); + ret = *fd = open(filename.data(), O_RDONLY | O_BINARY); errno_actual = errno; #endif - return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); + return CheckOpenResult(ret, errno_actual, filename); } -static inline Status FileOpenWriteable(const std::string& filename, bool write_only, +static inline Status FileOpenWriteable(const PlatformFilename& filename, bool write_only, bool truncate, int* fd) { int ret; errno_t errno_actual = 0; #if defined(_MSC_VER) - std::wstring wide_filename; - RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename)); - int oflag = _O_CREAT | _O_BINARY; int pmode = _S_IWRITE; if (!write_only) { @@ -225,7 +229,8 @@ static inline Status FileOpenWriteable(const std::string& filename, bool write_o oflag |= _O_RDWR; } - errno_actual = _wsopen_s(fd, wide_filename.c_str(), oflag, _SH_DENYNO, pmode); + errno_actual = _wsopen_s(fd, reinterpret_cast(filename.data()), oflag, + _SH_DENYNO, pmode); ret = *fd; #else @@ -241,9 +246,9 @@ static inline Status FileOpenWriteable(const std::string& filename, bool write_o oflag |= O_RDWR; } - ret = *fd = open(filename.c_str(), oflag, ARROW_WRITE_SHMODE); + ret = *fd = open(filename.data(), oflag, ARROW_WRITE_SHMODE); #endif - return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); + return CheckOpenResult(ret, errno_actual, filename); } static inline Status FileTell(int fd, int64_t* pos) { @@ -352,8 +357,9 @@ class OSFile { ~OSFile() {} Status OpenWriteable(const std::string& path, bool append, bool write_only) { - RETURN_NOT_OK(FileOpenWriteable(path, write_only, !append, &fd_)); - path_ = path; + RETURN_NOT_OK(PlatformFilename::Init(path, &path_)); + + RETURN_NOT_OK(FileOpenWriteable(path_, write_only, !append, &fd_)); is_open_ = true; mode_ = write_only ? FileMode::WRITE : FileMode::READWRITE; @@ -366,10 +372,11 @@ class OSFile { } Status OpenReadable(const std::string& path) { - RETURN_NOT_OK(FileOpenReadable(path, &fd_)); + RETURN_NOT_OK(PlatformFilename::Init(path, &path_)); + + RETURN_NOT_OK(FileOpenReadable(path_, &fd_)); RETURN_NOT_OK(FileGetSize(fd_, &size_)); - path_ = path; is_open_ = true; mode_ = FileMode::READ; return Status::OK(); @@ -408,14 +415,13 @@ class OSFile { int fd() const { return fd_; } bool is_open() const { return is_open_; } - const std::string& path() const { return path_; } int64_t size() const { return size_; } FileMode::type mode() const { return mode_; } protected: - std::string path_; + PlatformFilename path_; std::mutex lock_; diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index ba740f1e8f4a9..2a0e89ca325fa 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -40,10 +40,18 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { public: ~FileOutputStream(); - // When opening a new file, any existing file with the indicated path is - // truncated to 0 bytes, deleting any existing memory + /// \brief Open a local file for writing, truncating any existing file + /// \param[in] path with UTF8 encoding + /// \param[out] file a FileOutputStream instance + /// + /// When opening a new file, any existing file with the indicated path is + /// truncated to 0 bytes, deleting any existing memory static Status Open(const std::string& path, std::shared_ptr* file); + /// \brief Open a local file for writing + /// \param[in] path with UTF8 encoding + /// \param[in] append append to existing file, otherwise truncate to 0 bytes + /// \param[out] file a FileOutputStream instance static Status Open(const std::string& path, bool append, std::shared_ptr* file); @@ -68,10 +76,17 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { public: ~ReadableFile(); - // Open file, allocate memory (if needed) from default memory pool + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[out] file ReadableFile instance + /// Open file, allocate memory (if needed) from default memory pool static Status Open(const std::string& path, std::shared_ptr* file); - // Open file with one's own memory pool for memory allocations + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[in] pool a MemoryPool for memory allocations + /// \param[out] file ReadableFile instance + /// Open file with one's own memory pool for memory allocations static Status Open(const std::string& path, MemoryPool* memory_pool, std::shared_ptr* file); diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index 36c35700d6496..630356fa2af38 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -45,7 +45,7 @@ static bool FileExists(const std::string& path) { void InvalidParamHandler(const wchar_t* expr, const wchar_t* func, const wchar_t* source_file, unsigned int source_line, uintptr_t reserved) { - wprintf(L"Invalid parameter in funcion %s. Source: %s line %d expression %s", func, + wprintf(L"Invalid parameter in function %s. Source: %s line %d expression %s", func, source_file, source_line, expr); } #endif @@ -320,7 +320,12 @@ TEST_F(TestReadableFile, ReadAt) { } TEST_F(TestReadableFile, NonExistentFile) { - ASSERT_RAISES(IOError, ReadableFile::Open("0xDEADBEEF.txt", &file_)); + std::string path = "0xDEADBEEF.txt"; + Status s = ReadableFile::Open(path, &file_); + ASSERT_TRUE(s.IsIOError()); + + std::string message = s.message(); + ASSERT_NE(std::string::npos, message.find(path)); } class MyMemoryPool : public MemoryPool { diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 7be35dfc2c81f..2252e85e6ef77 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -132,7 +132,6 @@ def frombytes(o): def encode_file_path(path): import os - # Windows requires utf-16le encoding for unicode file names if isinstance(path, unicode_type): # POSIX systems can handle utf-8. UTF8 is converted to utf16-le in # libarrow @@ -140,6 +139,8 @@ def encode_file_path(path): else: encoded_path = path + # Windows file system requires utf-16le for file names; Arrow C++ libraries + # will convert utf8 to utf16 return encoded_path diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 9a570b9d5dab4..8a20f4c469200 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1127,3 +1127,14 @@ def test_write_error_deletes_incomplete_file(tmpdir): pass assert not os.path.exists(filename) + + +@parquet +def test_read_non_existent_file(tmpdir): + import pyarrow.parquet as pq + + path = 'non-existent-file.parquet' + try: + pq.read_table(path) + except Exception as e: + assert path in e.args[0] From 6e26701257be160fa95ce174d80b046adb493e57 Mon Sep 17 00:00:00 2001 From: fjetter Date: Tue, 8 Aug 2017 13:51:37 -0400 Subject: [PATCH 33/38] ARROW-439: [Python] Add option in "to_pandas" conversions to yield Categorical from String/Binary arrays I added support to cast Arrays and Columns to dictionaries with the possibility to extend the casting to different types. I intend to add more types to the casting, at least for trivial cases, but first I wanted to get some feedback on the current state. Author: fjetter Author: Wes McKinney Closes #909 from fjetter/feature/make_dictionary_array and squashes the following commits: d1189395 [Wes McKinney] Fix deprecated API usage 606724df [Wes McKinney] Handle ordered categories in arrow_to_pandas.cc. flake8 Cython fixes d2bb8d8e [Wes McKinney] Move dictionary index type dispatch and memory allocation into CategoricalBlock::Write 6ab28730 [fjetter] Remove dead code bea4cb9e [fjetter] Merge master bb3209ba [fjetter] Add pool to ConvertTableToPandas in python-test 24fbf424 [fjetter] Format arrow_to_pandas 39b22ff6 [fjetter] Allocate categorical blocks in write path b7f389f3 [fjetter] Pass memory pool from the outside c496cb5f [fjetter] Pass pandas options through to pandas write before conversion 4b12aa13 [fjetter] Push pandas options down b6fca35c [fjetter] Rename and add docs for EncodeDictionary 6479d292 [fjetter] add MakeDictionaryArray --- cpp/src/arrow/builder.cc | 81 +++- cpp/src/arrow/builder.h | 16 + cpp/src/arrow/python/arrow_to_pandas.cc | 504 ++++++++++---------- cpp/src/arrow/python/arrow_to_pandas.h | 20 +- cpp/src/arrow/python/python-test.cc | 4 +- cpp/src/arrow/util/parallel.h | 70 +++ python/pyarrow/array.pxi | 12 +- python/pyarrow/includes/libarrow.pxd | 15 +- python/pyarrow/pandas_compat.py | 7 +- python/pyarrow/table.pxi | 29 +- python/pyarrow/tests/test_convert_pandas.py | 29 +- 11 files changed, 502 insertions(+), 285 deletions(-) create mode 100644 cpp/src/arrow/util/parallel.h diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 889c64db9fdb8..e2054dbfde688 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -27,6 +27,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -1396,8 +1397,84 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder); DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder); DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder); - // DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); - // DICTIONARY_BUILDER_CASE(DECIMAL, DecimalBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} + +#define DICTIONARY_ARRAY_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + builder = std::make_shared(type, pool); \ + RETURN_NOT_OK(static_cast(*builder).AppendArray(input)); \ + RETURN_NOT_OK(builder->Finish(out)); \ + return Status::OK(); + +Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool, + std::shared_ptr* out) { + const std::shared_ptr& type = input.data()->type; + std::shared_ptr builder; + switch (type->id()) { + DICTIONARY_ARRAY_CASE(UINT8, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT8, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(UINT16, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT16, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(UINT32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(UINT64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(INT64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DATE32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DATE64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(TIME32, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(TIME64, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(TIMESTAMP, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(FLOAT, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DOUBLE, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder); + DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} +#define DICTIONARY_COLUMN_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + builder = std::make_shared(type, pool); \ + chunks = input.data(); \ + for (auto chunk : chunks->chunks()) { \ + RETURN_NOT_OK(static_cast(*builder).AppendArray(*chunk)); \ + } \ + RETURN_NOT_OK(builder->Finish(&arr)); \ + *out = std::make_shared(input.name(), arr); \ + return Status::OK(); + +/// \brief Encodes a column to a suitable dictionary type +/// \param input Column to be encoded +/// \param pool MemoryPool to allocate the dictionary +/// \param out The new column +/// \return Status +Status EncodeColumnToDictionary(const Column& input, MemoryPool* pool, + std::shared_ptr* out) { + const std::shared_ptr& type = input.type(); + std::shared_ptr builder; + std::shared_ptr arr; + std::shared_ptr chunks; + switch (type->id()) { + DICTIONARY_COLUMN_CASE(UINT8, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT8, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(UINT16, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT16, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(UINT32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(UINT64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(INT64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(DATE32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(DATE64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(TIME32, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(TIME64, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(TIMESTAMP, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(FLOAT, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(DOUBLE, DictionaryBuilder); + DICTIONARY_COLUMN_CASE(STRING, StringDictionaryBuilder); + DICTIONARY_COLUMN_CASE(BINARY, BinaryDictionaryBuilder); default: return Status::NotImplemented(type->ToString()); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index b15005f62bc7e..46900fc7129c1 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -28,6 +28,7 @@ #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -913,6 +914,21 @@ Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +/// \brief Convert Array to encoded DictionaryArray form +/// +/// \param[in] input The Array to be encoded +/// \param[in] pool MemoryPool to allocate memory for the hash table +/// \param[out] out Array encoded to DictionaryArray +Status ARROW_EXPORT EncodeArrayToDictionary(const Array& input, MemoryPool* pool, + std::shared_ptr* out); + +/// \brief Convert a Column's data internally to DictionaryArray +/// +/// \param[in] input The ChunkedArray to be encoded +/// \param[in] pool MemoryPool to allocate memory for the hash table +/// \param[out] out Column with data converted to DictionaryArray +Status ARROW_EXPORT EncodeColumnToDictionary(const Column& input, MemoryPool* pool, + std::shared_ptr* out); } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 8c769ee5eeaf8..23bef7bcae65d 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -22,14 +22,11 @@ #include "arrow/python/arrow_to_pandas.h" #include -#include #include #include #include -#include #include #include -#include #include #include @@ -42,6 +39,7 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/parallel.h" #include "arrow/visitor_inline.h" #include "arrow/python/builtin_convert.h" @@ -186,8 +184,8 @@ class PandasBlock { CATEGORICAL }; - PandasBlock(int64_t num_rows, int num_columns) - : num_rows_(num_rows), num_columns_(num_columns) {} + PandasBlock(PandasOptions options, int64_t num_rows, int num_columns) + : num_rows_(num_rows), num_columns_(num_columns), options_(options) {} virtual ~PandasBlock() {} virtual Status Allocate() = 0; @@ -255,6 +253,8 @@ class PandasBlock { OwnedRef block_arr_; uint8_t* block_data_; + PandasOptions options_; + // ndarray OwnedRef placement_arr_; int64_t* placement_data_; @@ -264,7 +264,8 @@ class PandasBlock { }; template -inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) { +inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& data, + double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = static_cast(*data.chunk(c)); auto in_values = reinterpret_cast(arr.raw_values()); @@ -277,7 +278,8 @@ inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values } template -inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) { +inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedArray& data, + T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = static_cast(*data.chunk(c)); auto in_values = reinterpret_cast(arr.raw_values()); @@ -287,7 +289,8 @@ inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_value } template -inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) { +inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& data, + OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = static_cast(*data.chunk(c)); auto in_values = reinterpret_cast(arr.raw_values()); @@ -297,7 +300,8 @@ inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_val } } -static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) { +static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); @@ -321,7 +325,8 @@ static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_v return Status::OK(); } -static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) { +static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& data, + uint8_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); auto bool_arr = static_cast(arr.get()); @@ -332,7 +337,8 @@ static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) } template -inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -362,7 +368,8 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) return Status::OK(); } -inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { std::shared_ptr arr = data.chunk(c); @@ -377,7 +384,8 @@ inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { return Status::OK(); } -inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertFixedSizeBinary(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { auto arr = static_cast(data.chunk(c).get()); @@ -407,7 +415,8 @@ inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_va return Status::OK(); } -inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { +inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; if (data.num_chunks() <= 0) { return Status::OK(); @@ -424,8 +433,8 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; - RETURN_NOT_OK( - ConvertArrayToPandas(arr->field(static_cast(i)), nullptr, &numpy_array)); + RETURN_NOT_OK(ConvertArrayToPandas(options, arr->field(static_cast(i)), + nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } @@ -470,7 +479,7 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) { } template -inline Status ConvertListsLike(const std::shared_ptr& col, +inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr& col, PyObject** out_values) { const ChunkedArray& data = *col->data().get(); auto list_type = std::static_pointer_cast(col->type()); @@ -485,7 +494,7 @@ inline Status ConvertListsLike(const std::shared_ptr& col, // TODO(ARROW-489): Currently we don't have a Python reference for single columns. // Storing a reference to the whole Array would be to expensive. PyObject* numpy_array; - RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array)); + RETURN_NOT_OK(ConvertColumnToPandas(options, flat_column, nullptr, &numpy_array)); PyAcquireGIL lock; @@ -560,7 +569,8 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) } template -static Status ConvertTimes(const ChunkedArray& data, PyObject** out_values) { +static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; PyAcquireGIL lock; @@ -629,7 +639,8 @@ Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, return Status::OK(); } -static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) { +static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { PyAcquireGIL lock; OwnedRef decimal_ref; OwnedRef Decimal_ref; @@ -673,9 +684,9 @@ static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) { return Status::OK(); } -#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ - case Type::ArrowEnum: \ - RETURN_NOT_OK((ConvertListsLike(col, out_buffer))); \ +#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ + case Type::ArrowEnum: \ + RETURN_NOT_OK((ConvertListsLike(options_, col, out_buffer))); \ break; class ObjectBlock : public PandasBlock { @@ -693,21 +704,21 @@ class ObjectBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); if (type == Type::BOOL) { - RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer)); + RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer)); } else if (type == Type::BINARY) { - RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::STRING) { - RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { - RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); + RETURN_NOT_OK(ConvertFixedSizeBinary(options_, data, out_buffer)); } else if (type == Type::TIME32) { - RETURN_NOT_OK(ConvertTimes(data, out_buffer)); + RETURN_NOT_OK(ConvertTimes(options_, data, out_buffer)); } else if (type == Type::TIME64) { - RETURN_NOT_OK(ConvertTimes(data, out_buffer)); + RETURN_NOT_OK(ConvertTimes(options_, data, out_buffer)); } else if (type == Type::DECIMAL) { - RETURN_NOT_OK(ConvertDecimals(data, out_buffer)); + RETURN_NOT_OK(ConvertDecimals(options_, data, out_buffer)); } else if (type == Type::NA) { - RETURN_NOT_OK(ConvertNulls(data, out_buffer)); + RETURN_NOT_OK(ConvertNulls(options_, data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast(col->type()); switch (list_type->value_type()->id()) { @@ -732,7 +743,7 @@ class ObjectBlock : public PandasBlock { } } } else if (type == Type::STRUCT) { - RETURN_NOT_OK(ConvertStruct(data, out_buffer)); + RETURN_NOT_OK(ConvertStruct(options_, data, out_buffer)); } else { std::stringstream ss; ss << "Unsupported type for object array output: " << col->type()->ToString(); @@ -768,7 +779,7 @@ class IntBlock : public PandasBlock { return Status::NotImplemented(ss.str()); } - ConvertIntegerNoNullsSameType(data, out_buffer); + ConvertIntegerNoNullsSameType(options_, data, out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -821,8 +832,8 @@ class Float64Block : public PandasBlock { const ChunkedArray& data = *col->data().get(); -#define INTEGER_CASE(IN_TYPE) \ - ConvertIntegerWithNulls(data, out_buffer); \ +#define INTEGER_CASE(IN_TYPE) \ + ConvertIntegerWithNulls(options_, data, out_buffer); \ break; switch (type) { @@ -881,7 +892,7 @@ class BoolBlock : public PandasBlock { uint8_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - ConvertBooleanNoNulls(*col->data().get(), out_buffer); + ConvertBooleanNoNulls(options_, *col->data().get(), out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -946,8 +957,8 @@ class DatetimeBlock : public PandasBlock { class DatetimeTZBlock : public DatetimeBlock { public: - DatetimeTZBlock(const std::string& timezone, int64_t num_rows) - : DatetimeBlock(num_rows, 1), timezone_(timezone) {} + DatetimeTZBlock(PandasOptions options, const std::string& timezone, int64_t num_rows) + : DatetimeBlock(options, num_rows, 1), timezone_(timezone) {} // Like Categorical, the internal ndarray is 1-dimensional Status Allocate() override { return AllocateDatetime(1); } @@ -973,25 +984,25 @@ class DatetimeTZBlock : public DatetimeBlock { std::string timezone_; }; -template class CategoricalBlock : public PandasBlock { public: - explicit CategoricalBlock(int64_t num_rows) : PandasBlock(num_rows, 1) {} - Status Allocate() override { - constexpr int npy_type = internal::arrow_traits::npy_type; + explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows) + : PandasBlock(options, num_rows, 1), pool_(pool) {} - if (!(npy_type == NPY_INT8 || npy_type == NPY_INT16 || npy_type == NPY_INT32 || - npy_type == NPY_INT64)) { - return Status::Invalid("Category indices must be signed integers"); - } - return AllocateNDArray(npy_type, 1); + Status Allocate() override { + return Status::NotImplemented( + "CategoricalBlock allocation happens when calling Write"); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, - int64_t rel_placement) override { - using T = typename internal::arrow_traits::T; + template + Status WriteIndices(const std::shared_ptr& col) { + using TRAITS = internal::arrow_traits; + using T = typename TRAITS::T; + constexpr int npy_type = TRAITS::npy_type; + RETURN_NOT_OK(AllocateNDArray(npy_type, 1)); - T* out_values = reinterpret_cast(block_data_) + rel_placement * num_rows_; + // No relative placement offset because a single column + T* out_values = reinterpret_cast(block_data_); const ChunkedArray& data = *col->data().get(); @@ -1008,13 +1019,48 @@ class CategoricalBlock : public PandasBlock { } } - placement_data_[rel_placement] = abs_placement; + return Status::OK(); + } + + Status Write(const std::shared_ptr& col, int64_t abs_placement, + int64_t rel_placement) override { + std::shared_ptr converted_col; + if (options_.strings_to_categorical && + (col->type()->id() == Type::STRING || col->type()->id() == Type::BINARY)) { + RETURN_NOT_OK(EncodeColumnToDictionary(static_cast(*col), pool_, + &converted_col)); + } else { + converted_col = col; + } - auto dict_type = static_cast(col->type().get()); + const auto& dict_type = static_cast(*converted_col->type()); + + switch (dict_type.index_type()->id()) { + case Type::INT8: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + case Type::INT16: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + case Type::INT32: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + case Type::INT64: + RETURN_NOT_OK(WriteIndices(converted_col)); + break; + default: { + std::stringstream ss; + ss << "Categorical index type not supported: " + << dict_type.index_type()->ToString(); + return Status::NotImplemented(ss.str()); + } + } + placement_data_[rel_placement] = abs_placement; PyObject* dict; - RETURN_NOT_OK(ConvertArrayToPandas(dict_type->dictionary(), nullptr, &dict)); + RETURN_NOT_OK(ConvertArrayToPandas(options_, dict_type.dictionary(), nullptr, &dict)); dictionary_.reset(dict); + ordered_ = dict_type.ordered(); return Status::OK(); } @@ -1027,20 +1073,26 @@ class CategoricalBlock : public PandasBlock { PyDict_SetItemString(result, "dictionary", dictionary_.obj()); PyDict_SetItemString(result, "placement", placement_arr_.obj()); + PyObject* py_ordered = ordered_ ? Py_True : Py_False; + Py_INCREF(py_ordered); + PyDict_SetItemString(result, "ordered", py_ordered); + *output = result; return Status::OK(); } protected: + MemoryPool* pool_; OwnedRef dictionary_; + bool ordered_; }; -Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, - std::shared_ptr* block) { -#define BLOCK_CASE(NAME, TYPE) \ - case PandasBlock::NAME: \ - *block = std::make_shared(num_rows, num_columns); \ +Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows, + int num_columns, std::shared_ptr* block) { +#define BLOCK_CASE(NAME, TYPE) \ + case PandasBlock::NAME: \ + *block = std::make_shared(options, num_rows, num_columns); \ break; switch (type) { @@ -1066,36 +1118,94 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, return (*block)->Allocate(); } -static inline Status MakeCategoricalBlock(const std::shared_ptr& type, - int64_t num_rows, - std::shared_ptr* block) { - // All categoricals become a block with a single column - auto dict_type = static_cast(type.get()); - switch (dict_type->index_type()->id()) { +using BlockMap = std::unordered_map>; + +static Status GetPandasBlockType(const Column& col, const PandasOptions& options, + PandasBlock::type* output_type) { + switch (col.type()->id()) { + case Type::BOOL: + *output_type = col.null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; + break; + case Type::UINT8: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8; + break; case Type::INT8: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8; + break; + case Type::UINT16: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16; break; case Type::INT16: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16; + break; + case Type::UINT32: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32; break; case Type::INT32: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32; break; case Type::INT64: - *block = std::make_shared>(num_rows); + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64; + break; + case Type::UINT64: + *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64; + break; + case Type::FLOAT: + *output_type = PandasBlock::FLOAT; + break; + case Type::DOUBLE: + *output_type = PandasBlock::DOUBLE; + break; + case Type::STRING: + case Type::BINARY: + if (options.strings_to_categorical) { + *output_type = PandasBlock::CATEGORICAL; + break; + } + case Type::NA: + case Type::FIXED_SIZE_BINARY: + case Type::STRUCT: + case Type::TIME32: + case Type::TIME64: + case Type::DECIMAL: + *output_type = PandasBlock::OBJECT; + break; + case Type::DATE32: + *output_type = PandasBlock::DATETIME; + break; + case Type::DATE64: + *output_type = PandasBlock::DATETIME; + break; + case Type::TIMESTAMP: { + const auto& ts_type = static_cast(*col.type()); + if (ts_type.timezone() != "") { + *output_type = PandasBlock::DATETIME_WITH_TZ; + } else { + *output_type = PandasBlock::DATETIME; + } + } break; + case Type::LIST: { + auto list_type = std::static_pointer_cast(col.type()); + if (!ListTypeSupported(*list_type->value_type())) { + std::stringstream ss; + ss << "Not implemented type for list in DataFrameBlock: " + << list_type->value_type()->ToString(); + return Status::NotImplemented(ss.str()); + } + *output_type = PandasBlock::OBJECT; + } break; + case Type::DICTIONARY: + *output_type = PandasBlock::CATEGORICAL; break; - default: { + default: std::stringstream ss; - ss << "Categorical index type not implemented: " - << dict_type->index_type()->ToString(); + ss << "No known equivalent Pandas block for Arrow data of type "; + ss << col.type()->ToString() << " is known."; return Status::NotImplemented(ss.str()); - } } - return (*block)->Allocate(); + return Status::OK(); } -using BlockMap = std::unordered_map>; - // Construct the exact pandas 0.x "BlockManager" memory layout // // * For each column determine the correct output pandas type @@ -1105,7 +1215,9 @@ using BlockMap = std::unordered_map>; // * placement arrays as we go class DataFrameBlockCreator { public: - explicit DataFrameBlockCreator(const std::shared_ptr
& table) : table_(table) {} + explicit DataFrameBlockCreator(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool) + : table_(table), options_(options), pool_(pool) {} Status Convert(int nthreads, PyObject** output) { column_types_.resize(table_->num_columns()); @@ -1123,94 +1235,17 @@ class DataFrameBlockCreator { for (int i = 0; i < table_->num_columns(); ++i) { std::shared_ptr col = table_->column(i); PandasBlock::type output_type; - - Type::type column_type = col->type()->id(); - switch (column_type) { - case Type::BOOL: - output_type = col->null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; - break; - case Type::UINT8: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8; - break; - case Type::INT8: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8; - break; - case Type::UINT16: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16; - break; - case Type::INT16: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16; - break; - case Type::UINT32: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32; - break; - case Type::INT32: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32; - break; - case Type::INT64: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64; - break; - case Type::UINT64: - output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64; - break; - case Type::FLOAT: - output_type = PandasBlock::FLOAT; - break; - case Type::DOUBLE: - output_type = PandasBlock::DOUBLE; - break; - case Type::NA: - case Type::STRING: - case Type::BINARY: - case Type::FIXED_SIZE_BINARY: - case Type::STRUCT: - case Type::TIME32: - case Type::TIME64: - case Type::DECIMAL: - output_type = PandasBlock::OBJECT; - break; - case Type::DATE32: - output_type = PandasBlock::DATETIME; - break; - case Type::DATE64: - output_type = PandasBlock::DATETIME; - break; - case Type::TIMESTAMP: { - const auto& ts_type = static_cast(*col->type()); - if (ts_type.timezone() != "") { - output_type = PandasBlock::DATETIME_WITH_TZ; - } else { - output_type = PandasBlock::DATETIME; - } - } break; - case Type::LIST: { - auto list_type = std::static_pointer_cast(col->type()); - if (!ListTypeSupported(*list_type->value_type())) { - std::stringstream ss; - ss << "Not implemented type for list in DataFrameBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); - } - output_type = PandasBlock::OBJECT; - } break; - case Type::DICTIONARY: - output_type = PandasBlock::CATEGORICAL; - break; - default: - std::stringstream ss; - ss << "No known equivalent Pandas block for Arrow data of type "; - ss << col->type()->ToString() << " is known."; - return Status::NotImplemented(ss.str()); - } + RETURN_NOT_OK(GetPandasBlockType(*col, options_, &output_type)); int block_placement = 0; std::shared_ptr block; if (output_type == PandasBlock::CATEGORICAL) { - RETURN_NOT_OK(MakeCategoricalBlock(col->type(), table_->num_rows(), &block)); + block = std::make_shared(options_, pool_, table_->num_rows()); categorical_blocks_[i] = block; } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { const auto& ts_type = static_cast(*col->type()); - block = std::make_shared(ts_type.timezone(), table_->num_rows()); + block = std::make_shared(options_, ts_type.timezone(), + table_->num_rows()); RETURN_NOT_OK(block->Allocate()); datetimetz_blocks_[i] = block; } else { @@ -1224,92 +1259,61 @@ class DataFrameBlockCreator { type_counts_[output_type] = 1; } } - column_types_[i] = output_type; column_block_placement_[i] = block_placement; } // Create normal non-categorical blocks - for (const auto& it : type_counts_) { + for (const auto& it : this->type_counts_) { PandasBlock::type type = static_cast(it.first); std::shared_ptr block; - RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block)); - blocks_[type] = block; + RETURN_NOT_OK( + MakeBlock(this->options_, type, this->table_->num_rows(), it.second, &block)); + this->blocks_[type] = block; } return Status::OK(); } - Status WriteTableToBlocks(int nthreads) { - auto WriteColumn = [this](int i) { - std::shared_ptr col = this->table_->column(i); - PandasBlock::type output_type = this->column_types_[i]; + Status GetBlock(int i, std::shared_ptr* block) { + PandasBlock::type output_type = this->column_types_[i]; - int rel_placement = this->column_block_placement_[i]; + if (output_type == PandasBlock::CATEGORICAL) { + auto it = this->categorical_blocks_.find(i); + if (it == this->blocks_.end()) { + return Status::KeyError("No categorical block allocated"); + } + *block = it->second; + } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { + auto it = this->datetimetz_blocks_.find(i); + if (it == this->datetimetz_blocks_.end()) { + return Status::KeyError("No datetimetz block allocated"); + } + *block = it->second; + } else { + auto it = this->blocks_.find(output_type); + if (it == this->blocks_.end()) { + return Status::KeyError("No block allocated"); + } + *block = it->second; + } + return Status::OK(); + } + Status WriteTableToBlocks(int nthreads) { + auto WriteColumn = [this](int i) { std::shared_ptr block; - if (output_type == PandasBlock::CATEGORICAL) { - auto it = this->categorical_blocks_.find(i); - if (it == this->blocks_.end()) { - return Status::KeyError("No categorical block allocated"); - } - block = it->second; - } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { - auto it = this->datetimetz_blocks_.find(i); - if (it == this->datetimetz_blocks_.end()) { - return Status::KeyError("No datetimetz block allocated"); - } - block = it->second; - } else { - auto it = this->blocks_.find(output_type); - if (it == this->blocks_.end()) { - return Status::KeyError("No block allocated"); - } - block = it->second; - } - return block->Write(col, i, rel_placement); + RETURN_NOT_OK(this->GetBlock(i, &block)); + return block->Write(this->table_->column(i), i, this->column_block_placement_[i]); }; - nthreads = std::min(nthreads, table_->num_columns()); - + int num_tasks = table_->num_columns(); + nthreads = std::min(nthreads, num_tasks); if (nthreads == 1) { - for (int i = 0; i < table_->num_columns(); ++i) { + for (int i = 0; i < num_tasks; ++i) { RETURN_NOT_OK(WriteColumn(i)); } } else { - std::vector thread_pool; - thread_pool.reserve(nthreads); - std::atomic task_counter(0); - - std::mutex error_mtx; - bool error_occurred = false; - Status error; - - for (int thread_id = 0; thread_id < nthreads; ++thread_id) { - thread_pool.emplace_back( - [this, &error, &error_occurred, &error_mtx, &task_counter, &WriteColumn]() { - int column_num; - while (!error_occurred) { - column_num = task_counter.fetch_add(1); - if (column_num >= this->table_->num_columns()) { - break; - } - Status s = WriteColumn(column_num); - if (!s.ok()) { - std::lock_guard lock(error_mtx); - error_occurred = true; - error = s; - break; - } - } - }); - } - for (auto&& thread : thread_pool) { - thread.join(); - } - - if (error_occurred) { - return error; - } + RETURN_NOT_OK(ParallelFor(nthreads, num_tasks, WriteColumn)); } return Status::OK(); } @@ -1354,6 +1358,11 @@ class DataFrameBlockCreator { // block type -> type count std::unordered_map type_counts_; + PandasOptions options_; + + // Memory pool for dictionary encoding + MemoryPool* pool_; + // block type -> block BlockMap blocks_; @@ -1366,8 +1375,9 @@ class DataFrameBlockCreator { class ArrowDeserializer { public: - ArrowDeserializer(const std::shared_ptr& col, PyObject* py_ref) - : col_(col), data_(*col->data().get()), py_ref_(py_ref) {} + ArrowDeserializer(PandasOptions options, const std::shared_ptr& col, + PyObject* py_ref) + : col_(col), data_(*col->data().get()), options_(options), py_ref_(py_ref) {} Status AllocateOutput(int type) { PyAcquireGIL lock; @@ -1378,7 +1388,8 @@ class ArrowDeserializer { } template - Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr arr) { + Status ConvertValuesZeroCopy(PandasOptions options, int npy_type, + std::shared_ptr arr) { typedef typename internal::arrow_traits::T T; const auto& prim_arr = static_cast(*arr); @@ -1429,7 +1440,7 @@ class ArrowDeserializer { int npy_type = traits::npy_type; if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { - return ConvertValuesZeroCopy(npy_type, data_.chunk(0)); + return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); } RETURN_NOT_OK(AllocateOutput(npy_type)); @@ -1482,17 +1493,17 @@ class ArrowDeserializer { typedef typename traits::T T; if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { - return ConvertValuesZeroCopy(traits::npy_type, data_.chunk(0)); + return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); } if (data_.null_count() > 0) { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertIntegerWithNulls(data_, out_values); + ConvertIntegerWithNulls(options_, data_, out_values); } else { RETURN_NOT_OK(AllocateOutput(traits::npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertIntegerNoNullsSameType(data_, out_values); + ConvertIntegerNoNullsSameType(options_, data_, out_values); } return Status::OK(); @@ -1502,7 +1513,7 @@ class ArrowDeserializer { inline Status VisitObjects(FUNCTOR func) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - return func(data_, out_values); + return func(options_, data_, out_values); } // UTF8 strings @@ -1534,7 +1545,7 @@ class ArrowDeserializer { } else { RETURN_NOT_OK(AllocateOutput(internal::arrow_traits::npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertBooleanNoNulls(data_, out_values); + ConvertBooleanNoNulls(options_, data_, out_values); } return Status::OK(); } @@ -1542,7 +1553,7 @@ class ArrowDeserializer { Status Visit(const ListType& type) { #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ - return ConvertListsLike(col_, out_values); + return ConvertListsLike(options_, col_, out_values); RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); @@ -1572,8 +1583,7 @@ class ArrowDeserializer { } Status Visit(const DictionaryType& type) { - std::shared_ptr block; - RETURN_NOT_OK(MakeCategoricalBlock(col_->type(), col_->length(), &block)); + auto block = std::make_shared(options_, nullptr, col_->length()); RETURN_NOT_OK(block->Write(col_, 0, 0)); auto dict_type = static_cast(col_->type().get()); @@ -1587,7 +1597,8 @@ class ArrowDeserializer { // Release GIL before calling ConvertArrayToPandas, will be reacquired // there if needed lock.release(); - RETURN_NOT_OK(ConvertArrayToPandas(dict_type->dictionary(), nullptr, &dictionary)); + RETURN_NOT_OK( + ConvertArrayToPandas(options_, dict_type->dictionary(), nullptr, &dictionary)); lock.acquire(); PyDict_SetItemString(result_, "indices", block->block_arr()); @@ -1607,28 +1618,29 @@ class ArrowDeserializer { private: std::shared_ptr col_; const ChunkedArray& data_; + PandasOptions options_; PyObject* py_ref_; PyArrayObject* arr_; PyObject* result_; }; -Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out) { +Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, + PyObject* py_ref, PyObject** out) { static std::string dummy_name = "dummy"; auto field = std::make_shared(dummy_name, arr->type()); auto col = std::make_shared(field, arr); - return ConvertColumnToPandas(col, py_ref, out); + return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, - PyObject** out) { - ArrowDeserializer converter(col, py_ref); +Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, + PyObject* py_ref, PyObject** out) { + ArrowDeserializer converter(options, col, py_ref); return converter.Convert(out); } -Status ConvertTableToPandas(const std::shared_ptr
& table, int nthreads, - PyObject** out) { - DataFrameBlockCreator helper(table); +Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, + int nthreads, MemoryPool* pool, PyObject** out) { + DataFrameBlockCreator helper(options, table, pool); return helper.Convert(nthreads, out); } diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 5a99274a33ee0..1d716a5c94fa6 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -39,18 +39,18 @@ class Table; namespace py { -ARROW_EXPORT -Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out); - -ARROW_EXPORT -Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, - PyObject** out); - struct PandasOptions { bool strings_to_categorical; }; +ARROW_EXPORT +Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, + PyObject* py_ref, PyObject** out); + +ARROW_EXPORT +Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, + PyObject* py_ref, PyObject** out); + // Convert a whole table as efficiently as possible to a pandas.DataFrame. // // The returned Python object is a list of tuples consisting of the exact 2D @@ -58,8 +58,8 @@ struct PandasOptions { // // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) ARROW_EXPORT -Status ConvertTableToPandas(const std::shared_ptr
& table, int nthreads, - PyObject** out); +Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, + int nthreads, MemoryPool* pool, PyObject** out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index dd956463fec76..0d830127ee9b0 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -92,7 +92,9 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { PyObject* out; Py_BEGIN_ALLOW_THREADS; - ASSERT_RAISES(UnknownError, ConvertTableToPandas(table, 2, &out)); + PandasOptions options; + MemoryPool* pool = default_memory_pool(); + ASSERT_RAISES(UnknownError, ConvertTableToPandas(options, table, 2, pool, &out)); Py_END_ALLOW_THREADS; } diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h new file mode 100644 index 0000000000000..9fec000c3ed35 --- /dev/null +++ b/cpp/src/arrow/util/parallel.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_PARALLEL_H +#define ARROW_UTIL_PARALLEL_H + +#include +#include +#include +#include + +#include "arrow/status.h" + +namespace arrow { + +template +Status ParallelFor(int nthreads, int num_tasks, FUNCTION&& func) { + std::vector thread_pool; + thread_pool.reserve(nthreads); + std::atomic task_counter(0); + + std::mutex error_mtx; + bool error_occurred = false; + Status error; + + for (int thread_id = 0; thread_id < nthreads; ++thread_id) { + thread_pool.emplace_back( + [&num_tasks, &task_counter, &error, &error_occurred, &error_mtx, &func]() { + int task_id; + while (!error_occurred) { + task_id = task_counter.fetch_add(1); + if (task_id >= num_tasks) { + break; + } + Status s = func(task_id); + if (!s.ok()) { + std::lock_guard lock(error_mtx); + error_occurred = true; + error = s; + break; + } + } + }); + } + for (auto&& thread : thread_pool) { + thread.join(); + } + if (error_occurred) { + return error; + } + return Status::OK(); +} + +} // namespace arrow + +#endif diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c0c7ac6da09ed..20e778d068ff8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -274,10 +274,15 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self): + def to_pandas(self, c_bool strings_to_categorical=False): """ Convert to an array object suitable for use in pandas + Parameters + ---------- + strings_to_categorical : boolean, default False + Encode string (UTF8) and binary types to pandas.Categorical + See also -------- Column.to_pandas @@ -286,9 +291,12 @@ cdef class Array: """ cdef: PyObject* out + PandasOptions options + options = PandasOptions(strings_to_categorical=strings_to_categorical) with nogil: - check_status(ConvertArrayToPandas(self.sp_array, self, &out)) + check_status(ConvertArrayToPandas(options, self.sp_array, + self, &out)) return wrap_array_output(out) def to_pylist(self): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 3ea487385de76..eed9640861fac 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -740,14 +740,18 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus TensorToNdarray(const CTensor& tensor, object base, PyObject** out) - CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, + CStatus ConvertArrayToPandas(PandasOptions options, + const shared_ptr[CArray]& arr, object py_ref, PyObject** out) - CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, + CStatus ConvertColumnToPandas(PandasOptions options, + const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) - CStatus ConvertTableToPandas(const shared_ptr[CTable]& table, - int nthreads, PyObject** out) + CStatus ConvertTableToPandas(PandasOptions options, + const shared_ptr[CTable]& table, + int nthreads, CMemoryPool* pool, + PyObject** out) void c_set_default_memory_pool \ " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\ @@ -767,6 +771,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyBytesReader(CBufferReader): PyBytesReader(object fo) + cdef struct PandasOptions: + c_bool strings_to_categorical + cdef extern from 'arrow/python/init.h': int arrow_init_numpy() except -1 diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index ddd562238e38a..434b1c9eab90e 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -269,7 +269,7 @@ def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): return values, type_ -def table_to_blockmanager(table, nthreads=1): +def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype import pyarrow.lib as lib @@ -305,17 +305,16 @@ def table_to_blockmanager(table, nthreads=1): block_table.schema.get_field_index(name) ) - result = lib.table_to_blocks(block_table, nthreads) + result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: - ordered = block_table.schema[placement[0]].type.ordered cat = pd.Categorical(block_arr, categories=item['dictionary'], - ordered=ordered, fastpath=True) + ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index b9b08998b3372..976f4297d5228 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -157,7 +157,7 @@ cdef class Column: sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array)) return pyarrow_wrap_column(sp_column) - def to_pandas(self): + def to_pandas(self, strings_to_categorical=False): """ Convert the arrow::Column to a pandas.Series @@ -167,9 +167,13 @@ cdef class Column: """ cdef: PyObject* out + PandasOptions options + + options = PandasOptions(strings_to_categorical=strings_to_categorical) with nogil: - check_status(libarrow.ConvertColumnToPandas(self.sp_column, + check_status(libarrow.ConvertColumnToPandas(options, + self.sp_column, self, &out)) return pd.Series(wrap_array_output(out), name=self.name) @@ -580,15 +584,18 @@ cdef class RecordBatch: return pyarrow_wrap_batch(batch) -def table_to_blocks(Table table, int nthreads): +def table_to_blocks(PandasOptions options, Table table, int nthreads, + MemoryPool memory_pool): cdef: PyObject* result_obj shared_ptr[CTable] c_table = table.sp_table + CMemoryPool* pool + pool = maybe_unbox_memory_pool(memory_pool) with nogil: check_status( libarrow.ConvertTableToPandas( - c_table, nthreads, &result_obj + options, c_table, nthreads, pool, &result_obj ) ) @@ -790,7 +797,8 @@ cdef class Table: return pyarrow_wrap_table(c_table) - def to_pandas(self, nthreads=None): + def to_pandas(self, nthreads=None, strings_to_categorical=False, + memory_pool=None): """ Convert the arrow::Table to a pandas DataFrame @@ -800,16 +808,23 @@ cdef class Table: For the default, we divide the CPU count by 2 because most modern computers have hyperthreading turned on, so doubling the CPU count beyond the number of physical cores does not help + strings_to_categorical : boolean, default False + Encode string (UTF8) and binary types to pandas.Categorical + memory_pool: MemoryPool, optional + Specific memory pool to use to allocate casted columns Returns ------- pandas.DataFrame """ + cdef: + PandasOptions options + options = PandasOptions(strings_to_categorical=strings_to_categorical) self._check_nullptr() if nthreads is None: nthreads = cpu_count() - - mgr = pdcompat.table_to_blockmanager(self, nthreads) + mgr = pdcompat.table_to_blockmanager(options, self, memory_pool, + nthreads) return pd.DataFrame(mgr) def to_pydict(self): diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 93058fb0a47b4..8969777b526c0 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -327,7 +327,7 @@ def test_timestamps_notimezone_no_nulls(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - }) + }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -342,7 +342,7 @@ def test_timestamps_notimezone_no_nulls(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - }) + }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -369,7 +369,7 @@ def test_timestamps_notimezone_nulls(self): None, '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - }) + }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -384,7 +384,7 @@ def test_timestamps_notimezone_nulls(self): None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - }) + }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( @@ -400,7 +400,7 @@ def test_timestamps_with_timezone(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - }) + }) df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') .to_frame()) self._check_pandas_roundtrip(df, timestamps_to_ms=True) @@ -413,7 +413,7 @@ def test_timestamps_with_timezone(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - }) + }) df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') .to_frame()) self._check_pandas_roundtrip(df, timestamps_to_ms=False) @@ -462,7 +462,7 @@ def test_date_objects_typed(self): table_pandas = table.to_pandas() ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04', - '2017-04-05'], + '2017-04-05'], dtype='datetime64[D]') .astype('datetime64[ns]')) ex_values[1] = pd.NaT.value @@ -491,10 +491,10 @@ def test_timedelta(self): # TODO(jreback): Pandas only support ns resolution # Arrow supports ??? for resolution df = pd.DataFrame({ - 'timedelta': np.arange(start=0, stop=3*86400000, + 'timedelta': np.arange(start=0, stop=3 * 86400000, step=86400000, dtype='timedelta64[ms]') - }) + }) pa.Table.from_pandas(df) def test_column_of_arrays(self): @@ -920,6 +920,17 @@ def test_decimal_metadata(self): assert data_column['numpy_type'] == 'object' assert data_column['metadata'] == {'precision': 26, 'scale': 11} + def test_table_str_to_categorical(self): + values = [None, 'a', 'b', np.nan] + df = pd.DataFrame({'strings': values}) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) + table = pa.Table.from_pandas(df, schema=schema) + + result = table.to_pandas(strings_to_categorical=True) + expected = pd.DataFrame({'strings': pd.Categorical(values)}) + tm.assert_frame_equal(result, expected, check_dtype=True) + def _pytime_from_micros(val): microseconds = val % 1000000 From a9c2f196e244e4073c750cbf07872a5ffccf48d2 Mon Sep 17 00:00:00 2001 From: Matt Darwin <(none)> Date: Tue, 8 Aug 2017 13:52:21 -0400 Subject: [PATCH 34/38] ARROW-1242: [JAVA] - upgrade jackson to mitigate security vulnerabilities As per #872 I am upgrading Jackson to the latest version on the current train (2.7.1 --> 2.7.9) Author: Matt Darwin <(none)> Author: Matt Closes #929 from mattdarwin/ARROW-1242-upgrade-jackson and squashes the following commits: d0595176 [Matt Darwin] 1242 upgraing jackson to 2.7.9 bc3b6a07 [Matt] Merge pull request #1 from apache/master --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index de2113e397e15..5702e960498fa 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 1.7.25 18.0 2 - 2.7.1 + 2.7.9 2.7.1 false From 7fdbcc60693884f701b40ee74ff86f5da71976af Mon Sep 17 00:00:00 2001 From: Matt Darwin <(none)> Date: Wed, 9 Aug 2017 09:10:08 -0400 Subject: [PATCH 35/38] ARROW-1243: [JAVA] update all libs to latest versions NB this commit excludes Jackson and logback upgrades, since they are dealt with in 871 and 872 Author: Matt Darwin <(none)> Author: Matt Darwin Author: Matt Closes #873 from mattdarwin/upgrade-libs and squashes the following commits: 9b51f46e [Matt Darwin] Merge branch 'master' into upgrade-libs 284a4ce3 [Matt Darwin] Merge branch 'master' of https://github.com/apache/arrow 79550b15 [Matt Darwin] rolling back lilith to 0.9.44 since 8 doesn't support java 7 c63eef64 [Matt Darwin] Merge branch 'master' into upgrade-libs bc3b6a07 [Matt] Merge pull request #1 from apache/master 8599ba06 [Matt Darwin] backing out guava upgrade 80d81e64 [Matt Darwin] downgrading guava to 20 for java 7 compatibility 806f3489 [Matt Darwin] Merge branch 'master' into upgrade-libs 8aafb7e9 [Matt Darwin] correcting indentation in BaseValueVector 94c14698 [Matt Darwin] upgrading netty to 4.0.49 cff5596a [Matt Darwin] reverting to netty 4.0.41.Final 568737d3 [Matt Darwin] switching to Collections from Guava for empty iterator c194e48d [Matt Darwin] upgraded hppc to 0.7.2 38be468c [Matt Darwin] upgrading libs except jackson and logback --- java/memory/pom.xml | 4 ++-- java/pom.xml | 6 +++--- java/tools/pom.xml | 4 ++-- java/vector/pom.xml | 6 +++--- .../main/java/org/apache/arrow/vector/BaseValueVector.java | 3 ++- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 9a8d2d7c9b56f..1070747691cfc 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -23,13 +23,13 @@ com.google.code.findbugs jsr305 - 3.0.1 + 3.0.2 com.carrotsearch hppc - 0.7.1 + 0.7.2 diff --git a/java/pom.xml b/java/pom.xml index 5702e960498fa..b0621c5a41ea6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -454,7 +454,7 @@ io.netty netty-handler - 4.0.41.Final + 4.0.49.Final @@ -495,7 +495,7 @@ com.googlecode.jmockit jmockit - 1.3 + 1.7 test @@ -507,7 +507,7 @@ org.mockito mockito-core - 1.9.5 + 2.7.22 test diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 9d067ef1e9bc2..8aadb44f3f69f 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -38,12 +38,12 @@ org.apache.commons commons-lang3 - 3.4 + 3.6 commons-cli commons-cli - 1.2 + 1.4 ch.qos.logback diff --git a/java/vector/pom.xml b/java/vector/pom.xml index e15ab9a2497fc..a81bbd0f12cca 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -34,7 +34,7 @@ joda-time joda-time - 2.9 + 2.9.9 com.fasterxml.jackson.core @@ -49,12 +49,12 @@ com.carrotsearch hppc - 0.7.1 + 0.7.2 org.apache.commons commons-lang3 - 3.4 + 3.6 commons-codec diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 05d2aa933cbef..598e578e55a6d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -18,6 +18,7 @@ package org.apache.arrow.vector; +import java.util.Collections; import java.util.Iterator; import com.google.flatbuffers.FlatBufferBuilder; @@ -104,7 +105,7 @@ public void reset() { @Override public Iterator iterator() { - return Iterators.emptyIterator(); + return Collections.emptyIterator(); } public static boolean checkBufRefs(final ValueVector vv) { From 86154f0be3fbafcd27716f3b3f7058c31242a52f Mon Sep 17 00:00:00 2001 From: Emilio Lahr-Vivaz Date: Wed, 9 Aug 2017 09:11:52 -0400 Subject: [PATCH 36/38] ARROW-1340: [Java] Fix NullableMapVector field metadata Author: Emilio Lahr-Vivaz Closes #953 from elahrvivaz/ARROW-1340 and squashes the following commits: a307779e [Emilio Lahr-Vivaz] ARROW-1340: [Java] Fix NullableMapVector field metadata --- .../vector/complex/NullableMapVector.java | 3 +- .../apache/arrow/vector/TestMapVector.java | 57 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index e70a915561f8b..fda9c1471e589 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -86,7 +86,8 @@ public NullableMapVector(String name, BufferAllocator allocator, FieldType field @Override public Field getField() { Field f = super.getField(); - return new Field(f.getName(), true, f.getType(), f.getChildren()); + FieldType type = new FieldType(true, f.getType(), f.getFieldType().getDictionary(), f.getFieldType().getMetadata()); + return new Field(f.getName(), type, f.getChildren()); } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java new file mode 100644 index 0000000000000..357df96aa2efc --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.NullableMapVector; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + + +public class TestMapVector { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testFieldMetadata() throws Exception { + Map metadata = new HashMap<>(); + metadata.put("k1", "v1"); + FieldType type = new FieldType(true, Struct.INSTANCE, null, metadata); + try (NullableMapVector vector = new NullableMapVector("map", allocator, type, null)) { + Assert.assertEquals(vector.getField().getMetadata(), type.getMetadata()); + } + } +} From e44ede87c069087e11b4f57682090e01ae06a746 Mon Sep 17 00:00:00 2001 From: Emilio Lahr-Vivaz Date: Wed, 9 Aug 2017 09:36:18 -0400 Subject: [PATCH 37/38] ARROW-1343: [Java] Aligning serialized schema, end of buffers in RecordBatches Author: Emilio Lahr-Vivaz Author: Wes McKinney Closes #954 from elahrvivaz/align_end and squashes the following commits: 79ac1204 [Wes McKinney] Revert to NDEBUG because it's a standard define in release builds in MSVC ae6bc9f2 [Wes McKinney] Use __declspec(noreturn) in MSVC. Not sure why this suddenly showed up 74b29ccf [Wes McKinney] Add notes to IPC.md to make alignment contract more clear e2f0114b [Wes McKinney] Add C++ DCHECKs on read path for aligned buffers, aligned file block offset, lengths 3d64c9f5 [Wes McKinney] Align stream schema message in C++, DCHECKs for FileBlocks 4778ee1f [Emilio Lahr-Vivaz] adding padding to magic bytes in file format 53429159 [Emilio Lahr-Vivaz] using asserts instead of padding checks, adding padding to ArrowRecordBatch.calculateBodySize, moving align to writeBufferBatches a12b4ff8 [Emilio Lahr-Vivaz] comments 0b32265b [Emilio Lahr-Vivaz] aligning schema write 26bbc255 [Emilio Lahr-Vivaz] Merge branch 'ARROW-1340' into align_end a307779e [Emilio Lahr-Vivaz] ARROW-1340: [Java] Fix NullableMapVector field metadata b2bf86d4 [Emilio Lahr-Vivaz] WIP for aligning end of buffers --- cpp/src/arrow/ipc/metadata.cc | 15 ++++++++ cpp/src/arrow/ipc/reader.cc | 13 +++++++ cpp/src/arrow/ipc/util.h | 4 +- cpp/src/arrow/ipc/writer.cc | 37 ++++++++++--------- cpp/src/arrow/util/bit-util.h | 18 ++++++--- cpp/src/arrow/util/logging.h | 6 ++- cpp/src/arrow/util/macros.h | 6 +++ format/IPC.md | 10 ++++- .../arrow/vector/file/ArrowFileWriter.java | 4 +- .../apache/arrow/vector/file/ArrowMagic.java | 5 ++- .../arrow/vector/schema/ArrowRecordBatch.java | 3 ++ .../vector/stream/MessageSerializer.java | 34 +++++++++++++---- 12 files changed, 118 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index faf01a568483a..c9534217e927c 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -36,6 +36,7 @@ #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/util/logging.h" namespace arrow { @@ -773,6 +774,20 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti flatbuffers::Offset fb_schema; RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, dictionary_memo, &fb_schema)); +#ifndef NDEBUG + for (size_t i = 0; i < dictionaries.size(); ++i) { + DCHECK(BitUtil::IsMultipleOf8(dictionaries[i].offset)) << i; + DCHECK(BitUtil::IsMultipleOf8(dictionaries[i].metadata_length)) << i; + DCHECK(BitUtil::IsMultipleOf8(dictionaries[i].body_length)) << i; + } + + for (size_t i = 0; i < record_batches.size(); ++i) { + DCHECK(BitUtil::IsMultipleOf8(record_batches[i].offset)) << i; + DCHECK(BitUtil::IsMultipleOf8(record_batches[i].metadata_length)) << i; + DCHECK(BitUtil::IsMultipleOf8(record_batches[i].body_length)) << i; + } +#endif + auto fb_dictionaries = FileBlocksToFlatbuffer(fbb, dictionaries); auto fb_record_batches = FileBlocksToFlatbuffer(fbb, record_batches); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8ae82804c3164..6ea907e0ed09f 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -34,6 +34,7 @@ #include "arrow/table.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/util/bit-util.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -59,6 +60,9 @@ class IpcComponentSource { *out = nullptr; return Status::OK(); } else { + DCHECK(BitUtil::IsMultipleOf8(buffer->offset())) + << "Buffer " << buffer_index + << " did not start on 8-byte aligned offset: " << buffer->offset(); return file_->ReadAt(buffer->offset(), buffer->length(), out); } } @@ -550,6 +554,10 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { DCHECK_LT(i, num_record_batches()); FileBlock block = record_batch(i); + DCHECK(BitUtil::IsMultipleOf8(block.offset)); + DCHECK(BitUtil::IsMultipleOf8(block.metadata_length)); + DCHECK(BitUtil::IsMultipleOf8(block.body_length)); + std::unique_ptr message; RETURN_NOT_OK( ReadMessage(block.offset, block.metadata_length, file_.get(), &message)); @@ -564,6 +572,11 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { // Read all the dictionaries for (int i = 0; i < num_dictionaries(); ++i) { FileBlock block = dictionary(i); + + DCHECK(BitUtil::IsMultipleOf8(block.offset)); + DCHECK(BitUtil::IsMultipleOf8(block.metadata_length)); + DCHECK(BitUtil::IsMultipleOf8(block.body_length)); + std::unique_ptr message; RETURN_NOT_OK( ReadMessage(block.offset, block.metadata_length, file_.get(), &message)); diff --git a/cpp/src/arrow/ipc/util.h b/cpp/src/arrow/ipc/util.h index 49a7d01ac6f35..412f31215ed65 100644 --- a/cpp/src/arrow/ipc/util.h +++ b/cpp/src/arrow/ipc/util.h @@ -27,10 +27,12 @@ namespace arrow { namespace ipc { -// Align on 8-byte boundaries // Buffers are padded to 64-byte boundaries (for SIMD) static constexpr int kArrowAlignment = 64; +// Align on 8-byte boundaries in IPC +static constexpr int kArrowIpcAlignment = 8; + static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0}; static inline int64_t PaddedLength(int64_t nbytes, int64_t alignment = kArrowAlignment) { diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 163b27b443351..bc07dc659f601 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -156,7 +156,7 @@ class RecordBatchSerializer : public ArrayVisitor { // The buffer might be null if we are handling zero row lengths. if (buffer) { size = buffer->size(); - padding = BitUtil::RoundUpToMultipleOf64(size) - size; + padding = BitUtil::RoundUpToMultipleOf8(size) - size; } // TODO(wesm): We currently have no notion of shared memory page id's, @@ -172,7 +172,7 @@ class RecordBatchSerializer : public ArrayVisitor { } *body_length = offset - buffer_start_offset_; - DCHECK(BitUtil::IsMultipleOf64(*body_length)); + DCHECK(BitUtil::IsMultipleOf8(*body_length)); return Status::OK(); } @@ -216,7 +216,7 @@ class RecordBatchSerializer : public ArrayVisitor { // The buffer might be null if we are handling zero row lengths. if (buffer) { size = buffer->size(); - padding = BitUtil::RoundUpToMultipleOf64(size) - size; + padding = BitUtil::RoundUpToMultipleOf8(size) - size; } if (size > 0) { @@ -251,7 +251,7 @@ class RecordBatchSerializer : public ArrayVisitor { // Send padding if it's available const int64_t buffer_length = - std::min(BitUtil::RoundUpToMultipleOf64(array.length() * type_width), + std::min(BitUtil::RoundUpToMultipleOf8(array.length() * type_width), data->size() - byte_offset); data = SliceBuffer(data, byte_offset, buffer_length); } @@ -618,15 +618,7 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { } virtual Status Start() { - std::shared_ptr schema_fb; - RETURN_NOT_OK(WriteSchemaMessage(*schema_, &dictionary_memo_, &schema_fb)); - - int32_t flatbuffer_size = static_cast(schema_fb->size()); - RETURN_NOT_OK( - Write(reinterpret_cast(&flatbuffer_size), sizeof(int32_t))); - - // Write the flatbuffer - RETURN_NOT_OK(Write(schema_fb->data(), flatbuffer_size)); + RETURN_NOT_OK(WriteSchema()); // If there are any dictionaries, write them as the next messages RETURN_NOT_OK(WriteDictionaries()); @@ -635,6 +627,17 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { return Status::OK(); } + Status WriteSchema() { + std::shared_ptr schema_fb; + RETURN_NOT_OK(WriteSchemaMessage(*schema_, &dictionary_memo_, &schema_fb)); + + int32_t metadata_length = 0; + RETURN_NOT_OK(WriteMessage(*schema_fb, sink_, &metadata_length)); + RETURN_NOT_OK(UpdatePosition()); + DCHECK_EQ(0, position_ % 8) << "WriteSchema did not perform an aligned write"; + return Status::OK(); + } + virtual Status Close() { // Write the schema if not already written // User is responsible for closing the OutputStream @@ -701,9 +704,9 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl { &record_batches_[record_batches_.size() - 1]); } - // Adds padding bytes if necessary to ensure all memory blocks are written on - // 64-byte (or other alignment) boundaries. - Status Align(int64_t alignment = kArrowAlignment) { + Status Align(int64_t alignment = kArrowIpcAlignment) { + // Adds padding bytes if necessary to ensure all memory blocks are written on + // 8-byte (or other alignment) boundaries. int64_t remainder = PaddedLength(position_, alignment) - position_; if (remainder > 0) { return Write(kPaddingBytes, remainder); @@ -774,7 +777,7 @@ class RecordBatchFileWriter::RecordBatchFileWriterImpl // It is only necessary to align to 8-byte boundary at the start of the file RETURN_NOT_OK(Write(reinterpret_cast(kArrowMagicBytes), strlen(kArrowMagicBytes))); - RETURN_NOT_OK(Align(8)); + RETURN_NOT_OK(Align()); // We write the schema at the start of the file (and the end). This also // writes all the dictionaries at the beginning of the file diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index fc360bae4e451..5c3938aadbc66 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -217,13 +217,13 @@ static inline uint32_t RoundUpNumi64(uint32_t bits) { return (bits + 63) >> 6; } /// Returns the rounded down to 64 multiple. static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; } -static inline int64_t RoundUpToMultipleOf64(int64_t num) { +template +static inline int64_t RoundToPowerOfTwo(int64_t num) { // TODO(wesm): is this definitely needed? // DCHECK_GE(num, 0); - constexpr int64_t round_to = 64; - constexpr int64_t force_carry_addend = round_to - 1; - constexpr int64_t truncate_bitmask = ~(round_to - 1); - constexpr int64_t max_roundable_num = std::numeric_limits::max() - round_to; + constexpr int64_t force_carry_addend = ROUND_TO - 1; + constexpr int64_t truncate_bitmask = ~(ROUND_TO - 1); + constexpr int64_t max_roundable_num = std::numeric_limits::max() - ROUND_TO; if (num <= max_roundable_num) { return (num + force_carry_addend) & truncate_bitmask; } @@ -231,6 +231,14 @@ static inline int64_t RoundUpToMultipleOf64(int64_t num) { return num; } +static inline int64_t RoundUpToMultipleOf64(int64_t num) { + return RoundToPowerOfTwo<64>(num); +} + +static inline int64_t RoundUpToMultipleOf8(int64_t num) { + return RoundToPowerOfTwo<8>(num); +} + /// Non hw accelerated pop count. /// TODO: we don't use this in any perf sensitive code paths currently. There /// might be a much faster way to implement this. diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 89e69f932d52d..998f7ed7bfaaa 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -21,6 +21,8 @@ #include #include +#include "arrow/util/macros.h" + namespace arrow { // Stubbed versions of macros defined in glog/logging.h, intended for @@ -127,9 +129,9 @@ class CerrLog { class FatalLog : public CerrLog { public: explicit FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL){} // NOLINT + : CerrLog(ARROW_FATAL) {} // NOLINT - [[noreturn]] ~FatalLog() { + ARROW_NORETURN ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index a2f704f0c68bc..fe2d7689bf641 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -36,7 +36,13 @@ #if defined(__GNUC__) #define ARROW_PREDICT_FALSE(x) (__builtin_expect(x, 0)) #define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#define ARROW_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define ARROW_NORETURN __declspec(noreturn) +#define ARROW_PREDICT_FALSE(x) x +#define ARROW_PREDICT_TRUE(x) x #else +#define ARROW_NORETURN #define ARROW_PREDICT_FALSE(x) x #define ARROW_PREDICT_TRUE(x) x #endif diff --git a/format/IPC.md b/format/IPC.md index 3fd234e4aa160..2f79031443b17 100644 --- a/format/IPC.md +++ b/format/IPC.md @@ -27,7 +27,7 @@ Data components in the stream and file formats are represented as encapsulated * A length prefix indicating the metadata size * The message metadata as a [Flatbuffer][3] * Padding bytes to an 8-byte boundary -* The message body +* The message body, which must be a multiple of 8 bytes Schematically, we have: @@ -38,6 +38,10 @@ Schematically, we have: ``` +The complete serialized message must be a multiple of 8 bytes so that messages +can be relocated between streams. Otherwise the amount of padding between the +metadata and the message body could be non-deterministic. + The `metadata_size` includes the size of the flatbuffer plus padding. The `Message` flatbuffer includes a version number, the particular message (as a flatbuffer union), and the size of the message body: @@ -154,6 +158,10 @@ struct Block { } ``` +The `metaDataLength` here includes the metadata length prefix, serialized +metadata, and any additional padding bytes, and by construction must be a +multiple of 8 bytes. + Some notes about this * The `Block` offset indicates the starting byte of the record batch. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java index 06519bc49fd1c..1d92d2bde1c6f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java @@ -38,7 +38,7 @@ public ArrowFileWriter(VectorSchemaRoot root, DictionaryProvider provider, Writa @Override protected void startInternal(WriteChannel out) throws IOException { - ArrowMagic.writeMagic(out); + ArrowMagic.writeMagic(out, true); } @Override @@ -54,7 +54,7 @@ protected void endInternal(WriteChannel out, } out.writeIntLittleEndian(footerLength); LOGGER.debug(String.format("Footer starts at %d, length: %d", footerStart, footerLength)); - ArrowMagic.writeMagic(out); + ArrowMagic.writeMagic(out, false); LOGGER.debug(String.format("magic written, now at %d", out.getCurrentPosition())); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java index 0d2da375295fe..68313e7878b71 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java @@ -28,8 +28,11 @@ public class ArrowMagic { public static final int MAGIC_LENGTH = MAGIC.length; - public static void writeMagic(WriteChannel out) throws IOException { + public static void writeMagic(WriteChannel out, boolean align) throws IOException { out.write(MAGIC); + if (align) { + out.align(); + } } public static boolean validateMagic(byte[] array) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java index d2f3782469597..c842d4c3f9a74 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java @@ -168,6 +168,9 @@ public int computeBodyLength() { ByteBuffer nioBuffer = buffer.nioBuffer(buffer.readerIndex(), buffer.readableBytes()); size += nioBuffer.remaining(); + if (size % 8 != 0) { + size += 8 - (size % 8); + } } return size; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java index a70d029389427..f69aa41e7f6bd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java @@ -78,12 +78,25 @@ public static int bytesToInt(byte[] bytes) { * @throws IOException if something went wrong */ public static long serialize(WriteChannel out, Schema schema) throws IOException { + long start = out.getCurrentPosition(); + assert start % 8 == 0; + FlatBufferBuilder builder = new FlatBufferBuilder(); int schemaOffset = schema.getSchema(builder); ByteBuffer serializedMessage = serializeMessage(builder, MessageHeader.Schema, schemaOffset, 0); - long size = out.writeIntLittleEndian(serializedMessage.remaining()); - size += out.write(serializedMessage); - return size; + + int size = serializedMessage.remaining(); + // ensure that message aligns to 8 byte padding - 4 bytes for size, then message body + if ((size + 4) % 8 != 0) { + size += 8 - (size + 4) % 8; + } + + out.writeIntLittleEndian(size); + out.write(serializedMessage); + out.align(); // any bytes written are already captured by our size modification above + + assert (size + 4) % 8 == 0; + return size + 4; } /** @@ -120,6 +133,7 @@ public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) long start = out.getCurrentPosition(); int bodyLength = batch.computeBodyLength(); + assert bodyLength % 8 == 0; FlatBufferBuilder builder = new FlatBufferBuilder(); int batchOffset = batch.writeTo(builder); @@ -141,6 +155,7 @@ public static ArrowBlock serialize(WriteChannel out, ArrowRecordBatch batch) out.align(); long bufferLength = writeBatchBuffers(out, batch); + assert bufferLength % 8 == 0; // Metadata size in the Block account for the size prefix return new ArrowBlock(start, metadataLength + 4, bufferLength); @@ -164,6 +179,7 @@ public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) t " != " + startPosition + layout.getSize()); } } + out.align(); return out.getCurrentPosition() - bufferStart; } @@ -268,6 +284,7 @@ public static ArrowRecordBatch deserializeRecordBatch(RecordBatch recordBatchFB, public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) throws IOException { long start = out.getCurrentPosition(); int bodyLength = batch.computeBodyLength(); + assert bodyLength % 8 == 0; FlatBufferBuilder builder = new FlatBufferBuilder(); int batchOffset = batch.writeTo(builder); @@ -276,10 +293,10 @@ public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) int metadataLength = serializedMessage.remaining(); - // Add extra padding bytes so that length prefix + metadata is a multiple - // of 8 after alignment - if ((start + metadataLength + 4) % 8 != 0) { - metadataLength += 8 - (start + metadataLength + 4) % 8; + // calculate alignment bytes so that metadata length points to the correct location after alignment + int padding = (int) ((start + metadataLength + 4) % 8); + if (padding != 0) { + metadataLength += (8 - padding); } out.writeIntLittleEndian(metadataLength); @@ -290,9 +307,10 @@ public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) // write the embedded record batch long bufferLength = writeBatchBuffers(out, batch.getDictionary()); + assert bufferLength % 8 == 0; // Metadata size in the Block account for the size prefix - return new ArrowBlock(start, metadataLength + 4, bufferLength + 8); + return new ArrowBlock(start, metadataLength + 4, bufferLength); } /** From 2972c9d3a0d371dbdcf69c68a0109b83aa6fd944 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 9 Aug 2017 17:47:30 -0400 Subject: [PATCH 38/38] ARROW-1342: [Python] Support strided ndarrays in pandas conversion from nested lists This does drop the vector append to the builder that was there before. I'm going to do some local benchmarking to make sure this doesn't degrade performance unacceptably, will report back here Author: Wes McKinney Closes #956 from wesm/ARROW-1342 and squashes the following commits: f2ebeba8 [Wes McKinney] Fix cpplint issue f403f9dd [Wes McKinney] Fix test case to be platform independent, note ARROW-1345. Improve quality of error message f4f44c18 [Wes McKinney] Fix test case where inferred list type is null ae5c8312 [Wes McKinney] Drop striding check b4aecd3a [Wes McKinney] Support strided ndarrays in pandas conversion from nested lists --- cpp/src/arrow/python/numpy-internal.h | 41 +++++++++++++ cpp/src/arrow/python/pandas_to_arrow.cc | 67 +++++++++------------ cpp/src/arrow/table.cc | 4 +- python/pyarrow/error.pxi | 3 +- python/pyarrow/includes/common.pxd | 1 + python/pyarrow/tests/pandas_examples.py | 10 ++- python/pyarrow/tests/test_convert_pandas.py | 9 +++ 7 files changed, 91 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index f1ef7dadde084..db34d24d99da5 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -25,6 +25,7 @@ #include "arrow/python/platform.h" #include +#include namespace arrow { namespace py { @@ -51,7 +52,12 @@ class Ndarray1DIndexer { int64_t size() const { return PyArray_SIZE(arr_); } + T* data() const { return data_; } + + bool is_strided() const { return stride_ == 1; } + T& operator[](size_type index) { return *(data_ + index * stride_); } + T& operator[](size_type index) const { return *(data_ + index * stride_); } private: PyArrayObject* arr_; @@ -59,6 +65,41 @@ class Ndarray1DIndexer { int64_t stride_; }; +static inline std::string GetNumPyTypeName(int npy_type) { +#define TYPE_CASE(TYPE, NAME) \ + case NPY_##TYPE: \ + return NAME; + + switch (npy_type) { + TYPE_CASE(BOOL, "bool") + TYPE_CASE(INT8, "int8") + TYPE_CASE(INT16, "int16") + TYPE_CASE(INT32, "int32") + TYPE_CASE(INT64, "int64") +#if (NPY_INT64 != NPY_LONGLONG) + TYPE_CASE(LONGLONG, "longlong") +#endif + TYPE_CASE(UINT8, "uint8") + TYPE_CASE(UINT16, "uint16") + TYPE_CASE(UINT32, "uint32") + TYPE_CASE(UINT64, "uint64") +#if (NPY_UINT64 != NPY_ULONGLONG) + TYPE_CASE(ULONGLONG, "ulonglong") +#endif + TYPE_CASE(FLOAT16, "float16") + TYPE_CASE(FLOAT32, "float32") + TYPE_CASE(FLOAT64, "float64") + TYPE_CASE(DATETIME, "datetime64") + TYPE_CASE(OBJECT, "object") + TYPE_CASE(VOID, "void") + default: + break; + } + +#undef TYPE_CASE + return "unrecognized type in GetNumPyTypeName"; +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 060fcb2453800..b6cc16b4179b3 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -97,8 +97,6 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { int64_t null_count = 0; Ndarray1DIndexer values(arr); - - // TODO(wesm): striding for (int i = 0; i < values.size(); ++i) { if (traits::isnull(values[i])) { ++null_count; @@ -125,22 +123,27 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap return null_count; } -template -static int64_t ValuesToValidBytes(const void* data, int64_t length, - uint8_t* valid_bytes) { +template +static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { typedef internal::npy_traits traits; typedef typename traits::value_type T; - int64_t null_count = 0; - const T* values = reinterpret_cast(data); - - // TODO(wesm): striding - for (int i = 0; i < length; ++i) { - valid_bytes[i] = !traits::isnull(values[i]); - if (traits::isnull(values[i])) null_count++; + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(array); + if (traits::supports_nulls) { + for (int64_t i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(builder->Append(values[i])); + } } - - return null_count; + return Status::OK(); } Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { @@ -148,14 +151,14 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { return Status::Invalid("only handle 1-dimensional arrays"); } - if (PyArray_DESCR(numpy_array)->type_num != np_type) { - return Status::Invalid("can only handle exact conversions"); + const int received_type = PyArray_DESCR(numpy_array)->type_num; + if (received_type != np_type) { + std::stringstream ss; + ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got " + << GetNumPyTypeName(received_type); + return Status::Invalid(ss.str()); } - npy_intp* astrides = PyArray_STRIDES(numpy_array); - if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) { - return Status::Invalid("No support for strided arrays in lists yet"); - } return Status::OK(); } @@ -577,7 +580,7 @@ Status PandasConverter::ConvertDecimals() { RETURN_NOT_OK(ImportModule("decimal", &decimal)); RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + Ndarray1DIndexer objects(arr_); PyObject* object = objects[0]; int precision; @@ -618,7 +621,7 @@ Status PandasConverter::ConvertTimes() { PyAcquireGIL lock; PyDateTime_IMPORT; - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + Ndarray1DIndexer objects(arr_); // datetime.time stores microsecond resolution Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_); @@ -906,7 +909,7 @@ Status LoopPySequence(PyObject* sequence, T func) { Py_ssize_t size = PySequence_Size(sequence); if (PyArray_Check(sequence)) { auto array = reinterpret_cast(sequence); - PyObject** objects = reinterpret_cast(PyArray_DATA(array)); + Ndarray1DIndexer objects(array); for (int64_t i = 0; i < size; ++i) { RETURN_NOT_OK(func(objects[i])); } @@ -934,7 +937,6 @@ template inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { typedef internal::npy_traits traits; - typedef typename traits::value_type T; typedef typename traits::BuilderClass BuilderT; PyAcquireGIL lock; @@ -956,24 +958,13 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr // TODO(uwe): Support more complex numpy array structures RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE)); - int64_t size = PyArray_DIM(numpy_array, 0); - auto data = reinterpret_cast(PyArray_DATA(numpy_array)); - if (traits::supports_nulls) { - RETURN_NOT_OK(null_bitmap_->Resize(size, false)); - // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't - // currently support this. - // ValuesToBitmap(data, size, null_bitmap_->mutable_data()); - ValuesToValidBytes(data, size, null_bitmap_->mutable_data()); - return value_builder->Append(data, size, null_bitmap_->data()); - } else { - return value_builder->Append(data, size); - } + return AppendNdarrayToBuilder(numpy_array, value_builder); } else if (PyList_Check(object)) { int64_t size; std::shared_ptr inferred_type; RETURN_NOT_OK(builder->Append(true)); RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); - if (inferred_type->id() != type->id()) { + if (inferred_type->id() != Type::NA && inferred_type->id() != type->id()) { std::stringstream ss; ss << inferred_type->ToString() << " cannot be converted to " << type->ToString(); return Status::TypeError(ss.str()); @@ -1064,7 +1055,7 @@ inline Status PandasConverter::ConvertTypedLists( std::shared_ptr inferred_type; RETURN_NOT_OK(builder->Append(true)); RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); - if (inferred_type->id() != Type::STRING) { + if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING) { std::stringstream ss; ss << inferred_type->ToString() << " cannot be converted to STRING."; return Status::TypeError(ss.str()); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 1f0c6d785448d..ae486987601bf 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -301,8 +301,8 @@ Table::Table(const std::shared_ptr& schema, columns_.resize(columns.size()); for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = std::make_shared(schema->field(static_cast(i)), - columns[i]); + columns_[i] = + std::make_shared(schema->field(static_cast(i)), columns[i]); } } diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi index 8a3f57d209ac0..8793c4e90f6b4 100644 --- a/python/pyarrow/error.pxi +++ b/python/pyarrow/error.pxi @@ -65,7 +65,7 @@ cdef int check_status(const CStatus& status) nogil except -1: return 0 with gil: - message = frombytes(status.ToString()) + message = frombytes(status.message()) if status.IsInvalid(): raise ArrowInvalid(message) elif status.IsIOError(): @@ -85,4 +85,5 @@ cdef int check_status(const CStatus& status) nogil except -1: elif status.IsPlasmaStoreFull(): raise PlasmaStoreFull(message) else: + message = frombytes(status.ToString()) raise ArrowException(message) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 637a133afb02b..6be08b0e59256 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -42,6 +42,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStatus() c_string ToString() + c_string message() c_bool ok() c_bool IsIOError() diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py index 17ad4b22b9cb2..c145e96342668 100644 --- a/python/pyarrow/tests/pandas_examples.py +++ b/python/pyarrow/tests/pandas_examples.py @@ -98,21 +98,25 @@ def dataframe_with_lists(include_index=False): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, - [0] + [0], + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, + dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, - [0.] + [0.], + np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"รค"], None, [u"1"], - [u"1", u"2", u"3"] + [u"1", u"2", u"3"], + [], ] if include_index: diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 8969777b526c0..61bd072f6bae9 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -534,6 +534,15 @@ def test_column_of_lists(self): field = schema.field_by_name(column) self._check_array_roundtrip(df[column], type=field.type) + def test_column_of_lists_strided(self): + df, schema = dataframe_with_lists() + df = pd.concat([df] * 6, ignore_index=True) + + arr = df['int64'].values[::3] + assert arr.strides[0] != 8 + + self._check_array_roundtrip(arr) + def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object)