From 8891a6d6a6f46b3ff082526226cb9df7fa6ada91 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 26 Mar 2024 10:59:33 -0400 Subject: [PATCH 01/51] GH-40784: [JS] Use bigIntToNumber (#40785) Just minor refactoring. Fixes #40784. * GitHub Issue: #40784 --- js/src/util/bn.ts | 24 ++++++++++++------------ js/test/unit/bn-tests.ts | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/js/src/util/bn.ts b/js/src/util/bn.ts index b4db9cf2b4afe..8f6dfe258fc8d 100644 --- a/js/src/util/bn.ts +++ b/js/src/util/bn.ts @@ -18,6 +18,7 @@ import { ArrayBufferViewInput, toArrayBufferView } from './buffer.js'; import { TypedArray, TypedArrayConstructor } from '../interfaces.js'; import { BigIntArray, BigIntArrayConstructor } from '../interfaces.js'; +import { bigIntToNumber } from './bigint.js'; /** @ignore */ export const isArrowBigNumSymbol = Symbol.for('isArrowBigNum'); @@ -79,29 +80,28 @@ export function bigNumToNumber>(bn: T, scale?: number) const negative = signed && words.at(-1)! & (BigInt(1) << BigInt(63)); let number = BigInt(0); let i = 0; - if (!negative) { - for (const word of words) { - number |= word * (BigInt(1) << BigInt(64 * i++)); - } - } else { + if (negative) { for (const word of words) { number |= (word ^ TWO_TO_THE_64_MINUS_1) * (BigInt(1) << BigInt(64 * i++)); } number *= BigInt(-1); number -= BigInt(1); + } else { + for (const word of words) { + number |= word * (BigInt(1) << BigInt(64 * i++)); + } } if (typeof scale === 'number') { const denominator = BigInt(Math.pow(10, scale)); const quotient = number / denominator; const remainder = number % denominator; - const n = Number(quotient) + (Number(remainder) / Number(denominator)); - return n; + return bigIntToNumber(quotient) + (bigIntToNumber(remainder) / bigIntToNumber(denominator)); } - return Number(number); + return bigIntToNumber(number); } /** @ignore */ -export const bigNumToString: { >(a: T): string } = (>(a: T) => { +export function bigNumToString>(a: T): string { // use BigInt native implementation if (a.byteLength === 8) { const bigIntArray = new a['BigIntArray'](a.buffer, a.byteOffset, 1); @@ -133,17 +133,17 @@ export const bigNumToString: { >(a: T): string } = (array); return `-${negated}`; -}); +} /** @ignore */ -export const bigNumToBigInt: { >(a: T): bigint } = (>(a: T) => { +export function bigNumToBigInt>(a: T): bigint { if (a.byteLength === 8) { const bigIntArray = new a['BigIntArray'](a.buffer, a.byteOffset, 1); return bigIntArray[0]; } else { return bigNumToString(a); } -}); +} /** @ignore */ function unsignedBigNumToString>(a: T) { diff --git a/js/test/unit/bn-tests.ts b/js/test/unit/bn-tests.ts index dbda02198ea2e..2ea8f6055db2c 100644 --- a/js/test/unit/bn-tests.ts +++ b/js/test/unit/bn-tests.ts @@ -93,8 +93,8 @@ describe(`BN`, () => { expect(n3.valueOf()).toBe(-1); const n4 = new BN(new Uint32Array([0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF]), true); expect(n4.valueOf(1)).toBe(-0.1); - const n5 = new BN(new Uint32Array([0x00000000, 0x00000000, 0x00000000, 0x80000000]), false); - expect(n5.valueOf()).toBe(1.7014118346046923e+38); + // const n5 = new BN(new Uint32Array([0x00000000, 0x00000000, 0x00000000, 0x80000000]), false); + // expect(n5.valueOf()).toBe(1.7014118346046923e+38); // const n6 = new BN(new Uint32Array([0x00000000, 0x00000000, 0x00000000, 0x80000000]), false); // expect(n6.valueOf(1)).toBe(1.7014118346046923e+37); }); From dbff1f4a3e11d808eddf24b816046ab854d5d836 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 26 Mar 2024 23:17:12 +0800 Subject: [PATCH 02/51] GH-36026: [C++][ORC] Catch all ORC exceptions to avoid crash (#40697) ### Rationale for this change When /usr/share/zoneinfo is unavailable and TZDIR env is unset, creating C++ ORC reader will crash on Windows. We need to eagerly check this and prevent followup crash. ### What changes are included in this PR? Eagerly check TZDB availability before creating ORC reader/writer. ### Are these changes tested? Yes, added a test case to make sure the check work as expected. ### Are there any user-facing changes? Users on Windows (or other cases when TZDB is not availble) will clearly see this error message instead of crash. * GitHub Issue: #36026 Authored-by: Gang Wu Signed-off-by: Antoine Pitrou --- cpp/src/arrow/adapters/orc/adapter.cc | 58 +++++++++++++++------- cpp/src/arrow/adapters/orc/adapter_test.cc | 19 +++++++ cpp/src/arrow/adapters/orc/util.cc | 8 +++ cpp/src/arrow/adapters/orc/util.h | 3 ++ 4 files changed, 69 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 2100e701f3302..127ec49ba990f 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,17 +18,14 @@ #include "arrow/adapters/orc/adapter.h" #include -#include -#include +#include #include #include #include #include -#include #include #include "arrow/adapters/orc/util.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" @@ -37,14 +34,11 @@ #include "arrow/table.h" #include "arrow/table_builder.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" -#include "arrow/util/range.h" -#include "arrow/util/visibility.h" #include "orc/Exceptions.hh" // alias to not interfere with nested orc namespace @@ -80,6 +74,12 @@ namespace liborc = orc; } \ catch (const liborc::NotImplementedYet& e) { \ return Status::NotImplemented(e.what()); \ + } \ + catch (const std::exception& e) { \ + return Status::UnknownError(e.what()); \ + } \ + catch (...) { \ + return Status::UnknownError("ORC error"); \ } #define ORC_CATCH_NOT_OK(_s) \ @@ -173,7 +173,7 @@ class OrcStripeReader : public RecordBatchReader { int64_t batch_size_; }; -liborc::RowReaderOptions default_row_reader_options() { +liborc::RowReaderOptions DefaultRowReaderOptions() { liborc::RowReaderOptions options; // Orc timestamp type is error-prone since it serializes values in the writer timezone // and reads them back in the reader timezone. To avoid this, both the Apache Orc C++ @@ -183,6 +183,24 @@ liborc::RowReaderOptions default_row_reader_options() { return options; } +// Proactively check timezone database availability for ORC versions older than 2.0.0 +Status CheckTimeZoneDatabaseAvailability() { + if (GetOrcMajorVersion() >= 2) { + return Status::OK(); + } + auto tz_dir = std::getenv("TZDIR"); + bool is_tzdb_avaiable = tz_dir != nullptr + ? std::filesystem::exists(tz_dir) + : std::filesystem::exists("/usr/share/zoneinfo"); + if (!is_tzdb_avaiable) { + return Status::Invalid( + "IANA time zone database is unavailable but required by ORC." + " Please install it to /usr/share/zoneinfo or set TZDIR env to the installed" + " directory"); + } + return Status::OK(); +} + } // namespace class ORCFileReader::Impl { @@ -332,25 +350,25 @@ class ORCFileReader::Impl { } Result> Read() { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema()); return ReadTable(opts, schema); } Result> Read(const std::shared_ptr& schema) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); return ReadTable(opts, schema); } Result> Read(const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); } Result> Read(const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectNames(&opts, include_names)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); @@ -358,13 +376,13 @@ class ORCFileReader::Impl { Result> Read(const std::shared_ptr& schema, const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); return ReadTable(opts, schema); } Result> ReadStripe(int64_t stripe) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadBatch(opts, schema, stripes_[static_cast(stripe)].num_rows); @@ -372,7 +390,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -381,7 +399,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectNames(&opts, include_names)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -487,7 +505,7 @@ class ORCFileReader::Impl { return nullptr; } - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); if (!include_indices.empty()) { RETURN_NOT_OK(SelectIndices(&opts, include_indices)); } @@ -508,7 +526,7 @@ class ORCFileReader::Impl { Result> GetRecordBatchReader( int64_t batch_size, const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); if (!include_names.empty()) { RETURN_NOT_OK(SelectNames(&opts, include_names)); } @@ -541,6 +559,7 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { + RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -779,7 +798,7 @@ class ORCFileWriter::Impl { &(arrow_index_offset[i]), (root->fields)[i])); } root->numElements = (root->fields)[0]->numElements; - writer_->add(*batch); + ORC_CATCH_NOT_OK(writer_->add(*batch)); batch->clear(); num_rows -= batch_size; } @@ -807,6 +826,7 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { + RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 73ecde6b9b576..b9d6c53215b41 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -33,8 +33,10 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" namespace liborc = orc; @@ -636,6 +638,23 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) { AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true); } +TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { + if (adapters::orc::GetOrcMajorVersion() >= 2) { + GTEST_SKIP() << "Only ORC pre-2.0.0 versions have the time zone database check"; + } + + EnvVarGuard tzdir_guard("TZDIR", "/wrong/path"); + const char* expect_str = "IANA time zone database is unavailable but required by ORC"; + EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + EXPECT_THAT( + adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); + EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); + EXPECT_THAT(adapters::orc::ORCFileReader::Open( + std::make_shared(buffer), default_memory_pool()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); +} + // Trivial class TestORCWriterTrivialNoWrite : public ::testing::Test {}; diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index f4bdbae6a7b4a..2a74bec1aa6fd 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -37,6 +37,7 @@ #include "orc/MemoryPool.hh" #include "orc/OrcFile.hh" +#include "orc/orc-config.hh" // alias to not interfere with nested orc namespace namespace liborc = orc; @@ -1220,6 +1221,13 @@ Result> GetArrowField(const std::string& name, return field(name, std::move(arrow_type), nullable, std::move(metadata)); } +int GetOrcMajorVersion() { + std::stringstream orc_version(ORC_VERSION); + std::string major_version; + std::getline(orc_version, major_version, '.'); + return std::stoi(major_version); +} + } // namespace orc } // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/util.h b/cpp/src/arrow/adapters/orc/util.h index 00af9f4b76e67..a18b11dda013f 100644 --- a/cpp/src/arrow/adapters/orc/util.h +++ b/cpp/src/arrow/adapters/orc/util.h @@ -60,6 +60,9 @@ ARROW_EXPORT Status WriteBatch(const ChunkedArray& chunked_array, int64_t length int* arrow_chunk_offset, int64_t* arrow_index_offset, liborc::ColumnVectorBatch* column_vector_batch); +/// \brief Get the major version provided by the official ORC C++ library. +ARROW_EXPORT int GetOrcMajorVersion(); + } // namespace orc } // namespace adapters } // namespace arrow From 32437a5aebd6fba0abbc63dfcf8e24106c617efd Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:53:07 -0400 Subject: [PATCH 03/51] GH-40205: [Python] ListView arrow-to-pandas conversion (#40482) ### Rationale for this change ListView should support converting to pandas/numpy in pyarrow. ### What changes are included in this PR? * `.to_pandas()` successfully creates a pandas series * `.to_numpy()` successfully creates a numpy array ### Are these changes tested? * Yes, unit tests ### Are there any user-facing changes? No, just adding support for existing APIs `to_pandas()` `to_numpy()`. * GitHub Issue: #40205 Authored-by: Dane Pitkin Signed-off-by: Joris Van den Bossche --- .../src/arrow/python/arrow_to_pandas.cc | 44 +++++++--- python/pyarrow/tests/test_pandas.py | 82 +++++++++++++++++++ 2 files changed, 115 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index a21183e09010d..734f6263d9990 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -203,7 +203,9 @@ static inline bool ListTypeSupported(const DataType& type) { return true; case Type::FIXED_SIZE_LIST: case Type::LIST: - case Type::LARGE_LIST: { + case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: { const auto& list_type = checked_cast(type); return ListTypeSupported(*list_type.value_type()); } @@ -752,9 +754,11 @@ Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& den return Status::OK(); } -template -Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +template +enable_if_list_like ConvertListsLike(PandasOptions options, + const ChunkedArray& data, + PyObject** out_values) { + using ListArrayT = typename TypeTraits::ArrayType; // Get column of underlying value arrays ArrayVector value_arrays; for (int c = 0; c < data.num_chunks(); c++) { @@ -828,6 +832,26 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, return Status::OK(); } +// TODO GH-40579: optimize ListView conversion to avoid unnecessary copies +template +enable_if_list_view ConvertListsLike(PandasOptions options, + const ChunkedArray& data, + PyObject** out_values) { + using ListViewArrayType = typename TypeTraits::ArrayType; + using NonViewType = + std::conditional_t; + using NonViewClass = typename TypeTraits::ArrayType; + ArrayVector list_arrays; + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + ARROW_ASSIGN_OR_RAISE(auto non_view_array, + NonViewClass::FromListView(arr, options.pool)); + list_arrays.emplace_back(non_view_array); + } + auto chunked_array = std::make_shared(list_arrays); + return ConvertListsLike(options, *chunked_array, out_values); +} + template Status ConvertMapHelper(F1 resetRow, F2 addPairToRow, F3 stealRow, const ChunkedArray& data, PyArrayObject* py_keys, @@ -1344,16 +1368,14 @@ struct ObjectWriterVisitor { } template - enable_if_t::value || is_var_length_list_type::value, - Status> - Visit(const T& type) { - using ArrayType = typename TypeTraits::ArrayType; + enable_if_t::value || is_list_view_type::value, Status> Visit( + const T& type) { if (!ListTypeSupported(*type.value_type())) { return Status::NotImplemented( "Not implemented type for conversion from List to Pandas: ", type.value_type()->ToString()); } - return ConvertListsLike(options, data, out_values); + return ConvertListsLike(options, data, out_values); } Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); } @@ -1367,8 +1389,6 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) || @@ -2207,6 +2227,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::FIXED_SIZE_LIST: case Type::LIST: case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: case Type::MAP: { auto list_type = std::static_pointer_cast(data.type()); if (!ListTypeSupported(*list_type->value_type())) { diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index fdfd123a8c34f..90b9bd8b8c453 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2522,6 +2522,88 @@ def test_list_values_behind_null(self): else: npt.assert_array_equal(left, right) + @pytest.mark.parametrize("klass", [pa.ListViewArray, pa.LargeListViewArray]) + def test_list_view_to_pandas_with_in_order_offsets(self, klass): + arr = klass.from_arrays( + offsets=pa.array([0, 2, 4]), + sizes=pa.array([2, 2, 2]), + values=pa.array([1, 2, 3, 4, 5, 6]), + ) + + actual = arr.to_pandas() + expected = pd.Series([[1, 2], [3, 4], [5, 6]]) + + tm.assert_series_equal(actual, expected) + + @pytest.mark.parametrize("klass", [pa.ListViewArray, pa.LargeListViewArray]) + def test_list_view_to_pandas_with_out_of_order_offsets(self, klass): + arr = klass.from_arrays( + offsets=pa.array([2, 4, 0]), + sizes=pa.array([2, 2, 2]), + values=pa.array([1, 2, 3, 4, 5, 6]), + ) + + actual = arr.to_pandas() + expected = pd.Series([[3, 4], [5, 6], [1, 2]]) + + tm.assert_series_equal(actual, expected) + + @pytest.mark.parametrize("klass", [pa.ListViewArray, pa.LargeListViewArray]) + def test_list_view_to_pandas_with_overlapping_offsets(self, klass): + arr = klass.from_arrays( + offsets=pa.array([0, 1, 2]), + sizes=pa.array([4, 4, 4]), + values=pa.array([1, 2, 3, 4, 5, 6]), + ) + + actual = arr.to_pandas() + expected = pd.Series([[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6]]) + + tm.assert_series_equal(actual, expected) + + @pytest.mark.parametrize("klass", [pa.ListViewArray, pa.LargeListViewArray]) + def test_list_view_to_pandas_with_null_values(self, klass): + arr = klass.from_arrays( + offsets=pa.array([0, 2, 2]), + sizes=pa.array([2, 0, 0]), + values=pa.array([1, None]), + mask=pa.array([False, False, True]) + ) + + actual = arr.to_pandas() + expected = pd.Series([[1, None], [], None]) + + tm.assert_series_equal(actual, expected) + + @pytest.mark.parametrize("klass", [pa.ListViewArray, pa.LargeListViewArray]) + def test_list_view_to_pandas_multiple_chunks(self, klass): + gc.collect() + bytes_start = pa.total_allocated_bytes() + arr1 = klass.from_arrays( + offsets=pa.array([2, 1, 0]), + sizes=pa.array([2, 2, 2]), + values=pa.array([1, 2, 3, 4]) + ) + arr2 = klass.from_arrays( + offsets=pa.array([0, 1, 1]), + sizes=pa.array([3, 3, 0]), + values=pa.array([5, 6, 7, None]), + mask=pa.array([False, False, True]) + ) + arr = pa.chunked_array([arr1, arr2]) + + actual = arr.to_pandas() + expected = pd.Series([[3, 4], [2, 3], [1, 2], [5, 6, 7], [6, 7, None], None]) + + tm.assert_series_equal(actual, expected) + + del actual + del arr + del arr1 + del arr2 + bytes_end = pa.total_allocated_bytes() + assert bytes_end == bytes_start + class TestConvertStructTypes: """ From 434f87274e8e9adab4f0434ae494f30dc955ca6e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 26 Mar 2024 16:57:06 +0100 Subject: [PATCH 04/51] GH-40060: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add support for different data types (#40359) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes are included in this PR? - Added support for `RecordBatches` with fields of different type in the conversion `RecordBatch` → `Tensor`. - Added detail of the constraints to the `RecordBatch.to_tensor()` docstrings, see https://github.com/apache/arrow/pull/40064#discussion_r1512307964. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40060 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Benjamin Kietzman Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/record_batch.cc | 91 +++++++++++++++----- cpp/src/arrow/record_batch_test.cc | 128 ++++++++++++++++++++++++++--- python/pyarrow/table.pxi | 3 + python/pyarrow/tests/test_table.py | 97 ++++++++++++++++++---- 4 files changed, 268 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index d52ebe053b098..0d8bda9b66e24 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -34,7 +34,9 @@ #include "arrow/type.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" +#include "arrow/util/unreachable.h" #include "arrow/util/vector.h" +#include "arrow/visit_type_inline.h" namespace arrow { @@ -248,19 +250,40 @@ Result> RecordBatch::ToStructArray() const { /*offset=*/0); } +template +struct ConvertColumnsToTensorVisitor { + Out*& out_values; + const ArrayData& in_data; + + template + Status Visit(const T&) { + if constexpr (is_numeric(T::type_id)) { + using In = typename T::c_type; + auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + + if constexpr (std::is_same_v) { + memcpy(out_values, in_values.data(), in_values.size_bytes()); + out_values += in_values.size(); + } else { + for (In in_value : in_values) { + *out_values++ = static_cast(in_value); + } + } + return Status::OK(); + } + Unreachable(); + } +}; + template inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) { using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); - // Loop through all of the columns - for (int i = 0; i < batch.num_columns(); ++i) { - const auto* in_values = batch.column(i)->data()->GetValues(1); - - // Copy data of each column - memcpy(out_values, in_values, sizeof(CType) * batch.num_rows()); - out_values += batch.num_rows(); - } // End loop through columns + for (const auto& column : batch.columns()) { + ConvertColumnsToTensorVisitor visitor{out_values, *column->data()}; + DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + } } Result> RecordBatch::ToTensor(MemoryPool* pool) const { @@ -269,28 +292,54 @@ Result> RecordBatch::ToTensor(MemoryPool* pool) const { "Conversion to Tensor for RecordBatches without columns/schema is not " "supported."); } - const auto& type = column(0)->type(); - // Check for supported data types - if (!is_integer(type->id()) && !is_floating(type->id())) { - return Status::TypeError("DataType is not supported: ", type->ToString()); - } - // Check for uniform data type // Check for no validity bitmap of each field for (int i = 0; i < num_columns(); ++i) { if (column(i)->null_count() > 0) { return Status::TypeError("Can only convert a RecordBatch with no nulls."); } - if (column(i)->type() != type) { - return Status::TypeError("Can only convert a RecordBatch with uniform data type."); + } + + // Check for supported data types and merge fields + // to get the resulting uniform data type + if (!is_integer(column(0)->type()->id()) && !is_floating(column(0)->type()->id())) { + return Status::TypeError("DataType is not supported: ", + column(0)->type()->ToString()); + } + std::shared_ptr result_field = schema_->field(0); + std::shared_ptr result_type = result_field->type(); + + if (num_columns() > 1) { + Field::MergeOptions options; + options.promote_integer_to_float = true; + options.promote_integer_sign = true; + options.promote_numeric_width = true; + + for (int i = 1; i < num_columns(); ++i) { + if (!is_numeric(column(i)->type()->id())) { + return Status::TypeError("DataType is not supported: ", + column(i)->type()->ToString()); + } + + // Casting of float16 is not supported, throw an error in this case + if ((column(i)->type()->id() == Type::HALF_FLOAT || + result_field->type()->id() == Type::HALF_FLOAT) && + column(i)->type()->id() != result_field->type()->id()) { + return Status::NotImplemented("Casting from or to halffloat is not supported."); + } + + ARROW_ASSIGN_OR_RAISE( + result_field, result_field->MergeWith( + schema_->field(i)->WithName(result_field->name()), options)); } + result_type = result_field->type(); } // Allocate memory ARROW_ASSIGN_OR_RAISE( std::shared_ptr result, - AllocateBuffer(type->bit_width() * num_columns() * num_rows(), pool)); + AllocateBuffer(result_type->bit_width() * num_columns() * num_rows(), pool)); // Copy data - switch (type->id()) { + switch (result_type->id()) { case Type::UINT8: ConvertColumnsToTensor(*this, result->mutable_data()); break; @@ -323,18 +372,18 @@ Result> RecordBatch::ToTensor(MemoryPool* pool) const { ConvertColumnsToTensor(*this, result->mutable_data()); break; default: - return Status::TypeError("DataType is not supported: ", type->ToString()); + return Status::TypeError("DataType is not supported: ", result_type->ToString()); } // Construct Tensor object const auto& fixed_width_type = - internal::checked_cast(*column(0)->type()); + internal::checked_cast(*result_type); std::vector shape = {num_rows(), num_columns()}; std::vector strides; ARROW_RETURN_NOT_OK( internal::ComputeColumnMajorStrides(fixed_width_type, shape, &strides)); ARROW_ASSIGN_OR_RAISE(auto tensor, - Tensor::Make(type, std::move(result), shape, strides)); + Tensor::Make(result_type, std::move(result), shape, strides)); return tensor; } diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 45cf7cae654ad..81154452d7229 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -619,37 +619,37 @@ TEST_F(TestRecordBatch, ConcatenateRecordBatches) { ASSERT_BATCHES_EQUAL(*batch, *null_batch); } -TEST_F(TestRecordBatch, ToTensorUnsupported) { +TEST_F(TestRecordBatch, ToTensorUnsupportedType) { const int length = 9; - // Mixed data type auto f0 = field("f0", int32()); - auto f1 = field("f1", int64()); + // Unsupported data type + auto f1 = field("f1", utf8()); std::vector> fields = {f0, f1}; auto schema = ::arrow::schema(fields); auto a0 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]"); - auto a1 = ArrayFromJSON(int64(), "[10, 20, 30, 40, 50, 60, 70, 80, 90]"); + auto a1 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "a", "b", "c", "a", "b", "c"])"); auto batch = RecordBatch::Make(schema, length, {a0, a1}); ASSERT_RAISES_WITH_MESSAGE( - TypeError, "Type error: Can only convert a RecordBatch with uniform data type.", + TypeError, "Type error: DataType is not supported: " + a1->type()->ToString(), batch->ToTensor()); - // Unsupported data type - auto f2 = field("f2", utf8()); - - std::vector> fields_1 = {f2}; - auto schema_2 = ::arrow::schema(fields_1); + // Unsupported boolean data type + auto f2 = field("f2", boolean()); - auto a2 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "a", "b", "c", "a", "b", "c"])"); - auto batch_2 = RecordBatch::Make(schema_2, length, {a2}); + std::vector> fields2 = {f0, f2}; + auto schema2 = ::arrow::schema(fields2); + auto a2 = ArrayFromJSON(boolean(), + "[true, false, true, true, false, true, false, true, true]"); + auto batch2 = RecordBatch::Make(schema2, length, {a0, a2}); ASSERT_RAISES_WITH_MESSAGE( TypeError, "Type error: DataType is not supported: " + a2->type()->ToString(), - batch_2->ToTensor()); + batch2->ToTensor()); } TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) { @@ -740,6 +740,108 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { CheckTensor(tensor, 18, shape, f_strides); } +TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { + const int length = 9; + + auto f0 = field("f0", uint16()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", float32()); + + auto a0 = ArrayFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a1 = ArrayFromJSON(int16(), "[10, 20, 30, 40, 50, 60, 70, 80, 90]"); + auto a2 = ArrayFromJSON(float32(), "[100, 200, 300, NaN, 500, 600, 700, 800, 900]"); + + // Single column + std::vector> fields = {f0}; + auto schema = ::arrow::schema(fields); + auto batch = RecordBatch::Make(schema, length, {a0}); + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 1}; + const int64_t uint16_size = sizeof(uint16_t); + std::vector f_strides = {uint16_size, uint16_size * shape[0]}; + std::shared_ptr tensor_expected = + TensorFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]", shape, f_strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTensor(tensor, 9, shape, f_strides); + + // uint16 + int16 = int32 + std::vector> fields1 = {f0, f1}; + auto schema1 = ::arrow::schema(fields1); + auto batch1 = RecordBatch::Make(schema1, length, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, batch1->ToTensor()); + ASSERT_OK(tensor1->Validate()); + + std::vector shape1 = {9, 2}; + const int64_t int32_size = sizeof(int32_t); + std::vector f_strides_1 = {int32_size, int32_size * shape1[0]}; + std::shared_ptr tensor_expected_1 = TensorFromJSON( + int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]", + shape1, f_strides_1); + + EXPECT_TRUE(tensor_expected_1->Equals(*tensor1)); + + CheckTensor(tensor1, 18, shape1, f_strides_1); + + ASSERT_EQ(tensor1->type()->bit_width(), tensor_expected_1->type()->bit_width()); + + ASSERT_EQ(1, tensor_expected_1->Value({0, 0})); + ASSERT_EQ(2, tensor_expected_1->Value({1, 0})); + ASSERT_EQ(10, tensor_expected_1->Value({0, 1})); + + // uint16 + int16 + float32 = float64 + std::vector> fields2 = {f0, f1, f2}; + auto schema2 = ::arrow::schema(fields2); + auto batch2 = RecordBatch::Make(schema2, length, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, batch2->ToTensor()); + ASSERT_OK(tensor2->Validate()); + + std::vector shape2 = {9, 3}; + const int64_t f64_size = sizeof(double); + std::vector f_strides_2 = {f64_size, f64_size * shape2[0]}; + std::shared_ptr tensor_expected_2 = + TensorFromJSON(float64(), + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, " + "60, 70, 80, 90, 100, 200, 300, NaN, 500, 600, 700, 800, 900]", + shape2, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTensor(tensor2, 27, shape2, f_strides_2); +} + +TEST_F(TestRecordBatch, ToTensorUnsupportedMixedFloat16) { + const int length = 9; + + auto f0 = field("f0", float16()); + auto f1 = field("f1", float64()); + + auto a0 = ArrayFromJSON(float16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a1 = ArrayFromJSON(float64(), "[10, 20, 30, 40, 50, 60, 70, 80, 90]"); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + auto batch = RecordBatch::Make(schema, length, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE( + NotImplemented, "NotImplemented: Casting from or to halffloat is not supported.", + batch->ToTensor()); + + std::vector> fields1 = {f1, f0}; + auto schema1 = ::arrow::schema(fields1); + auto batch1 = RecordBatch::Make(schema1, length, {a1, a0}); + + ASSERT_RAISES_WITH_MESSAGE( + NotImplemented, "NotImplemented: Casting from or to halffloat is not supported.", + batch1->ToTensor()); +} + template class TestBatchToTensor : public ::testing::Test {}; diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index cf8515c56e701..1ab3fd04ed9f0 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3392,6 +3392,9 @@ cdef class RecordBatch(_Tabular): def to_tensor(self): """ Convert to a :class:`~pyarrow.Tensor`. + + RecordBatches that can be converted have fields of type signed or unsigned + integer or float, including all bit-widths, with no validity bitmask. """ cdef: shared_ptr[CRecordBatch] c_record_batch diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 2a6ba7cb97912..a7d917c2baf2d 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -915,7 +915,7 @@ def check_tensors(tensor, expected_tensor, type, size): np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, ]) -def test_recordbatch_to_tensor(typ): +def test_recordbatch_to_tensor_uniform_type(typ): arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] arr3 = [100, 100, 100, 100, 100, 100, 100, 100, 100] @@ -959,6 +959,82 @@ def test_recordbatch_to_tensor(typ): check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) +def test_recordbatch_to_tensor_uniform_float_16(): + arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3 = [100, 100, 100, 100, 100, 100, 100, 100, 100] + batch = pa.RecordBatch.from_arrays( + [ + pa.array(np.array(arr1, dtype=np.float16), type=pa.float16()), + pa.array(np.array(arr2, dtype=np.float16), type=pa.float16()), + pa.array(np.array(arr3, dtype=np.float16), type=pa.float16()), + ], ["a", "b", "c"] + ) + result = batch.to_tensor() + + x = np.array([arr1, arr2, arr3], np.float16).transpose() + expected = pa.Tensor.from_numpy(x) + + check_tensors(result, expected, pa.float16(), 27) + + +def test_recordbatch_to_tensor_mixed_type(): + # uint16 + int16 = int32 + arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3 = [100, 200, 300, np.nan, 500, 600, 700, 800, 900] + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.uint16()), + pa.array(arr2, type=pa.int16()), + ], ["a", "b"] + ) + result = batch.to_tensor() + + x = np.array([arr1, arr2], np.int32).transpose() + expected = pa.Tensor.from_numpy(x) + + check_tensors(result, expected, pa.int32(), 18) + + # uint16 + int16 + float32 = float64 + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.uint16()), + pa.array(arr2, type=pa.int16()), + pa.array(arr3, type=pa.float32()), + ], ["a", "b", "c"] + ) + result = batch.to_tensor() + + x = np.array([arr1, arr2, arr3], np.float64).transpose() + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 27 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + +def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16(): + arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3 = [100, 200, 300, 400, 500, 600, 700, 800, 900] + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.uint16()), + pa.array(np.array(arr2, dtype=np.float16), type=pa.float16()), + pa.array(arr3, type=pa.float32()), + ], ["a", "b", "c"] + ) + + with pytest.raises( + NotImplementedError, + match="Casting from or to halffloat is not supported." + ): + batch.to_tensor() + + def test_recordbatch_to_tensor_nan(): arr1 = [1, 2, 3, 4, np.nan, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, np.nan, 90] @@ -1015,28 +1091,15 @@ def test_recordbatch_to_tensor_empty(): def test_recordbatch_to_tensor_unsupported(): - # Mixed data type arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] - arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] + # Unsupported data type + arr2 = ["a", "b", "c", "a", "b", "c", "a", "b", "c"] batch = pa.RecordBatch.from_arrays( [ pa.array(arr1, type=pa.int32()), - pa.array(arr2, type=pa.float32()), + pa.array(arr2, type=pa.utf8()), ], ["a", "b"] ) - with pytest.raises( - pa.ArrowTypeError, - match="Can only convert a RecordBatch with uniform data type." - ): - batch.to_tensor() - - # Unsupported data type - arr3 = ["a", "b", "c", "a", "b", "c", "a", "b", "c"] - batch = pa.RecordBatch.from_arrays( - [ - pa.array(arr3, type=pa.utf8()), - ], ["c"] - ) with pytest.raises( pa.ArrowTypeError, match="DataType is not supported" From f710ac52b049806515a14445b242c3ec819fb99d Mon Sep 17 00:00:00 2001 From: Alex Shcherbakov Date: Tue, 26 Mar 2024 21:17:04 +0200 Subject: [PATCH 05/51] GH-40719: [Go] Make `arrow.Null` non-null for `arrow.TypeEqual` to work properly with `new(arrow.NullType)` (#40802) ### Rationale for this change Currently creating a record with a `null` type via `new(arrow.NullType)` in the schema will fail the schema validation. ### What changes are included in this PR? Made `arrow.Null` a non-null value instead of just a declaration. ### Are these changes tested? Yes, see cd4253a24e6d828128fbb7854da3c37951d74885 ### Are there any user-facing changes? `arrow.Null` became non-null, but the type is the same. * GitHub Issue: #40719 Authored-by: Alex Shcherbakov Signed-off-by: Matt Topol --- go/arrow/compare_test.go | 3 +++ go/arrow/datatype_null.go | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/go/arrow/compare_test.go b/go/arrow/compare_test.go index 62e30e634ed0b..ca87621eadcb9 100644 --- a/go/arrow/compare_test.go +++ b/go/arrow/compare_test.go @@ -42,6 +42,9 @@ func TestTypeEqual(t *testing.T) { { Null, Null, true, false, }, + { + Null, new(NullType), true, false, + }, { &BinaryType{}, &StringType{}, false, false, }, diff --git a/go/arrow/datatype_null.go b/go/arrow/datatype_null.go index 2d2454c6525f9..c852b854a79b6 100644 --- a/go/arrow/datatype_null.go +++ b/go/arrow/datatype_null.go @@ -27,7 +27,5 @@ func (*NullType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecAlwaysNull()}} } -var ( - Null *NullType - _ DataType = Null -) +// Null gives us both the compile-time assertion of DataType interface as well as serving a good element for use in schemas. +var Null DataType = new(NullType) From 24feab091ab5a05b1cec234f51bd0223e2c41487 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 27 Mar 2024 01:45:29 +0100 Subject: [PATCH 06/51] GH-36656: [Dev] Validate in merge script if issue has an assigned milestone already (#40771) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change When we do the feature freeze for the releases or we are adding issues to patch releases we milestone the issues outside the merge script. The merge script should check and prompt if the issue already has a milestone assigned and should maintain the already assigned milestone to the issue. ### What changes are included in this PR? The merge script checks whether the issue already contains a milestone and if the milestone is different than the current default one it prompts the user to double check that it is the correct one. ### Are these changes tested? I've tested it locally. If no milestone or default it prompts as usual ``` Would you like to update the associated issue? (y/n): y Enter fix version [16.0.0]: ``` If a different and closed milestone is assigned: ``` === The assigned milestone is not the default === Assigned milestone: 15.0.2 Current milestone: 16.0.0 Please ensure to assign the correct milestone. The assigned milestone state is closed. Contact the Release Manager if it has to be added to a closed Release Please ensure to assign the correct milestone. ``` ### Are there any user-facing changes? No, only for committers and not relevant. * GitHub Issue: #36656 Lead-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/merge_arrow_pr.py | 18 +++++++++++++++++- dev/test_merge_arrow_pr.py | 4 ++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index ae482d69014ab..25d3372d8b4d3 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -253,7 +253,10 @@ def assignees(self): @property def current_fix_versions(self): - return self.issue.get("milestone", {}).get("title") + try: + return self.issue.get("milestone", {}).get("title") + except AttributeError: + pass @property def current_versions(self): @@ -680,6 +683,19 @@ def prompt_for_fix_version(cmd, issue, maintenance_branches=()): maintenance_branches=maintenance_branches ) + current_fix_versions = issue.current_fix_versions + if (current_fix_versions and + current_fix_versions != default_fix_version): + print("\n=== The assigned milestone is not the default ===") + print(f"Assigned milestone: {current_fix_versions}") + print(f"Current milestone: {default_fix_version}") + if issue.issue["milestone"].get("state") == 'closed': + print("The assigned milestone state is closed. Contact the ") + print("Release Manager if it has to be added to a closed Release") + print("Please ensure to assign the correct milestone.") + # Default to existing assigned milestone + default_fix_version = current_fix_versions + issue_fix_version = cmd.prompt("Enter fix version [%s]: " % default_fix_version) if issue_fix_version == "": diff --git a/dev/test_merge_arrow_pr.py b/dev/test_merge_arrow_pr.py index 39576876d55ea..305b08f2830bb 100755 --- a/dev/test_merge_arrow_pr.py +++ b/dev/test_merge_arrow_pr.py @@ -84,6 +84,10 @@ def current_versions(self): v for v in all_versions if not v.raw.get("released") ] + ['0.11.0'] + @property + def current_fix_versions(self): + return 'JS-0.4.0' + def project_versions(self, project): return self._project_versions From aae2557e303601f89c4bb94ee669d9f2fb83b528 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 27 Mar 2024 19:42:56 +0800 Subject: [PATCH 07/51] GH-39377: [C++] IO: Reuse same buffer in CompressedInputStream (#39807) ### Rationale for this change This patch reuses the same buffer in `CompressedInputStream`. It includes the `decompress_` and `compress_` buffer ### What changes are included in this PR? 1. For `compress_`, allocate and reuse same buffer with `kChunkSize` (64KB), and reusing it 2. For `decompress_`, reusing a same buffer (mostly 1MB) without continues `Reallocate` In the worst case, `decompress_` might hold a large buffer. ### Are these changes tested? Already ### Are there any user-facing changes? `CompressedInputStream` might has larger buffer * Closes: #39377 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/arrow/io/CMakeLists.txt | 2 + cpp/src/arrow/io/compressed.cc | 63 +++++-- cpp/src/arrow/io/compressed.h | 6 + cpp/src/arrow/io/compressed_benchmark.cc | 200 +++++++++++++++++++++++ 4 files changed, 253 insertions(+), 18 deletions(-) create mode 100644 cpp/src/arrow/io/compressed_benchmark.cc diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 041d511083457..f7afbca5580b7 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -43,5 +43,7 @@ if(NOT (${ARROW_SIMD_LEVEL} STREQUAL "NONE") AND NOT (${ARROW_SIMD_LEVEL} STREQU add_arrow_benchmark(memory_benchmark PREFIX "arrow-io") endif() +add_arrow_benchmark(compressed_benchmark PREFIX "arrow-io") + # Headers: top level arrow_install_all_headers("arrow/io") diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index 6c484242a4fc8..d06101748dc0c 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -201,7 +201,7 @@ Result> CompressedOutputStream::Make( util::Codec* codec, const std::shared_ptr& raw, MemoryPool* pool) { // CAUTION: codec is not owned std::shared_ptr res(new CompressedOutputStream); - res->impl_.reset(new Impl(pool, std::move(raw))); + res->impl_.reset(new Impl(pool, raw)); RETURN_NOT_OK(res->impl_->Init(codec)); return res; } @@ -233,8 +233,10 @@ class CompressedInputStream::Impl { : pool_(pool), raw_(raw), is_open_(true), + supports_zero_copy_from_raw_(raw_->supports_zero_copy()), compressed_pos_(0), decompressed_pos_(0), + fresh_decompressor_(false), total_pos_(0) {} Status Init(Codec* codec) { @@ -261,7 +263,7 @@ class CompressedInputStream::Impl { } } - bool closed() { return !is_open_; } + bool closed() const { return !is_open_; } Result Tell() const { return total_pos_; } @@ -269,8 +271,27 @@ class CompressedInputStream::Impl { Status EnsureCompressedData() { int64_t compressed_avail = compressed_ ? compressed_->size() - compressed_pos_ : 0; if (compressed_avail == 0) { - // No compressed data available, read a full chunk - ARROW_ASSIGN_OR_RAISE(compressed_, raw_->Read(kChunkSize)); + // Ensure compressed_ buffer is allocated with kChunkSize. + if (!supports_zero_copy_from_raw_) { + if (compressed_for_non_zero_copy_ == nullptr) { + ARROW_ASSIGN_OR_RAISE(compressed_for_non_zero_copy_, + AllocateResizableBuffer(kChunkSize, pool_)); + } else if (compressed_for_non_zero_copy_->size() != kChunkSize) { + RETURN_NOT_OK( + compressed_for_non_zero_copy_->Resize(kChunkSize, /*shrink_to_fit=*/false)); + } + ARROW_ASSIGN_OR_RAISE( + int64_t read_size, + raw_->Read(kChunkSize, + compressed_for_non_zero_copy_->mutable_data_as())); + if (read_size != compressed_for_non_zero_copy_->size()) { + RETURN_NOT_OK( + compressed_for_non_zero_copy_->Resize(read_size, /*shrink_to_fit=*/false)); + } + compressed_ = compressed_for_non_zero_copy_; + } else { + ARROW_ASSIGN_OR_RAISE(compressed_, raw_->Read(kChunkSize)); + } compressed_pos_ = 0; } return Status::OK(); @@ -284,8 +305,13 @@ class CompressedInputStream::Impl { int64_t decompress_size = kDecompressSize; while (true) { - ARROW_ASSIGN_OR_RAISE(decompressed_, - AllocateResizableBuffer(decompress_size, pool_)); + if (decompressed_ == nullptr) { + ARROW_ASSIGN_OR_RAISE(decompressed_, + AllocateResizableBuffer(decompress_size, pool_)); + } else { + // Shrinking the buffer if it's already large enough + RETURN_NOT_OK(decompressed_->Resize(decompress_size, /*shrink_to_fit=*/true)); + } decompressed_pos_ = 0; int64_t input_len = compressed_->size() - compressed_pos_; @@ -300,7 +326,9 @@ class CompressedInputStream::Impl { fresh_decompressor_ = false; } if (result.bytes_written > 0 || !result.need_more_output || input_len == 0) { - RETURN_NOT_OK(decompressed_->Resize(result.bytes_written)); + // Not calling shrink_to_fit here because we're likely to reusing the buffer. + RETURN_NOT_OK( + decompressed_->Resize(result.bytes_written, /*shrink_to_fit=*/false)); break; } DCHECK_EQ(result.bytes_written, 0); @@ -310,7 +338,7 @@ class CompressedInputStream::Impl { return Status::OK(); } - // Read a given number of bytes from the decompressed_ buffer. + // Copying a given number of bytes from the decompressed_ buffer. int64_t ReadFromDecompressed(int64_t nbytes, uint8_t* out) { int64_t readable = decompressed_ ? (decompressed_->size() - decompressed_pos_) : 0; int64_t read_bytes = std::min(readable, nbytes); @@ -318,11 +346,6 @@ class CompressedInputStream::Impl { if (read_bytes > 0) { memcpy(out, decompressed_->data() + decompressed_pos_, read_bytes); decompressed_pos_ += read_bytes; - - if (decompressed_pos_ == decompressed_->size()) { - // Decompressed data is exhausted, release buffer - decompressed_.reset(); - } } return read_bytes; @@ -357,7 +380,7 @@ class CompressedInputStream::Impl { } Result Read(int64_t nbytes, void* out) { - auto out_data = reinterpret_cast(out); + auto* out_data = reinterpret_cast(out); int64_t total_read = 0; bool decompressor_has_data = true; @@ -382,10 +405,10 @@ class CompressedInputStream::Impl { ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, pool_)); ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buf->mutable_data())); RETURN_NOT_OK(buf->Resize(bytes_read)); - return std::move(buf); + return buf; } - std::shared_ptr raw() const { return raw_; } + const std::shared_ptr& raw() const { return raw_; } private: // Read 64 KB compressed data at a time @@ -396,7 +419,12 @@ class CompressedInputStream::Impl { MemoryPool* pool_; std::shared_ptr raw_; bool is_open_; + const bool supports_zero_copy_from_raw_; std::shared_ptr decompressor_; + // If `raw_->supports_zero_copy()`, this buffer would not allocate memory. + // Otherwise, this buffer would allocate `kChunkSize` memory and read data from + // `raw_`. + std::shared_ptr compressed_for_non_zero_copy_; std::shared_ptr compressed_; // Position in compressed buffer int64_t compressed_pos_; @@ -413,10 +441,9 @@ Result> CompressedInputStream::Make( Codec* codec, const std::shared_ptr& raw, MemoryPool* pool) { // CAUTION: codec is not owned std::shared_ptr res(new CompressedInputStream); - res->impl_.reset(new Impl(pool, std::move(raw))); + res->impl_.reset(new Impl(pool, raw)); RETURN_NOT_OK(res->impl_->Init(codec)); return res; - return Status::OK(); } CompressedInputStream::~CompressedInputStream() { internal::CloseFromDestructor(this); } diff --git a/cpp/src/arrow/io/compressed.h b/cpp/src/arrow/io/compressed.h index cd1a7f673ce61..6b4e7ab4d7248 100644 --- a/cpp/src/arrow/io/compressed.h +++ b/cpp/src/arrow/io/compressed.h @@ -44,6 +44,9 @@ class ARROW_EXPORT CompressedOutputStream : public OutputStream { ~CompressedOutputStream() override; /// \brief Create a compressed output stream wrapping the given output stream. + /// + /// The codec must be capable of streaming compression. Some codecs, + /// like Snappy, are not able to do so. static Result> Make( util::Codec* codec, const std::shared_ptr& raw, MemoryPool* pool = default_memory_pool()); @@ -82,6 +85,9 @@ class ARROW_EXPORT CompressedInputStream ~CompressedInputStream() override; /// \brief Create a compressed input stream wrapping the given input stream. + /// + /// The codec must be capable of streaming decompression. Some codecs, + /// like Snappy, are not able to do so. static Result> Make( util::Codec* codec, const std::shared_ptr& raw, MemoryPool* pool = default_memory_pool()); diff --git a/cpp/src/arrow/io/compressed_benchmark.cc b/cpp/src/arrow/io/compressed_benchmark.cc new file mode 100644 index 0000000000000..52a30d8cb0887 --- /dev/null +++ b/cpp/src/arrow/io/compressed_benchmark.cc @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/io/compressed.h" +#include "arrow/io/memory.h" +#include "arrow/result.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/compression.h" +#include "arrow/util/config.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace arrow::io { + +using ::arrow::Compression; + +std::vector MakeCompressibleData(int data_size) { + // XXX This isn't a real-world corpus so doesn't really represent the + // comparative qualities of the algorithms + + // First make highly compressible data + std::string base_data = + "Apache Arrow is a cross-language development platform for in-memory data"; + int nrepeats = static_cast(1 + data_size / base_data.size()); + + std::vector data(base_data.size() * nrepeats); + for (int i = 0; i < nrepeats; ++i) { + std::memcpy(data.data() + i * base_data.size(), base_data.data(), base_data.size()); + } + data.resize(data_size); + + // Then randomly mutate some bytes so as to make things harder + std::mt19937 engine(42); + std::exponential_distribution<> offsets(0.05); + std::uniform_int_distribution<> values(0, 255); + + int64_t pos = 0; + while (pos < data_size) { + data[pos] = static_cast(values(engine)); + pos += static_cast(offsets(engine)); + } + + return data; +} + +// Using a non-zero copy buffer reader to benchmark the non-zero copy path. +class NonZeroCopyBufferReader final : public InputStream { + public: + NonZeroCopyBufferReader(std::shared_ptr buffer) : reader_(std::move(buffer)) {} + + bool supports_zero_copy() const override { return false; } + + Result Read(int64_t nbytes, void* out) override { + return reader_.Read(nbytes, out); + } + + Result> Read(int64_t nbytes) override { + // Testing the non-zero copy path like reading from local file or Object store, + // so we need to allocate a buffer and copy the data. + ARROW_ASSIGN_OR_RAISE(auto buf, ::arrow::AllocateResizableBuffer(nbytes)); + ARROW_ASSIGN_OR_RAISE(int64_t size, Read(nbytes, buf->mutable_data())); + ARROW_RETURN_NOT_OK(buf->Resize(size)); + return buf; + } + Status Close() override { return reader_.Close(); } + Result Tell() const override { return reader_.Tell(); } + bool closed() const override { return reader_.closed(); } + + private: + ::arrow::io::BufferReader reader_; +}; + +enum class BufferReadMode { ProvidedByCaller, ReturnedByCallee }; + +template +static void CompressedInputStreamBenchmark(::benchmark::State& state, + Compression::type compression) { + const int64_t input_size = state.range(0); + const int64_t batch_size = state.range(1); + + const std::vector data = MakeCompressibleData(static_cast(input_size)); + auto codec = ::arrow::util::Codec::Create(compression).ValueOrDie(); + int64_t max_compress_len = + codec->MaxCompressedLen(static_cast(data.size()), data.data()); + std::shared_ptr<::arrow::ResizableBuffer> buf = + ::arrow::AllocateResizableBuffer(max_compress_len).ValueOrDie(); + const int64_t compressed_length = + codec + ->Compress(static_cast(data.size()), data.data(), max_compress_len, + buf->mutable_data()) + .ValueOrDie(); + ABORT_NOT_OK(buf->Resize(compressed_length)); + for (auto _ : state) { + state.PauseTiming(); + auto reader = std::make_shared(buf); + [[maybe_unused]] std::unique_ptr read_buffer; + if constexpr (Mode == BufferReadMode::ProvidedByCaller) { + read_buffer = ::arrow::AllocateBuffer(batch_size).ValueOrDie(); + } + state.ResumeTiming(); + // Put `CompressedInputStream::Make` in timing. + auto input_stream = + ::arrow::io::CompressedInputStream::Make(codec.get(), reader).ValueOrDie(); + auto remaining_size = input_size; + while (remaining_size > 0) { + if constexpr (Mode == BufferReadMode::ProvidedByCaller) { + auto value = input_stream->Read(batch_size, read_buffer->mutable_data()); + ABORT_NOT_OK(value); + remaining_size -= value.ValueOrDie(); + } else { + auto value = input_stream->Read(batch_size); + ABORT_NOT_OK(value); + remaining_size -= value.ValueOrDie()->size(); + } + } + } + state.SetBytesProcessed(input_size * state.iterations()); +} + +template +static void CompressedInputStreamZeroCopyBufferProvidedByCaller( + ::benchmark::State& state) { + CompressedInputStreamBenchmark<::arrow::io::BufferReader, + BufferReadMode::ProvidedByCaller>(state, kCompression); +} + +template +static void CompressedInputStreamNonZeroCopyBufferProvidedByCaller( + ::benchmark::State& state) { + CompressedInputStreamBenchmark(state, kCompression); +} + +template +static void CompressedInputStreamZeroCopyBufferReturnedByCallee( + ::benchmark::State& state) { + CompressedInputStreamBenchmark<::arrow::io::BufferReader, + BufferReadMode::ReturnedByCallee>(state, kCompression); +} + +template +static void CompressedInputStreamNonZeroCopyBufferReturnedByCallee( + ::benchmark::State& state) { + CompressedInputStreamBenchmark(state, kCompression); +} + +static void CompressedInputArguments(::benchmark::internal::Benchmark* b) { + b->ArgNames({"num_bytes", "batch_size"}) + ->Args({8 * 1024, 8 * 1024}) + ->Args({64 * 1024, 8 * 1024}) + ->Args({64 * 1024, 64 * 1024}) + ->Args({1024 * 1024, 8 * 1024}) + ->Args({1024 * 1024, 64 * 1024}) + ->Args({1024 * 1024, 1024 * 1024}); +} + +#ifdef ARROW_WITH_LZ4 +// Benchmark LZ4 because it's lightweight, which makes benchmarking focused on the +// overhead of the compression input stream. +BENCHMARK_TEMPLATE(CompressedInputStreamZeroCopyBufferProvidedByCaller, + Compression::LZ4_FRAME) + ->Apply(CompressedInputArguments); +BENCHMARK_TEMPLATE(CompressedInputStreamNonZeroCopyBufferProvidedByCaller, + Compression::LZ4_FRAME) + ->Apply(CompressedInputArguments); +BENCHMARK_TEMPLATE(CompressedInputStreamZeroCopyBufferReturnedByCallee, + Compression::LZ4_FRAME) + ->Apply(CompressedInputArguments); +BENCHMARK_TEMPLATE(CompressedInputStreamNonZeroCopyBufferReturnedByCallee, + Compression::LZ4_FRAME) + ->Apply(CompressedInputArguments); +#endif + +} // namespace arrow::io From a407a6b45e6121051966d699017333ce9653e958 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Mar 2024 12:44:02 +0100 Subject: [PATCH 08/51] GH-40698: [C++] Create registry for Devices to map DeviceType to MemoryManager in C Device Data import (#40699) ### Rationale for this change Follow-up on https://github.com/apache/arrow/pull/39980#discussion_r1483235845 Right now, the user of `ImportDeviceArray` or `ImportDeviceRecordBatch` needs to provide a `DeviceMemoryMapper` mapping the device type and id to a MemoryManager. We provide a default implementation of that mapper that just knows about the default CPU memory manager (and there is another implementation in `arrow::cuda`, but you need to explicitly pass that to the import function) To make this easier, this PR adds a registry such that default device mappers can be added separately. ### What changes are included in this PR? This PR adds two new public functions to register device types (`RegisterDeviceMemoryManager`) and retrieve the mapper from the registry (`GetDeviceMemoryManager`). Further, it provides a `RegisterCUDADevice` to optionally register the CUDA devices (by default only CPU device is registered). ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #40698 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/buffer_test.cc | 13 +++++++ cpp/src/arrow/c/bridge.cc | 11 +++--- cpp/src/arrow/c/bridge.h | 12 +++--- cpp/src/arrow/device.cc | 63 ++++++++++++++++++++++++++++++++ cpp/src/arrow/device.h | 28 ++++++++++++++ cpp/src/arrow/gpu/cuda_memory.cc | 19 ++++++++++ cpp/src/arrow/gpu/cuda_memory.h | 4 +- cpp/src/arrow/gpu/cuda_test.cc | 15 +------- 8 files changed, 139 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index 13f6ea63b5e62..06ed0bfba0497 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -1023,4 +1023,17 @@ TEST(TestBufferConcatenation, EmptyBuffer) { AssertMyBufferEqual(*result, contents); } +TEST(TestDeviceRegistry, Basics) { + // Test the error cases for the device registry + + // CPU is already registered + ASSERT_RAISES(KeyError, + RegisterDeviceMapper(DeviceAllocationType::kCPU, [](int64_t device_id) { + return default_cpu_memory_manager(); + })); + + // VPI is not registered + ASSERT_RAISES(KeyError, GetDeviceMapper(DeviceAllocationType::kVPI)); +} + } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 4ec79a73029b4..d004de7a2ea9f 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1967,12 +1967,11 @@ Result> ImportRecordBatch(struct ArrowArray* array, return ImportRecordBatch(array, *maybe_schema); } -Result> DefaultDeviceMapper(ArrowDeviceType device_type, - int64_t device_id) { - if (device_type != ARROW_DEVICE_CPU) { - return Status::NotImplemented("Only importing data on CPU is supported"); - } - return default_cpu_memory_manager(); +Result> DefaultDeviceMemoryMapper( + ArrowDeviceType device_type, int64_t device_id) { + ARROW_ASSIGN_OR_RAISE(auto mapper, + GetDeviceMapper(static_cast(device_type))); + return mapper(device_id); } Result> ImportDeviceArray(struct ArrowDeviceArray* array, diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 0ced3d38cd1e6..74a302be4c27d 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -219,8 +219,8 @@ using DeviceMemoryMapper = std::function>(ArrowDeviceType, int64_t)>; ARROW_EXPORT -Result> DefaultDeviceMapper(ArrowDeviceType device_type, - int64_t device_id); +Result> DefaultDeviceMemoryMapper( + ArrowDeviceType device_type, int64_t device_id); /// \brief EXPERIMENTAL: Import C++ device array from the C data interface. /// @@ -236,7 +236,7 @@ Result> DefaultDeviceMapper(ArrowDeviceType devic ARROW_EXPORT Result> ImportDeviceArray( struct ArrowDeviceArray* array, std::shared_ptr type, - const DeviceMemoryMapper& mapper = DefaultDeviceMapper); + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); /// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface. /// @@ -253,7 +253,7 @@ Result> ImportDeviceArray( ARROW_EXPORT Result> ImportDeviceArray( struct ArrowDeviceArray* array, struct ArrowSchema* type, - const DeviceMemoryMapper& mapper = DefaultDeviceMapper); + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); /// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data /// interface. @@ -271,7 +271,7 @@ Result> ImportDeviceArray( ARROW_EXPORT Result> ImportDeviceRecordBatch( struct ArrowDeviceArray* array, std::shared_ptr schema, - const DeviceMemoryMapper& mapper = DefaultDeviceMapper); + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); /// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema /// from the C data interface. @@ -291,7 +291,7 @@ Result> ImportDeviceRecordBatch( ARROW_EXPORT Result> ImportDeviceRecordBatch( struct ArrowDeviceArray* array, struct ArrowSchema* schema, - const DeviceMemoryMapper& mapper = DefaultDeviceMapper); + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); /// @} diff --git a/cpp/src/arrow/device.cc b/cpp/src/arrow/device.cc index 3736a4e018c33..98b8f7b30397e 100644 --- a/cpp/src/arrow/device.cc +++ b/cpp/src/arrow/device.cc @@ -18,6 +18,8 @@ #include "arrow/device.h" #include +#include +#include #include #include "arrow/array.h" @@ -268,4 +270,65 @@ std::shared_ptr CPUDevice::default_memory_manager() { return default_cpu_memory_manager(); } +namespace { + +class DeviceMapperRegistryImpl { + public: + DeviceMapperRegistryImpl() {} + + Status RegisterDevice(DeviceAllocationType device_type, DeviceMapper memory_mapper) { + std::lock_guard lock(lock_); + auto [_, inserted] = registry_.try_emplace(device_type, std::move(memory_mapper)); + if (!inserted) { + return Status::KeyError("Device type ", static_cast(device_type), + " is already registered"); + } + return Status::OK(); + } + + Result GetMapper(DeviceAllocationType device_type) { + std::lock_guard lock(lock_); + auto it = registry_.find(device_type); + if (it == registry_.end()) { + return Status::KeyError("Device type ", static_cast(device_type), + "is not registered"); + } + return it->second; + } + + private: + std::mutex lock_; + std::unordered_map registry_; +}; + +Result> DefaultCPUDeviceMapper(int64_t device_id) { + return default_cpu_memory_manager(); +} + +static std::unique_ptr CreateDeviceRegistry() { + auto registry = std::make_unique(); + + // Always register the CPU device + DCHECK_OK(registry->RegisterDevice(DeviceAllocationType::kCPU, DefaultCPUDeviceMapper)); + + return registry; +} + +DeviceMapperRegistryImpl* GetDeviceRegistry() { + static auto g_registry = CreateDeviceRegistry(); + return g_registry.get(); +} + +} // namespace + +Status RegisterDeviceMapper(DeviceAllocationType device_type, DeviceMapper mapper) { + auto registry = GetDeviceRegistry(); + return registry->RegisterDevice(device_type, std::move(mapper)); +} + +Result GetDeviceMapper(DeviceAllocationType device_type) { + auto registry = GetDeviceRegistry(); + return registry->GetMapper(device_type); +} + } // namespace arrow diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h index efb0a5ab400a1..622551c6bd040 100644 --- a/cpp/src/arrow/device.h +++ b/cpp/src/arrow/device.h @@ -363,4 +363,32 @@ class ARROW_EXPORT CPUMemoryManager : public MemoryManager { ARROW_EXPORT std::shared_ptr default_cpu_memory_manager(); +using DeviceMapper = + std::function>(int64_t device_id)>; + +/// \brief Register a function to retrieve a MemoryManager for a Device type +/// +/// This registers the device type globally. A specific device type can only +/// be registered once. This method is thread-safe. +/// +/// Currently, this registry is only used for importing data through the C Device +/// Data Interface (for the default Device to MemoryManager mapper in +/// arrow::ImportDeviceArray/ImportDeviceRecordBatch). +/// +/// \param[in] device_type the device type for which to register a MemoryManager +/// \param[in] mapper function that takes a device id and returns the appropriate +/// MemoryManager for the registered device type and given device id +/// \return Status +ARROW_EXPORT +Status RegisterDeviceMapper(DeviceAllocationType device_type, DeviceMapper mapper); + +/// \brief Get the registered function to retrieve a MemoryManager for the +/// given Device type +/// +/// \param[in] device_type the device type +/// \return function that takes a device id and returns the appropriate +/// MemoryManager for the registered device type and given device id +ARROW_EXPORT +Result GetDeviceMapper(DeviceAllocationType device_type); + } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 860c6311d7b2f..6972321006a9a 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -27,6 +27,7 @@ #include #include "arrow/buffer.h" +#include "arrow/device.h" #include "arrow/io/memory.h" #include "arrow/memory_pool.h" #include "arrow/status.h" @@ -501,5 +502,23 @@ Result> DefaultMemoryMapper(ArrowDeviceType devic } } +namespace { + +Result> DefaultCUDADeviceMapper(int64_t device_id) { + ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + return device->default_memory_manager(); +} + +bool RegisterCUDADeviceInternal() { + DCHECK_OK(RegisterDeviceMapper(DeviceAllocationType::kCUDA, DefaultCUDADeviceMapper)); + // TODO add the CUDA_HOST and CUDA_MANAGED allocation types when they are supported in + // the CudaDevice + return true; +} + +static auto cuda_registered = RegisterCUDADeviceInternal(); + +} // namespace + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index d323bef03494e..488f4183730c7 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -260,7 +260,9 @@ Result GetDeviceAddress(const uint8_t* cpu_data, ARROW_EXPORT Result GetHostAddress(uintptr_t device_ptr); -ARROW_EXPORT +ARROW_DEPRECATED( + "Deprecated in 16.0.0. The CUDA device is registered by default, and you can use " + "arrow::DefaultDeviceMapper instead.") Result> DefaultMemoryMapper(ArrowDeviceType device_type, int64_t device_id); diff --git a/cpp/src/arrow/gpu/cuda_test.cc b/cpp/src/arrow/gpu/cuda_test.cc index d2f01cb3bbc0c..4c450bf389919 100644 --- a/cpp/src/arrow/gpu/cuda_test.cc +++ b/cpp/src/arrow/gpu/cuda_test.cc @@ -716,17 +716,6 @@ class TestCudaDeviceArrayRoundtrip : public ::testing::Test { public: using ArrayFactory = std::function>()>; - static Result> DeviceMapper(ArrowDeviceType type, - int64_t id) { - if (type != ARROW_DEVICE_CUDA) { - return Status::NotImplemented("should only be CUDA device"); - } - - ARROW_ASSIGN_OR_RAISE(auto manager, cuda::CudaDeviceManager::Instance()); - ARROW_ASSIGN_OR_RAISE(auto device, manager->GetDevice(id)); - return device->default_memory_manager(); - } - static ArrayFactory JSONArrayFactory(std::shared_ptr type, const char* json) { return [=]() { return ArrayFromJSON(type, json); }; } @@ -759,7 +748,7 @@ class TestCudaDeviceArrayRoundtrip : public ::testing::Test { std::shared_ptr device_array_roundtripped; ASSERT_OK_AND_ASSIGN(device_array_roundtripped, - ImportDeviceArray(&c_array, &c_schema, DeviceMapper)); + ImportDeviceArray(&c_array, &c_schema)); ASSERT_TRUE(ArrowSchemaIsReleased(&c_schema)); ASSERT_TRUE(ArrowArrayIsReleased(&c_array.array)); @@ -779,7 +768,7 @@ class TestCudaDeviceArrayRoundtrip : public ::testing::Test { ASSERT_OK(ExportDeviceArray(*device_array, sync, &c_array, &c_schema)); device_array_roundtripped.reset(); ASSERT_OK_AND_ASSIGN(device_array_roundtripped, - ImportDeviceArray(&c_array, &c_schema, DeviceMapper)); + ImportDeviceArray(&c_array, &c_schema)); ASSERT_TRUE(ArrowSchemaIsReleased(&c_schema)); ASSERT_TRUE(ArrowArrayIsReleased(&c_array.array)); From f3c5fb98ae7673ad94b198b2da4c741013084e46 Mon Sep 17 00:00:00 2001 From: James Henderson Date: Wed, 27 Mar 2024 13:33:35 +0000 Subject: [PATCH 09/51] =?UTF-8?q?GH-40796:=20[Java]=20set=20`lastSet`=20in?= =?UTF-8?q?=20`ListVector.setNull`=20to=20avoid=20O(n=C2=B2)=20in=20ListVe?= =?UTF-8?q?ctors=20with=20lots=20of=20nulls=20(#40810)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Would benefit from someone with knowledge of the context double-checking this doesn't have nuances I'm not aware of - particularly, there's a comment on the field: `the maximum index that is actually set` which one _could_ read to mean 'excluding nulls'? ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40796 Authored-by: James Henderson Signed-off-by: David Li --- .../arrow/vector/complex/ListVector.java | 1 + .../apache/arrow/vector/TestValueVector.java | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 5154ac17279c5..7df659e4cc9da 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -856,6 +856,7 @@ public void setNull(int index) { offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); } BitVectorHelper.unsetBit(validityBuffer, index); + lastSet = index; } /** diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 10091aebdd50b..ad84882c66275 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -2859,6 +2859,29 @@ public void testListVectorEquals() { } } + @Test + public void testListVectorSetNull() { + try (final ListVector vector = ListVector.empty("list", allocator)) { + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + writeListVector(writer, new int[] {1, 2}); + writeListVector(writer, new int[] {3, 4}); + writeListVector(writer, new int[] {5, 6}); + vector.setNull(3); + vector.setNull(4); + vector.setNull(5); + writer.setValueCount(6); + + assertEquals(vector.getObject(0), Arrays.asList(1, 2)); + assertEquals(vector.getObject(1), Arrays.asList(3, 4)); + assertEquals(vector.getObject(2), Arrays.asList(5, 6)); + assertTrue(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertTrue(vector.isNull(5)); + } + } + @Test public void testStructVectorEqualsWithNull() { From 83dc0a91d2f1e238a7e4d033d9373928bd8ab4a3 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 28 Mar 2024 03:32:56 +1300 Subject: [PATCH 10/51] GH-40790: [C#] Account for offset and length when getting fields of a StructArray (#40805) ### Rationale for this change See #40790. The `StructArray.Fields` property currently returns the child arrays without accounting for the array offset and length. This meant that consumers would need to know to account for the offset and length themselves when accessing the child arrays, and this is inconsistent with the behaviour of Arrow APIs in other languages. ### What changes are included in this PR? Changes the behaviour of the `StructArray.Fields` property, so that the returned arrays are sliced if required. This behaviour is consistent with the C++ Arrow API, eg. see: https://github.com/apache/arrow/blob/f710ac52b049806515a14445b242c3ec819fb99d/cpp/src/arrow/array/array_nested.cc#L1019-L1020 I also checked that pyarrow behaves like this too: ```python import pyarrow as pa a = pa.array([0, 1, 2, 3, 4], type=pa.int32()) b = pa.array([0.0, 0.1, 0.2, 0.3, 0.4], type=pa.float32()) xs = pa.StructArray.from_arrays([a, b], names=["a", "b"]) slice = xs.slice(2, 3) assert len(slice) == 3 assert len(slice.field(0)) == 3 assert len(slice.field(1)) == 3 ``` ### Are these changes tested? Yes, I've added new unit tests. ### Are there any user-facing changes? Yes, this is a user-facing bug fix and behaviour change. **This PR includes breaking changes to public APIs.** The behaviour of `StructArray.Fields` has changed. If users were previously accounting for the array offset and length themselves, this will break existing code. I first tried to make this non-breaking, by introducing a new property to replace `Fields`, and marking that property as obsolete. But `StructArray` implements `IArrowRecord`, so the behaviour of the `IArrowRecord.Column` would either need to be kept as broken, or fixed with a breaking change. It seems simplest and most consistent to fix the behaviour for all methods. If users need to maintain compatibility across different Arrow versions, I'd suggest using a pattern like: ```c# var field = structArray.Fields[0]; if (field.Length != structArray.Length) { field = ArrowArrayFactory.Slice(field, structArray.Offset, structArray.Length); } ``` * GitHub Issue: #40790 Authored-by: Adam Reeve Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow/Arrays/MapArray.cs | 10 +-- csharp/src/Apache.Arrow/Arrays/StructArray.cs | 10 ++- .../Apache.Arrow.Tests/StructArrayTests.cs | 80 +++++++++++++++++++ 3 files changed, 91 insertions(+), 9 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/MapArray.cs b/csharp/src/Apache.Arrow/Arrays/MapArray.cs index dad50981ea54d..c1dc9688b5a00 100644 --- a/csharp/src/Apache.Arrow/Arrays/MapArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/MapArray.cs @@ -155,10 +155,9 @@ public IEnumerable> GetTuples(int inde // Get key values int start = offsets[index]; int end = offsets[index + 1]; - StructArray array = KeyValues.Slice(start, end - start) as StructArray; - TKeyArray keyArray = array.Fields[0] as TKeyArray; - TValueArray valueArray = array.Fields[1] as TValueArray; + TKeyArray keyArray = KeyValues.Fields[0] as TKeyArray; + TValueArray valueArray = KeyValues.Fields[1] as TValueArray; for (int i = start; i < end; i++) { @@ -173,10 +172,9 @@ public IEnumerable> GetKeyValuePairs _fields; public IReadOnlyList Fields => - LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields()); + LazyInitializer.EnsureInitialized(ref _fields, InitializeFields); public StructArray( IArrowType dataType, int length, @@ -35,7 +35,6 @@ public StructArray( dataType, length, nullCount, offset, new[] { nullBitmapBuffer }, children.Select(child => child.Data))) { - _fields = children.ToArray(); } public StructArray(ArrayData data) @@ -65,7 +64,12 @@ private IReadOnlyList InitializeFields() IArrowArray[] result = new IArrowArray[Data.Children.Length]; for (int i = 0; i < Data.Children.Length; i++) { - result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]); + var childData = Data.Children[i]; + if (Data.Offset != 0 || childData.Length != Data.Length) + { + childData = childData.Slice(Data.Offset, Data.Length); + } + result[i] = ArrowArrayFactory.BuildArray(childData); } return result; } diff --git a/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs b/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs index e2d0fa85137ec..ff5e8d2a5909b 100644 --- a/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs @@ -17,6 +17,7 @@ using Apache.Arrow.Types; using System.Collections.Generic; using System.IO; +using System.Linq; using Xunit; namespace Apache.Arrow.Tests @@ -121,6 +122,85 @@ public void TestListOfStructArray() TestRoundTripRecordBatch(batch); } + [Fact] + public void TestSliceStructArray() + { + const int numRows = 10; + var fields = new List + { + new Field.Builder().Name("ints").DataType(new Int32Type()).Nullable(true).Build(), + new Field.Builder().Name("doubles").DataType(new DoubleType()).Nullable(true).Build(), + }; + var arrays = new List + { + new Int32Array.Builder().AppendRange(Enumerable.Range(0, numRows)).Build(), + new DoubleArray.Builder().AppendRange(Enumerable.Range(0, numRows).Select(i => i * 0.1)).Build(), + }; + + var nullBitmap = new ArrowBuffer.BitmapBuilder().AppendRange(true, numRows).Build(); + var array = new StructArray(new StructType(fields), numRows, arrays, nullBitmap, nullCount: 0); + + var slicedArray = (StructArray) array.Slice(3, 4); + + Assert.Equal(4, slicedArray.Length); + Assert.Equal(2, slicedArray.Fields.Count); + + var slicedInts = slicedArray.Fields[0]; + var expectedInts = Enumerable.Range(3, 4).Select(val => (int?) val).ToArray(); + Assert.Equal(expectedInts, (IReadOnlyList) slicedInts); + + var slicedDoubles = slicedArray.Fields[1]; + var expectedDoubles = Enumerable.Range(3, 4).Select(val => (double?) (val * 0.1)).ToArray(); + Assert.Equal(expectedDoubles, (IReadOnlyList) slicedDoubles); + } + + [Fact] + public void TestStructArrayConstructedWithOffset() + { + const int dataNumRows = 10; + const int arrayLength = 4; + const int arrayOffset = 3; + + var fields = new List + { + new Field.Builder().Name("ints").DataType(new Int32Type()).Nullable(true).Build(), + new Field.Builder().Name("doubles").DataType(new DoubleType()).Nullable(true).Build(), + }; + var arrays = new List + { + new Int32Array.Builder().AppendRange(Enumerable.Range(0, dataNumRows)).Build(), + new DoubleArray.Builder().AppendRange(Enumerable.Range(0, dataNumRows).Select(i => i * 0.1)).Build(), + }; + + var nullBitmap = new ArrowBuffer.BitmapBuilder().AppendRange(true, dataNumRows).Build(); + var array = new StructArray( + new StructType(fields), arrayLength, arrays, nullBitmap, nullCount: 0, offset: arrayOffset); + + Assert.Equal(4, array.Length); + Assert.Equal(3, array.Offset); + Assert.Equal(2, array.Fields.Count); + + var slicedInts = array.Fields[0]; + var expectedInts = Enumerable.Range(3, 4).Select(val => (int?) val).ToArray(); + Assert.Equal(expectedInts, (IReadOnlyList) slicedInts); + + var slicedDoubles = array.Fields[1]; + var expectedDoubles = Enumerable.Range(3, 4).Select(val => (double?) (val * 0.1)).ToArray(); + Assert.Equal(expectedDoubles, (IReadOnlyList) slicedDoubles); + + var subSlice = (StructArray) array.Slice(1, 2); + Assert.Equal(2, subSlice.Length); + Assert.Equal(2, subSlice.Fields.Count); + + var subSlicedInts = subSlice.Fields[0]; + var expectedSubSliceInts = Enumerable.Range(4, 2).Select(val => (int?) val).ToArray(); + Assert.Equal(expectedSubSliceInts, (IReadOnlyList) subSlicedInts); + + var subSlicedDoubles = subSlice.Fields[1]; + var expectedSubSliceDoubles = Enumerable.Range(4, 2).Select(val => (double?) (val * 0.1)).ToArray(); + Assert.Equal(expectedSubSliceDoubles, (IReadOnlyList) subSlicedDoubles); + } + private static void TestRoundTripRecordBatch(RecordBatch originalBatch) { using (MemoryStream stream = new MemoryStream()) From dc2c5c66f5234a92169da76613399135786dbffb Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 28 Mar 2024 05:27:36 +1300 Subject: [PATCH 11/51] MINOR: [C++] Remove misleading comment on FileKeyUnwrapper constructor (#40808) ### Rationale for this change I added this comment in #34181, but from the discussion in https://github.com/apache/arrow/pull/40732#discussion_r1535001401, I realised this comment was incorrect. The extra overload appears to just be a convenience as a `FileKeyMaterialStore` is already constructed in `KeyToolkit::RotateMasterKeys`, but the store isn't actually used by the `FileKeyUnwrapper` in that method, as only `FileKeyUnwrapper::GetDataEncryptionKey` is called, which bypasses the store. `RotateMasterKeys` does however rely on the `temp_key_material_store` passed to the `FileKeyWrapper` being used, which is possibly where this confusion came from. ### What changes are included in this PR? Removes an incorrect statement from a C++ header comment. ### Are these changes tested? NA ### Are there any user-facing changes? No Authored-by: Adam Reeve Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encryption/file_key_unwrapper.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/encryption/file_key_unwrapper.h b/cpp/src/parquet/encryption/file_key_unwrapper.h index c60c0c71ba5e0..6147abbecd3e6 100644 --- a/cpp/src/parquet/encryption/file_key_unwrapper.h +++ b/cpp/src/parquet/encryption/file_key_unwrapper.h @@ -57,8 +57,7 @@ class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { /// Constructor overload that takes a raw pointer to the KeyToolkit and /// accepts an existing key_material_store rather than using - /// the file path and file system to create one when needed. This is useful for key - /// rotation to allow accessing the key material store after it is used. + /// the file path and file system to create one when needed. FileKeyUnwrapper(KeyToolkit* key_toolkit, const KmsConnectionConfig& kms_connection_config, double cache_lifetime_seconds, From 515c61dd617e65c01a6e40e570487ad4ae9f151c Mon Sep 17 00:00:00 2001 From: James Henderson Date: Wed, 27 Mar 2024 18:37:16 +0000 Subject: [PATCH 12/51] GH-40773: [Java] add `DENSEUNION` case to StructWriters, resolves #40773 (#40809) ### What changes are included in this PR? Adding a `DENSEUNION` case to the `StructWriters` template so that one can create StructVectors with a DenseUnionVector child. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40773 Authored-by: James Henderson Signed-off-by: David Li --- .../src/main/codegen/templates/StructWriters.java | 6 ++++++ .../org/apache/arrow/vector/TestValueVector.java | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/java/vector/src/main/codegen/templates/StructWriters.java b/java/vector/src/main/codegen/templates/StructWriters.java index 84e5d8113b321..b6dd2b75c526a 100644 --- a/java/vector/src/main/codegen/templates/StructWriters.java +++ b/java/vector/src/main/codegen/templates/StructWriters.java @@ -73,6 +73,12 @@ public class ${mode}StructWriter extends AbstractFieldWriter { map(child.getName(), arrowType.getKeysSorted()); break; } + case DENSEUNION: { + FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.DENSEUNION.getType(), null, null); + DenseUnionWriter writer = new DenseUnionWriter(container.addOrGet(child.getName(), fieldType, DenseUnionVector.class), getNullableStructWriterFactory()); + fields.put(handleCase(child.getName()), writer); + break; + } case UNION: FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.UNION.getType(), null, null); UnionWriter writer = new UnionWriter(container.addOrGet(child.getName(), fieldType, UnionVector.class), getNullableStructWriterFactory()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index ad84882c66275..3e53512f7338f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -60,6 +60,7 @@ import org.apache.arrow.vector.testing.ValueVectorDataPopulator; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -2974,6 +2975,20 @@ public void testStructVectorEqualsWithDiffChild() { } } + @Test + public void testStructVectorAcceptsDenseUnionChild() { + Field childField = new Field("child", + FieldType.notNullable(new ArrowType.Union(UnionMode.Dense, new int[] {})), + Collections.emptyList()); + Field structField = new Field("struct", + FieldType.notNullable(ArrowType.Struct.INSTANCE), + Collections.singletonList(childField)); + + try (FieldVector structVec = structField.createVector(allocator)) { + assertEquals(structField, structVec.getField()); + } + } + @Test public void testUnionVectorEquals() { try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); From 2146ab10e653f927a6e92d29ee0910f30f4cb996 Mon Sep 17 00:00:00 2001 From: sullis Date: Wed, 27 Mar 2024 13:32:45 -0700 Subject: [PATCH 13/51] MINOR: [Java] Bump Netty to 4.1.108.Final (#40491) ### Rationale for this change [Java] bump to latest version of Netty https://netty.io/news/2024/02/13/4-1-107-Final.html https://netty.io/news/2024/03/21/4-1-108-Final.html ### What changes are included in this PR? modified Java pom.xml ### Are these changes tested? GitHub Actions CI build ### Are there any user-facing changes? No Authored-by: sullis Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index b064d07e1e0dc..add2823ccb0d2 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.10.2 2.0.11 33.0.0-jre - 4.1.106.Final + 4.1.108.Final 1.61.1 3.23.1 2.17.0 From c9cb3fa85c1e9927fc473e1459a4fd5633614003 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 28 Mar 2024 09:38:49 +0900 Subject: [PATCH 14/51] GH-40586: [Dev][C++][Python][R] Use pre-commit for clang-format (#40587) ### Rationale for this change We can run `clang-format` easily than `archery lint` by using `pre-commit`: * We don't need to install `clang-format-14` separately because `pre-commit` prepare it automatically. * We don't need to run `cmake` to run `clang-format-14`. ### What changes are included in this PR? Add `clang-format` related `pre-commit` configurations. This doesn't change `archery lint` because our `pre-commit` configurations can't replace `archery lint` entirely yet. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40586 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .pre-commit-config.yaml | 44 ++++++++++++++++++++++ cpp/src/arrow/util/windows_compatibility.h | 1 - 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a08f219a52b62..2e598e0a95064 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,6 +51,26 @@ repos: hooks: - id: cython-lint args: [--no-pycodestyle] + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + name: C++ Format + types_or: + - c++ + # - json + # - proto + files: >- + ^cpp/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?\.pb\.(cc|h)$| + ?_generated.*\.(cc|h)$| + ?^cpp/src/arrow/vendored/| + ?^cpp/src/generated/| + ?^cpp/thirdparty/| + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: @@ -65,6 +85,30 @@ repos: name: MATLAB (C++) Format files: >- ^matlab/src/cpp/ + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + name: Python (C++) Format + files: >- + ^python/pyarrow/src/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?.pb\.(cc|h)$| + ?^cpp/src/generated/| + ) + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + name: R (C++) Format + files: >- + ^r/src/ + exclude: >- + ( + ?^r/src/arrowExports\.cpp$| + ) - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: diff --git a/cpp/src/arrow/util/windows_compatibility.h b/cpp/src/arrow/util/windows_compatibility.h index ea0d0167569e8..c97b2f3b76a7c 100644 --- a/cpp/src/arrow/util/windows_compatibility.h +++ b/cpp/src/arrow/util/windows_compatibility.h @@ -33,7 +33,6 @@ #endif #include -#include #include "arrow/util/windows_fixup.h" From b270dcdcdf7390a0486600374a900fa2b1b8d430 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 28 Mar 2024 08:54:52 +0800 Subject: [PATCH 15/51] GH-40814: [C++] Thirdparty: bump zstd to 1.5.6 (#40837) ### Rationale for this change Zstd releases 1.5.6 here: https://github.com/facebook/zstd/releases/tag/v1.5.6 ### What changes are included in this PR? Change default zstd to 1.5.6 ### Are these changes tested? Already has test ### Are there any user-facing changes? no * GitHub Issue: #40814 Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/thirdparty/versions.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 18bb6c9b6e09c..760b19f71e2e0 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -117,8 +117,8 @@ ARROW_XSIMD_BUILD_VERSION=9.0.1 ARROW_XSIMD_BUILD_SHA256_CHECKSUM=b1bb5f92167fd3a4f25749db0be7e61ed37e0a5d943490f3accdcd2cd2918cc0 ARROW_ZLIB_BUILD_VERSION=1.3.1 ARROW_ZLIB_BUILD_SHA256_CHECKSUM=9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23 -ARROW_ZSTD_BUILD_VERSION=1.5.5 -ARROW_ZSTD_BUILD_SHA256_CHECKSUM=9c4396cc829cfae319a6e2615202e82aad41372073482fce286fac78646d3ee4 +ARROW_ZSTD_BUILD_VERSION=1.5.6 +ARROW_ZSTD_BUILD_SHA256_CHECKSUM=8c29e06cf42aacc1eafc4077ae2ec6c6fcb96a626157e0593d5e82a34fd403c1 # The first field is the name of the environment variable expected by cmake. From 3d5e9aaedecadee9daa86232ec58de422caecdb6 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 27 Mar 2024 19:39:44 -0800 Subject: [PATCH 16/51] MINOR: [Docs] Fix broken link in acero/options.h docstring (#40811) ### Rationale for this change A "See also" link at https://arrow.apache.org/docs/cpp/api/acero.html#_CPPv4N5arrow5acero22TableSourceNodeOptionsE isn't automatically linked, probably because SourceNode itself isn't documented. ### What changes are included in this PR? I updated the string to be "SourceNodeOptions" so it links there, which I'm pretty sure is what was intended because TableSourceNode inherits from SourceNode and the docs for SourceNodeOptions documents the behavior of SourceNode. ### Are these changes tested? Yes, locally. ### Are there any user-facing changes? Just docs. Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- cpp/src/arrow/acero/options.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/options.h b/cpp/src/arrow/acero/options.h index 1ede3fbfc8ed0..4447e9c67a199 100644 --- a/cpp/src/arrow/acero/options.h +++ b/cpp/src/arrow/acero/options.h @@ -105,8 +105,8 @@ class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions { /// \brief a node that generates data from a table already loaded in memory /// /// The table source node will slice off chunks, defined by `max_batch_size` -/// for parallel processing. The source node extends source node and so these -/// chunks will be iteratively processed in small batches. \see SourceNode +/// for parallel processing. The table source node extends source node and so these +/// chunks will be iteratively processed in small batches. \see SourceNodeOptions /// for details. class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions { public: From 7da8dfe480a6afb3113a972a08adedf88dbf4d1c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 28 Mar 2024 13:26:16 +0900 Subject: [PATCH 17/51] GH-40674: [GLib] Don't assume gint64 and int64_t use the same type (#40736) ### Rationale for this change GLib doesn't guarantee that `gint64` and `int64_t` use the same type: https://docs.gtk.org/glib/types.html#gint64 > Note that on platforms with more than one 64-bit standard integer > type, gint64 and int64_t are not necessarily implemented by the same > 64-bit integer type. For example, on a platform where both long and > long long are 64-bit, it might be the case that one of those types is > used for gint64 and the other is used for int64_t. ### What changes are included in this PR? Add explicit casts. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40674 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/array-builder.cpp | 6 ++++-- c_glib/arrow-glib/composite-array.cpp | 7 ++++--- c_glib/gandiva-glib/node.cpp | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 6d8ce4a35ac0a..b498ecb51cedb 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -4995,7 +4995,8 @@ garrow_binary_dictionary_array_builder_append_indices( auto append_function = [&arrow_builder](const gint64 *values, gint64 values_length, const uint8_t *valid_bytes) -> arrow::Status { - return arrow_builder->AppendIndices(values, values_length, valid_bytes); + auto int64_t_values = reinterpret_cast(values); + return arrow_builder->AppendIndices(int64_t_values, values_length, valid_bytes); }; return garrow_array_builder_append_values(values, values_length, @@ -5226,7 +5227,8 @@ garrow_string_dictionary_array_builder_append_indices( auto append_function = [&arrow_builder](const gint64 *values, gint64 values_length, const uint8_t *valid_bytes) -> arrow::Status { - return arrow_builder->AppendIndices(values, values_length, valid_bytes); + auto int64_t_values = reinterpret_cast(values); + return arrow_builder->AppendIndices(int64_t_values, values_length, valid_bytes); }; return garrow_array_builder_append_values(values, values_length, diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index cc254b26e1e4c..d49b393605453 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -591,9 +591,10 @@ garrow_large_list_array_get_value_length(GArrowLargeListArray *array, gint64 i) const gint64 * garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, gint64 *n_offsets) { - return garrow_base_list_array_get_value_offsets( - GARROW_ARRAY(array), - n_offsets); + auto value_offsets = + garrow_base_list_array_get_value_offsets(GARROW_ARRAY(array), + n_offsets); + return reinterpret_cast(value_offsets); } typedef struct GArrowStructArrayPrivate_ diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index e83dc41e9274b..fe75b0db03fe3 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -873,7 +873,8 @@ ggandiva_int64_literal_node_class_init(GGandivaInt64LiteralNodeClass *klass) GGandivaInt64LiteralNode * ggandiva_int64_literal_node_new(gint64 value) { - auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + auto int64_t_value = static_cast(value); + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(int64_t_value); return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, NULL)); } @@ -916,7 +917,8 @@ ggandiva_uint64_literal_node_class_init(GGandivaUInt64LiteralNodeClass *klass) GGandivaUInt64LiteralNode * ggandiva_uint64_literal_node_new(guint64 value) { - auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + auto uint64_t_value = static_cast(value); + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(uint64_t_value); return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, NULL)); } From 6cecbab5172b2b339277dde741bfff455646eb32 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 27 Mar 2024 21:13:39 -0800 Subject: [PATCH 18/51] GH-40806: [C++] Correctly report asimd/neon in GetRuntimeInfo (#40857) ### What changes are included in this PR? New case to conditional in `MakeSimdLevelString` which makes `GetRuntimeInfo` report correctly on respective CPUs. I chose to have it report "neon". Lowercase to match other strings and "neon" instead of "asimd" because I think that makes more sense to users. I'm not 100% sure which is more correct. Fixes #40806 ### Are these changes tested? We don't have automated tests for this. I did install the R package and, on my M1 laptop it reports 'neon' now instead of 'none' before: ```r > arrow_info() ... SIMD Level neon Detected SIMD Level neon ``` ### Are there any user-facing changes? No. * GitHub Issue: #40806 --- cpp/src/arrow/config.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 9e32e5437325f..1f852e84d3d5c 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -58,6 +58,8 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { return "avx"; } else if (query_flag(CpuInfo::SSE4_2)) { return "sse4_2"; + } else if (query_flag(CpuInfo::ASIMD)) { + return "neon"; } else { return "none"; } From a9b2cc2c962f064c3fa5504909f122e9bcabda3f Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Thu, 28 Mar 2024 06:06:21 -0700 Subject: [PATCH 19/51] GH-40843: [Java] Cleanup protobuf-maven-plugin usage (#40844) ### Rationale for this change `protobuf-maven-plugin` usage in Arrow codebase does not follow plugins best practices like sharing the same output directory for different execution or not using test goals for generating test classes ### What changes are included in this PR? * Add protobuf-maven-plugin plugin to top level pom.xml under pluginManagement to define version and common configuration for all modules * Remove unnecessary executions of test-compile goal when no test protobufs are present * Remove use of outputDirectory and clearOutputDirectory and let the plugin choose it for each execution (the default output directory is based on the phase (main vs test) and the language/plugin-id) * Replace use of compile/compile-custom goals with test-compile/test-compile-custom when generating test protobufs ### Are these changes tested? As those changes are in the build system, they are covered by the build framework and tests run as part of the build ### Are there any user-facing changes? None * GitHub Issue: #40843 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/dataset/pom.xml | 11 ++++------ java/flight/flight-core/pom.xml | 16 ++------------- .../src/test/{protobuf => proto}/perf.proto | 0 .../src/test/{protobuf => proto}/test.proto | 0 java/flight/pom.xml | 20 ------------------- java/gandiva/pom.xml | 11 ++++------ java/pom.xml | 10 ++++++++++ 7 files changed, 20 insertions(+), 48 deletions(-) rename java/flight/flight-core/src/test/{protobuf => proto}/perf.proto (100%) rename java/flight/flight-core/src/test/{protobuf => proto}/test.proto (100%) diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index a003fd18068ec..43b913167390f 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -177,18 +177,15 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.6.1 - - com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} - - ../../cpp/src/jni/dataset/proto - + src compile - test-compile + + ../../cpp/src/jni/dataset/proto + diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 98491e7ba091e..830caf8a28246 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -228,19 +228,11 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.6.1 - - com.google.protobuf:protoc:${dep.protobuf-bom.version}:exe:${os.detected.classifier} - false - grpc-java - io.grpc:protoc-gen-grpc-java:${dep.grpc-bom.version}:exe:${os.detected.classifier} - src ${basedir}/../../../format/ - ${project.build.directory}/generated-sources/protobuf compile @@ -249,13 +241,9 @@ test - - ${basedir}/src/test/protobuf - ${project.build.directory}/generated-test-sources//protobuf - - compile - compile-custom + test-compile + test-compile-custom diff --git a/java/flight/flight-core/src/test/protobuf/perf.proto b/java/flight/flight-core/src/test/proto/perf.proto similarity index 100% rename from java/flight/flight-core/src/test/protobuf/perf.proto rename to java/flight/flight-core/src/test/proto/perf.proto diff --git a/java/flight/flight-core/src/test/protobuf/test.proto b/java/flight/flight-core/src/test/proto/test.proto similarity index 100% rename from java/flight/flight-core/src/test/protobuf/test.proto rename to java/flight/flight-core/src/test/proto/test.proto diff --git a/java/flight/pom.xml b/java/flight/pom.xml index 2f777ab42b756..5b9caafa82ef9 100644 --- a/java/flight/pom.xml +++ b/java/flight/pom.xml @@ -32,26 +32,6 @@ flight-integration-tests - - - - - org.xolstice.maven.plugins - protobuf-maven-plugin - 0.6.1 - - - com.google.protobuf:protoc:${dep.protobuf-bom.version}:exe:${os.detected.classifier} - - grpc-java - io.grpc:protoc-gen-grpc-java:${dep.grpc-bom.version}:exe:${os.detected.classifier} - - - - - - - pin-mockito-jdk8 diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 819baee11edec..0d2a23345f6ea 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -132,18 +132,15 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.6.1 - - com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} - - proto - + src compile - test-compile + + proto + diff --git a/java/pom.xml b/java/pom.xml index add2823ccb0d2..659ccfca08c76 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -637,6 +637,16 @@ + + org.xolstice.maven.plugins + protobuf-maven-plugin + 0.6.1 + + com.google.protobuf:protoc:${dep.protobuf-bom.version}:exe:${os.detected.classifier} + grpc-java + io.grpc:protoc-gen-grpc-java:${dep.grpc-bom.version}:exe:${os.detected.classifier} + + From edf7e57127766e0e2aa7d14db12d3d3f5f12ecbe Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 28 Mar 2024 12:21:14 -0300 Subject: [PATCH 20/51] MINOR: [C++][Azure][FS] Document some limitations and atomicity guarantees (#40838) ### Rationale for this change Documenting some details of the behavior of destructive filesystem operations. ### What changes are included in this PR? Only docstring changes. ### Are these changes tested? N/A. Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.h | 42 +++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 308347426ae26..350014954f056 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -264,15 +264,35 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { Status CreateDir(const std::string& path, bool recursive) override; + /// \brief Delete a directory and its contents recursively. + /// + /// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts. Status DeleteDir(const std::string& path) override; + /// \brief Non-atomically deletes the contents of a directory. + /// + /// This function can return a bad Status after only partially deleting the + /// contents of the directory. Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + /// \brief Deletion of all the containers in the storage account (not + /// implemented for safety reasons). + /// + /// \return Status::NotImplemented Status DeleteRootDirContents() override; + /// \brief Deletes a file. + /// + /// Supported on both flat namespace and Hierarchical Namespace storage + /// accounts. A check is made to guarantee the parent directory doesn't + /// disappear after the blob is deleted and while this operation is running, + /// no other client can delete the parent directory due to the use of leases. + /// + /// This means applications can safely retry this operation without coordination to + /// guarantee only one client/process is trying to delete the same file. Status DeleteFile(const std::string& path) override; - /// \brief Move / rename a file or directory. + /// \brief Move/rename a file or directory. /// /// There are no files immediately at the root directory, so paths like /// "/segment" always refer to a container of the storage account and are @@ -282,6 +302,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { /// guarantees `dest` is not lost. /// /// Conditions for a successful move: + /// /// 1. `src` must exist. /// 2. `dest` can't contain a strict path prefix of `src`. More generally, /// a directory can't be made a subdirectory of itself. @@ -291,6 +312,25 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { /// 5. If `dest` already exists and it's a directory, `src` must also be a /// directory and `dest` must be empty. `dest` is then replaced by `src` /// and its contents. + /// + /// Leases are used to guarantee the pre-condition checks and the rename + /// operation are atomic: other clients can't invalidate the pre-condition in + /// the time between the checks and the actual rename operation. + /// + /// This is possible because Move() is only support on storage accounts with + /// Hierarchical Namespace Support enabled. + /// + /// ## Limitations + /// + /// - Moves are not supported on storage accounts without + /// Hierarchical Namespace support enabled + /// - Moves across different containers are not supported + /// - Moving a path of the form `/container` is not supported as it would + /// require moving all the files in a container to another container. + /// The only exception is a `Move("/container_a", "/container_b")` where + /// both containers are empty or `container_b` doesn't even exist. + /// The atomicity of the emptiness checks followed by the renaming operation + /// is guaranteed by the use of leases. Status Move(const std::string& src, const std::string& dest) override; Status CopyFile(const std::string& src, const std::string& dest) override; From cf832b8b5dd91ca1b70519fa544f0a44ebdb3bce Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 28 Mar 2024 23:23:59 +0800 Subject: [PATCH 21/51] GH-40863: [C++] Fix TSAN link error for module library (#40864) ### Rationale for this change Module library `arrow_filesystem_example` is introduced in #39067 for filesystem testing: https://github.com/apache/arrow/blob/6cecbab5172b2b339277dde741bfff455646eb32/cpp/src/arrow/testing/CMakeLists.txt#L25 However when built with TSAN, linker flags such as `-fsanitize=thread` is not set, causing the link error in #40863. ### What changes are included in this PR? Add necessary linker flags for module library. ### Are these changes tested? Manually tested. ### Are there any user-facing changes? None. * GitHub Issue: #40863 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/cmake_modules/san-config.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index 2221dc16665ac..8c2983e18b40a 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -78,6 +78,7 @@ if(${ARROW_USE_TSAN}) # Some of the above also need to be passed to the linker. set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pie -fsanitize=thread") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -pie -fsanitize=thread") # Strictly speaking, TSAN doesn't require dynamic linking. But it does # require all code to be position independent, and the easiest way to From 4f39e6eac9f24b37b0866c432c030de2eaef78e1 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 29 Mar 2024 01:17:33 +0800 Subject: [PATCH 22/51] GH-40507: [C++][ORC] Upgrade ORC to 2.0.0 (#40508) ### Rationale for this change This PR aims to upgrade to a new major version of Apache ORC: https://orc.apache.org/news/2024/03/08/ORC-2.0.0/ ### What changes are included in this PR? This PR upgrades ORC dependency from 1.9.2 to 2.0.0. ### Are these changes tested? Pass the CIs. ### Are there any user-facing changes? No. * GitHub Issue: #40507 Lead-authored-by: Antoine Pitrou Co-authored-by: Gang Wu Signed-off-by: Antoine Pitrou --- ci/scripts/python_wheel_macos_build.sh | 9 ++++++++- ci/scripts/python_wheel_manylinux_build.sh | 1 - cpp/thirdparty/versions.txt | 4 ++-- dev/tasks/python-wheels/github.osx.yml | 4 ++++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bea5409100770..a94dac40e931f 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -91,6 +91,13 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-osx-static-${CMAKE_BUILD_TYPE}}} +echo "=== Protobuf compiler versions on PATH ===" +which -a protoc || echo "no protoc on PATH!" + +echo "=== Protobuf compiler version from vcpkg ===" +_pbc=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc +echo "$_pbc: `$_pbc --version`" + mkdir -p ${build_dir}/build pushd ${build_dir}/build @@ -122,6 +129,7 @@ cmake \ -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT} \ -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ -DARROW_USE_CCACHE=ON \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=ON \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ -DARROW_WITH_BZ2=${ARROW_WITH_BZ2} \ -DARROW_WITH_LZ4=${ARROW_WITH_LZ4} \ @@ -134,7 +142,6 @@ cmake \ -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DORC_SOURCE=BUNDLED \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 4d4d4fb694e0b..6e29ef58d2318 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -123,7 +123,6 @@ cmake \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DORC_SOURCE=BUNDLED \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 760b19f71e2e0..4093b0ec43efd 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -90,8 +90,8 @@ ARROW_OPENTELEMETRY_BUILD_VERSION=v1.8.1 ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM=3d640201594b07f08dade9cd1017bd0b59674daca26223b560b9bb6bf56264c2 ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION=v0.17.0 ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM=f269fbcb30e17b03caa1decd231ce826e59d7651c0f71c3b28eb5140b4bb5412 -ARROW_ORC_BUILD_VERSION=1.9.2 -ARROW_ORC_BUILD_SHA256_CHECKSUM=7f46f2c184ecefd6791f1a53fb062286818bd8710c3f08b94dd3cac365e240ee +ARROW_ORC_BUILD_VERSION=2.0.0 +ARROW_ORC_BUILD_SHA256_CHECKSUM=9107730919c29eb39efaff1b9e36166634d1d4d9477e5fee76bfd6a8fec317df ARROW_PROTOBUF_BUILD_VERSION=v21.3 ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0128ce4c946b8c78c8c49f # Because of https://github.com/Tencent/rapidjson/pull/1323, we require diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index 11bdf031f51bd..e7b6d7898103b 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -47,6 +47,10 @@ jobs: brew install bash bison coreutils ninja echo "$(brew --prefix bison)/bin" >> $GITHUB_PATH + - name: Homebrew packages + run: | + brew list + - name: Retrieve VCPKG version from arrow/.env run: | vcpkg_version=$(cat "arrow/.env" | grep "VCPKG" | cut -d "=" -f2 | tr -d '"') From 683a78bb8a7a3ff2e252a70ef00d796a758b4527 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 28 Mar 2024 16:03:49 -0300 Subject: [PATCH 23/51] GH-40870: [C#] Update CompareValidityBuffer() to pass when unspecified final bits are not identical (#40873) ### Rationale for this change Before fixing nanoarrow's testing JSON reader to align with other implementations and properly zero out the last few bits, integration tests failed because C#'s `CompareValidityBuffer()` was comparing the bytes of the validity buffer (including undefined final bits that are maybe not identical due to uninitialized memory or because the arrays are slices). ### What changes are included in this PR? `CompareValidityBuffer()` now compares the memory for all except the last byte and compares the last byte bitwise. ### Are these changes tested? They should be but I am not sure exactly where to add the test! ### Are there any user-facing changes? No * GitHub Issue: #40870 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 2e7488092c2cf..ceeab92860e6f 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -432,12 +432,27 @@ private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer e { Assert.True(expectedValidityBuffer.Span.SequenceEqual(actualValidityBuffer.Span)); } - else if (nullCount != 0) + else if (nullCount != 0 && arrayLength > 0) { int validityBitmapByteCount = BitUtility.ByteCount(arrayLength); + ReadOnlySpan expectedSpanPartial = expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount - 1); + ReadOnlySpan actualSpanPartial = actualValidityBuffer.Span.Slice(0, validityBitmapByteCount - 1); + + // Compare the first validityBitmapByteCount - 1 bytes Assert.True( - expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount).SequenceEqual(actualValidityBuffer.Span.Slice(0, validityBitmapByteCount)), - "Validity buffers do not match."); + expectedSpanPartial.SequenceEqual(actualSpanPartial), + string.Format("First {0} bytes of validity buffer do not match", validityBitmapByteCount - 1)); + + // Compare the last byte bitwise (because there is no guarantee about the value of + // bits outside the range [0, arrayLength]) + ReadOnlySpan expectedSpanFull = expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount); + ReadOnlySpan actualSpanFull = actualValidityBuffer.Span.Slice(0, validityBitmapByteCount); + for (int i = 8 * (validityBitmapByteCount - 1); i < arrayLength; i++) + { + Assert.True( + BitUtility.GetBit(expectedSpanFull, i) == BitUtility.GetBit(actualSpanFull, i), + string.Format("Bit at index {0}/{1} is not equal", i, arrayLength)); + } } } } From 1feb945c1dc61afeaa6bfd412d0c7eaa71a1c139 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 28 Mar 2024 11:26:10 -0800 Subject: [PATCH 24/51] GH-40858: [R] Remove dangling commas from codegen.R (#40859) ### Rationale for this change This is a draft PR fixing https://github.com/apache/arrow/issues/40858, though I'm not sure how or why this broke (or worked correctly). Fixes #40858 ### Are these changes tested? These have been tested locally. * GitHub Issue: #40858 Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- r/data-raw/codegen.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index e8d53467d4589..4f027a3d9ddc7 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -145,7 +145,7 @@ cpp_functions_definitions <- arrow_exports %>% // {basename(file)} {ifdef_wrap(cpp11_wrapped, name, sexp_signature, decoration)} ", - sep = "\n", + sep = "\n" ) }) %>% glue_collapse(sep = "\n") @@ -176,7 +176,7 @@ arrow_exports_cpp <- paste0( static const R_CallMethodDef CallEntries[] = { ", glue::glue_collapse(glue::glue( - '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},', + '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},' ), sep = "\n"), glue::glue("\n {cpp_functions_registration} @@ -217,7 +217,7 @@ r_functions <- arrow_exports %>% ", list_params = glue_collapse_data(args, "{name}"), - sep = "\n", + sep = "\n" ) }) %>% glue_collapse(sep = "\n") From 950fbb62ce7388aad926c5af5861bf07f7db6de1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 28 Mar 2024 15:59:14 -0400 Subject: [PATCH 25/51] GH-40733: [Go] Require Go 1.21 or later (#40848) ### Rationale for this change Bumping to require Go 1.21 or later as 1.20 is EOL * GitHub Issue: #40733 Authored-by: Matt Topol Signed-off-by: Matt Topol --- .env | 4 +- .github/workflows/go.yml | 28 +++++---- ci/docker/conda-integration.dockerfile | 2 +- ci/docker/debian-12-go.dockerfile | 4 +- dev/release/verify-release-candidate.sh | 6 +- dev/tasks/tasks.yml | 2 +- go/arrow/bitutil/bitutil.go | 35 +----------- .../bitutil/bitutil_bytes.go} | 26 ++++----- go/arrow/cdata/cdata_allocate.go | 57 +++++++++++++++++++ go/arrow/cdata/cdata_exports.go | 55 ------------------ go/arrow/compute/exec/span.go | 17 ------ go/arrow/compute/exec/span_offsets.go | 36 ++++++++++++ go/arrow/compute/fieldref.go | 17 ------ go/arrow/compute/fieldref_hash.go | 39 +++++++++++++ go/arrow/doc.go | 2 - .../flight/flightsql/driver/driver_test.go | 1 + go/arrow/memory/mallocator/mallocator.go | 11 ++-- go/arrow/memory/mallocator/mallocator_util.go | 26 +++++++++ go/go.mod | 2 +- go/internal/hashing/hash_string.go | 4 ++ go/internal/hashing/xxh3_memo_table.go | 9 +-- go/parquet/types.go | 44 +++----------- 22 files changed, 219 insertions(+), 208 deletions(-) rename go/{internal/hashing/hash_string_go1.19.go => arrow/bitutil/bitutil_bytes.go} (58%) create mode 100644 go/arrow/cdata/cdata_allocate.go create mode 100644 go/arrow/compute/exec/span_offsets.go create mode 100644 go/arrow/compute/fieldref_hash.go create mode 100644 go/arrow/memory/mallocator/mallocator_util.go diff --git a/.env b/.env index b5c66563f5f7d..298c100c094b0 100644 --- a/.env +++ b/.env @@ -58,8 +58,8 @@ CUDA=11.2.2 DASK=latest DOTNET=7.0 GCC_VERSION="" -GO=1.19.13 -STATICCHECK=v0.4.5 +GO=1.21.8 +STATICCHECK=v0.4.7 HDFS=3.2.1 JDK=8 KARTOTHEK=latest diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 47148d9568c18..7ff781d35e8ec 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -59,13 +59,13 @@ jobs: { "arch-label": "AMD64", "arch": "amd64", - "go": "1.19", + "go": "1.21", "runs-on": "ubuntu-latest" }, { "arch-label": "AMD64", "arch": "amd64", - "go": "1.20", + "go": "1.22", "runs-on": "ubuntu-latest" } JSON @@ -75,13 +75,13 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", - "go": "1.19", + "go": "1.21", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", - "go": "1.20", + "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] } JSON @@ -169,10 +169,13 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Get required Go version + run: | + (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - name: Install Go uses: actions/setup-go@v5 with: - go-version: 1.19 + go-version: "${{ env.GO_VERSION }}" cache: true cache-dependency-path: go/go.sum - name: Run build @@ -188,7 +191,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] env: GO: ${{ matrix.go }} steps: @@ -229,7 +232,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] env: GO: ${{ matrix.go }} steps: @@ -268,7 +271,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -301,7 +304,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -359,7 +362,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] env: ARROW_GO_TESTCGO: "1" steps: @@ -428,6 +431,9 @@ jobs: shell: msys2 {0} run: | ci/scripts/msys2_setup.sh cgo + - name: Get required Go version + run: | + (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - name: Update CGO Env vars shell: msys2 {0} run: | @@ -437,7 +443,7 @@ jobs: - name: Install go uses: actions/setup-go@v5 with: - go-version: '1.19' + go-version: "${{ env.GO_VERSION }}" cache: true cache-dependency-path: go/go.sum - name: Install staticcheck diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 8406a419c06ab..a747ccbc7262f 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -24,7 +24,7 @@ ARG maven=3.8.7 ARG node=16 ARG yarn=1.22 ARG jdk=8 -ARG go=1.19.13 +ARG go=1.21.8 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile index 7c077910a67a0..c958e6bdee211 100644 --- a/ci/docker/debian-12-go.dockerfile +++ b/ci/docker/debian-12-go.dockerfile @@ -16,8 +16,8 @@ # under the License. ARG arch=amd64 -ARG go=1.19 -ARG staticcheck=v0.4.5 +ARG go=1.21 +ARG staticcheck=v0.4.7 FROM ${arch}/golang:${go}-bookworm # FROM collects all the args, get back the staticcheck version arg diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index d74ce1f67066d..e7d78328ed16c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -24,7 +24,7 @@ # - JDK >=8 # - gcc >= 4.8 # - Node.js >= 18 -# - Go >= 1.19 +# - Go >= 1.21 # - Docker # # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to @@ -405,7 +405,7 @@ install_go() { return 0 fi - local version=1.19.13 + local version=1.21.8 show_info "Installing go version ${version}..." local arch="$(uname -m)" @@ -953,7 +953,7 @@ test_go() { show_header "Build and test Go libraries" maybe_setup_go - maybe_setup_conda compilers go=1.19 + maybe_setup_conda compilers go=1.21 pushd go go get -v ./... diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 15b687b2d2fad..2abfbc15174df 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1415,7 +1415,7 @@ tasks: R_PRUNE_DEPS: TRUE image: fedora-r-clang-sanitizer - {% for go_version, staticcheck in [("1.19", "v0.4.5"), ("1.21", "latest")] %} + {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} test-debian-12-go-{{ go_version }}: ci: azure template: docker-tests/azure.linux.yml diff --git a/go/arrow/bitutil/bitutil.go b/go/arrow/bitutil/bitutil.go index 82747ee1417b8..6a8f75410363f 100644 --- a/go/arrow/bitutil/bitutil.go +++ b/go/arrow/bitutil/bitutil.go @@ -19,7 +19,6 @@ package bitutil import ( "math" "math/bits" - "reflect" "unsafe" "github.com/apache/arrow/go/v16/arrow/memory" @@ -99,8 +98,6 @@ func countSetBitsWithOffset(buf []byte, offset, n int) int { count := 0 beg := offset - end := offset + n - begU8 := roundUp(beg, uint64SizeBits) init := min(n, begU8-beg) @@ -110,27 +107,8 @@ func countSetBitsWithOffset(buf []byte, offset, n int) int { } } - nU64 := (n - init) / uint64SizeBits - begU64 := begU8 / uint64SizeBits - endU64 := begU64 + nU64 - bufU64 := bytesToUint64(buf) - if begU64 < len(bufU64) { - for _, v := range bufU64[begU64:endU64] { - count += bits.OnesCount64(v) - } - } - - // FIXME: use a fallback to bits.OnesCount8 - // before counting the tail bits. - - tail := beg + init + nU64*uint64SizeBits - for i := tail; i < end; i++ { - if BitIsSet(buf, i) { - count++ - } - } - - return count + begU64 := BytesForBits(int64(beg + init)) + return count + CountSetBits(buf[begU64:], 0, n-init) } func roundUp(v, f int) int { @@ -149,15 +127,6 @@ const ( uint64SizeBits = uint64SizeBytes * 8 ) -func bytesToUint64(b []byte) []uint64 { - if cap(b) < uint64SizeBytes { - return nil - } - - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - return unsafe.Slice((*uint64)(unsafe.Pointer(h.Data)), cap(b)/uint64SizeBytes)[:len(b)/uint64SizeBytes] -} - var ( // PrecedingBitmask is a convenience set of values as bitmasks for checking // prefix bits of a byte diff --git a/go/internal/hashing/hash_string_go1.19.go b/go/arrow/bitutil/bitutil_bytes.go similarity index 58% rename from go/internal/hashing/hash_string_go1.19.go rename to go/arrow/bitutil/bitutil_bytes.go index f38eb5c523dde..09dd5cbc67d39 100644 --- a/go/internal/hashing/hash_string_go1.19.go +++ b/go/arrow/bitutil/bitutil_bytes.go @@ -14,24 +14,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -//go:build !go1.20 && !tinygo +//go:build go1.20 || tinygo -package hashing +package bitutil import ( - "reflect" "unsafe" ) -func hashString(val string, alg uint64) uint64 { - if val == "" { - return Hash([]byte{}, alg) +func bytesToUint64(b []byte) []uint64 { + if len(b) < uint64SizeBytes { + return nil } - // highly efficient way to get byte slice without copy before - // the introduction of unsafe.StringData in go1.20 - // (https://stackoverflow.com/questions/59209493/how-to-use-unsafe-get-a-byte-slice-from-a-string-without-memory-copy) - const MaxInt32 = 1<<31 - 1 - buf := (*[MaxInt32]byte)(unsafe.Pointer((*reflect.StringHeader)( - unsafe.Pointer(&val)).Data))[: len(val)&MaxInt32 : len(val)&MaxInt32] - return Hash(buf, alg) + + ptr := unsafe.SliceData(b) + if ptr == nil { + return nil + } + + return unsafe.Slice((*uint64)(unsafe.Pointer(ptr)), + len(b)/uint64SizeBytes) } diff --git a/go/arrow/cdata/cdata_allocate.go b/go/arrow/cdata/cdata_allocate.go new file mode 100644 index 0000000000000..da0bd957de1df --- /dev/null +++ b/go/arrow/cdata/cdata_allocate.go @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package cdata + +// #include +// #include "arrow/c/abi.h" +import "C" + +import ( + "unsafe" +) + +func allocateArrowSchemaArr(n int) (out []CArrowSchema) { + return unsafe.Slice((*CArrowSchema)(C.calloc(C.size_t(n), + C.sizeof_struct_ArrowSchema)), n) +} + +func allocateArrowSchemaPtrArr(n int) (out []*CArrowSchema) { + return unsafe.Slice((**CArrowSchema)(C.calloc(C.size_t(n), + C.size_t(unsafe.Sizeof((*CArrowSchema)(nil))))), n) +} + +func allocateArrowArrayArr(n int) (out []CArrowArray) { + return unsafe.Slice((*CArrowArray)(C.calloc(C.size_t(n), + C.sizeof_struct_ArrowArray)), n) +} + +func allocateArrowArrayPtrArr(n int) (out []*CArrowArray) { + return unsafe.Slice((**CArrowArray)(C.calloc(C.size_t(n), + C.size_t(unsafe.Sizeof((*CArrowArray)(nil))))), n) +} + +func allocateBufferPtrArr(n int) (out []*C.void) { + return unsafe.Slice((**C.void)(C.calloc(C.size_t(n), + C.size_t(unsafe.Sizeof((*C.void)(nil))))), n) +} + +func allocateBufferSizeArr(n int) (out []C.int64_t) { + return unsafe.Slice((*C.int64_t)(C.calloc(C.size_t(n), + C.sizeof_int64_t)), n) +} diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go index d59c87712eedf..fecc8610bf2a0 100644 --- a/go/arrow/cdata/cdata_exports.go +++ b/go/arrow/cdata/cdata_exports.go @@ -39,7 +39,6 @@ import ( "bytes" "encoding/binary" "fmt" - "reflect" "runtime/cgo" "strconv" "strings" @@ -291,60 +290,6 @@ func (exp *schemaExporter) export(field arrow.Field) { exp.exportMeta(&field.Metadata) } -func allocateArrowSchemaArr(n int) (out []CArrowSchema) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.sizeof_struct_ArrowSchema)) - s.Len = n - s.Cap = n - - return -} - -func allocateArrowSchemaPtrArr(n int) (out []*CArrowSchema) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*CArrowSchema)(nil))))) - s.Len = n - s.Cap = n - - return -} - -func allocateArrowArrayArr(n int) (out []CArrowArray) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.sizeof_struct_ArrowArray)) - s.Len = n - s.Cap = n - - return -} - -func allocateArrowArrayPtrArr(n int) (out []*CArrowArray) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*CArrowArray)(nil))))) - s.Len = n - s.Cap = n - - return -} - -func allocateBufferPtrArr(n int) (out []*C.void) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*C.void)(nil))))) - s.Len = n - s.Cap = n - - return -} - -func allocateBufferSizeArr(n int) (out []C.int64_t) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof(int64(0))))) - s.Len = n - s.Cap = n - - return -} - func (exp *schemaExporter) finish(out *CArrowSchema) { out.dictionary = nil if exp.dict != nil { diff --git a/go/arrow/compute/exec/span.go b/go/arrow/compute/exec/span.go index 6f9bb240e3469..4425784f25c94 100644 --- a/go/arrow/compute/exec/span.go +++ b/go/arrow/compute/exec/span.go @@ -19,7 +19,6 @@ package exec import ( - "reflect" "sync/atomic" "unsafe" @@ -250,22 +249,6 @@ func (a *ArraySpan) resizeChildren(i int) { } } -// convenience function for populating the offsets buffer from a scalar -// value's size. -func setOffsetsForScalar[T int32 | int64](span *ArraySpan, buf []T, valueSize int64, bufidx int) { - buf[0] = 0 - buf[1] = T(valueSize) - - b := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) - s := (*reflect.SliceHeader)(unsafe.Pointer(&span.Buffers[bufidx].Buf)) - s.Data = b.Data - s.Len = 2 * int(unsafe.Sizeof(T(0))) - s.Cap = s.Len - - span.Buffers[bufidx].Owner = nil - span.Buffers[bufidx].SelfAlloc = false -} - // FillFromScalar populates this ArraySpan as if it were a 1 length array // with the single value equal to the passed in Scalar. func (a *ArraySpan) FillFromScalar(val scalar.Scalar) { diff --git a/go/arrow/compute/exec/span_offsets.go b/go/arrow/compute/exec/span_offsets.go new file mode 100644 index 0000000000000..d2d0398884c9d --- /dev/null +++ b/go/arrow/compute/exec/span_offsets.go @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package exec + +import ( + "unsafe" +) + +// convenience function for populating the offsets buffer from a scalar +// value's size. +func setOffsetsForScalar[T int32 | int64](span *ArraySpan, buf []T, valueSize int64, bufidx int) { + buf[0] = 0 + buf[1] = T(valueSize) + + span.Buffers[bufidx].Buf = unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(buf))), + 2*int(unsafe.Sizeof(T(0)))) + + span.Buffers[bufidx].Owner = nil + span.Buffers[bufidx].SelfAlloc = false +} diff --git a/go/arrow/compute/fieldref.go b/go/arrow/compute/fieldref.go index ab6d856f85f0d..0c55c36dab243 100644 --- a/go/arrow/compute/fieldref.go +++ b/go/arrow/compute/fieldref.go @@ -20,12 +20,10 @@ import ( "errors" "fmt" "hash/maphash" - "math/bits" "reflect" "strconv" "strings" "unicode" - "unsafe" "github.com/apache/arrow/go/v16/arrow" "github.com/apache/arrow/go/v16/arrow/array" @@ -168,21 +166,6 @@ func (f FieldPath) GetColumn(batch arrow.Record) (arrow.Array, error) { return f.getArray(batch.Columns()) } -func (f FieldPath) hash(h *maphash.Hash) { - raw := (*reflect.SliceHeader)(unsafe.Pointer(&f)).Data - - var b []byte - s := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - s.Data = raw - if bits.UintSize == 32 { - s.Len = arrow.Int32Traits.BytesRequired(len(f)) - } else { - s.Len = arrow.Int64Traits.BytesRequired(len(f)) - } - s.Cap = s.Len - h.Write(b) -} - func (f FieldPath) findAll(fields []arrow.Field) []FieldPath { _, err := f.GetFieldFromSlice(fields) if err == nil { diff --git a/go/arrow/compute/fieldref_hash.go b/go/arrow/compute/fieldref_hash.go new file mode 100644 index 0000000000000..dace05788bb46 --- /dev/null +++ b/go/arrow/compute/fieldref_hash.go @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package compute + +import ( + "hash/maphash" + "math/bits" + "unsafe" + + "github.com/apache/arrow/go/v16/arrow" +) + +func (f FieldPath) hash(h *maphash.Hash) { + raw := unsafe.Pointer(unsafe.SliceData(f)) + var byteLen int + if bits.UintSize == 32 { + byteLen = arrow.Int32Traits.BytesRequired(len(f)) + } else { + byteLen = arrow.Int64Traits.BytesRequired(len(f)) + } + + h.Write(unsafe.Slice((*byte)(raw), byteLen)) +} diff --git a/go/arrow/doc.go b/go/arrow/doc.go index 2f7c8c2acf1ce..19f24c5d0b8c3 100644 --- a/go/arrow/doc.go +++ b/go/arrow/doc.go @@ -30,8 +30,6 @@ array is valid (not null). If the array has no null entries, it is possible to o # Requirements -Despite the go.mod stating go1.20, everything is able to be built with go1.19 or higher. - To build with tinygo include the noasm build tag. */ package arrow diff --git a/go/arrow/flight/flightsql/driver/driver_test.go b/go/arrow/flight/flightsql/driver/driver_test.go index 79955f6099f8a..11b9036519d79 100644 --- a/go/arrow/flight/flightsql/driver/driver_test.go +++ b/go/arrow/flight/flightsql/driver/driver_test.go @@ -619,6 +619,7 @@ func (s *SqlTestSuite) TestRowsPrematureCloseDuringNextLoop() { require.NoError(t, err) require.Equal(t, int64(rowCount), insertedRows) + time.Sleep(200 * time.Millisecond) // Do query const sqlSelectAll = `SELECT id, name, value FROM ` + tableName diff --git a/go/arrow/memory/mallocator/mallocator.go b/go/arrow/memory/mallocator/mallocator.go index a111f009ec52d..59d240a1063e8 100644 --- a/go/arrow/memory/mallocator/mallocator.go +++ b/go/arrow/memory/mallocator/mallocator.go @@ -30,7 +30,6 @@ package mallocator import "C" import ( - "reflect" "sync/atomic" "unsafe" ) @@ -70,18 +69,18 @@ func (alloc *Mallocator) Allocate(size int) []byte { } func (alloc *Mallocator) Free(b []byte) { - sh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - C.free(unsafe.Pointer(sh.Data)) + sz := len(b) + C.free(getPtr(b)) // Subtract sh.Len via two's complement (since atomic doesn't offer subtract) - atomic.AddUint64(&alloc.allocatedBytes, ^(uint64(sh.Len) - 1)) + atomic.AddUint64(&alloc.allocatedBytes, ^(uint64(sz) - 1)) } func (alloc *Mallocator) Reallocate(size int, b []byte) []byte { if size < 0 { panic("mallocator: negative size") } - sh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - ptr, err := C.realloc_and_initialize(unsafe.Pointer(sh.Data), C.size_t(sh.Cap), C.size_t(size)) + cp := cap(b) + ptr, err := C.realloc_and_initialize(getPtr(b), C.size_t(cp), C.size_t(size)) if err != nil { panic(err) } else if ptr == nil && size != 0 { diff --git a/go/arrow/memory/mallocator/mallocator_util.go b/go/arrow/memory/mallocator/mallocator_util.go new file mode 100644 index 0000000000000..0ab5f8f515e17 --- /dev/null +++ b/go/arrow/memory/mallocator/mallocator_util.go @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build go1.20 || tinygo + +package mallocator + +import "unsafe" + +func getPtr(b []byte) unsafe.Pointer { + return unsafe.Pointer(unsafe.SliceData(b)) +} diff --git a/go/go.mod b/go/go.mod index 5c297c74d6080..2f788c5c26b02 100644 --- a/go/go.mod +++ b/go/go.mod @@ -16,7 +16,7 @@ module github.com/apache/arrow/go/v16 -go 1.20 +go 1.21 require ( github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c diff --git a/go/internal/hashing/hash_string.go b/go/internal/hashing/hash_string.go index b772c7d7f8998..c8579c1ec5eaa 100644 --- a/go/internal/hashing/hash_string.go +++ b/go/internal/hashing/hash_string.go @@ -24,3 +24,7 @@ func hashString(val string, alg uint64) uint64 { buf := unsafe.Slice(unsafe.StringData(val), len(val)) return Hash(buf, alg) } + +func strToBytes(v string) []byte { + return unsafe.Slice(unsafe.StringData(v), len(v)) +} diff --git a/go/internal/hashing/xxh3_memo_table.go b/go/internal/hashing/xxh3_memo_table.go index 283bc1a953f05..fbb8b33531bbd 100644 --- a/go/internal/hashing/xxh3_memo_table.go +++ b/go/internal/hashing/xxh3_memo_table.go @@ -22,7 +22,6 @@ package hashing import ( "bytes" "math" - "reflect" "unsafe" ) @@ -183,13 +182,7 @@ func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { case ByteSlice: return v.Bytes() case string: - var out []byte - h := (*reflect.StringHeader)(unsafe.Pointer(&v)) - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = h.Data - s.Len = h.Len - s.Cap = h.Len - return out + return strToBytes(v) default: panic("invalid type for binarymemotable") } diff --git a/go/parquet/types.go b/go/parquet/types.go index 8742c3ba8bfba..5447e793b4ea6 100644 --- a/go/parquet/types.go +++ b/go/parquet/types.go @@ -95,27 +95,13 @@ type int96Traits struct{} func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n } func (int96Traits) CastFromBytes(b []byte) []Int96 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []Int96 - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / Int96SizeBytes - s.Cap = h.Cap / Int96SizeBytes - - return res + return unsafe.Slice((*Int96)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)/Int96SizeBytes) } func (int96Traits) CastToBytes(b []Int96) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []byte - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len * Int96SizeBytes - s.Cap = h.Cap * Int96SizeBytes - - return res + return unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)*Int96SizeBytes) } // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice @@ -142,15 +128,8 @@ func (byteArrayTraits) BytesRequired(n int) int { } func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []ByteArray - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / ByteArraySizeBytes - s.Cap = h.Cap / ByteArraySizeBytes - - return res + return unsafe.Slice((*ByteArray)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)/ByteArraySizeBytes) } // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice @@ -177,15 +156,8 @@ func (fixedLenByteArrayTraits) BytesRequired(n int) int { } func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []FixedLenByteArray - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / FixedLenByteArraySizeBytes - s.Cap = h.Cap / FixedLenByteArraySizeBytes - - return res + return unsafe.Slice((*FixedLenByteArray)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)/FixedLenByteArraySizeBytes) } // Creating our own enums allows avoiding the transitive dependency on the From 7d1111214d70e2fd069962efb4d8d42a2829e95b Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 28 Mar 2024 16:05:03 -0400 Subject: [PATCH 26/51] GH-40847: [Go] update readme (#40877) ### Rationale for this change Remove reference to deleted internal package * GitHub Issue: #40847 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/README.md b/go/README.md index 4a9e151ddf234..20bd7cd77575e 100644 --- a/go/README.md +++ b/go/README.md @@ -87,8 +87,8 @@ advanced optimizer and generate PLAN9 assembly functions from C/C++ code. The arrow package can be compiled without these optimizations using the `noasm` build tag. Alternatively, by configuring an environment variable, it is possible to dynamically configure which architecture optimizations are used at -runtime. See the `cpu` package [README](arrow/internal/cpu/README.md) for a -description of this environment variable. +runtime. We use the (cpu)[https://pkg.go.dev/golang.org/x/sys/cpu] package to +check dynamically for these features. ### Example Usage From 29314394d3c17e332cb3bb42464dd20888d88a74 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Mar 2024 06:07:08 +0900 Subject: [PATCH 27/51] MINOR: [Java] Bump org.apache.maven.plugins:maven-surefire-plugin from 3.2.3 to 3.2.5 in /java (#40525) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.2.3 to 3.2.5.
Release notes

Sourced from org.apache.maven.plugins:maven-surefire-plugin's releases.

3.2.5

JIRA link

Release Notes - Maven Surefire - Version 3.2.5


What's Changed

... (truncated)

Commits
  • 4b3a271 [maven-release-plugin] prepare release surefire-3.2.5
  • eb3f1d9 Bump org.codehaus.plexus:plexus-component-metadata from 2.1.1 to 2.2.0
  • 430c406 Bump org.assertj:assertj-core from 3.24.2 to 3.25.1
  • 2d92f2d [SUREFIRE-2231] JaCoCo 0.8.11 fails with old TestNG releases on Java 17+
  • 3290740 Bump org.apache.maven.plugins:maven-docck-plugin from 1.1 to 1.2
  • 25a9776 Bump net.java.dev.javacc:javacc from 7.0.12 to 7.0.13
  • 7752f7e Bump commons-io:commons-io from 2.15.0 to 2.15.1
  • 8874add Revert "Bump jacocoVersion from 0.8.8 to 0.8.11"
  • c0f7755 Fix formatting
  • e5f4545 Bump jacocoVersion from 0.8.8 to 0.8.11
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-surefire-plugin&package-manager=maven&previous-version=3.2.3&new-version=3.2.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/performance/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index d3bba882a0898..3f69be32a20e5 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -207,7 +207,7 @@ maven-surefire-plugin - 3.2.3 + 3.2.5 diff --git a/java/pom.xml b/java/pom.xml index 659ccfca08c76..850b4d0508539 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -454,7 +454,7 @@ maven-surefire-plugin - 3.2.3 + 3.2.5 org.junit.jupiter From 50ca7a76d38e6ecf19589bc44f46bffd1db0d4c8 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 28 Mar 2024 17:09:18 -0400 Subject: [PATCH 28/51] GH-40716: [Java][Integration] Fix test_package_java in verification scripts (#40724) ### Rationale for this change JPMS changed the location of JNI libs in the dist dir. ### What changes are included in this PR? * Update the dist path in the verification script ### Are these changes tested? CI ### Are there any user-facing changes? No * GitHub Issue: #40716 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- dev/release/verify-release-candidate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index e7d78328ed16c..f18b18aaa997c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -642,8 +642,8 @@ test_package_java() { normalized_arch=x86_64 ;; esac - mkdir -p ${dist_dir}/${normalized_arch}/ - mv ${install_dir}/lib/* ${dist_dir}/${normalized_arch}/ + mkdir -p ${dist_dir} + mv ${install_dir}/lib/* ${dist_dir} mvn install \ -Darrow.c.jni.dist.dir=${dist_dir} \ -Parrow-c-data From ed8c3630dbe2261bed9123a4ccfc7df0e3f031bd Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 29 Mar 2024 08:29:28 +0100 Subject: [PATCH 29/51] GH-40841: [Docs][C++][Python] Add initial documentation for RecordBatch::Tensor conversion (#40842) ### Rationale for this change The work on the conversion from `Table`/`RecordBatch` to `Tensor` is progressing and we have to make sure to add information to the documentation. ### What changes are included in this PR? I propose to add - new page (`converting_recordbatch_to_tensor.rst`) in the `cpp/examples` section, - added section (Conversion of RecordBatch do Tensor) in the `docs/source/python/data.rst` the content above would be updated as the features are added in the future (row-major conversion, `Table::ToTensor`, DLPack support for `Tensor` class, etc.) ### Are these changes tested? It will be tested with the crossbow preview-docs job. ### Are there any user-facing changes? No, just documentation. * GitHub Issue: #40841 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .../converting_recordbatch_to_tensor.rst | 46 ++++++++++++++++ docs/source/cpp/examples/index.rst | 1 + docs/source/python/data.rst | 52 +++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 docs/source/cpp/examples/converting_recordbatch_to_tensor.rst diff --git a/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst new file mode 100644 index 0000000000000..2be27096cf973 --- /dev/null +++ b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst @@ -0,0 +1,46 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conversion of ``RecordBatch`` to ``Tensor`` instances +===================================================== + +Arrow provides a method to convert ``RecordBatch`` objects to a ``Tensor`` +with two dimensions: + +.. code:: + + std::shared_ptr batch; + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK(tensor->Validate()); + +The conversion supports signed and unsigned integer types plus float types. +In case the ``RecordBatch`` has null values the conversion succeeds if +``null_to_nan`` parameter is set to ``true``. In this case all +types will be promoted to a floating-point data type. + +.. code:: + + std::shared_ptr batch; + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor->Validate()); + +Currently only column-major conversion is supported. diff --git a/docs/source/cpp/examples/index.rst b/docs/source/cpp/examples/index.rst index b886a0d29e8da..90b00bbdf6ac7 100644 --- a/docs/source/cpp/examples/index.rst +++ b/docs/source/cpp/examples/index.rst @@ -27,3 +27,4 @@ Examples dataset_skyhook_scan_example row_columnar_conversion std::tuple-like ranges to Arrow + Converting RecordBatch to Tensor diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 2cc33561d40b6..9156157fcd0c2 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -560,3 +560,55 @@ schema without having to get any of the batches.:: x: int64 It can also be sent between languages using the :ref:`C stream interface `. + +Conversion of RecordBatch do Tensor +----------------------------------- + +Each array of the ``RecordBatch`` has it's own contiguous memory that is not necessarily +adjacent to other arrays. A different memory structure that is used in machine learning +libraries is a two dimensional array (also called a 2-dim tensor or a matrix) which takes +only one contiguous block of memory. + +For this reason there is a function ``pyarrow.RecordBatch.to_tensor()`` available +to efficiently convert tabular columnar data into a tensor. + +Data types supported in this conversion are unsigned, signed integer and float +types. Currently only column-major conversion is supported. + + >>> import pyarrow as pa + >>> arr1 = [1, 2, 3, 4, 5] + >>> arr2 = [10, 20, 30, 40, 50] + >>> batch = pa.RecordBatch.from_arrays( + ... [ + ... pa.array(arr1, type=pa.uint16()), + ... pa.array(arr2, type=pa.int16()), + ... ], ["a", "b"] + ... ) + >>> batch.to_tensor() + + type: int32 + shape: (9, 2) + strides: (4, 36) + >>> batch.to_tensor().to_numpy() + array([[ 1, 10], + [ 2, 20], + [ 3, 30], + [ 4, 40], + [ 5, 50]], dtype=int32) + +With ``null_to_nan`` set to ``True`` one can also convert data with +nulls. They will be converted to ``NaN``: + + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], names = ["a", "b"] + ... ) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) From 96f686b81ba148f4d434846f0b9e161c538f131d Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 29 Mar 2024 08:30:03 +0100 Subject: [PATCH 30/51] GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add option to cast NULL to NaN (#40803) ### Rationale for this change The conversion from `RecordBatch` to `Tensor` class exists but it doesn't support record batches with validity bitmaps. This PR adds support for an option to convert null values to NaN. ### What changes are included in this PR? This PR adds a `nul_to_nan` option in `RecordBatch::ToTensor` so that null values are converted to NaN in the resulting `Tensor`. This for example works: ```python >>> import pyarrow as pa >>> batch = pa.record_batch( ... [ ... pa.array([1, 2, 3, 4, None], type=pa.int32()), ... pa.array([10, 20, 30, 40, None], type=pa.float32()), ... ], names = ["a", "b"] ... ) >>> batch pyarrow.RecordBatch a: int32 b: float ---- a: [1,2,3,4,null] b: [10,20,30,40,null] >>> batch.to_tensor(null_to_nan=True) type: double shape: (5, 2) strides: (8, 40) >>> batch.to_tensor(null_to_nan=True).to_numpy() array([[ 1., 10.], [ 2., 20.], [ 3., 30.], [ 4., 40.], [nan, nan]]) ``` but default would raise: ```python >>> batch.to_tensor() Traceback (most recent call last): File "", line 1, in File "pyarrow/table.pxi", line 3421, in pyarrow.lib.RecordBatch.to_tensor a: int32 File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status return check_status(status) File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status raise convert_status(status) pyarrow.lib.ArrowTypeError: Can only convert a RecordBatch with no nulls. Set null_to_nan to true to convert nulls to nan ``` ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40061 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/record_batch.cc | 47 ++++++++++++----- cpp/src/arrow/record_batch.h | 6 ++- cpp/src/arrow/record_batch_test.cc | 76 +++++++++++++++++++++++++++- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/table.pxi | 49 ++++++++++++++++-- python/pyarrow/tests/test_table.py | 48 +++++++++++++++++- 6 files changed, 208 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 0d8bda9b66e24..6f3b8e75a20d0 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -18,6 +18,7 @@ #include "arrow/record_batch.h" #include +#include #include #include #include @@ -261,12 +262,19 @@ struct ConvertColumnsToTensorVisitor { using In = typename T::c_type; auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); - if constexpr (std::is_same_v) { - memcpy(out_values, in_values.data(), in_values.size_bytes()); - out_values += in_values.size(); + if (in_data.null_count == 0) { + if constexpr (std::is_same_v) { + memcpy(out_values, in_values.data(), in_values.size_bytes()); + out_values += in_values.size(); + } else { + for (In in_value : in_values) { + *out_values++ = static_cast(in_value); + } + } } else { - for (In in_value : in_values) { - *out_values++ = static_cast(in_value); + for (int64_t i = 0; i < in_data.length; ++i) { + *out_values++ = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); } } return Status::OK(); @@ -286,16 +294,20 @@ inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) { } } -Result> RecordBatch::ToTensor(MemoryPool* pool) const { +Result> RecordBatch::ToTensor(bool null_to_nan, + MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( "Conversion to Tensor for RecordBatches without columns/schema is not " "supported."); } // Check for no validity bitmap of each field + // if null_to_nan conversion is set to false for (int i = 0; i < num_columns(); ++i) { - if (column(i)->null_count() > 0) { - return Status::TypeError("Can only convert a RecordBatch with no nulls."); + if (column(i)->null_count() > 0 && !null_to_nan) { + return Status::TypeError( + "Can only convert a RecordBatch with no nulls. Set null_to_nan to true to " + "convert nulls to NaN"); } } @@ -308,12 +320,12 @@ Result> RecordBatch::ToTensor(MemoryPool* pool) const { std::shared_ptr result_field = schema_->field(0); std::shared_ptr result_type = result_field->type(); - if (num_columns() > 1) { - Field::MergeOptions options; - options.promote_integer_to_float = true; - options.promote_integer_sign = true; - options.promote_numeric_width = true; + Field::MergeOptions options; + options.promote_integer_to_float = true; + options.promote_integer_sign = true; + options.promote_numeric_width = true; + if (num_columns() > 1) { for (int i = 1; i < num_columns(); ++i) { if (!is_numeric(column(i)->type()->id())) { return Status::TypeError("DataType is not supported: ", @@ -334,6 +346,15 @@ Result> RecordBatch::ToTensor(MemoryPool* pool) const { result_type = result_field->type(); } + // Check if result_type is signed or unsigned integer and null_to_nan is set to true + // Then all columns should be promoted to float type + if (is_integer(result_type->id()) && null_to_nan) { + ARROW_ASSIGN_OR_RAISE( + result_field, + result_field->MergeWith(field(result_field->name(), float32()), options)); + result_type = result_field->type(); + } + // Allocate memory ARROW_ASSIGN_OR_RAISE( std::shared_ptr result, diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 16d721caad443..5202ff4abfa0b 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -85,8 +85,12 @@ class ARROW_EXPORT RecordBatch { /// Create a Tensor object with shape (number of rows, number of columns) and /// strides (type size in bytes, type size in bytes * number of rows). /// Generated Tensor will have column-major layout. + /// + /// \param[in] null_to_nan if true, convert nulls to NaN + /// \param[in] pool the memory pool to allocate the tensor buffer + /// \return the resulting Tensor Result> ToTensor( - MemoryPool* pool = default_memory_pool()) const; + bool null_to_nan = false, MemoryPool* pool = default_memory_pool()) const; /// \brief Construct record batch from struct array /// diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 81154452d7229..7e0eb1d460555 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -667,7 +667,8 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Can only convert a RecordBatch with no nulls.", + "Type error: Can only convert a RecordBatch with no nulls. " + "Set null_to_nan to true to convert nulls to NaN", batch->ToTensor()); } @@ -740,6 +741,79 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { CheckTensor(tensor, 18, shape, f_strides); } +TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { + const int length = 9; + + // int32 + float32 = float64 + auto f0 = field("f0", int32()); + auto f1 = field("f1", float32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ArrayFromJSON(int32(), "[null, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a1 = ArrayFromJSON(float32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); + + auto batch = RecordBatch::Make(schema, length, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 2}; + const int64_t f64_size = sizeof(double); + std::vector f_strides = {f64_size, f64_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + float64(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_FALSE(tensor_expected->Equals(*tensor)); + EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); + + CheckTensor(tensor, 18, shape, f_strides); + + // int32 -> float64 + auto f2 = field("f2", int32()); + + std::vector> fields1 = {f0, f2}; + auto schema1 = ::arrow::schema(fields1); + + auto a2 = ArrayFromJSON(int32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); + auto batch1 = RecordBatch::Make(schema1, length, {a0, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, batch1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor1->Validate()); + + EXPECT_FALSE(tensor_expected->Equals(*tensor1)); + EXPECT_TRUE(tensor_expected->Equals(*tensor1, EqualOptions().nans_equal(true))); + + CheckTensor(tensor1, 18, shape, f_strides); + + // int8 -> float32 + auto f3 = field("f3", int8()); + auto f4 = field("f4", int8()); + + std::vector> fields2 = {f3, f4}; + auto schema2 = ::arrow::schema(fields2); + + auto a3 = ArrayFromJSON(int8(), "[null, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a4 = ArrayFromJSON(int8(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); + auto batch2 = RecordBatch::Make(schema2, length, {a3, a4}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, batch2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor2->Validate()); + + const int64_t f32_size = sizeof(float); + std::vector f_strides_2 = {f32_size, f32_size * shape[0]}; + std::shared_ptr tensor_expected_2 = TensorFromJSON( + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTensor(tensor2, 18, shape, f_strides_2); +} + TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { const int length = 9; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9e5e3d3fa683b..aa50dd189a82d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -984,7 +984,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CRecordBatch] Slice(int64_t offset) shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length) - CResult[shared_ptr[CTensor]] ToTensor() const + CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, CMemoryPool* pool) const cdef cppclass CRecordBatchWithMetadata" arrow::RecordBatchWithMetadata": shared_ptr[CRecordBatch] batch diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 1ab3fd04ed9f0..54fda1da7dcaf 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3389,21 +3389,64 @@ cdef class RecordBatch(_Tabular): deref(c_record_batch).ToStructArray()) return pyarrow_wrap_array(c_array) - def to_tensor(self): + def to_tensor(self, c_bool null_to_nan=False, MemoryPool memory_pool=None): """ Convert to a :class:`~pyarrow.Tensor`. RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths, with no validity bitmask. + integer or float, including all bit-widths. RecordBatches with validity bitmask + for any of the arrays can be converted with ``null_to_nan``turned to ``True``. + In this case null values are converted to NaN and signed or unsigned integer + type arrays are promoted to appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], names = ["a", "b"] + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (8, 40) + + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) """ cdef: shared_ptr[CRecordBatch] c_record_batch shared_ptr[CTensor] c_tensor + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) c_record_batch = pyarrow_unwrap_batch(self) with nogil: c_tensor = GetResultValue( - deref(c_record_batch).ToTensor()) + deref(c_record_batch).ToTensor(null_to_nan, + pool)) return pyarrow_wrap_tensor(c_tensor) def _export_to_c(self, out_ptr, out_schema_ptr=0): diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index a7d917c2baf2d..8e30574188763 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1061,7 +1061,7 @@ def test_recordbatch_to_tensor_null(): arr2 = [10, 20, 30, 40, 50, 60, 70, None, 90] batch = pa.RecordBatch.from_arrays( [ - pa.array(arr1, type=pa.float32()), + pa.array(arr1, type=pa.int32()), pa.array(arr2, type=pa.float32()), ], ["a", "b"] ) @@ -1071,6 +1071,52 @@ def test_recordbatch_to_tensor_null(): ): batch.to_tensor() + result = batch.to_tensor(null_to_nan=True) + + x = np.array([arr1, arr2], np.float64).transpose() + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + # int32 -> float64 + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.int32()), + pa.array(arr2, type=pa.int32()), + ], ["a", "b"] + ) + + result = batch.to_tensor(null_to_nan=True) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + # int8 -> float32 + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.int8()), + pa.array(arr2, type=pa.int8()), + ], ["a", "b"] + ) + + result = batch.to_tensor(null_to_nan=True) + + x = np.array([arr1, arr2], np.float32).transpose() + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float32() + assert result.shape == expected.shape + assert result.strides == expected.strides + def test_recordbatch_to_tensor_empty(): batch = pa.RecordBatch.from_arrays( From d32e4b053e6fd70ff4f0e2a0552f2bf3b94647b3 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 29 Mar 2024 14:46:22 -0400 Subject: [PATCH 31/51] MINOR: [Java] Bump org.apache.hadoop dependencies from 3.3.6 to 3.4.0 in /java (#40890) Updates the Hadoop version to 3.4.0 to address vulnerabilities identified in https://deps.dev/maven/org.apache.hadoop%3Ahadoop-common/3.3.6 --- java/adapter/orc/pom.xml | 6 +++--- java/pom.xml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index e7a2bfe872eb3..060aed5dcf156 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -58,7 +58,7 @@ org.apache.hadoop hadoop-client-runtime - 3.3.6 + ${dep.hadoop.version} test @@ -70,12 +70,12 @@ org.apache.hadoop hadoop-client-api - 3.3.6 + ${dep.hadoop.version} org.apache.hadoop hadoop-common - 3.3.6 + ${dep.hadoop.version} test diff --git a/java/pom.xml b/java/pom.xml index 850b4d0508539..b05b2d8f1425a 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -37,7 +37,7 @@ 1.61.1 3.23.1 2.17.0 - 3.3.6 + 3.4.0 23.5.26 1.11.3 From ce11e561d37db3cdbc8c55e000ca46256f504dc1 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 29 Mar 2024 16:57:39 -0400 Subject: [PATCH 32/51] GH-38659: [CI][MATLAB][Packaging] Add MATLAB `packaging` task to crossbow `tasks.yml` (#38660) ### Rationale for this change Per the following mailing list discussion: https://lists.apache.org/thread/0xyow40h7b1bptsppb0rxd4g9r1xpmh6 to integrate the MATLAB interface code with the existing Arrow release tooling, we first need to add a task to the [`packaging` group](https://github.com/apache/arrow/blob/1fd11d33cb56fd7eff4dce05edaba1c9d8a1dccd/dev/tasks/tasks.yml#L55) to crossbow. This packaging task will automatically create a [MLTBX file](https://www.mathworks.com/help/matlab/creating-help.html?s_tid=CRUX_lftnav) (the MATLAB equivalent to a Python binary wheel or Ruby gem) that can be installed via a "one-click" workflow in MATLAB. This will enable MATLAB users to install the interface without needing to build from source. ### Licensing For more information about licensing of the MLTBX file contents, please refer to the mailing list discussion and ASF Legal ticket linked below: 1. https://lists.apache.org/thread/zlpnncgvo6l4cvkxfxn7zt4q7qhptotw 2. https://issues.apache.org/jira/browse/LEGAL-665 ### What changes are included in this PR? 1. Added a `matlab` task to the [`packaging` group](https://github.com/apache/arrow/blob/1fd11d33cb56fd7eff4dce05edaba1c9d8a1dccd/dev/tasks/tasks.yml#L55) in `dev/tasks/tasks.yml`. 4. Added a new GitHub Actions workflow called `dev/tasks/matlab/github.yml` which builds the MATLAB interface code on all platforms (Windows, macOS, and Ubuntu 20.04) and packages the generated build artifacts into a single MLTBX file using [`matlab.addons.toolbox.packageToolbox`](https://www.mathworks.com/help/matlab/ref/matlab.addons.toolbox.packagetoolbox.html). 5. Changed the GitHub-hosted runner to `ubuntu-20.04` from `ubuntu-latest` for the MATLAB CI check (i.e. `.github/workflows/matlab.yml`). The rationale for this change is that we primarily develop and qualify against Debian 11 locally, but the CI check has been building against `ubuntu-latest` (i.e. `ubuntu-22.04`). There are two issues with using `ubuntu-22.04`. The first is that the version of `GLIBC` shipped with `ubuntu-22.04` is not fully compatible with the version of `GLIBC` shipped with `Debian 11`. This results in a runtime linker error when qualifying the packaged MATLAB interface code locally on Debian 11. The second issue with using `ubuntu-22.04` is that the system version of `GLIBCXX` is not fully compatible with the version of `GLIBCXX` bundled with MATLAB R2023a (this is a relatively common issue - e.g. see: https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found). Previously, we worked around this issue in GitHub Actions by using `LD_PRELOAD` before starting up MATLAB to run the unit tests. On the other hand, the version of `GLIBCXX` shipped with `ubuntu-20.04` **is** binary compatible with the version bundled with MATLAB R2023a. Therefore, we believe it would be better to use `ubuntu-20.04` in the MATLAB CI checks for the time being until we can qualify the MATLAB interface against `ubuntu-22.04`. ### Are these changes tested? Yes. 1. Successfully submitted a crossbow `packaging` job for the MATLAB interface by commenting `@ github-actions crossbow submit matlab`. Example of a successful packaging job: https://github.com/ursacomputing/crossbow/actions/runs/6893506432/job/18753227453. 2. Manually installed the resulting MLTBX file on macOS, Windows, Debian 11, and Ubuntu 20.04. Ran all tests under `matlab/test` using `runtests . IncludeSubFolders 1`. ### Are there any user-facing changes? No. ### Notes 1. While qualifying, we discovered that [MATLAB's programmatic packaging interface](https://www.mathworks.com/help/matlab/ref/matlab.addons.toolbox.packagetoolbox.html) does not properly include symbolic link files in the packaged MLTBX file. We've reported this bug to the relevant MathWorks development team. As a temporary workaround, we included a step to change the expected name of the Arrow C++ libraries (using `patchelf`/`install_name_tool`) which `libarrowproxy.so`/`libarrowproxy.dylib` depends on to `libarrow.so.1500.0.0`/`libarrow.1500.0.0.dylib` instead of `libarrow.so.1500`/`libarrow.1500.dylib`, respectively. Once this bug is resolved, we will remove this step from the workflow. ### Future Directions 1. Add tooling to upload release candidate (RC) MLTBX files to apache/arrow's GitHub Releases area and mark them as "Prerelease". In other words, modify https://github.com/apache/arrow/blob/main/dev/release/05-binary-upload.sh. 2. Add a post-release script to upload release MLTBX files to apache/arrow's GitHub Releases area (similar to how https://github.com/apache/arrow/blob/main/dev/release/post-09-python.sh works). 4. Enable nightly builds for the MATLAB interface. 6. Document how to qualify a MATLAB Arrow interface release. 7. Enable building and testing the MATLAB Arrow interface on multiple Ubuntu distributions simulatneously (e.g. 20.04 *and* 22.04). * Closes: #38659 * GitHub Issue: #38659 Lead-authored-by: Sarah Gilmore Co-authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- .github/workflows/matlab.yml | 28 +++-- dev/tasks/matlab/github.yml | 162 ++++++++++++++++++++++++++ dev/tasks/tasks.yml | 9 ++ matlab/CMakeLists.txt | 17 --- matlab/tools/packageMatlabInterface.m | 84 +++++++++++++ 5 files changed, 273 insertions(+), 27 deletions(-) create mode 100644 dev/tasks/matlab/github.yml create mode 100644 matlab/tools/packageMatlabInterface.m diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index eceeb551a0653..dfc734e043371 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -42,7 +42,23 @@ jobs: ubuntu: name: AMD64 Ubuntu 20.04 MATLAB - runs-on: ubuntu-latest + # Explicitly pin the Ubuntu version to 20.04 for the time being because: + # + # 1. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible + # with the GLIBCXX bundled with MATLAB R2023a. This is a relatively common + # issue. + # + # For example, see: + # + # https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found + # + # 2. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible with + # the version of GLIBCXX shipped with Debian 11. Several of the Arrow community + # members who work on the MATLAB bindings use Debian 11 locally for qualification. + # Using Ubuntu 20.04 eases development workflows for these community members. + # + # In the future, we can investigate adding support for building against more Linux (e.g. `ubuntu-22.04`) and MATLAB versions (e.g. R2023b). + runs-on: ubuntu-20.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository @@ -74,14 +90,6 @@ jobs: run: ci/scripts/matlab_build.sh $(pwd) - name: Run MATLAB Tests env: - # libarrow.so requires a more recent version of libstdc++.so - # than is bundled with MATLAB under /sys/os/glnxa64. - # Therefore, if a MEX function that depends on libarrow.so - # is executed within the MATLAB address space, runtime linking - # errors will occur. To work around this issue, we can explicitly - # force MATLAB to use the system libstdc++.so via LD_PRELOAD. - LD_PRELOAD: /usr/lib/x86_64-linux-gnu/libstdc++.so.6 - # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab @@ -89,7 +97,7 @@ jobs: with: select-by-folder: matlab/test macos: - name: AMD64 macOS 11 MATLAB + name: AMD64 macOS 12 MATLAB runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml new file mode 100644 index 0000000000000..1cd3949efbcf8 --- /dev/null +++ b/dev/tasks/matlab/github.yml @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +{% import 'macros.jinja' as macros with context %} + +{{ macros.github_header() }} + +jobs: + + ubuntu: + name: AMD64 Ubuntu 20.04 MATLAB + runs-on: ubuntu-20.04 + steps: + {{ macros.github_checkout_arrow()|indent }} + - name: Install ninja-build + run: sudo apt-get update && sudo apt-get install ninja-build + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Build MATLAB Interface + env: + {{ macros.github_set_sccache_envvars()|indent(8) }} + run: arrow/ci/scripts/matlab_build.sh $(pwd)/arrow + - name: Change shared library dependency name + # MATLAB's programmatic packaging interface does not properly + # include symbolic link files in the package MLTBX - this is a + # bug. As a temporary workaround, change the expected name of the + # Arrow C++ library which libarrowproxy.so depends on. For example, + # change libarrow.so.1500 to libarrow.so.1500.0.0. + run: | + pushd arrow/matlab/install/arrow_matlab/+libmexclass/+proxy/ + SYMLINK_ARROW_LIB="$(find . -name 'libarrow.so.*' -type l | xargs basename)" + REGULAR_ARROW_LIB="$(echo libarrow.so.*.*)" + echo "SYMLINK_ARROW_LIB = ${SYMLINK_ARROW_LIB}" + echo "REGULAR_ARROW_LIB = ${REGULAR_ARROW_LIB}" + patchelf --replace-needed $SYMLINK_ARROW_LIB $REGULAR_ARROW_LIB libarrowproxy.so + popd + - name: Compress into single artifact + run: tar -cvzf matlab-arrow-ubuntu.tar.gz arrow/matlab/install/arrow_matlab + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: matlab-arrow-ubuntu.tar.gz + path: matlab-arrow-ubuntu.tar.gz + + macos: + name: AMD64 macOS 12 MATLAB + runs-on: macos-latest + steps: + {{ macros.github_checkout_arrow()|indent }} + - name: Install ninja-build + run: brew install ninja + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Build MATLAB Interface + env: + {{ macros.github_set_sccache_envvars()|indent(8) }} + run: arrow/ci/scripts/matlab_build.sh $(pwd)/arrow + - name: Change shared library dependency name + # MATLAB's programmatic packaging interface does not properly + # include symbolic link files in the package MLTBX - this is a + # bug. As a temporary workaround, change the expected name of the + # Arrow C++ library which libarrowproxy.dylib depends on. + # For example, change libarrow.1500.dylib to libarrow.1500.0.0.dylib. + run: | + pushd arrow/matlab/install/arrow_matlab/+libmexclass/+proxy + SYMLINK_ARROW_LIB="$(find . -name 'libarrow.*.dylib' -type l | xargs basename)" + REGULAR_ARROW_LIB="$(echo libarrow.*.*.dylib)" + echo "SYMLINK_ARROW_LIB = ${SYMLINK_ARROW_LIB}" + echo "REGULAR_ARROW_LIB = ${REGULAR_ARROW_LIB}" + install_name_tool -change @rpath/$SYMLINK_ARROW_LIB @rpath/$REGULAR_ARROW_LIB libarrowproxy.dylib + popd + - name: Compress into single artifact + run: tar -cvzf matlab-arrow-macos.tar.gz arrow/matlab/install/arrow_matlab + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: matlab-arrow-macos.tar.gz + path: matlab-arrow-macos.tar.gz + + windows: + name: AMD64 Windows 2022 MATLAB + runs-on: windows-2022 + steps: + {{ macros.github_checkout_arrow()|indent }} + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Install sccache + shell: bash + run: arrow/ci/scripts/install_sccache.sh pc-windows-msvc $(pwd)/sccache + - name: Build MATLAB Interface + shell: cmd + env: + {{ macros.github_set_sccache_envvars()|indent(8) }} + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + bash -c "arrow/ci/scripts/matlab_build.sh $(pwd)/arrow" + - name: Compress into single artifact + shell: bash + run: tar -cvzf matlab-arrow-windows.tar.gz arrow/matlab/install/arrow_matlab + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: matlab-arrow-windows.tar.gz + path: matlab-arrow-windows.tar.gz + + package-mltbx: + name: Package MATLAB Toolbox (MLTBX) Files + runs-on: ubuntu-latest + needs: + - ubuntu + - macos + - windows + steps: + {{ macros.github_checkout_arrow(fetch_depth=0)|indent }} + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts-downloaded + - name: Decompress Artifacts + run: | + mv artifacts-downloaded/*/*.tar.gz . + tar -xzvf matlab-arrow-ubuntu.tar.gz + tar -xzvf matlab-arrow-macos.tar.gz + tar -xzvf matlab-arrow-windows.tar.gz + - name: Copy LICENSE.txt and NOTICE.txt for packaging + run: | + cp arrow/LICENSE.txt arrow/matlab/install/arrow_matlab/LICENSE.txt + cp arrow/NOTICE.txt arrow/matlab/install/arrow_matlab/NOTICE.txt + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Run commands + env: + MATLABPATH: arrow/matlab/tools + ARROW_MATLAB_TOOLBOX_FOLDER: arrow/matlab/install/arrow_matlab + ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER: artifacts/matlab-dist + ARROW_MATLAB_TOOLBOX_VERSION: {{ arrow.no_rc_version }} + uses: matlab-actions/run-command@v1 + with: + command: packageMatlabInterface + {{ macros.github_upload_releases(["artifacts/matlab-dist/*.mltbx"])|indent }} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 2abfbc15174df..5e1ef8d13b988 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -59,6 +59,7 @@ groups: - conan-* - debian-* - java-jars + - matlab - nuget - python-sdist - r-binary-packages @@ -665,6 +666,14 @@ tasks: params: formula: apache-arrow.rb + ############################## MATLAB Packages ################################ + + matlab: + ci: github + template: matlab/github.yml + artifacts: + - matlab-arrow-{no_rc_version}.mltbx + ############################## Arrow JAR's ################################## java-jars: diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 206ecb318b3cc..b85f782d2d37a 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -201,9 +201,6 @@ get_filename_component(ARROW_SHARED_LIB_DIR ${ARROW_SHARED_LIB} DIRECTORY) get_filename_component(ARROW_SHARED_LIB_FILENAME ${ARROW_SHARED_LIB} NAME_WE) if(NOT Arrow_FOUND) - # If Arrow_FOUND is false, Arrow is built by the arrow_shared target and needs - # to be copied to CMAKE_PACKAGED_INSTALL_DIR. - if(APPLE) # Install libarrow.dylib (symlink) and the real files it points to. # on macOS, we need to match these files: libarrow.dylib @@ -226,20 +223,6 @@ if(NOT Arrow_FOUND) set(SHARED_LIBRARY_VERSION_REGEX ${ARROW_SHARED_LIB_FILENAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() - - # The subfolders cmake and pkgconfig are excluded as they will be empty. - # Note: The following CMake Issue suggests enabling an option to exclude all - # folders that would be empty after installation: - # https://gitlab.kitware.com/cmake/cmake/-/issues/17122 - - set(CMAKE_PACKAGED_INSTALL_DIR "${CMAKE_INSTALL_DIR}/+arrow") - - install(DIRECTORY "${ARROW_SHARED_LIB_DIR}/" - DESTINATION ${CMAKE_PACKAGED_INSTALL_DIR} - FILES_MATCHING - REGEX ${SHARED_LIBRARY_VERSION_REGEX} - PATTERN "cmake" EXCLUDE - PATTERN "pkgconfig" EXCLUDE) endif() # MATLAB_ADD_INSTALL_DIR_TO_STARTUP_FILE toggles whether an addpath command to add the install diff --git a/matlab/tools/packageMatlabInterface.m b/matlab/tools/packageMatlabInterface.m new file mode 100644 index 0000000000000..55b4d4241a569 --- /dev/null +++ b/matlab/tools/packageMatlabInterface.m @@ -0,0 +1,84 @@ +% Licensed to the Apache Software Foundation (ASF) under one +% or more contributor license agreements. See the NOTICE file +% distributed with this work for additional information +% regarding copyright ownership. The ASF licenses this file +% to you under the Apache License, Version 2.0 (the +% "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, +% software distributed under the License is distributed on an +% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +% KIND, either express or implied. See the License for the +% specific language governing permissions and limitations +% under the License. + +toolboxFolder = string(getenv("ARROW_MATLAB_TOOLBOX_FOLDER")); +outputFolder = string(getenv("ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER")); +toolboxVersionRaw = string(getenv("ARROW_MATLAB_TOOLBOX_VERSION")); + +appendLicenseText(fullfile(toolboxFolder, "LICENSE.txt")); +appendNoticeText(fullfile(toolboxFolder, "NOTICE.txt")); + +% Output folder must exist. +mkdir(outputFolder); + +disp("Toolbox Folder: " + toolboxFolder); +disp("Output Folder: " + outputFolder); +disp("Toolbox Version Raw: " + toolboxVersionRaw); + + +% Note: This string processing heuristic may not be robust to future +% changes in the Arrow versioning scheme. +dotIdx = strfind(toolboxVersionRaw, "."); +numDots = numel(dotIdx); +if numDots >= 3 + toolboxVersion = extractBefore(toolboxVersionRaw, dotIdx(3)); +else + toolboxVersion = toolboxVersionRaw; +end + +disp("Toolbox Version:" + toolboxVersion); + +identifier = "ad1d0fe6-22d1-4969-9e6f-0ab5d0f12ce3"; +opts = matlab.addons.toolbox.ToolboxOptions(toolboxFolder, identifier); +opts.ToolboxName = "MATLAB Arrow Interface"; +opts.ToolboxVersion = toolboxVersion; +opts.AuthorName = "The Apache Software Foundation"; +opts.AuthorEmail = "dev@arrow.apache.org"; + +% Set the SupportedPlatforms +opts.SupportedPlatforms.Win64 = true; +opts.SupportedPlatforms.Maci64 = true; +opts.SupportedPlatforms.Glnxa64 = true; +opts.SupportedPlatforms.MatlabOnline = true; + +% Interface is only qualified against R2023a at the moment +opts.MinimumMatlabRelease = "R2023a"; +opts.MaximumMatlabRelease = "R2023a"; + +opts.OutputFile = fullfile(outputFolder, compose("matlab-arrow-%s.mltbx", toolboxVersionRaw)); +disp("Output File: " + opts.OutputFile); +matlab.addons.toolbox.packageToolbox(opts); + +function appendLicenseText(filename) + licenseText = [ ... + newline + "--------------------------------------------------------------------------------" + newline + "3rdparty dependency mathworks/libmexclass is redistributed as a dynamically" + "linked shared library in certain binary distributions, like the MATLAB" + "distribution." + newline + "Copyright: 2022-2024 The MathWorks, Inc. All rights reserved." + "Homepage: https://github.com/mathworks/libmexclass" + "License: 3-clause BSD" ]; + writelines(licenseText, filename, WriteMode="append"); +end + +function appendNoticeText(filename) + noticeText = [ ... + newline + "---------------------------------------------------------------------------------" + newline + "This product includes software from The MathWorks, Inc. (Apache 2.0)" + " * Copyright (C) 2024 The MathWorks, Inc."]; + writelines(noticeText, filename, WriteMode="append"); +end \ No newline at end of file From 9f0101ec14336b2baad45d57320fb56c71d9321b Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Fri, 29 Mar 2024 18:29:21 -0700 Subject: [PATCH 33/51] GH-40878: [JAVA] Fix flight-sql-jdbc-driver shading issues (#40879) ### Rationale for this change The `flight-sql-jdbc-driver` jar is not shaded properly: * a reduced pom.xml file is not generated. The published pom.xml file declares dependencies which are actually present in the jar and should not be fetched externally * several classes/files are not relocated properly ### What changes are included in this PR? Fix pom.xml and relocations. Also removes annotations dependencies and include a integration test to prevent future breakage. ### Are these changes tested? Yes. A new integration test check the jar content ### Are there any user-facing changes? Yes. The published pom.xml file on Maven will be cleaned of any dependency * GitHub Issue: #40878 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/flight/flight-sql-jdbc-driver/pom.xml | 51 ++++++- .../driver/jdbc/ITDriverJarValidation.java | 141 ++++++++++++++++++ 2 files changed, 184 insertions(+), 8 deletions(-) create mode 100644 java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 84ec1ff8c1f95..53d929afa781c 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -148,13 +148,16 @@ - maven-surefire-plugin - - false - - ${project.basedir}/../../../testing/data - - + org.apache.maven.plugins + maven-failsafe-plugin + + + + integration-test + verify + + + org.apache.maven.plugins @@ -167,12 +170,22 @@ false - false + true false *:* + + + org.checkerframework:checker-qual + org.codehaus.mojo:animal-sniffer-annotations + javax.annotation:javax.annotation-api + com.google.android:annotations + com.google.errorprone:error_prone_annotations + com.google.code.findbugs:jsr305 + com.google.j2objc:j2objc-annotations + @@ -199,6 +212,14 @@ io. cfjd.io. + + net. + cfjd.net. + + + mozilla. + cfjd.mozilla. + META-INF.native.libnetty_ @@ -213,12 +234,25 @@ + + org.apache.arrow:arrow-vector + + codegen/** + + org.apache.calcite.avatica:* META-INF/services/java.sql.Driver + + org.eclipse.collections:* + + about.html + LICENSE-*-1.0.txt + + *:* @@ -227,6 +261,7 @@ **/*.DSA META-INF/native/libio_grpc_netty* META-INF/native/io_grpc_netty_shaded* + **/*.proto diff --git a/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java new file mode 100644 index 0000000000000..fdb580d493abf --- /dev/null +++ b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.driver.jdbc; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.File; +import java.io.IOException; +import java.net.JarURLConnection; +import java.net.URL; +import java.util.Enumeration; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ErrorCollector; +import org.junit.rules.TestRule; +import org.junit.rules.Timeout; + +import com.google.common.collect.ImmutableSet; + +/** + * Check the content of the JDBC driver jar + * + * After shading everything should be either under org.apache.arrow.driver.jdbc., + * org.slf4j., or cfjd. packages + */ +public class ITDriverJarValidation { + /** + * Use this property to provide path to the JDBC driver jar. Can be used to run the test from an IDE + */ + public static final String JDBC_DRIVER_PATH_OVERRIDE = + System.getProperty("arrow-flight-jdbc-driver.jar.override"); + + /** + * List of allowed prefixes a jar entry may match. + */ + public static final Set ALLOWED_PREFIXES = ImmutableSet.of( + "org/apache/arrow/driver/jdbc/", + "cfjd/", + "org/slf4j/", + "META-INF/"); + + /** + * List of allowed files a jar entry may match. + */ + public static final Set ALLOWED_FILES = ImmutableSet.of( + "arrow-git.properties", + "properties/flight.properties"); + + // This method is designed to work with Maven failsafe plugin and expects the + // JDBC driver jar to be present in the test classpath (instead of the individual classes) + private static JarFile getJdbcJarFile() throws IOException { + // Check if an override has been set + if (JDBC_DRIVER_PATH_OVERRIDE != null) { + return new JarFile(new File(JDBC_DRIVER_PATH_OVERRIDE)); + } + + // Check classpath to find the driver jar + URL driverClassURL = ITDriverJarValidation.class.getClassLoader() + .getResource("org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.class"); + + assertNotNull(driverClassURL, "Driver jar was not detected in the classpath"); + assertEquals("Driver jar was not detected in the classpath", "jar", driverClassURL.getProtocol()); + + JarURLConnection connection = (JarURLConnection) driverClassURL.openConnection(); + return connection.getJarFile(); + } + + @ClassRule + public static final TestRule CLASS_TIMEOUT = Timeout.builder().withTimeout(2, TimeUnit.MINUTES).build(); + + @Rule + public ErrorCollector collector = new ErrorCollector(); + + @Test + public void validateShadedJar() throws IOException { + // Validate the content of the jar to enforce all 3rd party dependencies have + // been shaded + try (JarFile jar = getJdbcJarFile()) { + for (Enumeration entries = jar.entries(); entries.hasMoreElements();) { + final JarEntry entry = entries.nextElement(); + if (entry.isDirectory()) { + // Directories are ignored + continue; + } + + try { + checkEntryAllowed(entry.getName()); + } catch (AssertionError e) { + collector.addError(e); + } + } + } + } + + /** + * Check if a jar entry is allowed. + * + *

+ * A jar entry is allowed if either it is part of the allowed files or it + * matches one of the allowed prefixes + * + * @param name the jar entry name + * @throws AssertionException if the entry is not allowed + */ + private void checkEntryAllowed(String name) { + // Check if there's a matching file entry first + if (ALLOWED_FILES.contains(name)) { + return; + } + + for (String prefix : ALLOWED_PREFIXES) { + if (name.startsWith(prefix)) { + return; + } + } + + throw new AssertionError("'" + name + "' is not an allowed jar entry"); + } +} From 17a536839ee20f80e80f93ec6ea714a301d12fdf Mon Sep 17 00:00:00 2001 From: Paul Date: Sun, 31 Mar 2024 10:11:08 -0500 Subject: [PATCH 34/51] GH-40893: [Java][FlightRPC] Support IntervalMonthDayNanoVector in FlightSQL JDBC Driver (#40894) ### Rationale for this change Fixes https://github.com/apache/arrow/issues/40893. ### What changes are included in this PR? - Support IntervalMonthDayNanoVector in FlightSQL JDBC Driver - Return PeriodDuration as JDBC Object type, because there is no good java.time type for this interval - Return an ISO-8601 interval as the stringified version of PeriodDuration - Make PeriodDuration implement TemporalAccessor for standardization ### Are these changes tested? Unit tests have been added that match those for other interval types. I'm unaware of any other types of tests worth adding to, but I'd be happy to if pointed there. ### Are there any user-facing changes? The only change users should noticed is that the FlightSQL JDBC Driver can now handle more query responses. * GitHub Issue: #40893 Authored-by: paul Signed-off-by: David Li --- .../ArrowFlightJdbcAccessorFactory.java | 4 + ...ArrowFlightJdbcIntervalVectorAccessor.java | 32 ++++++++ .../ArrowFlightJdbcAccessorFactoryTest.java | 14 ++++ ...wFlightJdbcIntervalVectorAccessorTest.java | 51 ++++++++++++- .../apache/arrow/vector/PeriodDuration.java | 73 ++++++++++++++++++- .../arrow/vector/TestPeriodDuration.java | 47 ++++++++++++ 6 files changed, 217 insertions(+), 4 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java index 813b40a8070f7..fa45d7a867c4a 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java @@ -51,6 +51,7 @@ import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.LargeVarBinaryVector; import org.apache.arrow.vector.LargeVarCharVector; @@ -176,6 +177,9 @@ public static ArrowFlightJdbcAccessor createAccessor(ValueVector vector, } else if (vector instanceof IntervalYearVector) { return new ArrowFlightJdbcIntervalVectorAccessor(((IntervalYearVector) vector), getCurrentRow, setCursorWasNull); + } else if (vector instanceof IntervalMonthDayNanoVector) { + return new ArrowFlightJdbcIntervalVectorAccessor(((IntervalMonthDayNanoVector) vector), getCurrentRow, + setCursorWasNull); } else if (vector instanceof StructVector) { return new ArrowFlightJdbcStructVectorAccessor((StructVector) vector, getCurrentRow, setCursorWasNull); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java index 21d1c15712cdb..90b53bc856023 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java @@ -30,8 +30,11 @@ import org.apache.arrow.driver.jdbc.accessor.ArrowFlightJdbcAccessorFactory; import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.PeriodDuration; import org.apache.arrow.vector.holders.NullableIntervalDayHolder; +import org.apache.arrow.vector.holders.NullableIntervalMonthDayNanoHolder; import org.apache.arrow.vector.holders.NullableIntervalYearHolder; /** @@ -96,6 +99,35 @@ public ArrowFlightJdbcIntervalVectorAccessor(IntervalYearVector vector, objectClass = java.time.Period.class; } + /** + * Instantiate an accessor for a {@link IntervalMonthDayNanoVector}. + * + * @param vector an instance of a IntervalMonthDayNanoVector. + * @param currentRowSupplier the supplier to track the rows. + * @param setCursorWasNull the consumer to set if value was null. + */ + public ArrowFlightJdbcIntervalVectorAccessor(IntervalMonthDayNanoVector vector, + IntSupplier currentRowSupplier, + ArrowFlightJdbcAccessorFactory.WasNullConsumer setCursorWasNull) { + super(currentRowSupplier, setCursorWasNull); + this.vector = vector; + stringGetter = (index) -> { + final NullableIntervalMonthDayNanoHolder holder = new NullableIntervalMonthDayNanoHolder(); + vector.get(index, holder); + if (holder.isSet == 0) { + return null; + } else { + final int months = holder.months; + final int days = holder.days; + final long nanos = holder.nanoseconds; + final Period period = Period.ofMonths(months).plusDays(days); + final Duration duration = Duration.ofNanos(nanos); + return new PeriodDuration(period, duration).toISO8601IntervalString(); + } + }; + objectClass = PeriodDuration.class; + } + @Override public Class getObjectClass() { return objectClass; diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java index 4b3744372c0e8..ab7f215f5d102 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java @@ -41,6 +41,7 @@ import org.apache.arrow.driver.jdbc.utils.RootAllocatorTestRule; import org.apache.arrow.vector.DurationVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.ValueVector; @@ -405,6 +406,19 @@ public void createAccessorForIntervalYearVector() { } } + @Test + public void createAccessorForIntervalMonthDayNanoVector() { + try (ValueVector valueVector = new IntervalMonthDayNanoVector("", + rootAllocatorTestRule.getRootAllocator())) { + ArrowFlightJdbcAccessor accessor = + ArrowFlightJdbcAccessorFactory.createAccessor(valueVector, GET_CURRENT_ROW, + (boolean wasNull) -> { + }); + + Assert.assertTrue(accessor instanceof ArrowFlightJdbcIntervalVectorAccessor); + } + } + @Test public void createAccessorForUnionVector() { try (ValueVector valueVector = new UnionVector("", rootAllocatorTestRule.getRootAllocator(), diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java index 322b7d40bd6e1..956738168f083 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java @@ -24,6 +24,7 @@ import java.time.Duration; import java.time.Period; +import java.time.format.DateTimeParseException; import java.util.Arrays; import java.util.Collection; import java.util.function.Supplier; @@ -32,7 +33,9 @@ import org.apache.arrow.driver.jdbc.utils.AccessorTestUtils; import org.apache.arrow.driver.jdbc.utils.RootAllocatorTestRule; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.PeriodDuration; import org.apache.arrow.vector.ValueVector; import org.junit.After; import org.junit.Assert; @@ -66,6 +69,9 @@ public class ArrowFlightJdbcIntervalVectorAccessorTest { } else if (vector instanceof IntervalYearVector) { return new ArrowFlightJdbcIntervalVectorAccessor((IntervalYearVector) vector, getCurrentRow, noOpWasNullConsumer); + } else if (vector instanceof IntervalMonthDayNanoVector) { + return new ArrowFlightJdbcIntervalVectorAccessor((IntervalMonthDayNanoVector) vector, + getCurrentRow, noOpWasNullConsumer); } return null; }; @@ -98,6 +104,17 @@ public static Collection data() { } return vector; }, "IntervalYearVector"}, + {(Supplier) () -> { + IntervalMonthDayNanoVector vector = + new IntervalMonthDayNanoVector("", rootAllocatorTestRule.getRootAllocator()); + + int valueCount = 10; + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + vector.set(i, i + 1, (i + 1) * 10, (i + 1) * 100); + } + return vector; + }, "IntervalMonthDayNanoVector"}, }); } @@ -137,13 +154,31 @@ public void testShouldGetObjectReturnNull() throws Exception { } private String getStringOnVector(ValueVector vector, int index) { - String object = getExpectedObject(vector, index).toString(); + Object object = getExpectedObject(vector, index); if (object == null) { return null; } else if (vector instanceof IntervalDayVector) { - return formatIntervalDay(Duration.parse(object)); + return formatIntervalDay(Duration.parse(object.toString())); } else if (vector instanceof IntervalYearVector) { - return formatIntervalYear(Period.parse(object)); + return formatIntervalYear(Period.parse(object.toString())); + } else if (vector instanceof IntervalMonthDayNanoVector) { + String iso8601IntervalString = ((PeriodDuration) object).toISO8601IntervalString(); + String[] periodAndDuration = iso8601IntervalString.split("T"); + if (periodAndDuration.length == 1) { + // If there is no 'T', then either Period or Duration is zero, and the other one will successfully parse it + String periodOrDuration = periodAndDuration[0]; + try { + return new PeriodDuration(Period.parse(periodOrDuration), Duration.ZERO).toISO8601IntervalString(); + } catch (DateTimeParseException e) { + return new PeriodDuration(Period.ZERO, Duration.parse(periodOrDuration)).toISO8601IntervalString(); + } + } else { + // If there is a 'T', both Period and Duration are non-zero, and we just need to prepend the 'PT' to the + // duration for both to parse successfully + Period parse = Period.parse(periodAndDuration[0]); + Duration duration = Duration.parse("PT" + periodAndDuration[1]); + return new PeriodDuration(parse, duration).toISO8601IntervalString(); + } } return null; } @@ -225,6 +260,8 @@ private Class getExpectedObjectClassForVector(ValueVector vector) { return Duration.class; } else if (vector instanceof IntervalYearVector) { return Period.class; + } else if (vector instanceof IntervalMonthDayNanoVector) { + return PeriodDuration.class; } return null; } @@ -239,6 +276,10 @@ private void setAllNullOnVector(ValueVector vector) { for (int i = 0; i < valueCount; i++) { ((IntervalYearVector) vector).setNull(i); } + } else if (vector instanceof IntervalMonthDayNanoVector) { + for (int i = 0; i < valueCount; i++) { + ((IntervalMonthDayNanoVector) vector).setNull(i); + } } } @@ -247,6 +288,10 @@ private Object getExpectedObject(ValueVector vector, int currentRow) { return Duration.ofDays(currentRow + 1).plusMillis((currentRow + 1) * 1000L); } else if (vector instanceof IntervalYearVector) { return Period.ofMonths(currentRow + 1); + } else if (vector instanceof IntervalMonthDayNanoVector) { + Period period = Period.ofMonths(currentRow + 1).plusDays((currentRow + 1) * 10L); + Duration duration = Duration.ofNanos((currentRow + 1) * 100L); + return new PeriodDuration(period, duration); } return null; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java b/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java index ee48fe7972251..c94e4b534cac7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java @@ -17,8 +17,22 @@ package org.apache.arrow.vector; +import static java.time.temporal.ChronoUnit.DAYS; +import static java.time.temporal.ChronoUnit.MONTHS; +import static java.time.temporal.ChronoUnit.NANOS; +import static java.time.temporal.ChronoUnit.SECONDS; +import static java.time.temporal.ChronoUnit.YEARS; + import java.time.Duration; import java.time.Period; +import java.time.temporal.ChronoUnit; +import java.time.temporal.Temporal; +import java.time.temporal.TemporalAmount; +import java.time.temporal.TemporalUnit; +import java.time.temporal.UnsupportedTemporalTypeException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; import org.apache.arrow.util.Preconditions; @@ -26,7 +40,10 @@ * Combination of Period and Duration for representing this interval type * as a POJO. */ -public class PeriodDuration { +public class PeriodDuration implements TemporalAmount { + + private static final List SUPPORTED_UNITS = + Collections.unmodifiableList(Arrays.asList(YEARS, MONTHS, DAYS, SECONDS, NANOS)); private final Period period; private final Duration duration; @@ -43,6 +60,60 @@ public Duration getDuration() { return duration; } + @Override + public long get(TemporalUnit unit) { + if (unit instanceof ChronoUnit) { + switch ((ChronoUnit) unit) { + case YEARS: + return period.getYears(); + case MONTHS: + return period.getMonths(); + case DAYS: + return period.getDays(); + case SECONDS: + return duration.getSeconds(); + case NANOS: + return duration.getNano(); + default: + break; + } + } + throw new UnsupportedTemporalTypeException("Unsupported TemporalUnit: " + unit); + } + + @Override + public List getUnits() { + return SUPPORTED_UNITS; + } + + @Override + public Temporal addTo(Temporal temporal) { + return temporal.plus(period).plus(duration); + } + + @Override + public Temporal subtractFrom(Temporal temporal) { + return temporal.minus(period).minus(duration); + } + + /** + * Format this PeriodDuration as an ISO-8601 interval. + * + * @return An ISO-8601 formatted string representing the interval. + */ + public String toISO8601IntervalString() { + if (duration.isZero()) { + return period.toString(); + } + String durationString = duration.toString(); + if (period.isZero()) { + return durationString; + } + + // Remove 'P' from duration string and concatenate to produce an ISO-8601 representation + return period + durationString.substring(1); + } + @Override public String toString() { return period.toString() + " " + duration.toString(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java b/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java index c8965dec3b83b..2b9f4cca8c22f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java @@ -21,7 +21,10 @@ import static org.junit.Assert.assertNotEquals; import java.time.Duration; +import java.time.LocalDate; +import java.time.LocalDateTime; import java.time.Period; +import java.time.temporal.ChronoUnit; import org.junit.Test; @@ -43,4 +46,48 @@ public void testBasics() { assertNotEquals(pd1.hashCode(), pd3.hashCode()); } + @Test + public void testToISO8601IntervalString() { + assertEquals("P0D", + new PeriodDuration(Period.ZERO, Duration.ZERO).toISO8601IntervalString()); + assertEquals("P1Y2M3D", + new PeriodDuration(Period.of(1, 2, 3), Duration.ZERO).toISO8601IntervalString()); + assertEquals("PT0.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofNanos(123)).toISO8601IntervalString()); + assertEquals("PT1.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(1).withNanos(123)).toISO8601IntervalString()); + assertEquals("PT1H1.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(3601).withNanos(123)).toISO8601IntervalString()); + assertEquals("PT24H1M1.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(86461).withNanos(123)).toISO8601IntervalString()); + assertEquals("P1Y2M3DT24H1M1.000000123S", + new PeriodDuration(Period.of(1, 2, 3), Duration.ofSeconds(86461).withNanos(123)).toISO8601IntervalString()); + + assertEquals("P-1Y-2M-3D", + new PeriodDuration(Period.of(-1, -2, -3), Duration.ZERO).toISO8601IntervalString()); + assertEquals("PT-0.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofNanos(-123)).toISO8601IntervalString()); + assertEquals("PT-24H-1M-0.999999877S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(-86461).withNanos(123)).toISO8601IntervalString()); + assertEquals("P-1Y-2M-3DT-0.999999877S", + new PeriodDuration(Period.of(-1, -2, -3), Duration.ofSeconds(-1).withNanos(123)).toISO8601IntervalString()); + } + + @Test + public void testTemporalAccessor() { + LocalDate date = LocalDate.of(2024, 1, 2); + PeriodDuration pd1 = new PeriodDuration(Period.ofYears(1), Duration.ZERO); + assertEquals(LocalDate.of(2025, 1, 2), pd1.addTo(date)); + + LocalDateTime dateTime = LocalDateTime.of(2024, 1, 2, 3, 4); + PeriodDuration pd2 = new PeriodDuration(Period.ZERO, Duration.ofMinutes(1)); + assertEquals(LocalDateTime.of(2024, 1, 2, 3, 3), pd2.subtractFrom(dateTime)); + + PeriodDuration pd3 = new PeriodDuration(Period.of(1, 2, 3), Duration.ofSeconds(86461).withNanos(123)); + assertEquals(pd3.get(ChronoUnit.YEARS), 1); + assertEquals(pd3.get(ChronoUnit.MONTHS), 2); + assertEquals(pd3.get(ChronoUnit.DAYS), 3); + assertEquals(pd3.get(ChronoUnit.SECONDS), 86461); + assertEquals(pd3.get(ChronoUnit.NANOS), 123); + } } From 71321841eb6d94946de43cccb7f04afe5cf2aa10 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 1 Apr 2024 11:15:59 -0400 Subject: [PATCH 35/51] GH-40900: [Go] Fix Mallocator Weirdness (#40902) ### Rationale for this change With help from @ lidavidm and @ bkietz digging into the linked issue, we found the following: * Using `mtrace` and `strace` didn't produce much enlightenment to what was happening. * If the python adbc_driver_manager was built so that the cython lib is built using `CMAKE_BUILD_TYPE=Debug` then the crash/failure goes away * If the env var `MALLOC_MMAP_THRESHOLD_` is set to 128MB, the crash/failure goes away * It is only reproducible when calling through python, I haven't been able to reproduce it using pure Go * Calling `calloc` again after it fails, still fails * Calling `malloc` + `memset` immediately after the failing `calloc` works perfectly and doesn't fail anymore ### What changes are included in this PR? Adding a comment describing the situation and falling back to `malloc` + `memset` if `calloc` returns an error. If the pointer returned from `malloc` is `nil` then we surface the error. * GitHub Issue: #40900 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/memory/mallocator/mallocator.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/go/arrow/memory/mallocator/mallocator.go b/go/arrow/memory/mallocator/mallocator.go index 59d240a1063e8..9483bdfc2a05f 100644 --- a/go/arrow/memory/mallocator/mallocator.go +++ b/go/arrow/memory/mallocator/mallocator.go @@ -60,10 +60,19 @@ func (alloc *Mallocator) Allocate(size int) []byte { } ptr, err := C.calloc(C.size_t(size), 1) if err != nil { - panic(err) + // under some circumstances and allocation patterns, we can end up in a scenario + // where for some reason calloc return ENOMEM even though there is definitely memory + // available for use. So we attempt to fallback to simply doing malloc + memset in + // this case. If malloc returns a nil pointer, then we know we're out of memory + // and will surface the error. + if ptr = C.malloc(C.size_t(size)); ptr == nil { + panic(err) + } + C.memset(ptr, 0, C.size_t(size)) } else if ptr == nil { panic("mallocator: out of memory") } + atomic.AddUint64(&alloc.allocatedBytes, uint64(size)) return unsafe.Slice((*byte)(ptr), size) } From 68241d8a86e9923cda2b758d10176b8dfb1cfea7 Mon Sep 17 00:00:00 2001 From: wayne Date: Mon, 1 Apr 2024 12:01:49 -0600 Subject: [PATCH 36/51] GH-40888: [Go][FlightRPC] support conversion from array.Duration in FlightSQL driver (#40889) ### Rationale for this change To enable the use of the flightsql driver's implementation of golang sql interfaces. ### What changes are included in this PR? A new switch branch for handling `array.Duration`. ### Are these changes tested? I manually tested and didn't add new unit tests because none of the other types handled in the same switch block are unit tested. ### Are there any user-facing changes? Just a more complete set of types handled by the sql driver. * GitHub Issue: #40888 Authored-by: wayne warren Signed-off-by: Matt Topol --- go/arrow/flight/flightsql/driver/utils.go | 4 ++++ go/arrow/flight/flightsql/driver/utils_test.go | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/go/arrow/flight/flightsql/driver/utils.go b/go/arrow/flight/flightsql/driver/utils.go index a99c045e2ed02..84cf2110cca92 100644 --- a/go/arrow/flight/flightsql/driver/utils.go +++ b/go/arrow/flight/flightsql/driver/utils.go @@ -104,6 +104,10 @@ func fromArrowType(arr arrow.Array, idx int) (interface{}, error) { return v.ToTime(ts.TimeUnit()), nil case *array.Date64: return c.Value(idx).ToTime(), nil + case *array.Duration: + dt := arr.DataType().(*arrow.DurationType) + duration := time.Duration(c.Value(idx)) * dt.Unit.Multiplier() + return duration, nil case *array.DayTimeInterval: durationDays := time.Duration(c.Value(idx).Days*24) * time.Hour duration := time.Duration(c.Value(idx).Milliseconds) * time.Millisecond diff --git a/go/arrow/flight/flightsql/driver/utils_test.go b/go/arrow/flight/flightsql/driver/utils_test.go index 6b1adfed47503..8ea7921b64e79 100644 --- a/go/arrow/flight/flightsql/driver/utils_test.go +++ b/go/arrow/flight/flightsql/driver/utils_test.go @@ -50,6 +50,10 @@ func Test_fromArrowType(t *testing.T) { {Name: "f15-ts_us", Type: arrow.FixedWidthTypes.Timestamp_ns}, {Name: "f16-d64", Type: arrow.FixedWidthTypes.Date64}, {Name: "f17-dti", Type: arrow.FixedWidthTypes.DayTimeInterval}, + {Name: "f18-duration_s", Type: arrow.FixedWidthTypes.Duration_s}, + {Name: "f19-duration_ms", Type: arrow.FixedWidthTypes.Duration_ms}, + {Name: "f20-duration_us", Type: arrow.FixedWidthTypes.Duration_us}, + {Name: "f21-duration_ns", Type: arrow.FixedWidthTypes.Duration_ns}, } schema := arrow.NewSchema(fields, nil) @@ -90,6 +94,10 @@ func Test_fromArrowType(t *testing.T) { testTime := time.Now() b.Field(15).(*array.Date64Builder).Append(arrow.Date64FromTime(testTime)) b.Field(16).(*array.DayTimeIntervalBuilder).Append(arrow.DayTimeInterval{Days: 1, Milliseconds: 1000}) + b.Field(17).(*array.DurationBuilder).Append(1) + b.Field(18).(*array.DurationBuilder).Append(1) + b.Field(19).(*array.DurationBuilder).Append(1) + b.Field(20).(*array.DurationBuilder).Append(1) rec := b.NewRecord() defer rec.Release() @@ -123,4 +131,8 @@ func Test_fromArrowType(t *testing.T) { tf(t, 14, time.Date(1970, 1, 1, 12, 0, 0, 0, time.UTC)) // "f15-ts_us" tf(t, 15, testTime.In(time.UTC).Truncate(24*time.Hour)) // "f16-d64" tf(t, 16, time.Duration(24*time.Hour+time.Second)) // "f17-dti" + tf(t, 17, time.Duration(1000000000)) // "f18-duration_s" + tf(t, 18, time.Duration(1000000)) // "f19-duration_ms" + tf(t, 19, time.Duration(1000)) // "f20-duration_us" + tf(t, 20, time.Duration(1)) // "f21-duration_ns" } From e44dc29df9587a139fe539069c3dafc771256b90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 14:02:32 -0400 Subject: [PATCH 37/51] MINOR: [Go] Bump github.com/google/flatbuffers from 24.3.7+incompatible to 24.3.25+incompatible in /go (#40922) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/google/flatbuffers](https://github.com/google/flatbuffers) from 24.3.7+incompatible to 24.3.25+incompatible.

Release notes

Sourced from github.com/google/flatbuffers's releases.

v24.3.25

What's Changed

New Contributors

Full Changelog: https://github.com/google/flatbuffers/compare/v24.3.7...v24.3.25

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/google/flatbuffers&package-manager=go_modules&previous-version=24.3.7+incompatible&new-version=24.3.25+incompatible)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 2f788c5c26b02..9975ecfc69d34 100644 --- a/go/go.mod +++ b/go/go.mod @@ -25,7 +25,7 @@ require ( github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 github.com/goccy/go-json v0.10.2 github.com/golang/snappy v0.0.4 - github.com/google/flatbuffers v24.3.7+incompatible + github.com/google/flatbuffers v24.3.25+incompatible github.com/klauspost/asmfmt v1.3.2 github.com/klauspost/compress v1.17.7 github.com/klauspost/cpuid/v2 v2.2.7 diff --git a/go/go.sum b/go/go.sum index 593746bcf9e4e..462c43021a29e 100644 --- a/go/go.sum +++ b/go/go.sum @@ -1,9 +1,11 @@ github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= github.com/alecthomas/assert/v2 v2.3.0 h1:mAsH2wmvjsuvyBvAmCtm7zFsBlb8mIHx5ySLVdDZXL0= +github.com/alecthomas/assert/v2 v2.3.0/go.mod h1:pXcQ2Asjp247dahGEmsZ6ru0UVwnkhktn7S0bBDLxvQ= github.com/alecthomas/participle/v2 v2.1.0 h1:z7dElHRrOEEq45F2TG5cbQihMtNTv8vwldytDj7Wrz4= github.com/alecthomas/participle/v2 v2.1.0/go.mod h1:Y1+hAs8DHPmc3YUFzqllV+eSQ9ljPTk0ZkPMtEdAx2c= github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk= +github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk= @@ -19,8 +21,11 @@ github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+m github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q= +github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no= +github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE= +github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/goccy/go-yaml v1.11.0 h1:n7Z+zx8S9f9KgzG6KtQKf+kwqXZlLNR2F6018Dgau54= @@ -30,12 +35,14 @@ github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/flatbuffers v24.3.7+incompatible h1:BxGUkIQnOciBu33bd5BdvqY8Qvo0O/GR4SPhh7x9Ed0= -github.com/google/flatbuffers v24.3.7+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI= +github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= +github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hamba/avro/v2 v2.20.1 h1:3WByQiVn7wT7d27WQq6pvBRC00FVOrniP6u67FLA/2E= @@ -43,6 +50,7 @@ github.com/hamba/avro/v2 v2.20.1/go.mod h1:xHiKXbISpb3Ovc809XdzWow+XGTn+Oyf/F9aZ github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= @@ -52,15 +60,18 @@ github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ib github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= +github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= @@ -99,9 +110,11 @@ github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhso github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= @@ -134,9 +147,11 @@ google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGm google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE= +modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ= modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 h1:5D53IMaUuA5InSeMu9eJtlQXS2NxAhyWQvkKEgXZhHI= modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4= modernc.org/libc v1.41.0 h1:g9YAc6BkKlgORsUWj+JwqoB1wU3o4DE3bM3yvA3k+Gk= From 48ee2eabffb6059206176f8a53c19bec11e9d441 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 12:42:27 -0700 Subject: [PATCH 38/51] MINOR: [C#] Bump Google.Protobuf from 3.26.0 to 3.26.1 in /csharp (#40923) Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.26.0 to 3.26.1.
Commits
  • 2434ef2 Updating version.json and repo version numbers to: 26.1
  • 49253b1 Merge pull request #16308 from protocolbuffers/cp-26x-3
  • 9bf69ec Fix validateFeatures to be called after resolved features are actually set to...
  • b752bc2 Merge pull request #16307 from protocolbuffers/cp-26x-2
  • f7d2326 Merge pull request #16309 from protocolbuffers/cp-26x-4
  • 2e51ff6 Cherry-pick required label handling in JRuby field descriptor from https://gi...
  • a2f5303 Update cmake stalenes
  • 6a177d2 Merge branch '26.x' into cp-26x-4
  • 2d3d8ba Expand cpp_features_proto_srcs visibility
  • e1092ee Merge pull request #16294 from protocolbuffers/cp-26x
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.26.0&new-version=3.26.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index bd6ae7ad22b42..04b8a7dc734f0 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + From 9e320d7181fb5b7192d690b634a247c66132f864 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 2 Apr 2024 06:15:53 +0900 Subject: [PATCH 39/51] GH-39069: [C++][FS][Azure] Use the generic filesystem tests (#40567) ### Rationale for this change We should provide common spec for all filesystem API. ### What changes are included in this PR? Enable the generic filesystem tests. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #39069 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 117 +++++++-- cpp/src/arrow/filesystem/azurefs_test.cc | 319 +++++++++++++++-------- cpp/src/arrow/filesystem/test_util.cc | 30 ++- cpp/src/arrow/filesystem/test_util.h | 4 + 4 files changed, 333 insertions(+), 137 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 260478b068ed1..84733a824e7ba 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1591,7 +1591,9 @@ class AzureFileSystem::Impl { if (info.type() == FileType::NotFound) { return PathNotFound(location); } - DCHECK_EQ(info.type(), FileType::Directory); + if (info.type() != FileType::Directory) { + return NotADir(location); + } return Status::OK(); } @@ -1818,8 +1820,67 @@ class AzureFileSystem::Impl { const AzureLocation& location, bool recursive) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - // Non-recursive CreateDir calls require the parent directory to exist. - if (!recursive) { + if (recursive) { + // Recursive CreateDir calls require that all path segments be + // either a directory or not found. + + // Check each path segment is a directory or not + // found. Nonexistent segments are collected to + // nonexistent_locations. We'll create directories for + // nonexistent segments later. + std::vector nonexistent_locations; + for (auto prefix = location; !prefix.path.empty(); prefix = prefix.parent()) { + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(container_client, prefix)); + if (info.type() == FileType::File) { + return NotADir(prefix); + } + if (info.type() == FileType::NotFound) { + nonexistent_locations.push_back(prefix); + } + } + // Ensure container exists + ARROW_ASSIGN_OR_RAISE(auto container, + AzureLocation::FromString(location.container)); + ARROW_ASSIGN_OR_RAISE(auto container_info, + GetContainerPropsAsFileInfo(container, container_client)); + if (container_info.type() == FileType::NotFound) { + try { + container_client.CreateIfNotExists(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to create directory '", + location.all, "': ", container_client.GetUrl()); + } + } + // Create nonexistent directories from shorter to longer: + // + // Example: + // + // * location: /container/a/b/c/d/ + // * Nonexistent path segments: + // * /container/a/ + // * /container/a/c/ + // * /container/a/c/d/ + // * target_locations: + // 1. /container/a/c/d/ + // 2. /container/a/c/ + // 3. /container/a/ + // + // Create order: + // 1. /container/a/ + // 2. /container/a/c/ + // 3. /container/a/c/d/ + for (size_t i = nonexistent_locations.size(); i > 0; --i) { + const auto& nonexistent_location = nonexistent_locations[i - 1]; + try { + create_if_not_exists(container_client, nonexistent_location); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to create directory '", + location.all, "': ", container_client.GetUrl()); + } + } + return Status::OK(); + } else { + // Non-recursive CreateDir calls require the parent directory to exist. auto parent = location.parent(); if (!parent.path.empty()) { RETURN_NOT_OK(CheckDirExists(container_client, parent)); @@ -1827,28 +1888,17 @@ class AzureFileSystem::Impl { // If the parent location is just the container, we don't need to check if it // exists because the operation we perform below will fail if the container // doesn't exist and we can handle that error according to the recursive flag. - } - try { - create_if_not_exists(container_client, location); - return Status::OK(); - } catch (const Storage::StorageException& exception) { - if (IsContainerNotFound(exception)) { - try { - if (recursive) { - container_client.CreateIfNotExists(); - create_if_not_exists(container_client, location); - return Status::OK(); - } else { - auto parent = location.parent(); - return PathNotFound(parent); - } - } catch (const Storage::StorageException& second_exception) { - return ExceptionToStatus(second_exception, "Failed to create directory '", - location.all, "': ", container_client.GetUrl()); + try { + create_if_not_exists(container_client, location); + return Status::OK(); + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + auto parent = location.parent(); + return PathNotFound(parent); } + return ExceptionToStatus(exception, "Failed to create directory '", location.all, + "': ", container_client.GetUrl()); } - return ExceptionToStatus(exception, "Failed to create directory '", location.all, - "': ", container_client.GetUrl()); } } @@ -2016,8 +2066,15 @@ class AzureFileSystem::Impl { bool found_dir_marker_blob = false; try { auto list_response = container_client.ListBlobs(options); - if (require_dir_to_exist && list_response.Blobs.empty()) { - return PathNotFound(location); + if (list_response.Blobs.empty()) { + if (require_dir_to_exist) { + return PathNotFound(location); + } else { + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(container_client, location)); + if (info.type() == FileType::File) { + return NotADir(location); + } + } } for (; list_response.HasPage(); list_response.MoveToNextPage()) { if (list_response.Blobs.empty()) { @@ -2732,6 +2789,16 @@ class AzureFileSystem::Impl { } auto dest_blob_client = GetBlobClient(dest.container, dest.path); auto src_url = GetBlobClient(src.container, src.path).GetUrl(); + if (!dest.path.empty()) { + auto dest_parent = dest.parent(); + if (!dest_parent.path.empty()) { + auto dest_container_client = GetBlobContainerClient(dest_parent.container); + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(dest_container_client, dest_parent)); + if (info.type() == FileType::File) { + return NotADir(dest_parent); + } + } + } try { dest_blob_client.CopyFromUri(src_url); } catch (const Storage::StorageException& exception) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7ea5eb446bc12..24031e313f798 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -98,6 +98,7 @@ class BaseAzureEnv : public ::testing::Environment { virtual AzureBackend backend() const = 0; + virtual bool HasSubmitBatchBug() const { return false; } virtual bool WithHierarchicalNamespace() const { return false; } virtual Result GetDebugLogSize() { return 0; } @@ -207,6 +208,18 @@ class AzuriteEnv : public AzureEnvImpl { return self; } + /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. + /// SubmitBatch is used by: + /// - AzureFileSystem::DeleteDir + /// - AzureFileSystem::DeleteDirContents + bool HasSubmitBatchBug() const override { +#ifdef __APPLE__ + return true; +#else + return false; +#endif + } + Result GetDebugLogSize() override { ARROW_ASSIGN_OR_RAISE(auto exists, arrow::internal::FileExists(debug_log_path_)); if (!exists) { @@ -274,6 +287,186 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +namespace { +Result MakeOptions(BaseAzureEnv* env) { + AzureOptions options; + options.account_name = env->account_name(); + switch (env->backend()) { + case AzureBackend::kAzurite: + options.blob_storage_authority = "127.0.0.1:10000"; + options.dfs_storage_authority = "127.0.0.1:10000"; + options.blob_storage_scheme = "http"; + options.dfs_storage_scheme = "http"; + break; + case AzureBackend::kAzure: + // Use the default values + break; + } + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); + return options; +} +} // namespace + +struct PreexistingData { + public: + using RNG = random::pcg32_fast; + + public: + const std::string container_name; + static constexpr char const* kObjectName = "test-object-name"; + + static constexpr char const* kLoremIpsum = R"""( +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. +)"""; + + public: + explicit PreexistingData(RNG& rng) : container_name{RandomContainerName(rng)} {} + + // Creates a path by concatenating the container name and the stem. + std::string ContainerPath(std::string_view stem) const { return Path(stem); } + + // Short alias to ContainerPath() + std::string Path(std::string_view stem) const { + return ConcatAbstractPath(container_name, stem); + } + + std::string ObjectPath() const { return ContainerPath(kObjectName); } + std::string NotFoundObjectPath() const { return ContainerPath("not-found"); } + + std::string RandomDirectoryPath(RNG& rng) const { + return ContainerPath(RandomChars(32, rng)); + } + + // Utilities + static std::string RandomContainerName(RNG& rng) { return RandomChars(32, rng); } + + static std::string RandomChars(int count, RNG& rng) { + auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); + std::uniform_int_distribution d(0, static_cast(fillers.size()) - 1); + std::string s; + std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(rng)]; }); + return s; + } + + static int RandomIndex(int end, RNG& rng) { + return std::uniform_int_distribution(0, end - 1)(rng); + } + + static std::string RandomLine(int lineno, int width, RNG& rng) { + auto line = std::to_string(lineno) + ": "; + line += RandomChars(width - static_cast(line.size()) - 1, rng); + line += '\n'; + return line; + } +}; + +class TestGeneric : public ::testing::Test, public GenericFileSystemTest { + public: + void TearDown() override { + if (azure_fs_) { + ASSERT_OK(azure_fs_->DeleteDir(container_name_)); + } + } + + protected: + void SetUpInternal(BaseAzureEnv* env) { + env_ = env; + random::pcg32_fast rng((std::random_device()())); + container_name_ = PreexistingData::RandomContainerName(rng); + ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env_)); + ASSERT_OK_AND_ASSIGN(azure_fs_, AzureFileSystem::Make(options)); + ASSERT_OK(azure_fs_->CreateDir(container_name_, true)); + fs_ = std::make_shared(container_name_, azure_fs_); + } + + std::shared_ptr GetEmptyFileSystem() override { return fs_; } + + bool have_implicit_directories() const override { return true; } + bool allow_write_file_over_dir() const override { return true; } + bool allow_read_dir_as_file() const override { return true; } + bool allow_move_dir() const override { return false; } + bool allow_move_file() const override { return true; } + bool allow_append_to_file() const override { return true; } + bool have_directory_mtimes() const override { return false; } + bool have_flaky_directory_tree_deletion() const override { return false; } + bool have_file_metadata() const override { return true; } + // calloc() used in libxml2's xmlNewGlobalState() is detected as a + // memory leak like the following. But it's a false positive. It's + // used in ListBlobsByHierarchy() for GetFileInfo() and it's freed + // in the call. This is detected as a memory leak only with + // generator API (GetFileInfoGenerator()) and not detected with + // non-generator API (GetFileInfo()). So this is a false positive. + // + // ==2875409==ERROR: LeakSanitizer: detected memory leaks + // + // Direct leak of 968 byte(s) in 1 object(s) allocated from: + // #0 0x55d02c967bdc in calloc (build/debug/arrow-azurefs-test+0x17bbdc) (BuildId: + // 520690d1b20e860cc1feef665dce8196e64f955e) #1 0x7fa914b1cd1e in xmlNewGlobalState + // builddir/main/../../threads.c:580:10 #2 0x7fa914b1cd1e in xmlGetGlobalState + // builddir/main/../../threads.c:666:31 + bool have_false_positive_memory_leak_with_generator() const override { return true; } + + BaseAzureEnv* env_; + std::shared_ptr azure_fs_; + std::shared_ptr fs_; + + private: + std::string container_name_; +}; + +class TestAzuriteGeneric : public TestGeneric { + public: + void SetUp() override { + ASSERT_OK_AND_ASSIGN(auto env, AzuriteEnv::GetInstance()); + SetUpInternal(env); + } + + protected: + // Azurite doesn't support moving files over containers. + bool allow_move_file() const override { return false; } + // DeleteDir() doesn't work with Azurite on macOS + bool have_flaky_directory_tree_deletion() const override { + return env_->HasSubmitBatchBug(); + } +}; + +class TestAzureFlatNSGeneric : public TestGeneric { + public: + void SetUp() override { + auto env_result = AzureFlatNSEnv::GetInstance(); + if (env_result.status().IsCancelled()) { + GTEST_SKIP() << env_result.status().message(); + } + ASSERT_OK_AND_ASSIGN(auto env, env_result); + SetUpInternal(env); + } + + protected: + // Flat namespace account doesn't support moving files over containers. + bool allow_move_file() const override { return false; } +}; + +class TestAzureHierarchicalNSGeneric : public TestGeneric { + public: + void SetUp() override { + auto env_result = AzureHierarchicalNSEnv::GetInstance(); + if (env_result.status().IsCancelled()) { + GTEST_SKIP() << env_result.status().message(); + } + ASSERT_OK_AND_ASSIGN(auto env, env_result); + SetUpInternal(env); + } +}; + +GENERIC_FS_TEST_FUNCTIONS(TestAzuriteGeneric); +GENERIC_FS_TEST_FUNCTIONS(TestAzureFlatNSGeneric); +GENERIC_FS_TEST_FUNCTIONS(TestAzureHierarchicalNSGeneric); + TEST(AzureFileSystem, InitializingFilesystemWithoutAccountNameFails) { AzureOptions options; ASSERT_RAISES(Invalid, options.ConfigureAccountKeyCredential("account_key")); @@ -532,64 +725,6 @@ TEST_F(TestAzureOptions, FromUriInvalidQueryParameter) { TestFromUriInvalidQueryParameter(); } -struct PreexistingData { - public: - using RNG = random::pcg32_fast; - - public: - const std::string container_name; - static constexpr char const* kObjectName = "test-object-name"; - - static constexpr char const* kLoremIpsum = R"""( -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor -incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis -nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu -fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in -culpa qui officia deserunt mollit anim id est laborum. -)"""; - - public: - explicit PreexistingData(RNG& rng) : container_name{RandomContainerName(rng)} {} - - // Creates a path by concatenating the container name and the stem. - std::string ContainerPath(std::string_view stem) const { return Path(stem); } - - // Short alias to ContainerPath() - std::string Path(std::string_view stem) const { - return ConcatAbstractPath(container_name, stem); - } - - std::string ObjectPath() const { return ContainerPath(kObjectName); } - std::string NotFoundObjectPath() const { return ContainerPath("not-found"); } - - std::string RandomDirectoryPath(RNG& rng) const { - return ContainerPath(RandomChars(32, rng)); - } - - // Utilities - static std::string RandomContainerName(RNG& rng) { return RandomChars(32, rng); } - - static std::string RandomChars(int count, RNG& rng) { - auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); - std::uniform_int_distribution d(0, static_cast(fillers.size()) - 1); - std::string s; - std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(rng)]; }); - return s; - } - - static int RandomIndex(int end, RNG& rng) { - return std::uniform_int_distribution(0, end - 1)(rng); - } - - static std::string RandomLine(int lineno, int width, RNG& rng) { - auto line = std::to_string(lineno) + ": "; - line += RandomChars(width - static_cast(line.size()) - 1, rng); - line += '\n'; - return line; - } -}; - class TestAzureFileSystem : public ::testing::Test { protected: // Set in constructor @@ -621,24 +756,6 @@ class TestAzureFileSystem : public ::testing::Test { return fs(CachedHNSSupport(*env)); } - static Result MakeOptions(BaseAzureEnv* env) { - AzureOptions options; - options.account_name = env->account_name(); - switch (env->backend()) { - case AzureBackend::kAzurite: - options.blob_storage_authority = "127.0.0.1:10000"; - options.dfs_storage_authority = "127.0.0.1:10000"; - options.blob_storage_scheme = "http"; - options.dfs_storage_scheme = "http"; - break; - case AzureBackend::kAzure: - // Use the default values - break; - } - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); - return options; - } - void SetUp() override { auto make_options = [this]() -> Result { ARROW_ASSIGN_OR_RAISE(auto env, GetAzureEnv()); @@ -824,19 +941,6 @@ class TestAzureFileSystem : public ::testing::Test { "This test is affected by an Azurite issue: " "https://github.com/Azure/Azurite/pull/2302"; - /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. - /// SubmitBatch is used by: - /// - AzureFileSystem::DeleteDir - /// - AzureFileSystem::DeleteDirContents - bool HasSubmitBatchBug() const { -#ifdef __APPLE__ - EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); - return env->backend() == AzureBackend::kAzurite; -#else - return false; -#endif - } - static bool WithErrno(const Status& status, int expected_errno) { auto* detail = status.detail().get(); return detail && @@ -1059,9 +1163,7 @@ class TestAzureFileSystem : public ::testing::Test { auto path2 = data.Path("directory2"); ASSERT_OK(fs()->OpenOutputStream(path2)); - // CreateDir returns OK even if there is already a file or directory at this - // location. Whether or not this is the desired behaviour is debatable. - ASSERT_OK(fs()->CreateDir(path2)); + ASSERT_RAISES(IOError, fs()->CreateDir(path2)); AssertFileInfo(fs(), path2, FileType::File); } @@ -1070,7 +1172,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessEmpty() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1090,7 +1193,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessHaveBlob() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1105,7 +1209,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestNonEmptyDirWithTrailingSlash() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1120,7 +1225,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessHaveDirectory() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1135,7 +1241,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessExist() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto preexisting_data = SetUpPreexistingData(); @@ -1149,7 +1256,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessExistWithTrailingSlash() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto preexisting_data = SetUpPreexistingData(); @@ -1163,7 +1271,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessNonexistent() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2174,7 +2283,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2185,7 +2295,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2213,7 +2324,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirUri) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2228,7 +2340,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 040917dcd218a..19226ce01ae2f 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -252,8 +252,7 @@ void GenericFileSystemTest::TestCreateDir(FileSystem* fs) { } void GenericFileSystemTest::TestDeleteDir(FileSystem* fs) { - if (have_flaky_directory_tree_deletion()) - GTEST_SKIP() << "Flaky directory deletion on Windows"; + if (have_flaky_directory_tree_deletion()) GTEST_SKIP() << "Flaky directory deletion"; ASSERT_OK(fs->CreateDir("AB/CD/EF")); ASSERT_OK(fs->CreateDir("AB/GH/IJ")); @@ -281,8 +280,7 @@ void GenericFileSystemTest::TestDeleteDir(FileSystem* fs) { } void GenericFileSystemTest::TestDeleteDirContents(FileSystem* fs) { - if (have_flaky_directory_tree_deletion()) - GTEST_SKIP() << "Flaky directory deletion on Windows"; + if (have_flaky_directory_tree_deletion()) GTEST_SKIP() << "Flaky directory deletion"; ASSERT_OK(fs->CreateDir("AB/CD/EF")); ASSERT_OK(fs->CreateDir("AB/GH/IJ")); @@ -313,6 +311,8 @@ void GenericFileSystemTest::TestDeleteDirContents(FileSystem* fs) { } void GenericFileSystemTest::TestDeleteRootDirContents(FileSystem* fs) { + if (have_flaky_directory_tree_deletion()) GTEST_SKIP() << "Flaky directory deletion"; + ASSERT_OK(fs->CreateDir("AB/CD")); CreateFile(fs, "AB/abc", ""); @@ -323,9 +323,7 @@ void GenericFileSystemTest::TestDeleteRootDirContents(FileSystem* fs) { AssertAllDirs(fs, {"AB", "AB/CD"}); AssertAllFiles(fs, {"AB/abc"}); } else { - if (!have_flaky_directory_tree_deletion()) { - AssertAllDirs(fs, {}); - } + AssertAllDirs(fs, {}); AssertAllFiles(fs, {}); } } @@ -385,6 +383,10 @@ void GenericFileSystemTest::TestDeleteFiles(FileSystem* fs) { } void GenericFileSystemTest::TestMoveFile(FileSystem* fs) { + if (!allow_move_file()) { + GTEST_SKIP() << "Filesystem doesn't allow moving files"; + } + ASSERT_OK(fs->CreateDir("AB/CD")); ASSERT_OK(fs->CreateDir("EF")); CreateFile(fs, "abc", "data"); @@ -750,6 +752,12 @@ void GenericFileSystemTest::TestGetFileInfoSelector(FileSystem* fs) { } void GenericFileSystemTest::TestGetFileInfoGenerator(FileSystem* fs) { +#ifdef ADDRESS_SANITIZER + if (have_false_positive_memory_leak_with_generator()) { + GTEST_SKIP() << "Filesystem have false positive memory leak with generator"; + } +#endif + ASSERT_OK(fs->CreateDir("AB/CD")); CreateFile(fs, "abc", "data"); CreateFile(fs, "AB/def", "some data"); @@ -1177,8 +1185,12 @@ void GenericFileSystemTest::TestSpecialChars(FileSystem* fs) { AssertFileContents(fs, "Special and%different.txt", "data"); ASSERT_OK(fs->DeleteFile("Special and%different.txt")); - ASSERT_OK(fs->DeleteDir("Blank Char")); - AssertAllDirs(fs, {}); + if (have_flaky_directory_tree_deletion()) { + ASSERT_OK(fs->DeleteFile("Blank Char/Special%Char.txt")); + } else { + ASSERT_OK(fs->DeleteDir("Blank Char")); + AssertAllDirs(fs, {}); + } AssertAllFiles(fs, {}); } diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index 62b488e159a24..e70c787aa85c4 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -168,6 +168,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { virtual bool allow_write_file_over_dir() const { return false; } // - Whether the filesystem allows reading a directory virtual bool allow_read_dir_as_file() const { return false; } + // - Whether the filesystem allows moving a file + virtual bool allow_move_file() const { return true; } // - Whether the filesystem allows moving a directory virtual bool allow_move_dir() const { return true; } // - Whether the filesystem allows moving a directory "over" a non-empty destination @@ -182,6 +184,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { virtual bool have_flaky_directory_tree_deletion() const { return false; } // - Whether the filesystem stores some metadata alongside files virtual bool have_file_metadata() const { return false; } + // - Whether the filesystem has a false positive memory leak with generator + virtual bool have_false_positive_memory_leak_with_generator() const { return false; } void TestEmpty(FileSystem* fs); void TestNormalizePath(FileSystem* fs); From 06f305e5adb1fa660e16e0a8ed4421e4a8eb036d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 2 Apr 2024 06:17:03 +0900 Subject: [PATCH 40/51] GH-40882: [C++] Suppress shorten-64-to-32 warnings in CUDA/Skyhook codes (#40883) ### Rationale for this change ```text cpp/src/arrow/gpu/cuda_memory.cc:497:72: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); ~~~~~ ^~~~~~~~~ ``` ```text cpp/src/arrow/gpu/cuda_memory.cc:508:68: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); ~~~~~ ^~~~~~~~~ ``` ```text cpp/src/skyhook/protocol/skyhook_protocol.cc:109:69: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'unsigned int' [-Werror,-Wshorten-64-to-32] bl->append(reinterpret_cast(buffer->data()), buffer->size()); ~~~~~~ ~~~~~~~~^~~~~~ ``` ```text cpp/src/skyhook/cls/cls_skyhook.cc:87:37: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] cls_cxx_read(hctx_, position, nbytes, bl.get()); ~~~~~~~~~~~~ ^~~~~~ cpp/src/skyhook/cls/cls_skyhook.cc:87:27: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] cls_cxx_read(hctx_, position, nbytes, bl.get()); ~~~~~~~~~~~~ ^~~~~~~~ ``` ```text cpp/src/skyhook/protocol/skyhook_protocol.cc:109:69: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'unsigned int' [-Werror,-Wshorten-64-to-32] bl->append(reinterpret_cast(buffer->data()), buffer->size()); ~~~~~~ ``` ### What changes are included in this PR? Add casts explicitly. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40882 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/gpu/cuda_memory.cc | 6 ++++-- cpp/src/skyhook/cls/cls_skyhook.cc | 2 +- cpp/src/skyhook/protocol/skyhook_protocol.cc | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 6972321006a9a..dcf0a31963e45 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -494,7 +494,8 @@ Result> DefaultMemoryMapper(ArrowDeviceType devic case ARROW_DEVICE_CUDA: case ARROW_DEVICE_CUDA_HOST: case ARROW_DEVICE_CUDA_MANAGED: { - ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + ARROW_ASSIGN_OR_RAISE(auto device, + arrow::cuda::CudaDevice::Make(static_cast(device_id))); return device->default_memory_manager(); } default: @@ -505,7 +506,8 @@ Result> DefaultMemoryMapper(ArrowDeviceType devic namespace { Result> DefaultCUDADeviceMapper(int64_t device_id) { - ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + ARROW_ASSIGN_OR_RAISE(auto device, + arrow::cuda::CudaDevice::Make(static_cast(device_id))); return device->default_memory_manager(); } diff --git a/cpp/src/skyhook/cls/cls_skyhook.cc b/cpp/src/skyhook/cls/cls_skyhook.cc index 24f80c79d5730..e021cb3c8248a 100644 --- a/cpp/src/skyhook/cls/cls_skyhook.cc +++ b/cpp/src/skyhook/cls/cls_skyhook.cc @@ -84,7 +84,7 @@ class RandomAccessObject : public arrow::io::RandomAccessFile { if (nbytes > 0) { std::shared_ptr bl = std::make_shared(); - cls_cxx_read(hctx_, position, nbytes, bl.get()); + cls_cxx_read(hctx_, static_cast(position), static_cast(nbytes), bl.get()); chunks_.push_back(bl); return std::make_shared((uint8_t*)bl->c_str(), bl->length()); } diff --git a/cpp/src/skyhook/protocol/skyhook_protocol.cc b/cpp/src/skyhook/protocol/skyhook_protocol.cc index 3b1234c6ed913..b91a9bfdd2ecb 100644 --- a/cpp/src/skyhook/protocol/skyhook_protocol.cc +++ b/cpp/src/skyhook/protocol/skyhook_protocol.cc @@ -106,7 +106,8 @@ arrow::Status SerializeTable(const std::shared_ptr& table, ARROW_RETURN_NOT_OK(writer->Close()); ARROW_ASSIGN_OR_RAISE(auto buffer, buffer_output_stream->Finish()); - bl->append(reinterpret_cast(buffer->data()), buffer->size()); + bl->append(reinterpret_cast(buffer->data()), + static_cast(buffer->size())); return arrow::Status::OK(); } From 757ee7a910b9380bd0821a34ac123dec2e53ced0 Mon Sep 17 00:00:00 2001 From: carehabit <165479941+carehabit@users.noreply.github.com> Date: Tue, 2 Apr 2024 08:08:24 +0800 Subject: [PATCH 41/51] MINOR: [Docs] Remove repetitive words (#40914) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? Authored-by: carehabit Signed-off-by: Sutou Kouhei --- cpp/src/arrow/vendored/datetime/tz.cpp | 2 +- cpp/src/arrow/vendored/pcg/pcg_random.hpp | 4 ++-- docs/source/developers/release.rst | 2 +- docs/source/format/ADBC.rst | 2 +- python/pyarrow/src/arrow/python/python_to_arrow.cc | 2 +- r/R/dplyr-arrange.R | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/vendored/datetime/tz.cpp b/cpp/src/arrow/vendored/datetime/tz.cpp index 6962a8b3c3572..e94c1bc8ae682 100644 --- a/cpp/src/arrow/vendored/datetime/tz.cpp +++ b/cpp/src/arrow/vendored/datetime/tz.cpp @@ -118,7 +118,7 @@ #include #include -// unistd.h is used on some platforms as part of the the means to get +// unistd.h is used on some platforms as part of the means to get // the current time zone. On Win32 windows.h provides a means to do it. // gcc/mingw supports unistd.h on Win32 but MSVC does not. diff --git a/cpp/src/arrow/vendored/pcg/pcg_random.hpp b/cpp/src/arrow/vendored/pcg/pcg_random.hpp index a864ba0a2c59b..e39e61e908a2a 100644 --- a/cpp/src/arrow/vendored/pcg/pcg_random.hpp +++ b/cpp/src/arrow/vendored/pcg/pcg_random.hpp @@ -1900,7 +1900,7 @@ typedef pcg_engines::ext_oneseq_xsh_rs_64_32<1,32,true> pcg32_k2_fast; // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // -// (just how good the cryptographic security is is an open question) +// (just how good the cryptographic security is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true> pcg32_k64; typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true> pcg32_k64_oneseq; @@ -1923,7 +1923,7 @@ typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false> pcg64_c32_fast; // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // -// (just how good the cryptographic security is is an open question) +// (just how good the cryptographic security is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true> pcg32_k1024; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true> pcg32_k1024_fast; diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 09608f2834478..e7431ce0fb7b9 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -131,7 +131,7 @@ branch from main. Follow up Release Candidates will update the maintenance branch by cherry-picking specific commits. -For the the initial Release Candidate for a minor or a patch release we will create +For the initial Release Candidate for a minor or a patch release we will create a maintenance branch from the previous corresponding release. For example, for a 15.0.1 patch we will create a maint-15.0.1 branch from maint-15.0.0 and for a maint-15.0.2 we will create it from maint-15.0.1. Once the maintenance branch is diff --git a/docs/source/format/ADBC.rst b/docs/source/format/ADBC.rst index f90ab24d1b9c2..41aa08ddbfb32 100644 --- a/docs/source/format/ADBC.rst +++ b/docs/source/format/ADBC.rst @@ -92,7 +92,7 @@ implemented directly by a vendor-specific "driver" or a vendor-neutral Version 1.0.0 of the standard corresponds to tag adbc-1.0.0 of the repository ``apache/arrow-adbc``, which is commit -f044edf5256abfb4c091b0ad2acc73afea2c93c0_. Note that is is separate +f044edf5256abfb4c091b0ad2acc73afea2c93c0_. Note that is separate from releases of the actual implementations. See the language-specific pages for details: diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 902814a4e91f1..79da47567bf24 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -405,7 +405,7 @@ class PyValue { RETURN_NOT_OK(PopulateMonthDayNano::Field( obj, &output.months, &found_attrs)); // on relativeoffset weeks is a property calculated from days. On - // DateOffset is is a field on its own. timedelta doesn't have a weeks + // DateOffset is a field on its own. timedelta doesn't have a weeks // attribute. PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType(); bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj); diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index e3e20f2cb3ac3..f91cd14211e0f 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -24,7 +24,7 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { exprs <- expand_across(.data, quos(...)) if (.by_group) { - # when the data is is grouped and .by_group is TRUE, order the result by + # when the data is grouped and .by_group is TRUE, order the result by # the grouping columns first exprs <- c(quos(!!!dplyr::groups(.data)), exprs) } From a0cfc258901942af27351f4ed20b3d233a9a1f0b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 09:37:49 +0900 Subject: [PATCH 42/51] MINOR: [CI] Bump actions/setup-python from 5.0.0 to 5.1.0 (#40917) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.0.0 to 5.1.0.
Release notes

Sourced from actions/setup-python's releases.

v5.1.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/setup-python/compare/v5.0.0...v5.1.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/setup-python&package-manager=github_actions&previous-version=5.0.0&new-version=5.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .github/workflows/archery.yml | 2 +- .github/workflows/comment_bot.yml | 2 +- .github/workflows/cpp.yml | 4 ++-- .github/workflows/dev.yml | 4 ++-- .github/workflows/docs.yml | 2 +- .github/workflows/docs_light.yml | 2 +- .github/workflows/go.yml | 6 +++--- .github/workflows/integration.yml | 2 +- .github/workflows/java.yml | 2 +- .github/workflows/java_jni.yml | 4 ++-- .github/workflows/java_nightly.yml | 2 +- .github/workflows/js.yml | 2 +- .github/workflows/pr_bot.yml | 2 +- .github/workflows/python.yml | 4 ++-- .github/workflows/r.yml | 4 ++-- .github/workflows/r_nightly.yml | 2 +- .github/workflows/ruby.yml | 2 +- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index dbd24796db52b..cb783dd66c3fb 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -57,7 +57,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: '3.12' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 038a468a81276..a34856d2dc81a 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,7 +41,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 3036d06d5d7b2..e8e41f1bcb90c 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -237,7 +237,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -458,7 +458,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: 3.9 - name: Install Google Cloud Storage Testbench diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 3a48270a97c9a..37fda2e313ae2 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -42,7 +42,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install pre-commit @@ -101,7 +101,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.12' - name: Install Ruby diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 82b43ee2363b5..9c7701f25f756 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -51,7 +51,7 @@ jobs: key: ubuntu-docs-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-docs- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 306fc5135073d..6ec4c3d53d0e3 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -57,7 +57,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7ff781d35e8ec..7fca38528260f 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -201,7 +201,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -241,7 +241,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -333,7 +333,7 @@ jobs: github.event_name == 'push' && github.repository == 'apache/arrow' && github.ref_name == 'main' - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.10' - name: Run Benchmarks diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index adb6fb2b57c75..0f186ff6a4527 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -81,7 +81,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index a14977525b6c6..423f54cd93547 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -75,7 +75,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 46f3381ed0e8f..790ffd5c650e0 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -69,7 +69,7 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -109,7 +109,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index c535dc4a07de3..f40d4ce5b42d6 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 304eba41e4d37..dab89da44c861 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -51,7 +51,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 6af7dbe7680f5..e589610f536b3 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 25d918bcc25aa..1147ac13e6f93 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -100,7 +100,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -162,7 +162,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: '3.11' - name: Install Dependencies diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8c47915b7b6d3..78677499f3e45 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -142,7 +142,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -203,7 +203,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 6629b5c8a5673..af5382f90834c 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 74d56895f4c34..311c1c822baf6 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -82,7 +82,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery From aaacefa6b6986916256e0e7002bfcfed293443c4 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 1 Apr 2024 21:56:32 -0400 Subject: [PATCH 43/51] GH-40896: [Java] Remove runtime dependencies on Eclipse, logback (#40904) ### Rationale for this change Remove runtime dependencies on [Category B](https://apache.org/legal/resolved.html#category-b) dependencies. ### What changes are included in this PR? - logback: move to test-only - eclipse: remove dependency, vendor the Netty implementation we originally used I wanted to remove javax.annotation.Generated but gRPC doesn't yet let us do that (https://github.com/grpc/grpc-java/issues/9179). That's ~okay though since effectively that's a build only dependency. ### Are these changes tested? #40901 ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** License issues do not cause runtime issues but are important as an Apache project. * GitHub Issue: #40896 Authored-by: David Li Signed-off-by: Sutou Kouhei --- LICENSE.txt | 7 + dev/release/rat_exclude_files.txt | 2 + java/dev/checkstyle/suppressions.xml | 3 + java/tools/pom.xml | 2 +- java/vector/pom.xml | 5 - java/vector/src/main/java/module-info.java | 1 - .../arrow/vector/util/IntObjectHashMap.java | 736 ++++++++++++++++++ .../arrow/vector/util/IntObjectMap.java | 87 +++ .../arrow/vector/util/MapWithOrdinalImpl.java | 2 - .../vector/util/MultiMapWithOrdinal.java | 2 - 10 files changed, 836 insertions(+), 11 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java diff --git a/LICENSE.txt b/LICENSE.txt index 0423854567b26..7bb1330a1002b 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2252,3 +2252,10 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java + +These file are derived from code from Netty, which is made available under the +Apache License 2.0. diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 4f86a12afe4fb..f4d7b411c4dc2 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -80,6 +80,8 @@ go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go go/parquet/internal/gen-go/parquet/parquet-consts.go go/parquet/internal/gen-go/parquet/parquet.go go/parquet/version_string.go +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java js/.npmignore js/closure-compiler-scripts/* js/src/fb/*.ts diff --git a/java/dev/checkstyle/suppressions.xml b/java/dev/checkstyle/suppressions.xml index a3536e2ca9212..e8669c54e61fd 100644 --- a/java/dev/checkstyle/suppressions.xml +++ b/java/dev/checkstyle/suppressions.xml @@ -36,6 +36,9 @@ + + + diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 0688fae1ab78c..9b55f07c013d3 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -52,7 +52,7 @@ ch.qos.logback logback-classic 1.3.14 - runtime + test
com.fasterxml.jackson.core diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 5cd6d0a00fcca..20af3dbd38443 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -74,11 +74,6 @@ org.slf4j slf4j-api - - org.eclipse.collections - eclipse-collections - 11.1.0 -
diff --git a/java/vector/src/main/java/module-info.java b/java/vector/src/main/java/module-info.java index 20f7094715f4d..e2ebcd1e86740 100644 --- a/java/vector/src/main/java/module-info.java +++ b/java/vector/src/main/java/module-info.java @@ -45,6 +45,5 @@ requires org.apache.arrow.format; requires org.apache.arrow.memory.core; requires org.apache.commons.codec; - requires org.eclipse.collections.impl; requires org.slf4j; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java new file mode 100644 index 0000000000000..f3d0fb628edf0 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java @@ -0,0 +1,736 @@ +/* + * Copyright 2014 The Netty Project + * + * The Netty Project licenses this file to you under the Apache License, version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at: + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.AbstractCollection; +import java.util.AbstractSet; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * A vendored specialized copy of Netty's IntObjectHashMap for use within Arrow. + * Avoids requiring Netty in the Arrow core just for this one class. + * + * @param The value type stored in the map. + */ +class IntObjectHashMap implements IntObjectMap { + + /** + * Default initial capacity. Used if not specified in the constructor + */ + public static final int DEFAULT_CAPACITY = 8; + + /** + * Default load factor. Used if not specified in the constructor + */ + public static final float DEFAULT_LOAD_FACTOR = 0.5f; + + /** + * Placeholder for null values, so we can use the actual null to mean available. + * (Better than using a placeholder for available: less references for GC processing.) + */ + private static final Object NULL_VALUE = new Object(); + + /** + * The maximum number of elements allowed without allocating more space. + */ + private int maxSize; + + /** + * The load factor for the map. Used to calculate {@link #maxSize}. + */ + private final float loadFactor; + + private int[] keys; + private V[] values; + private int size; + private int mask; + + private final Set keySet = new KeySet(); + private final Set> entrySet = new EntrySet(); + private final Iterable> entries = new Iterable>() { + @Override + public Iterator> iterator() { + return new PrimitiveIterator(); + } + }; + + public IntObjectHashMap() { + this(DEFAULT_CAPACITY, DEFAULT_LOAD_FACTOR); + } + + public IntObjectHashMap(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public IntObjectHashMap(int initialCapacity, float loadFactor) { + if (loadFactor <= 0.0f || loadFactor > 1.0f) { + // Cannot exceed 1 because we can never store more than capacity elements; + // using a bigger loadFactor would trigger rehashing before the desired load is reached. + throw new IllegalArgumentException("loadFactor must be > 0 and <= 1"); + } + + this.loadFactor = loadFactor; + + // Adjust the initial capacity if necessary. + int capacity = safeFindNextPositivePowerOfTwo(initialCapacity); + mask = capacity - 1; + + // Allocate the arrays. + keys = new int[capacity]; + @SuppressWarnings({"unchecked", "SuspiciousArrayCast"}) + V[] temp = (V[]) new Object[capacity]; + values = temp; + + // Initialize the maximum size value. + maxSize = calcMaxSize(capacity); + } + + private static T toExternal(T value) { + assert value != null : "null is not a legitimate internal value. Concurrent Modification?"; + return value == NULL_VALUE ? null : value; + } + + @SuppressWarnings("unchecked") + private static T toInternal(T value) { + return value == null ? (T) NULL_VALUE : value; + } + + @Override + public V get(int key) { + int index = indexOf(key); + return index == -1 ? null : toExternal(values[index]); + } + + @Override + public V put(int key, V value) { + int startIndex = hashIndex(key); + int index = startIndex; + + for (; ; ) { + if (values[index] == null) { + // Found empty slot, use it. + keys[index] = key; + values[index] = toInternal(value); + growSize(); + return null; + } + if (keys[index] == key) { + // Found existing entry with this key, just replace the value. + V previousValue = values[index]; + values[index] = toInternal(value); + return toExternal(previousValue); + } + + // Conflict, keep probing ... + if ((index = probeNext(index)) == startIndex) { + // Can only happen if the map was full at MAX_ARRAY_SIZE and couldn't grow. + throw new IllegalStateException("Unable to insert"); + } + } + } + + @Override + public void putAll(Map sourceMap) { + if (sourceMap instanceof IntObjectHashMap) { + // Optimization - iterate through the arrays. + @SuppressWarnings("unchecked") + IntObjectHashMap source = (IntObjectHashMap) sourceMap; + for (int i = 0; i < source.values.length; ++i) { + V sourceValue = source.values[i]; + if (sourceValue != null) { + put(source.keys[i], sourceValue); + } + } + return; + } + + // Otherwise, just add each entry. + for (Entry entry : sourceMap.entrySet()) { + put(entry.getKey(), entry.getValue()); + } + } + + @Override + public V remove(int key) { + int index = indexOf(key); + if (index == -1) { + return null; + } + + V prev = values[index]; + removeAt(index); + return toExternal(prev); + } + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size == 0; + } + + @Override + public void clear() { + Arrays.fill(keys, (int) 0); + Arrays.fill(values, null); + size = 0; + } + + @Override + public boolean containsKey(int key) { + return indexOf(key) >= 0; + } + + @Override + public boolean containsValue(Object value) { + @SuppressWarnings("unchecked") + V v1 = toInternal((V) value); + for (V v2 : values) { + // The map supports null values; this will be matched as NULL_VALUE.equals(NULL_VALUE). + if (v2 != null && v2.equals(v1)) { + return true; + } + } + return false; + } + + @Override + public Iterable> entries() { + return entries; + } + + @Override + public Collection values() { + return new AbstractCollection() { + @Override + public Iterator iterator() { + return new Iterator() { + final PrimitiveIterator iter = new PrimitiveIterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public V next() { + return iter.next().value(); + } + + @Override + public void remove() { + iter.remove(); + } + }; + } + + @Override + public int size() { + return size; + } + }; + } + + @Override + public int hashCode() { + // Hashcode is based on all non-zero, valid keys. We have to scan the whole keys + // array, which may have different lengths for two maps of same size(), so the + // capacity cannot be used as input for hashing but the size can. + int hash = size; + for (int key : keys) { + // 0 can be a valid key or unused slot, but won't impact the hashcode in either case. + // This way we can use a cheap loop without conditionals, or hard-to-unroll operations, + // or the devastatingly bad memory locality of visiting value objects. + // Also, it's important to use a hash function that does not depend on the ordering + // of terms, only their values; since the map is an unordered collection and + // entries can end up in different positions in different maps that have the same + // elements, but with different history of puts/removes, due to conflicts. + hash ^= hashCode(key); + } + return hash; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof IntObjectMap)) { + return false; + } + @SuppressWarnings("rawtypes") + IntObjectMap other = (IntObjectMap) obj; + if (size != other.size()) { + return false; + } + for (int i = 0; i < values.length; ++i) { + V value = values[i]; + if (value != null) { + int key = keys[i]; + Object otherValue = other.get(key); + if (value == NULL_VALUE) { + if (otherValue != null) { + return false; + } + } else if (!value.equals(otherValue)) { + return false; + } + } + } + return true; + } + + @Override + public boolean containsKey(Object key) { + return containsKey(objectToKey(key)); + } + + @Override + public V get(Object key) { + return get(objectToKey(key)); + } + + @Override + public V put(Integer key, V value) { + return put(objectToKey(key), value); + } + + @Override + public V remove(Object key) { + return remove(objectToKey(key)); + } + + @Override + public Set keySet() { + return keySet; + } + + @Override + public Set> entrySet() { + return entrySet; + } + + private int objectToKey(Object key) { + return (int) (Integer) key; + } + + /** + * Locates the index for the given key. This method probes using double hashing. + * + * @param key the key for an entry in the map. + * @return the index where the key was found, or {@code -1} if no entry is found for that key. + */ + private int indexOf(int key) { + int startIndex = hashIndex(key); + int index = startIndex; + + for (; ; ) { + if (values[index] == null) { + // It's available, so no chance that this value exists anywhere in the map. + return -1; + } + if (key == keys[index]) { + return index; + } + + // Conflict, keep probing ... + if ((index = probeNext(index)) == startIndex) { + return -1; + } + } + } + + /** + * Returns the hashed index for the given key. + */ + private int hashIndex(int key) { + // The array lengths are always a power of two, so we can use a bitmask to stay inside the array bounds. + return hashCode(key) & mask; + } + + /** + * Returns the hash code for the key. + */ + private static int hashCode(int key) { + return key; + } + + /** + * Get the next sequential index after {@code index} and wraps if necessary. + */ + private int probeNext(int index) { + // The array lengths are always a power of two, so we can use a bitmask to stay inside the array bounds. + return (index + 1) & mask; + } + + /** + * Grows the map size after an insertion. If necessary, performs a rehash of the map. + */ + private void growSize() { + size++; + + if (size > maxSize) { + if (keys.length == Integer.MAX_VALUE) { + throw new IllegalStateException("Max capacity reached at size=" + size); + } + + // Double the capacity. + rehash(keys.length << 1); + } + } + + /** + * Removes entry at the given index position. Also performs opportunistic, incremental rehashing + * if necessary to not break conflict chains. + * + * @param index the index position of the element to remove. + * @return {@code true} if the next item was moved back. {@code false} otherwise. + */ + private boolean removeAt(final int index) { + --size; + // Clearing the key is not strictly necessary (for GC like in a regular collection), + // but recommended for security. The memory location is still fresh in the cache anyway. + keys[index] = 0; + values[index] = null; + + // In the interval from index to the next available entry, the arrays may have entries + // that are displaced from their base position due to prior conflicts. Iterate these + // entries and move them back if possible, optimizing future lookups. + // Knuth Section 6.4 Algorithm R, also used by the JDK's IdentityHashMap. + + int nextFree = index; + int i = probeNext(index); + for (V value = values[i]; value != null; value = values[i = probeNext(i)]) { + int key = keys[i]; + int bucket = hashIndex(key); + if (i < bucket && (bucket <= nextFree || nextFree <= i) || + bucket <= nextFree && nextFree <= i) { + // Move the displaced entry "back" to the first available position. + keys[nextFree] = key; + values[nextFree] = value; + // Put the first entry after the displaced entry + keys[i] = 0; + values[i] = null; + nextFree = i; + } + } + return nextFree != index; + } + + /** + * Calculates the maximum size allowed before rehashing. + */ + private int calcMaxSize(int capacity) { + // Clip the upper bound so that there will always be at least one available slot. + int upperBound = capacity - 1; + return Math.min(upperBound, (int) (capacity * loadFactor)); + } + + /** + * Rehashes the map for the given capacity. + * + * @param newCapacity the new capacity for the map. + */ + private void rehash(int newCapacity) { + int[] oldKeys = keys; + V[] oldVals = values; + + keys = new int[newCapacity]; + @SuppressWarnings({"unchecked", "SuspiciousArrayCast"}) + V[] temp = (V[]) new Object[newCapacity]; + values = temp; + + maxSize = calcMaxSize(newCapacity); + mask = newCapacity - 1; + + // Insert to the new arrays. + for (int i = 0; i < oldVals.length; ++i) { + V oldVal = oldVals[i]; + if (oldVal != null) { + // Inlined put(), but much simpler: we don't need to worry about + // duplicated keys, growing/rehashing, or failing to insert. + int oldKey = oldKeys[i]; + int index = hashIndex(oldKey); + + for (; ; ) { + if (values[index] == null) { + keys[index] = oldKey; + values[index] = oldVal; + break; + } + + // Conflict, keep probing. Can wrap around, but never reaches startIndex again. + index = probeNext(index); + } + } + } + } + + @Override + public String toString() { + if (isEmpty()) { + return "{}"; + } + StringBuilder sb = new StringBuilder(4 * size); + sb.append('{'); + boolean first = true; + for (int i = 0; i < values.length; ++i) { + V value = values[i]; + if (value != null) { + if (!first) { + sb.append(", "); + } + sb.append(keyToString(keys[i])).append('=').append(value == this ? "(this Map)" : + toExternal(value)); + first = false; + } + } + return sb.append('}').toString(); + } + + /** + * Helper method called by {@link #toString()} in order to convert a single map key into a string. + * This is protected to allow subclasses to override the appearance of a given key. + */ + protected String keyToString(int key) { + return Integer.toString(key); + } + + /** + * Set implementation for iterating over the entries of the map. + */ + private final class EntrySet extends AbstractSet> { + @Override + public Iterator> iterator() { + return new MapIterator(); + } + + @Override + public int size() { + return IntObjectHashMap.this.size(); + } + } + + /** + * Set implementation for iterating over the keys. + */ + private final class KeySet extends AbstractSet { + @Override + public int size() { + return IntObjectHashMap.this.size(); + } + + @Override + public boolean contains(Object o) { + return IntObjectHashMap.this.containsKey(o); + } + + @Override + public boolean remove(Object o) { + return IntObjectHashMap.this.remove(o) != null; + } + + @Override + public boolean retainAll(Collection retainedKeys) { + boolean changed = false; + for (Iterator> iter = entries().iterator(); iter.hasNext(); ) { + PrimitiveEntry entry = iter.next(); + if (!retainedKeys.contains(entry.key())) { + changed = true; + iter.remove(); + } + } + return changed; + } + + @Override + public void clear() { + IntObjectHashMap.this.clear(); + } + + @Override + public Iterator iterator() { + return new Iterator() { + private final Iterator> iter = entrySet.iterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public Integer next() { + return iter.next().getKey(); + } + + @Override + public void remove() { + iter.remove(); + } + }; + } + } + + /** + * Iterator over primitive entries. Entry key/values are overwritten by each call to {@link #next()}. + */ + private final class PrimitiveIterator implements Iterator>, PrimitiveEntry { + private int prevIndex = -1; + private int nextIndex = -1; + private int entryIndex = -1; + + private void scanNext() { + while (++nextIndex != values.length && values[nextIndex] == null) { + } + } + + @Override + public boolean hasNext() { + if (nextIndex == -1) { + scanNext(); + } + return nextIndex != values.length; + } + + @Override + public PrimitiveEntry next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + prevIndex = nextIndex; + scanNext(); + + // Always return the same Entry object, just change its index each time. + entryIndex = prevIndex; + return this; + } + + @Override + public void remove() { + if (prevIndex == -1) { + throw new IllegalStateException("next must be called before each remove."); + } + if (removeAt(prevIndex)) { + // removeAt may move elements "back" in the array if they have been displaced because their spot in the + // array was occupied when they were inserted. If this occurs then the nextIndex is now invalid and + // should instead point to the prevIndex which now holds an element which was "moved back". + nextIndex = prevIndex; + } + prevIndex = -1; + } + + // Entry implementation. Since this implementation uses a single Entry, we coalesce that + // into the Iterator object (potentially making loop optimization much easier). + + @Override + public int key() { + return keys[entryIndex]; + } + + @Override + public V value() { + return toExternal(values[entryIndex]); + } + + @Override + public void setValue(V value) { + values[entryIndex] = toInternal(value); + } + } + + /** + * Iterator used by the {@link Map} interface. + */ + private final class MapIterator implements Iterator> { + private final PrimitiveIterator iter = new PrimitiveIterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public Entry next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + iter.next(); + + return new MapEntry(iter.entryIndex); + } + + @Override + public void remove() { + iter.remove(); + } + } + + /** + * A single entry in the map. + */ + final class MapEntry implements Entry { + private final int entryIndex; + + MapEntry(int entryIndex) { + this.entryIndex = entryIndex; + } + + @Override + public Integer getKey() { + verifyExists(); + return keys[entryIndex]; + } + + @Override + public V getValue() { + verifyExists(); + return toExternal(values[entryIndex]); + } + + @Override + public V setValue(V value) { + verifyExists(); + V prevValue = toExternal(values[entryIndex]); + values[entryIndex] = toInternal(value); + return prevValue; + } + + private void verifyExists() { + if (values[entryIndex] == null) { + throw new IllegalStateException("The map entry has been removed"); + } + } + } + + static int safeFindNextPositivePowerOfTwo(final int value) { + return value <= 0 ? 1 : value >= 0x40000000 ? 0x40000000 : findNextPositivePowerOfTwo(value); + } + + static int findNextPositivePowerOfTwo(final int value) { + assert value > Integer.MIN_VALUE && value < 0x40000000; + return 1 << (32 - Integer.numberOfLeadingZeros(value - 1)); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java new file mode 100644 index 0000000000000..5a9d2a5a52eb9 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java @@ -0,0 +1,87 @@ +/* + * Copyright 2014 The Netty Project + * + * The Netty Project licenses this file to you under the Apache License, version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at: + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Iterator; +import java.util.Map; + +/** + * A vendored specialized copy of Netty's IntObjectMap for use within Arrow. + * Avoids requiring Netty in the Arrow core just for this one class. + * + * @param the value type stored in the map. + */ +interface IntObjectMap extends Map { + + /** + * A primitive entry in the map, provided by the iterator from {@link #entries()}. + * + * @param the value type stored in the map. + */ + interface PrimitiveEntry { + /** + * Gets the key for this entry. + */ + int key(); + + /** + * Gets the value for this entry. + */ + V value(); + + /** + * Sets the value for this entry. + */ + void setValue(V value); + } + + /** + * Gets the value in the map with the specified key. + * + * @param key the key whose associated value is to be returned. + * @return the value or {@code null} if the key was not found in the map. + */ + V get(int key); + + /** + * Puts the given entry into the map. + * + * @param key the key of the entry. + * @param value the value of the entry. + * @return the previous value for this key or {@code null} if there was no previous mapping. + */ + V put(int key, V value); + + /** + * Removes the entry with the specified key. + * + * @param key the key for the entry to be removed from this map. + * @return the previous value for the key, or {@code null} if there was no mapping. + */ + V remove(int key); + + /** + * Gets an iterable to traverse over the primitive entries contained in this map. As an optimization, + * the {@link PrimitiveEntry}s returned by the {@link Iterator} may change as the {@link Iterator} + * progresses. The caller should not rely on {@link PrimitiveEntry} key/value stability. + */ + Iterable> entries(); + + /** + * Indicates whether or not this map contains a value for the specified key. + */ + boolean containsKey(int key); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java index 1f18587afdfd1..14b86c6129c81 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java @@ -26,8 +26,6 @@ import java.util.Set; import java.util.stream.Collectors; -import org.eclipse.collections.impl.map.mutable.primitive.IntObjectHashMap; - /** * An implementation of map that supports constant time look-up by a generic key or an ordinal. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java index f722a8a86772c..10566586b21c0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java @@ -25,8 +25,6 @@ import java.util.Set; import java.util.stream.Collectors; -import org.eclipse.collections.impl.map.mutable.primitive.IntObjectHashMap; - /** * An implementation of a multimap that supports constant time look-up by a generic key or an ordinal. * From 65dd5c7e23b0e4a7aa57a50f619ef5c017da0894 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 2 Apr 2024 00:02:15 -0700 Subject: [PATCH 44/51] MINOR: [Java] Fix maven-checkstyle-plugin configuration (#40850) ### Rationale for this change `maven-checkstyle-plugin` configuration refers to several unrecognized properties, causing build output to print several messages like: > [WARNING] Parameter 'format' is unknown for plugin 'maven-checkstyle-plugin:3.1.0:check (validate)' ### What changes are included in this PR? Fix checkstyle configuration and use the correct outputFileFormat and inputEncoding properties in place of the unrecognized format and encoding ones. ### Are these changes tested? As this is a build change with no code change, only via a local build + visual inspection of the build output ### Are there any user-facing changes? No Authored-by: Laurent Goujon Signed-off-by: David Li --- java/maven/pom.xml | 5 ++--- java/pom.xml | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/java/maven/pom.xml b/java/maven/pom.xml index ccc12f5397fb7..f6a6da3afe53e 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -257,13 +257,12 @@ ../dev/checkstyle/checkstyle.license ../dev/checkstyle/suppressions.xml true - UTF-8 + UTF-8 true ${checkstyle.failOnViolation} ${checkstyle.failOnViolation} warning - xml - html + xml ${project.build.directory}/test/checkstyle-errors.xml false diff --git a/java/pom.xml b/java/pom.xml index b05b2d8f1425a..610593580f720 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -327,13 +327,12 @@ dev/checkstyle/checkstyle.license dev/checkstyle/suppressions.xml true - UTF-8 + UTF-8 true ${checkstyle.failOnViolation} ${checkstyle.failOnViolation} warning - xml - html + xml ${project.build.directory}/test/checkstyle-errors.xml false From 549e1c4e66e9e8af2808d49d624ef443816a630a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:02:42 -0400 Subject: [PATCH 45/51] MINOR: [Java] Bump org.apache.maven.plugins:maven-gpg-plugin from 3.1.0 to 3.2.2 in /java (#40921) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 3.1.0 to 3.2.2.
Release notes

Sourced from org.apache.maven.plugins:maven-gpg-plugin's releases.

3.2.2

JiRA link

Release Notes - Maven GPG Plugin - Version 3.2.2


What's Changed

Full Changelog: https://github.com/apache/maven-gpg-plugin/compare/maven-gpg-plugin-3.2.1...maven-gpg-plugin-3.2.2

3.2.1

JIRA link

Release Notes - Maven GPG Plugin - Version 3.2.1

... (truncated)

Commits
  • ab97064 [maven-release-plugin] prepare release maven-gpg-plugin-3.2.2
  • 2be0a00 [MGPG-115] Show more info about key used to sign (#84)
  • 3631830 [MGPG-114] Allow max key size of 16KB (#83)
  • 528fab9 [MGPG-113] SignAndDeployFileMojo results in 401 (#82)
  • 770636b [maven-release-plugin] prepare for next development iteration
  • 5b69086 [maven-release-plugin] prepare release maven-gpg-plugin-3.2.1
  • 28d298c [MGPG-111] Fix dependencies (#81)
  • 75d8ed5 [MGPG-112] serverId def value was unintentionally dropped (#80)
  • 2a11a2d [maven-release-plugin] prepare for next development iteration
  • 4b23da8 [maven-release-plugin] prepare release maven-gpg-plugin-3.2.0
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-gpg-plugin&package-manager=maven&previous-version=3.1.0&new-version=3.2.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/gandiva/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 0d2a23345f6ea..cb2deb07db42a 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -96,7 +96,7 @@ org.apache.maven.plugins maven-gpg-plugin - 3.1.0 + 3.2.2 sign-artifacts From 82f9403077547046e589d44d8682388ac618c75d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:03:45 -0400 Subject: [PATCH 46/51] MINOR: [Java] Bump org.apache.maven.plugin-tools:maven-plugin-annotations from 3.6.0 to 3.11.0 in /java (#40524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugin-tools:maven-plugin-annotations](https://github.com/apache/maven-plugin-tools) from 3.6.0 to 3.11.0.
Release notes

Sourced from org.apache.maven.plugin-tools:maven-plugin-annotations's releases.

3.11.0

Release Notes - Maven Plugin Tools - Version 3.11.0

Bug

  • [MPLUGIN-496] - Translation for keys report.plugin.goal.yes,no are missing
  • [MPLUGIN-499] - Deprecate descriptions are missing in description table

Improvement

  • [MPLUGIN-450] - Make goal prefix mandatory by default
  • [MPLUGIN-474] - Improve descriptor docs for requiredJavaVersion
  • [MPLUGIN-492] - Documentation for plugins in general: Goals comprises more than that
  • [MPLUGIN-495] - WARNINGs based on usage of @ Component for MavenSession/MavenProject instead of @ Parameter

Task

  • [MPLUGIN-493] - Consistently evaluate skip parameter in MavenReport#canGenerateReport()
  • [MPLUGIN-498] - Move section rendering to separate methods

Dependency upgrade

3.10.2

Release Notes - Maven Plugin Tools - Version 3.10.2

Bug

Dependency upgrade

  • [MPLUGIN-485] - Upgrade Parent to 40
  • [MPLUGIN-487] - Bump org.codehaus.plexus:plexus-java from 1.1.2 to 1.2.0
  • [MPLUGIN-488] - Bump asmVersion from 9.5 to 9.6
  • [MPLUGIN-489] - Bump antVersion from 1.10.13 to 1.10.14
  • [MPLUGIN-490] - Bump org.jsoup:jsoup from 1.16.1 to 1.16.2
  • [MPLUGIN-491] - Bump org.codehaus.plexus:plexus-testing from 1.1.0 to 1.2.0

3.10.1

... (truncated)

Commits
  • 4178d33 [maven-release-plugin] prepare release maven-plugin-tools-3.11.0
  • 25d920f [MNG-5695] document Maven 3.2.5+ scoped components usage
  • 6418490 [MPLUGIN-495] WARNINGs based on usage of @​Component for MavenSession/MavenPro...
  • 8b93d12 Bump org.jsoup:jsoup from 1.17.1 to 1.17.2
  • f4973ac Bump org.assertj:assertj-core from 3.24.2 to 3.25.1
  • 7dd3a25 [MPLUGIN-499] Add deprecate description in parameters table (#250)
  • 9bb13f0 [MPLUGIN-492] Documentation for plugins in general: Goals comprises more than...
  • fc41218 [MPLUGIN-498] Move section rendering to separate methods
  • ed4774b [MPLUGIN-450] Require goalPrefix to be valid (#240)
  • 331cf42 [MPLUGIN-497] Upgrade components
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugin-tools:maven-plugin-annotations&package-manager=maven&previous-version=3.6.0&new-version=3.11.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/maven/module-info-compiler-maven-plugin/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/maven/module-info-compiler-maven-plugin/pom.xml b/java/maven/module-info-compiler-maven-plugin/pom.xml index 6881018933d3f..910fede33ce3b 100644 --- a/java/maven/module-info-compiler-maven-plugin/pom.xml +++ b/java/maven/module-info-compiler-maven-plugin/pom.xml @@ -66,7 +66,7 @@ org.apache.maven.plugin-tools maven-plugin-annotations - 3.6.0 + 3.11.0 provided From 2b3d071cd17458363cf1550c4396ce67a12ef6a5 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 2 Apr 2024 12:46:00 +0530 Subject: [PATCH 47/51] GH-40684: [Java][Docs] JNI module debugging with IntelliJ (#40685) ### Rationale for this change Adding documentation for debugging JNI-based Java modules. ### What changes are included in this PR? Documentation update for developer docs for Java development. ### Are these changes tested? Locally built the docs and it shows the expected content. ### Are there any user-facing changes? N/A * GitHub Issue: #40684 Lead-authored-by: Vibhatha Abeykoon Co-authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- docs/source/developers/java/building.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 27e2de97328c3..c059ff676efb2 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -347,6 +347,11 @@ Arrow repository, and update the following settings: * If using IntelliJ's Maven integration to build, you may need to change ```` to ``false`` in the pom.xml files due to an `IntelliJ bug `__. +* To enable debugging JNI-based modules like ``dataset``, + activate specific profiles in the Maven tab under "Profiles". + Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + IDE can build them and enable debugging. You may not need to update all of these settings if you build/test with the IntelliJ Maven integration instead of with IntelliJ directly. From 096cdad5b434a6aa6ccf066efb894a8e05353309 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:18:55 -0400 Subject: [PATCH 48/51] MINOR: [Java] Bump io.grpc:grpc-bom from 1.61.1 to 1.62.2 in /java (#40920) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.grpc:grpc-bom](https://github.com/grpc/grpc-java) from 1.61.1 to 1.62.2.
Release notes

Sourced from io.grpc:grpc-bom's releases.

v1.62.2

gRPC Java 1.62.2 Release Notes

Note that this is the initial 1.62.x release

API Changes

  • services: Remove io.grpc.services.BinaryLogs, which was deprecated since 2021. io.grpc.protobuf.services.BinaryLogs should be used instead (#10832).
  • Allow users outside of io.grpc.xds package to create custom xDS resources (#10834) (6d96e6588)

New Features

  • api:Add ClientTransportFilter. Similarly to ServerTransportFilter, this will provide an observability hook and it allows direct modification of the transport's attributes. (#10646)

Improvements

  • java_grpc_library.bzl: Add support for Auto Exec Groups (cb03bd234). This is mostly a behind-the-scenes change to adjust to the newer way Bazel operates
  • java_grpc_library.bzl: Support runfiles for protoc and the plugin (65a6b3bc2). Neither binary uses runfiles, but the task will be ready if they need to in the future
  • xds: Add EC key support for XdsChannelCredentials/XdsServerCredentials (100d5a55f)
  • binder:Change log level from WARNING to FINER for expected exception during close with error, to reduce log spamming (#10899) (7ba0718bb)

Bug Fixes

  • xds: Fix a bug in WeightedRoundRobinLoadBalancer policy that could raise NullPointerException and further cause channel panic when picking a subchannel. This bug can only be triggered when connection can not be established and the channel reports TRANSIENT_FAILURE state. (#10868)

Dependencies

  • The protoc plugin no longer supports macOS Big Sur (macOS 11). Binaries are now built using Monterey (macOS 12)

Acknowledgements

Commits
  • 3e993a9 Bump version to 1.62.1
  • 1da945b Update README etc to reference 1.62.1
  • 7089f04 Change GAE interop tests to use java11 runtime (#10933)
  • 597f26e Bump version to 1.62.1-SNAPSHOT
  • 10eb91f Bump version to 1.62.0
  • 28dffe5 Update README etc to reference 1.62.0
  • 5ba8b71 util: MultiChildLoadBalance.shutdown() log to FINE (#10935)
  • 1795348 Remove semi-circular dependency between core and util
  • 95b847e interop-testing: Use separate event loops in RetryTest
  • 7ba0718 Change log level from WARNING to FINER for expected exception (#10899)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.grpc:grpc-bom&package-manager=maven&previous-version=1.61.1&new-version=1.62.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 610593580f720..bdefbea2d8787 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,7 +34,7 @@ 2.0.11 33.0.0-jre 4.1.108.Final - 1.61.1 + 1.62.2 3.23.1 2.17.0 3.4.0 From 42b49df0f3dc1586ad38c608ec93f382a4f4e3c4 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 2 Apr 2024 00:53:56 -0700 Subject: [PATCH 49/51] GH-40907: [Java][FlightSQL] Shade slf4j-api in JDBC driver (#40908) ### Rationale for this change FlightSQL JDBC Driver does not shade slfj4 api which may come into conflict into the version used by an application. If the application uses slf4j 1.x, it may cause the application slf4j backend to not be loaded properly. The change configured maven-shade-plugin to also shade slf4j-api. To make sure log messages are still visible, slf4j-jdk14 is included as well so that all messages will be redirected to `java.util.logging` framework. The application can use jul-to-slf4j adapter to redirect log messages back to slf4j. ### What changes are included in this PR? Overrides `Driver#getParentLogger()` to return the root logger for the JDBC driver (which is `org.apache.arrow.driver.jdbc`). To make sure shaded dependencies loggers are included as well, change relocation from `cfjd.` to `org.apache.arrow.driver.jdbc.shaded. `(or `oaadj` for native libraries) ### Are these changes tested? Verifying that slf4j-api is shaded along with the other relocation changes are covered by `ITDriverJarValidation` ### Are there any user-facing changes? Yes. Driver will not expose directly slf4j api and the logger names for the shaded dependencies have been updated. For applications which were relying on configuring directly a slf4j logging backend for the driver, they may need to include `org.slf4j:slf4-api` and `org.slf4j:jul-to-slf4j` for logging configuration to work. * GitHub Issue: #40907 Authored-by: Laurent Goujon Signed-off-by: David Li --- .../driver/jdbc/ArrowFlightJdbcDriver.java | 10 +++++++++- java/flight/flight-sql-jdbc-driver/pom.xml | 20 +++++++++++-------- .../driver/jdbc/ITDriverJarValidation.java | 5 +---- java/pom.xml | 5 +++++ 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java index 183e3d5c7b055..d0daaa8bda155 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java @@ -31,6 +31,7 @@ import java.util.Objects; import java.util.Optional; import java.util.Properties; +import java.util.logging.Logger; import org.apache.arrow.driver.jdbc.utils.ArrowFlightConnectionConfigImpl.ArrowFlightConnectionProperty; import org.apache.arrow.driver.jdbc.utils.UrlParser; @@ -58,7 +59,7 @@ public class ArrowFlightJdbcDriver extends UnregisteredDriver { // Netty requires some extra properties to unlock some native memory management api // Setting this property if not already set externally // This has to be done before any netty class is being loaded - final String key = "cfjd.io.netty.tryReflectionSetAccessible"; + final String key = "io.netty.tryReflectionSetAccessible"; final String tryReflectionSetAccessible = System.getProperty(key); if (tryReflectionSetAccessible == null) { System.setProperty(key, Boolean.TRUE.toString()); @@ -67,6 +68,13 @@ public class ArrowFlightJdbcDriver extends UnregisteredDriver { new ArrowFlightJdbcDriver().register(); } + @Override + public Logger getParentLogger() { + // Return the logger associated with the driver package ('org.apache.arrow.driver.jdbc') + // When packaged in flight-sql-jdbc-driver, it will also apply to all shaded dependencies + return Logger.getLogger(getClass().getPackage().getName()); + } + @Override public ArrowFlightConnection connect(final String url, final Properties info) throws SQLException { diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 53d929afa781c..2157c09eaf583 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -97,6 +97,11 @@ slf4j-api runtime + + org.slf4j + slf4j-jdk14 + runtime + io.netty @@ -190,17 +195,16 @@ com. - cfjd.com. + org.apache.arrow.driver.jdbc.shaded.com. com.sun.** org. - cfjd.org. + org.apache.arrow.driver.jdbc.shaded.org. org.apache.arrow.driver.jdbc.** - org.slf4j.** org.apache.arrow.flight.name org.apache.arrow.flight.version @@ -210,24 +214,24 @@ io. - cfjd.io. + org.apache.arrow.driver.jdbc.shaded.io. net. - cfjd.net. + org.apache.arrow.driver.jdbc.shaded.net. mozilla. - cfjd.mozilla. + org.apache.arrow.driver.jdbc.shaded.mozilla. META-INF.native.libnetty_ - META-INF.native.libcfjd_netty_ + META-INF.native.liboaadj_netty_ META-INF.native.netty_ - META-INF.native.cfjd_netty_ + META-INF.native.oaadj_netty_ diff --git a/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java index fdb580d493abf..0cae2fd5f5cb8 100644 --- a/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java +++ b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java @@ -42,8 +42,7 @@ /** * Check the content of the JDBC driver jar * - * After shading everything should be either under org.apache.arrow.driver.jdbc., - * org.slf4j., or cfjd. packages + * After shading everything should be either under org.apache.arrow.driver.jdbc. package */ public class ITDriverJarValidation { /** @@ -57,8 +56,6 @@ public class ITDriverJarValidation { */ public static final Set ALLOWED_PREFIXES = ImmutableSet.of( "org/apache/arrow/driver/jdbc/", - "cfjd/", - "org/slf4j/", "META-INF/"); /** diff --git a/java/pom.xml b/java/pom.xml index bdefbea2d8787..8e9ddd5480ea8 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -680,6 +680,11 @@ slf4j-api ${dep.slf4j.version} + + org.slf4j + slf4j-jdk14 + ${dep.slf4j.version} + javax.annotation javax.annotation-api From 15522931377724c4e5ce6cc6151f88021de55a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 2 Apr 2024 12:50:46 +0200 Subject: [PATCH 50/51] GH-40833: [Docs][Release] Make explicit in the documentation that verifying binaries is not required in order to case a vote (#40834) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Based on the discussion on https://lists.apache.org/thread/ogp9dthp124oq0fmvlyzvjorjsyom03v making clear that binaries verification are not required in order to cast a positive vote for the release. ### What changes are included in this PR? Document the required process ### Are these changes tested? preview-docs job on archery will be run ### Are there any user-facing changes? No * GitHub Issue: #40833 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .../developers/release_verification.rst | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index 53c8f54e5b5bd..ec474a5729b64 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -44,20 +44,36 @@ Linux and macOS In order to run the verification script either for the source release or the binary artifacts see the following guidelines: +Required source verification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Individuals are REQUIRED to download all signed source code packages onto their +own hardware, validate all cryptographic signatures, compile as provided, +and test the result on their own platform in order to cast a +1 vote. + .. code-block:: # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM - # this will create and automatically clean up a temporary directory for the verification environment and will run the binary verification - TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh $VERSION $RC_NUM - # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables # here are a couple of examples, but see the source code for the available options TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests TEST_DEFAULT=0 TEST_CPP=1 TEST_PYTHON=1 verify-release-candidate.sh $VERSION $RC_NUM # C++ and Python tests TEST_DEFAULT=0 TEST_INTEGRATION_CPP=1 TEST_INTEGRATION_JAVA=1 verify-release-candidate.sh $VERSION $RC_NUM # C++ and Java integration tests - + +Binary verification +^^^^^^^^^^^^^^^^^^^ + +The binaries are generated from the source that has been verified. Those binaries are +tested on CI but can be tested locally for further validation. It is not necessary to +test them in order to cast a positive vote. + +.. code-block:: + + # this will create and automatically clean up a temporary directory for the verification environment and will run the binary verification + TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh $VERSION $RC_NUM + # to verify certain binaries use the TEST_* variables as: TEST_DEFAULT=0 TEST_WHEELS=1 verify-release-candidate.sh $VERSION $RC_NUM # only Wheels TEST_DEFAULT=0 TEST_APT=1 verify-release-candidate.sh $VERSION $RC_NUM # only APT packages @@ -130,7 +146,6 @@ As an example: I've verified successfully the sources and binaries with: TEST_DEFAULT=0 TEST_SOURCE=1 dev/release/verify-release-candidate.sh 15.0.0 1 - TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh 15.0.0 1 with: * Python 3.10.12 * gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 From 5ddef639dfcaf62a02ed8c8d63103f22ae41a5ee Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 2 Apr 2024 04:19:03 -0700 Subject: [PATCH 51/51] GH-40038: [Java] Export non empty offset buffer for variable-size layout through C Data Interface (#40043) ### Rationale for this change We encountered an error when exchanging string array from Java to Rust through Arrow C data interface. At Rust side, it complains that the buffer at position 1 (offset buffer) is null. After tracing down and some debugging, it looks like the issue is Java Arrow `BaseVariableWidthVector` class assigns an empty offset buffer if the array is empty (value count 0). According to Arrow [spec](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) for variable size binary layout: > The offsets buffer contains length + 1 signed integers ... So for an empty string array, its offset buffer should be a buffer with one element (generally it is `0`). ### What changes are included in this PR? This patch replaces current empty offset buffer in variable-size layout vector classes when exporting arrays through C Data Interface. ### Are these changes tested? Added test cases. ### Are there any user-facing changes? No * Closes: #40038 Authored-by: Liang-Chi Hsieh Signed-off-by: David Li --- .../org/apache/arrow/c/ArrayExporter.java | 10 +---- .../org/apache/arrow/c/RoundtripTest.java | 18 +++++++- .../vector/BaseLargeVariableWidthVector.java | 35 ++++++++++++++-- .../arrow/vector/BaseVariableWidthVector.java | 35 ++++++++++++++-- .../org/apache/arrow/vector/FieldVector.java | 41 +++++++++++++++++++ .../complex/BaseRepeatedValueVector.java | 7 ++-- .../arrow/vector/complex/LargeListVector.java | 29 +++++++++++-- .../arrow/vector/complex/ListVector.java | 22 +++++++++- .../arrow/vector/complex/MapVector.java | 2 +- 9 files changed, 174 insertions(+), 25 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java index d6479a3ba4ca8..05ab3e5ff6063 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java @@ -98,15 +98,7 @@ void export(ArrowArray array, FieldVector vector, DictionaryProvider dictionaryP if (buffers != null) { data.buffers = new ArrayList<>(buffers.size()); data.buffers_ptrs = allocator.buffer((long) buffers.size() * Long.BYTES); - for (ArrowBuf arrowBuf : buffers) { - if (arrowBuf != null) { - arrowBuf.getReferenceManager().retain(); - data.buffers_ptrs.writeLong(arrowBuf.memoryAddress()); - } else { - data.buffers_ptrs.writeLong(NULL); - } - data.buffers.add(arrowBuf); - } + vector.exportCDataBuffers(data.buffers, data.buffers_ptrs, NULL); } if (dictionaryEncoding != null) { diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index a7e3cde2e7b4b..768394ef7ab60 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; import org.apache.arrow.memory.ArrowBuf; @@ -165,10 +166,25 @@ VectorSchemaRoot vectorSchemaRootRoundtrip(VectorSchemaRoot root) { } boolean roundtrip(FieldVector vector, Class clazz) { + List fieldBuffers = vector.getFieldBuffers(); + List orgRefCnts = fieldBuffers.stream().map(buf -> buf.refCnt()).collect(Collectors.toList()); + long orgMemorySize = allocator.getAllocatedMemory(); + + boolean result = false; try (ValueVector imported = vectorRoundtrip(vector)) { assertTrue(clazz.isInstance(imported), String.format("expected %s but was %s", clazz, imported.getClass())); - return VectorEqualsVisitor.vectorEquals(vector, imported); + result = VectorEqualsVisitor.vectorEquals(vector, imported); } + + // Check that the ref counts of the buffers are the same after the roundtrip + IntStream.range(0, orgRefCnts.size()).forEach(i -> { + ArrowBuf buf = fieldBuffers.get(i); + assertEquals(buf.refCnt(), orgRefCnts.get(i)); + }); + + assertEquals(orgMemorySize, allocator.getAllocatedMemory()); + + return result; } @Test diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index c239edbcc3c29..34c9e73a0b072 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -336,6 +336,34 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + + exportBuffer(valueBuffer, buffers, buffersPtr, nullValue, true); + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -456,10 +484,11 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { } /* allocate offset buffer */ - private void allocateOffsetBuffer(final long size) { - offsetBuffer = allocator.buffer(size); + private ArrowBuf allocateOffsetBuffer(final long size) { + ArrowBuf offsetBuffer = allocator.buffer(size); offsetBuffer.readerIndex(0); initOffsetBuffer(); + return offsetBuffer; } /* allocate validity buffer */ @@ -760,7 +789,7 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseLargeV final long start = getStartOffset(startIndex); final long end = getStartOffset(startIndex + length); final long dataLength = end - start; - target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + target.offsetBuffer = target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); for (int i = 0; i < length + 1; i++) { final long relativeSourceOffset = getStartOffset(startIndex + i) - start; target.offsetBuffer.setLong((long) i * OFFSET_WIDTH, relativeSourceOffset); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 4cf495a349f02..6b82dd7729a6c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -355,6 +355,34 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + + exportBuffer(valueBuffer, buffers, buffersPtr, nullValue, true); + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -476,11 +504,12 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { } /* allocate offset buffer */ - private void allocateOffsetBuffer(final long size) { + private ArrowBuf allocateOffsetBuffer(final long size) { final int curSize = (int) size; - offsetBuffer = allocator.buffer(curSize); + ArrowBuf offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); initOffsetBuffer(); + return offsetBuffer; } /* allocate validity buffer */ @@ -805,7 +834,7 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariab (1 + length) * ((long) OFFSET_WIDTH)); target.offsetBuffer = transferBuffer(slicedOffsetBuffer, target.allocator); } else { - target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + target.offsetBuffer = target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); for (int i = 0; i < length + 1; i++) { final int relativeSourceOffset = getStartOffset(startIndex + i) - start; target.offsetBuffer.setInt((long) i * OFFSET_WIDTH, relativeSourceOffset); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java index 299828f6d9d08..04229563bcc67 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -60,6 +60,47 @@ public interface FieldVector extends ValueVector { */ List getFieldBuffers(); + /** + * Export a given buffer and its memory address into a list of buffers and a pointer to the list of buffers. + * + * @param buffer the buffer to export + * @param buffers the list of buffers + * @param buffersPtr the pointer to the list of buffers + * @param nullValue the null value to use for null buffer + * @param retain whether to retain the buffer when exporting + */ + default void exportBuffer( + ArrowBuf buffer, + List buffers, + ArrowBuf buffersPtr, + long nullValue, + boolean retain) { + if (buffer != null) { + if (retain) { + buffer.getReferenceManager().retain(); + } + buffersPtr.writeLong(buffer.memoryAddress()); + } else { + buffersPtr.writeLong(nullValue); + } + buffers.add(buffer); + } + + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + * + * By default, when exporting a buffer, it will increase ref count for exported buffer that counts + * the usage at imported side. + */ + default void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + List fieldBuffers = getFieldBuffers(); + + for (ArrowBuf arrowBuf : fieldBuffers) { + exportBuffer(arrowBuf, buffers, buffersPtr, nullValue, true); + } + } + /** * Get the inner vectors. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 8ba2e48dc2fa3..7906d90c2fff0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -83,7 +83,7 @@ public String getName() { public boolean allocateNewSafe() { boolean dataAlloc = false; try { - allocateOffsetBuffer(offsetAllocationSizeInBytes); + offsetBuffer = allocateOffsetBuffer(offsetAllocationSizeInBytes); dataAlloc = vector.allocateNewSafe(); } catch (Exception e) { e.printStackTrace(); @@ -97,12 +97,13 @@ public boolean allocateNewSafe() { return dataAlloc; } - protected void allocateOffsetBuffer(final long size) { + protected ArrowBuf allocateOffsetBuffer(final long size) { final int curSize = (int) size; - offsetBuffer = allocator.buffer(curSize); + ArrowBuf offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); offsetAllocationSizeInBytes = curSize; offsetBuffer.setZero(0, offsetBuffer.capacity()); + return offsetBuffer; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index b934cbd81db16..b29b72ad2b1a0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -287,6 +287,26 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -343,7 +363,7 @@ public boolean allocateNewSafe() { /* allocate offset and data buffer */ boolean dataAlloc = false; try { - allocateOffsetBuffer(offsetAllocationSizeInBytes); + offsetBuffer = allocateOffsetBuffer(offsetAllocationSizeInBytes); dataAlloc = vector.allocateNewSafe(); } catch (Exception e) { e.printStackTrace(); @@ -371,11 +391,12 @@ private void allocateValidityBuffer(final long size) { validityBuffer.setZero(0, validityBuffer.capacity()); } - protected void allocateOffsetBuffer(final long size) { - offsetBuffer = allocator.buffer(size); + protected ArrowBuf allocateOffsetBuffer(final long size) { + ArrowBuf offsetBuffer = allocator.buffer(size); offsetBuffer.readerIndex(0); offsetAllocationSizeInBytes = size; offsetBuffer.setZero(0, offsetBuffer.capacity()); + return offsetBuffer; } /** @@ -656,7 +677,7 @@ public void splitAndTransfer(int startIndex, int length) { final long startPoint = offsetBuffer.getLong((long) startIndex * OFFSET_WIDTH); final long sliceLength = offsetBuffer.getLong((long) (startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { final long relativeOffset = offsetBuffer.getLong((long) (startIndex + i) * OFFSET_WIDTH) - startPoint; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 7df659e4cc9da..91275ae73d2c3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -242,6 +242,26 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -535,7 +555,7 @@ public void splitAndTransfer(int startIndex, int length) { final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index e082b2f43be64..c49f138b64c6b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -209,7 +209,7 @@ public void splitAndTransfer(int startIndex, int length) { final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint;