From 989508418b59e19037cbc58b46fb72ec2adf6234 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 05:28:50 +0900 Subject: [PATCH 01/11] MINOR: [CI] Bump actions/download-artifact from 4.1.7 to 4.1.8 (#44020) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 4.1.7 to 4.1.8.
Release notes

Sourced from actions/download-artifact's releases.

v4.1.8

What's Changed

Full Changelog: https://github.com/actions/download-artifact/compare/v4...v4.1.8

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/download-artifact&package-manager=github_actions&previous-version=4.1.7&new-version=4.1.8)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index fbc2ebe0bc5f1..bd1631db4f617 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -332,7 +332,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4.1.8 with: name: libarrow-rtools40-ucrt64.zip path: r/windows From 23c46879d20eabfaf010e34fb03131999a02c30f Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 10 Sep 2024 05:55:20 +0530 Subject: [PATCH 02/11] GH-44013: [Java] Consider warnings as errors for Dataset Module (#44014) ### Rationale for this change This PR configs the build such that warnings are considered as errors in the C module. And corresponding code changes have also been made. ### What changes are included in this PR? Adding flags to consider warnings as errors in javac and fixing the corresponding errors. ### Are these changes tested? Tested by existing test cases. ### Are there any user-facing changes? N/A * GitHub Issue: #44013 Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- java/dataset/pom.xml | 9 ++++++ .../arrow/dataset/TextBasedWriteSupport.java | 12 ++++++-- .../dataset/file/TestFileSystemDataset.java | 7 ++--- .../dataset/jni/TestReservationListener.java | 1 + .../substrait/TestAceroSubstraitConsumer.java | 28 +++++-------------- 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index a19e934f0de98..92b67825517c6 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -202,6 +202,15 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java index 9e6559824ce7f..e3495bd81ca79 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java @@ -17,10 +17,13 @@ package org.apache.arrow.dataset; import java.io.File; -import java.io.FileWriter; import java.io.IOException; +import java.io.Writer; import java.net.URI; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.StandardOpenOption; import java.util.Random; public class TextBasedWriteSupport { @@ -43,7 +46,12 @@ public static TextBasedWriteSupport writeTempFile( File outputFolder, String fileExtension, String... values) throws URISyntaxException, IOException { TextBasedWriteSupport writer = new TextBasedWriteSupport(outputFolder, fileExtension); - try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) { + try (Writer addValues = + Files.newBufferedWriter( + new File(writer.uri).toPath(), + StandardCharsets.UTF_8, + StandardOpenOption.CREATE, + StandardOpenOption.APPEND)) { for (Object value : values) { addValues.write(value + "\n"); } diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index 0b085d25b32eb..89ce208e8c6f6 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -29,7 +29,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; -import java.util.LinkedList; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -407,7 +406,7 @@ public void testBaseArrowIpcRead() throws Exception { try (VectorSchemaRoot root = VectorSchemaRoot.create(sourceSchema, rootAllocator()); FileOutputStream sink = new FileOutputStream(dataFile); ArrowFileWriter writer = - new ArrowFileWriter(root, /*dictionaryProvider=*/ null, sink.getChannel())) { + new ArrowFileWriter(root, /* provider= */ null, sink.getChannel())) { IntVector ints = (IntVector) root.getVector(0); ints.setSafe(0, 0); ints.setSafe(1, 1024); @@ -562,7 +561,7 @@ private void checkParquetReadResult( Schema schema, List expected, List actual) { assertEquals(expected.size(), actual.stream().mapToInt(ArrowRecordBatch::getLength).sum()); final int fieldCount = schema.getFields().size(); - LinkedList expectedRemovable = new LinkedList<>(expected); + ArrayList expectedRemovable = new ArrayList<>(expected); try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator())) { VectorLoader loader = new VectorLoader(vsr); for (ArrowRecordBatch batch : actual) { @@ -578,7 +577,7 @@ private void checkParquetReadResult( } } for (int i = 0; i < batchRowCount; i++) { - expectedRemovable.poll(); + expectedRemovable.remove(0); } } assertTrue(expectedRemovable.isEmpty()); diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java b/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java index f366c824d2ded..9fabc4a257fb3 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java @@ -91,6 +91,7 @@ public void unreserve(long size) { } @Test + @SuppressWarnings("UnnecessaryAsync") public void testErrorThrownFromReservationListener() throws Exception { final String errorMessage = "ERROR_MESSAGE"; ParquetWriteSupport writeSupport = diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java index 97c185d7053d5..eec6570a639f2 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java @@ -23,6 +23,7 @@ import java.io.File; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; @@ -87,7 +88,8 @@ public void testRunQueryLocalFiles() throws Exception { TestAceroSubstraitConsumer.class .getClassLoader() .getResource("substrait/local_files_users.json") - .toURI()))) + .toURI())), + StandardCharsets.UTF_8) .replace("FILENAME_PLACEHOLDER", writeSupport.getOutputURI()))) { assertEquals(schema, arrowReader.getVectorSchemaRoot().getSchema()); int rowcount = 0; @@ -134,7 +136,8 @@ public void testRunQueryNamedTable() throws Exception { TestAceroSubstraitConsumer.class .getClassLoader() .getResource("substrait/named_table_users.json") - .toURI()))), + .toURI())), + StandardCharsets.UTF_8), mapTableToArrowReader)) { assertEquals(schema, arrowReader.getVectorSchemaRoot().getSchema()); assertEquals(arrowReader.getVectorSchemaRoot().getSchema(), schema); @@ -186,7 +189,8 @@ public void testRunQueryNamedTableWithException() throws Exception { TestAceroSubstraitConsumer.class .getClassLoader() .getResource("substrait/named_table_users.json") - .toURI()))), + .toURI())), + StandardCharsets.UTF_8), mapTableToArrowReader)) { assertEquals(schema, arrowReader.getVectorSchemaRoot().getSchema()); int rowcount = 0; @@ -311,12 +315,6 @@ public void testRunExtendedExpressionsFilter() throws Exception { @Test public void testRunExtendedExpressionsFilterWithProjectionsInsteadOfFilterException() throws Exception { - final Schema schema = - new Schema( - Arrays.asList( - Field.nullable("id", new ArrowType.Int(32, true)), - Field.nullable("name", new ArrowType.Utf8())), - null); // Substrait Extended Expression: Project New Column: // Expression ADD: id + 2 // Expression CONCAT: name + '-' + name @@ -360,12 +358,6 @@ public void testRunExtendedExpressionsFilterWithProjectionsInsteadOfFilterExcept @Test public void testRunExtendedExpressionsFilterWithEmptyFilterException() throws Exception { - final Schema schema = - new Schema( - Arrays.asList( - Field.nullable("id", new ArrowType.Int(32, true)), - Field.nullable("name", new ArrowType.Utf8())), - null); String base64EncodedSubstraitFilter = ""; ByteBuffer substraitExpressionFilter = getByteBuffer(base64EncodedSubstraitFilter); ParquetWriteSupport writeSupport = @@ -529,12 +521,6 @@ public void testRunExtendedExpressionsProjectionWithFilterInsteadOfProjectionExc @Test public void testRunExtendedExpressionsProjectionWithEmptyProjectionException() throws Exception { - final Schema schema = - new Schema( - Arrays.asList( - Field.nullable("id", new ArrowType.Int(32, true)), - Field.nullable("name", new ArrowType.Utf8())), - null); String base64EncodedSubstraitFilter = ""; ByteBuffer substraitExpressionProjection = getByteBuffer(base64EncodedSubstraitFilter); ParquetWriteSupport writeSupport = From 09ed5e5c4ed9e7fd1619bf3bf3a5c88be4540a40 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 10 Sep 2024 05:56:18 +0530 Subject: [PATCH 03/11] GH-44011: [Java] Consider warnings as errors for C Module (#44012) ### Rationale for this change This PR configs the build such that warnings are considered as errors in the C module. And corresponding code changes have also been made. ### What changes are included in this PR? Adding flags to consider warnings as errors in javac and fixing the corresponding errors. ### Are these changes tested? Tested by existing test cases. ### Are there any user-facing changes? N/A * GitHub Issue: #44011 Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- java/c/pom.xml | 11 +++++++++++ .../main/java/org/apache/arrow/c/ArrayExporter.java | 2 +- .../org/apache/arrow/c/BufferImportTypeVisitor.java | 2 +- .../test/java/org/apache/arrow/c/DictionaryTest.java | 8 ++++---- .../test/java/org/apache/arrow/c/RoundtripTest.java | 8 -------- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/java/c/pom.xml b/java/c/pom.xml index 52962354047b1..fe57bd2ea0ec5 100644 --- a/java/c/pom.xml +++ b/java/c/pom.xml @@ -91,5 +91,16 @@ under the License. + + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + + diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java index 820a1522749c6..0c6b5de4486bc 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java @@ -90,7 +90,7 @@ void export(ArrowArray array, FieldVector vector, DictionaryProvider dictionaryP data.buffers = new ArrayList<>(vector.getExportedCDataBufferCount()); data.buffers_ptrs = - allocator.buffer((long) (vector.getExportedCDataBufferCount()) * Long.BYTES); + allocator.buffer((long) vector.getExportedCDataBufferCount() * Long.BYTES); vector.exportCDataBuffers(data.buffers, data.buffers_ptrs, NULL); if (dictionaryEncoding != null) { diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 93fef6d7ca801..e47d27bf091ee 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -232,7 +232,7 @@ public List visit(ArrowType.Utf8 type) { private List visitVariableWidthView(ArrowType type) { final int viewBufferIndex = 1; final int variadicSizeBufferIndex = this.buffers.length - 1; - final long numOfVariadicBuffers = this.buffers.length - 3; + final long numOfVariadicBuffers = this.buffers.length - 3L; final long variadicSizeBufferCapacity = numOfVariadicBuffers * Long.BYTES; List buffers = new ArrayList<>(); diff --git a/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java b/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java index ce0e82586b766..8cd4913f22dd2 100644 --- a/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java @@ -247,8 +247,8 @@ private void createStructVector(StructVector vector) { // Write the values to child 1 child1.allocateNew(); - child1.set(0, "01234567890".getBytes()); - child1.set(1, "012345678901234567".getBytes()); + child1.set(0, "01234567890".getBytes(StandardCharsets.UTF_8)); + child1.set(1, "012345678901234567".getBytes(StandardCharsets.UTF_8)); vector.setIndexDefined(0); // Write the values to child 2 @@ -269,8 +269,8 @@ private void createStructVectorInline(StructVector vector) { // Write the values to child 1 child1.allocateNew(); - child1.set(0, "012345678".getBytes()); - child1.set(1, "01234".getBytes()); + child1.set(0, "012345678".getBytes(StandardCharsets.UTF_8)); + child1.set(1, "01234".getBytes(StandardCharsets.UTF_8)); vector.setIndexDefined(0); // Write the values to child 2 diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 18b2e94adde47..d8286465e475f 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -528,14 +528,6 @@ public void testVarBinaryVector() { } } - private String generateString(String str, int repetition) { - StringBuilder aRepeated = new StringBuilder(); - for (int i = 0; i < repetition; i++) { - aRepeated.append(str); - } - return aRepeated.toString(); - } - @Test public void testViewVector() { // ViewVarCharVector with short strings From f3dd298bd32f6dc38654680c49b3f1fbf97e3d5f Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 10 Sep 2024 05:58:10 +0530 Subject: [PATCH 04/11] GH-43576: [Java] Gandiva Tests are failing due to linking issues (#43978) ### Rationale for this change Gandiva tests are failing due to a linking issue and it is failing the Java CIs. But for most of the made PRs, we cannot verify the overall workflow given that those PRs are independent of the Gandiva component. ### What changes are included in this PR? This PR disables such failing tests temporarily such that once the Gandiva issue is fixed, re-enabling the tests. Re-enabling task will be tracked using https://github.com/apache/arrow/issues/43981 ### Are these changes tested? Yes, by existing CIs and tests. ### Are there any user-facing changes? No * GitHub Issue: #43576 Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- .../java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java index f2590226b1a74..0d86bd9e72923 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java @@ -62,6 +62,7 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +@Disabled("Disabled until GH-43981 is solved") public class ProjectorTest extends BaseEvaluatorTest { private Charset utf8Charset = Charset.forName("UTF-8"); From b6316c091f416967c5e7c9a9284601fa4507aa72 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 10 Sep 2024 13:23:26 +0800 Subject: [PATCH 05/11] GH-44036: [C++] IPC: ipc reader/writer code enhancement (#44019) ### Rationale for this change So minor ipc code enhancement when I read the code ### What changes are included in this PR? Avoid copying shared_ptr in some naive space ### Are these changes tested? Covered by existence ### Are there any user-facing changes? no * GitHub Issue: #44036 Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/src/arrow/ipc/reader.cc | 12 ++++++------ cpp/src/arrow/ipc/writer.cc | 21 +++++++++------------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index da84f2f2dc87d..98214c1debb86 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -305,7 +305,7 @@ class ArrayLoader { RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); } else { buffer_index_++; - out_->buffers[1].reset(new Buffer(nullptr, 0)); + out_->buffers[1] = std::make_shared(nullptr, 0); } return Status::OK(); } @@ -644,11 +644,11 @@ Result> LoadRecordBatch( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector& inclusion_mask, const IpcReadContext& context, io::RandomAccessFile* file) { - if (inclusion_mask.size() > 0) { - return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file); - } else { + if (inclusion_mask.empty()) { return LoadRecordBatchSubset(metadata, schema, /*inclusion_mask=*/nullptr, context, file); + } else { + return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file); } } @@ -1447,7 +1447,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Prebuffering's read patterns are also slightly worse than the alternative // when doing whole-file reads because the logic is not in place to recognize // we can just read the entire file up-front - if (options_.included_fields.size() != 0 && + if (!options_.included_fields.empty() && options_.included_fields.size() != schema_->fields().size() && !file_->supports_zero_copy()) { RETURN_NOT_OK(state->PreBufferMetadata({})); @@ -1907,7 +1907,7 @@ Result> RecordBatchFileReader::Open( Future> RecordBatchFileReader::OpenAsync( const std::shared_ptr& file, const IpcReadOptions& options) { ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return OpenAsync(std::move(file), footer_offset, options); + return OpenAsync(file, footer_offset, options); } Future> RecordBatchFileReader::OpenAsync( diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index f603e60c66555..88aa3f3f8a47a 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -86,7 +86,7 @@ bool HasNestedDict(const ArrayData& data) { } Status GetTruncatedBitmap(int64_t offset, int64_t length, - const std::shared_ptr input, MemoryPool* pool, + const std::shared_ptr& input, MemoryPool* pool, std::shared_ptr* buffer) { if (!input) { *buffer = input; @@ -103,7 +103,7 @@ Status GetTruncatedBitmap(int64_t offset, int64_t length, } Status GetTruncatedBuffer(int64_t offset, int64_t length, int32_t byte_width, - const std::shared_ptr input, MemoryPool* pool, + const std::shared_ptr& input, MemoryPool* pool, std::shared_ptr* buffer) { if (!input) { *buffer = input; @@ -252,7 +252,7 @@ class RecordBatchSerializer { } Status Assemble(const RecordBatch& batch) { - if (field_nodes_.size() > 0) { + if (!field_nodes_.empty()) { field_nodes_.clear(); buffer_meta_.clear(); out_->body_buffers.clear(); @@ -335,8 +335,7 @@ class RecordBatchSerializer { ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, AllocateBuffer(required_bytes, options_.memory_pool)); - offset_type* dest_offsets = - reinterpret_cast(shifted_offsets->mutable_data()); + auto dest_offsets = shifted_offsets->mutable_span_as(); const offset_type start_offset = array.value_offset(0); for (int i = 0; i < array.length(); ++i) { @@ -362,7 +361,6 @@ class RecordBatchSerializer { offset_type* out_min_offset, offset_type* out_max_end) { auto offsets = array.value_offsets(); - auto sizes = array.value_sizes(); const int64_t required_bytes = sizeof(offset_type) * array.length(); if (array.offset() != 0) { @@ -572,7 +570,7 @@ class RecordBatchSerializer { Status Visit(const StructArray& array) { --max_recursion_depth_; for (int i = 0; i < array.num_fields(); ++i) { - std::shared_ptr field = array.field(i); + const auto& field = array.field(i); RETURN_NOT_OK(VisitArray(*field)); } ++max_recursion_depth_; @@ -641,8 +639,7 @@ class RecordBatchSerializer { ARROW_ASSIGN_OR_RAISE( auto shifted_offsets_buffer, AllocateBuffer(length * sizeof(int32_t), options_.memory_pool)); - int32_t* shifted_offsets = - reinterpret_cast(shifted_offsets_buffer->mutable_data()); + auto shifted_offsets = shifted_offsets_buffer->mutable_span_as(); // Offsets are guaranteed to be increasing according to the spec, so // the first offset we find for a child is the initial offset and @@ -899,7 +896,7 @@ Status GetContiguousTensor(const Tensor& tensor, MemoryPool* pool, RETURN_NOT_OK(WriteStridedTensorData(0, 0, elem_size, tensor, scratch_space->mutable_data(), &stream)); - out->reset(new Tensor(tensor.type(), contiguous_data, tensor.shape())); + *out = std::make_unique(tensor.type(), contiguous_data, tensor.shape()); return Status::OK(); } @@ -1005,7 +1002,7 @@ class SparseTensorSerializer { } Status Assemble(const SparseTensor& sparse_tensor) { - if (buffer_meta_.size() > 0) { + if (!buffer_meta_.empty()) { buffer_meta_.clear(); out_->body_buffers.clear(); } @@ -1169,7 +1166,7 @@ Status RecordBatchWriter::WriteTable(const Table& table) { return WriteTable(tab namespace internal { -IpcPayloadWriter::~IpcPayloadWriter() {} +IpcPayloadWriter::~IpcPayloadWriter() = default; Status IpcPayloadWriter::Start() { return Status::OK(); } From fed5fcbbd5f1ca578f2eb3661a8260b4e21965ea Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 10 Sep 2024 00:30:54 -0700 Subject: [PATCH 06/11] GH-43996: [Java] Mark new allocated ArrowSchema as released (#43997) ### Rationale for this change As described in #43996. ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #43996 Authored-by: Liang-Chi Hsieh Signed-off-by: David Li --- .../src/main/java/org/apache/arrow/c/ArrowSchema.java | 10 +++++++++- .../java/org/apache/arrow/c/ArrowArrayUtilityTest.java | 7 +++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java b/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java index 06e401627ef01..ad9f16ae9ceed 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java @@ -52,6 +52,7 @@ */ public class ArrowSchema implements BaseStruct { private static final int SIZE_OF = 72; + private static final int INDEX_RELEASE_CALLBACK = 56; private ArrowBuf data; @@ -103,7 +104,9 @@ public static ArrowSchema wrap(long memoryAddress) { * @return A new ArrowSchema instance */ public static ArrowSchema allocateNew(BufferAllocator allocator) { - return new ArrowSchema(allocator.buffer(ArrowSchema.SIZE_OF)); + ArrowSchema schema = new ArrowSchema(allocator.buffer(ArrowSchema.SIZE_OF)); + schema.markReleased(); + return schema; } ArrowSchema(ArrowBuf data) { @@ -111,6 +114,11 @@ public static ArrowSchema allocateNew(BufferAllocator allocator) { this.data = data; } + /** Mark the schema as released. */ + public void markReleased() { + directBuffer().putLong(INDEX_RELEASE_CALLBACK, NULL); + } + @Override public long memoryAddress() { checkNotNull(data, "ArrowSchema is already closed"); diff --git a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java index 1d4cb411fab45..511358a5e62fa 100644 --- a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java @@ -50,6 +50,13 @@ void afterEach() { allocator.close(); } + @Test + void arraySchemaInit() { + ArrowSchema schema = ArrowSchema.allocateNew(allocator); + assertThat(schema.snapshot().release).isEqualTo(0); + schema.close(); + } + // ------------------------------------------------------------ // BufferImportTypeVisitor From b4b22a4635fcf9bb1a36bf26411c6bde2ae01732 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 07:31:08 -0700 Subject: [PATCH 07/11] MINOR: [C#] Bump Microsoft.NET.Test.Sdk from 17.11.0 to 17.11.1 in /csharp (#44025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 17.11.0 to 17.11.1.
Release notes

Sourced from Microsoft.NET.Test.Sdk's releases.

v17.11.1

What's Changed

Full Changelog: https://github.com/microsoft/vstest/compare/v17.11.0...v17.11.1

Commits
  • 58dbd02 Revert "Do not publish to BAR when RTM version is built"
  • aa62848 Do not publish to BAR when RTM version is built
  • d824a2f Bump 17.11.1
  • ed4ac92 Forward error output from testhost as info (#5193)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.NET.Test.Sdk&package-manager=nuget&previous-version=17.11.0&new-version=17.11.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 4ea02e0ed21c0..baa9ca1188cef 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index fd8274230ec64..fb546c213f8a6 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index eae9ab746f283..71f54aa539e14 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index ee71b203218f8..aabe787b1f8c3 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -16,7 +16,7 @@ - + all From 22a44962bb3a641051cf449d9598477dd7dc820d Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 10 Sep 2024 21:06:50 +0530 Subject: [PATCH 08/11] GH-44016: [Java] Consider warnings as errors for Format Module (#44017) ### Rationale for this change This PR configs the build such that warnings are considered errors in the Format module. And corresponding code changes have also been made. ### What changes are included in this PR? Adding flags to consider warnings as errors in javac and fixing the corresponding errors. ### Are these changes tested? Tested by existing test cases. ### Are there any user-facing changes? N/A * GitHub Issue: #44016 Lead-authored-by: Vibhatha Lakmal Abeykoon Co-authored-by: Vibhatha Lakmal Abeykoon Co-authored-by: David Li Signed-off-by: Dane Pitkin --- java/format/pom.xml | 9 +++++++++ java/format/src/main/java/module-info.java | 2 ++ 2 files changed, 11 insertions(+) diff --git a/java/format/pom.xml b/java/format/pom.xml index 1121930da42d2..f767215b12807 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -61,6 +61,15 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + diff --git a/java/format/src/main/java/module-info.java b/java/format/src/main/java/module-info.java index bda779c91afbc..f8d740b726fde 100644 --- a/java/format/src/main/java/module-info.java +++ b/java/format/src/main/java/module-info.java @@ -15,6 +15,8 @@ * limitations under the License. */ +// TODO(https://github.com/apache/arrow/issues/44037): Google hasn't reviewed Flatbuffers fix +@SuppressWarnings({ "requires-automatic", "requires-transitive-automatic" }) module org.apache.arrow.format { exports org.apache.arrow.flatbuf; requires transitive flatbuffers.java; From c469da4d8494dbeffac6a43d662d5277b606942d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:12:31 -0400 Subject: [PATCH 09/11] MINOR: [Java] Bump logback.version from 1.5.7 to 1.5.8 in /java (#44023) Bumps `logback.version` from 1.5.7 to 1.5.8. Updates `ch.qos.logback:logback-classic` from 1.5.7 to 1.5.8
Commits
  • 92e1a5e prepare release 1.5.8
  • 76d8dd8 Update README.md, comment out CI action results
  • d7e0d59 Merge branch 'master' of github.com:qos-ch/logback
  • fe3bf9d os.name property is expected to be Mac OS X on Apple computers
  • 9806273 Update README.md
  • c45f110 check for Mac OS X
  • 00c6f5e what is the os.name
  • 7d03a42 update actions/setup
  • edacb3b skip email sent termination test on MacOs
  • 3b5d041 allow more time for timetout
  • Additional commits viewable in compare view

Updates `ch.qos.logback:logback-core` from 1.5.7 to 1.5.8
Commits
  • 92e1a5e prepare release 1.5.8
  • 76d8dd8 Update README.md, comment out CI action results
  • d7e0d59 Merge branch 'master' of github.com:qos-ch/logback
  • fe3bf9d os.name property is expected to be Mac OS X on Apple computers
  • 9806273 Update README.md
  • c45f110 check for Mac OS X
  • 00c6f5e what is the os.name
  • 7d03a42 update actions/setup
  • edacb3b skip email sent termination test on MacOs
  • 3b5d041 allow more time for timetout
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index c6b1876873f30..1c68fde535879 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -111,7 +111,7 @@ under the License. 5.11.0 5.2.0 3.46.0 - 1.5.7 + 1.5.8 none -Xdoclint:none From a87a8e0efe1650b01ac85f7a7331ccfcffc088a2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:12:58 -0400 Subject: [PATCH 10/11] MINOR: [Java] Bump io.netty:netty-bom from 4.1.112.Final to 4.1.113.Final in /java (#44022) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.netty:netty-bom](https://github.com/netty/netty) from 4.1.112.Final to 4.1.113.Final.
Commits
  • d0a109e [maven-release-plugin] prepare release netty-4.1.113.Final
  • e1d6384 Cleanup fields on AdaptiveByteBuf::deallocate (#14273)
  • 8a02f45 Upload hidden files for staging (#14275)
  • c0fdb8e adjust continuation frame header length (#14245)
  • 95d86bb chore: clean code DefaultChannelPipeline add method (#14249)
  • 1c1da9f Fix netty-all artifact snapshot deployments (#14264)
  • 235eb6f Upgrade to netty-tcnative 2.0.66.Final (#14254)
  • ceade95 Ensure flushes are not discarded by ChunkedWriteHandler for passed th… (#14248)
  • dc30c33 Add new SslHandler.isEncrypted(...) variant that will not produce fal… (#14243)
  • 31d1592 Remove reference to parent in recycled buffers for leak detection (#14250)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.netty:netty-bom&package-manager=maven&previous-version=4.1.112.Final&new-version=4.1.113.Final)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 1c68fde535879..1e22b6b973b9f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -96,7 +96,7 @@ under the License. 5.11.0 2.0.16 33.2.1-jre - 4.1.112.Final + 4.1.113.Final 1.65.0 3.25.4 2.17.2 From 44b72d5c2518b7dc70b67b588432fb06ea3896c7 Mon Sep 17 00:00:00 2001 From: larry98 Date: Tue, 10 Sep 2024 15:08:00 -0400 Subject: [PATCH 11/11] GH-43187: [C++] Support basic is_in predicate simplification (#43761) ### Rationale for this change Prior to https://github.com/apache/arrow/pull/43256, this PR adds a basic implementation that does a linear scan filter over the value set on each guarantee. This isolates the correctness/semantics of `is_in` predicate simplification from the binary search performance optimization. ### What changes are included in this PR? `SimplifyWithGuarantee` now handles `is_in` expressions. ### Are these changes tested? A new unit test was added to arrow-compute-expression-test testing this change. ### Are there any user-facing changes? No. * GitHub Issue: #43187 Lead-authored-by: Larry Wang Co-authored-by: larry98 Co-authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/compute/expression.cc | 73 ++++++++++ cpp/src/arrow/compute/expression_test.cc | 173 +++++++++++++++++++++++ 2 files changed, 246 insertions(+) diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 33e5928c2865d..12fda5d58f3bf 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -23,6 +23,7 @@ #include #include "arrow/chunked_array.h" +#include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/expression_internal.h" @@ -1242,6 +1243,72 @@ struct Inequality { /*insert_implicit_casts=*/false, &exec_context); } + /// Simplify an `is_in` call against an inequality guarantee. + /// + /// We avoid the complexity of fully simplifying EQUAL comparisons to true + /// literals (e.g., 'x is_in [1, 2, 3]' given the guarantee 'x = 2') due to + /// potential complications with null matching behavior. This is ok for the + /// predicate pushdown use case because the overall aim is to simplify to an + /// unsatisfiable expression. + /// + /// \pre `is_in_call` is a call to the `is_in` function + /// \return a simplified expression, or nullopt if no simplification occurred + static Result> SimplifyIsIn( + const Inequality& guarantee, const Expression::Call* is_in_call) { + DCHECK_EQ(is_in_call->function_name, "is_in"); + + auto options = checked_pointer_cast(is_in_call->options); + + const auto& lhs = Comparison::StripOrderPreservingCasts(is_in_call->arguments[0]); + if (!lhs.field_ref()) return std::nullopt; + if (*lhs.field_ref() != guarantee.target) return std::nullopt; + + FilterOptions::NullSelectionBehavior null_selection; + switch (options->null_matching_behavior) { + case SetLookupOptions::MATCH: + null_selection = + guarantee.nullable ? FilterOptions::EMIT_NULL : FilterOptions::DROP; + break; + case SetLookupOptions::SKIP: + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::EMIT_NULL: + if (guarantee.nullable) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::INCONCLUSIVE: + if (guarantee.nullable) return std::nullopt; + ARROW_ASSIGN_OR_RAISE(Datum is_null, IsNull(options->value_set)); + ARROW_ASSIGN_OR_RAISE(Datum any_null, Any(is_null)); + if (any_null.scalar_as().value) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + } + + std::string func_name = Comparison::GetName(guarantee.cmp); + DCHECK_NE(func_name, "na"); + std::vector args{options->value_set, guarantee.bound}; + ARROW_ASSIGN_OR_RAISE(Datum filter_mask, CallFunction(func_name, args)); + FilterOptions filter_options(null_selection); + ARROW_ASSIGN_OR_RAISE(Datum simplified_value_set, + Filter(options->value_set, filter_mask, filter_options)); + + if (simplified_value_set.length() == 0) return literal(false); + if (simplified_value_set.length() == options->value_set.length()) return std::nullopt; + + ExecContext exec_context; + Expression::Call simplified_call; + simplified_call.function_name = "is_in"; + simplified_call.arguments = is_in_call->arguments; + simplified_call.options = std::make_shared( + simplified_value_set, options->null_matching_behavior); + ARROW_ASSIGN_OR_RAISE( + Expression simplified_expr, + BindNonRecursive(std::move(simplified_call), + /*insert_implicit_casts=*/false, &exec_context)); + return simplified_expr; + } + /// \brief Simplify the given expression given this inequality as a guarantee. Result Simplify(Expression expr) { const auto& guarantee = *this; @@ -1258,6 +1325,12 @@ struct Inequality { return call->function_name == "is_valid" ? literal(true) : literal(false); } + if (call->function_name == "is_in") { + ARROW_ASSIGN_OR_RAISE(std::optional result, + SimplifyIsIn(guarantee, call)); + return result.value_or(expr); + } + auto cmp = Comparison::Get(expr); if (!cmp) return expr; diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index d94a17b6ffadf..0b7e8a9c23b13 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -27,6 +27,7 @@ #include #include +#include "arrow/array/builder_primitive.h" #include "arrow/compute/expression_internal.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" @@ -1616,6 +1617,144 @@ TEST(Expression, SimplifyWithComparisonAndNullableCaveat) { true_unless_null(field_ref("i32")))); // not satisfiable, will drop row group } +TEST(Expression, SimplifyIsIn) { + auto is_in = [](Expression field, std::shared_ptr value_set_type, + std::string json_array, + SetLookupOptions::NullMatchingBehavior null_matching_behavior) { + SetLookupOptions options{ArrayFromJSON(value_set_type, json_array), + null_matching_behavior}; + return call("is_in", {field}, options); + }; + + for (SetLookupOptions::NullMatchingBehavior null_matching : { + SetLookupOptions::MATCH, + SetLookupOptions::SKIP, + SetLookupOptions::EMIT_NULL, + SetLookupOptions::INCONCLUSIVE, + }) { + Simplify{is_in(field_ref("i32"), int32(), "[]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(equal(field_ref("i32"), literal(6))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(3))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(9))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(0))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(0))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(9))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(and_(less_equal(field_ref("i32"), literal(7)), + greater(field_ref("i32"), literal(4)))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7]", null_matching)); + + Simplify{is_in(field_ref("u32"), int8(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int8(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("u32"), int64(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int64(), "[5,7,9]", null_matching)); + } + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3,null]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::EMIT_NULL)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); +} + TEST(Expression, SimplifyThenExecute) { auto filter = or_({equal(field_ref("f32"), literal(0)), @@ -1643,6 +1782,40 @@ TEST(Expression, SimplifyThenExecute) { AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); } +TEST(Expression, SimplifyIsInThenExecute) { + auto input = RecordBatchFromJSON(kBoringSchema, R"([ + {"i64": 2, "i32": 5}, + {"i64": 5, "i32": 6}, + {"i64": 3, "i32": 6}, + {"i64": 3, "i32": 5}, + {"i64": 4, "i32": 5}, + {"i64": 2, "i32": 7}, + {"i64": 5, "i32": 5} + ])"); + + std::vector guarantees{greater(field_ref("i64"), literal(1)), + greater_equal(field_ref("i32"), literal(5)), + less_equal(field_ref("i64"), literal(5))}; + + for (const Expression& guarantee : guarantees) { + auto filter = + call("is_in", {guarantee.call()->arguments[0]}, + compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2,3]"), true}); + ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(filter, guarantee)); + + Datum evaluated, simplified_evaluated; + ExpectExecute(filter, input, &evaluated); + ExpectExecute(simplified, input, &simplified_evaluated); + if (simplified_evaluated.is_scalar()) { + ASSERT_OK_AND_ASSIGN( + simplified_evaluated, + MakeArrayFromScalar(*simplified_evaluated.scalar(), evaluated.length())); + } + AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); + } +} + TEST(Expression, Filter) { auto ExpectFilter = [](Expression filter, std::string batch_json) { ASSERT_OK_AND_ASSIGN(auto s, kBoringSchema->AddField(0, field("in", boolean())));