diff --git a/cpp/src/gandiva/expression_cache_key.h b/cpp/src/gandiva/expression_cache_key.h index 476dc25992464..9c884decacdd7 100644 --- a/cpp/src/gandiva/expression_cache_key.h +++ b/cpp/src/gandiva/expression_cache_key.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -77,11 +78,11 @@ class ExpressionCacheKey { size_t Hash() const { return hash_code_; } std::string ToString() { - std::stringstream stringstream; + std::stringstream ss; for (const auto &item : expressions_as_strings_) { - stringstream << item << " || "; + ss << item << " || "; } - return stringstream.str(); + return ss.str(); } bool operator==(const ExpressionCacheKey& other) const { diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 1ccdb0dddfcc7..a34a02c8659ae 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -68,12 +68,6 @@ static char mask_array[256] = { 'x', 'x', 'x', (char)123, (char)124, (char)125, (char)126, (char)127}; -const uint8_t* gdv_fn_get_json_object_utf8_utf8(int64_t ptr, const char* data, int data_len, - const char* pattern, int pattern_len, int32_t* out_len) { - gandiva::JsonHolder* holder = reinterpret_cast(ptr); - return (*holder)(std::string(data, data_len), std::string(pattern, pattern_len), out_len); -} - const uint8_t* gdv_fn_get_json_object_utf8_utf8(int64_t ptr, int64_t holder_ptr, const char* data, int data_len, bool in1_valid, const char* pattern, int pattern_len, bool in2_valid, bool* out_valid, int32_t* out_len) { if (!in1_valid || !in2_valid) { diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops.cc b/cpp/src/gandiva/precompiled/arithmetic_ops.cc index c303ddf50f63d..fdb7fdb302bc7 100644 --- a/cpp/src/gandiva/precompiled/arithmetic_ops.cc +++ b/cpp/src/gandiva/precompiled/arithmetic_ops.cc @@ -275,8 +275,6 @@ CEIL(int64, int64) CAST_UNARY(castBIGINT, int32, int64) CAST_UNARY(castBIGINT, date64, int64) -CAST_UNARY(castBIGINT, float32, int64) -CAST_UNARY(castBIGINT, float64, int64) CAST_UNARY(castBIGINT, boolean, int64) CAST_UNARY(castINT, int8, int32) CAST_UNARY(castINT, int16, int32) diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc index b9def96162991..1dd9dacc7f7fd 100644 --- a/cpp/src/gandiva/precompiled/time.cc +++ b/cpp/src/gandiva/precompiled/time.cc @@ -657,7 +657,6 @@ gdv_date64 castDATE_nullsafe_utf8(int64_t context, const char* input, gdv_int32 // store the last value dateFields[dateIndex++] = value; } - const char* msg = "Not a valid date value "; if (dateIndex != 3) { *out_valid = false; return 0; diff --git a/cpp/src/gandiva/translate_holder.cc b/cpp/src/gandiva/translate_holder.cc index a8b0d4bfb2e97..fa6e7b343d39c 100644 --- a/cpp/src/gandiva/translate_holder.cc +++ b/cpp/src/gandiva/translate_holder.cc @@ -35,7 +35,7 @@ const uint8_t* TranslateHolder::operator()(gandiva::ExecutionContext* ctx, std:: std::string matching_str, std::string replace_str, int32_t* out_len) { char res[text.length()]; std::unordered_map replace_map; - for (int i = 0; i < matching_str.length(); i++) { + for (uint64_t i = 0; i < matching_str.length(); i++) { if (i >= replace_str.length()) { replace_map[matching_str[i]] = '\0'; } else { @@ -43,7 +43,7 @@ const uint8_t* TranslateHolder::operator()(gandiva::ExecutionContext* ctx, std:: } } int j = 0; - for (int i = 0; i < text.length(); i++) { + for (uint64_t i = 0; i < text.length(); i++) { if (replace_map.find(text[i]) == replace_map.end()) { res[j++] = text[i]; continue; diff --git a/cpp/src/jni/dataset/jni_wrapper.cc b/cpp/src/jni/dataset/jni_wrapper.cc index 28ebebf40855a..e9f9400ebd1f3 100644 --- a/cpp/src/jni/dataset/jni_wrapper.cc +++ b/cpp/src/jni/dataset/jni_wrapper.cc @@ -26,8 +26,6 @@ #include "arrow/jniutil/jni_util.h" #include "arrow/util/iterator.h" -#include "jni/dataset/DTypes.pb.h" - #include "org_apache_arrow_dataset_file_JniWrapper.h" #include "org_apache_arrow_dataset_jni_JniWrapper.h" #include "org_apache_arrow_dataset_jni_NativeMemoryPool.h" @@ -186,7 +184,10 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;"); runtime_exception_class = CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;"); - + java_reservation_listener_class = + CreateGlobalClassReference(env, + "Lorg/apache/arrow/" + "dataset/jni/ReservationListener;"); reserve_memory_method = JniGetOrThrow(GetMethodID(env, java_reservation_listener_class, "reserve", "(J)V")); unreserve_memory_method = JniGetOrThrow( diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 437d5a467ce66..9ada436539ef4 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -38,12 +38,25 @@ compile ${arrow.vector.classifier} + + org.apache.arrow + arrow-format + ${project.version} + compile + ${arrow.vector.classifier} + org.apache.arrow arrow-memory-core ${project.version} compile + + com.google.flatbuffers + flatbuffers-java + 1.12.0 + compile + org.apache.arrow arrow-memory-netty diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/filter/Filter.java b/java/dataset/src/main/java/org/apache/arrow/dataset/filter/Filter.java new file mode 100644 index 0000000000000..fc12216bf96f5 --- /dev/null +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/filter/Filter.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.filter; + +// todo filter tree implementation +// todo see also https://issues.apache.org/jira/browse/ARROW-6953 + +/** + * Datasets filter. + */ +public interface Filter { + + Filter EMPTY = new Filter() { + @Override + public byte[] toByteArray() { + return new byte[0]; + } + }; + + byte[] toByteArray(); + +} \ No newline at end of file diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java index f5a1af384b24e..2d2a3491ba48c 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java @@ -17,56 +17,38 @@ package org.apache.arrow.dataset.scanner; -import java.util.Optional; +import org.apache.arrow.dataset.filter.Filter; -import org.apache.arrow.util.Preconditions; +import java.util.Optional; /** * Options used during scanning. */ public class ScanOptions { - private final Optional columns; + private final String[] columns; + private final Filter filter; private final long batchSize; /** * Constructor. * @param columns Projected columns. Empty for scanning all columns. * @param batchSize Maximum row number of each returned {@link org.apache.arrow.vector.ipc.message.ArrowRecordBatch} - * - * @deprecated Deprecated. Use {@link #ScanOptions(long, Optional)} instead. - */ - @Deprecated - public ScanOptions(String[] columns, long batchSize) { - this(batchSize, Optional.of(columns).map(present -> { - if (present.length == 0) { - // Backwards compatibility: See ARROW-13257, in the new constructor, we now use null to scan for all columns. - return null; - } - return present; - })); - } - - /** - * Constructor. - * @param batchSize Maximum row number of each returned {@link org.apache.arrow.vector.ipc.message.ArrowRecordBatch} - * @param columns (Optional) Projected columns. {@link Optional#empty()} for scanning all columns. Otherwise, - * Only columns present in the Array will be scanned. */ - public ScanOptions(long batchSize, Optional columns) { - Preconditions.checkNotNull(columns); - this.batchSize = batchSize; + public ScanOptions(String[] columns, Filter filter, long batchSize) { this.columns = columns; + this.filter = filter; + this.batchSize = batchSize; } - public ScanOptions(long batchSize) { - this(batchSize, Optional.empty()); + public Optional getColumns() { + return Optional.of(columns); } - public Optional getColumns() { - return columns; + public Filter getFilter() { + return filter; } public long getBatchSize() { return batchSize; } -} +} \ No newline at end of file diff --git a/java/dataset/src/main/java/org/apache/arrow/memory/NativeUnderlyingMemory.java b/java/dataset/src/main/java/org/apache/arrow/memory/NativeUnderlyingMemory.java deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java deleted file mode 100644 index 5dd0bcf0a782a..0000000000000 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.dataset.file; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.nio.channels.Channels; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Objects; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.arrow.dataset.ParquetWriteSupport; -import org.apache.arrow.dataset.TestDataset; -import org.apache.arrow.dataset.file.format.ParquetFileFormat; -import org.apache.arrow.dataset.filter.Filter; -import org.apache.arrow.dataset.jni.NativeMemoryPool; -import org.apache.arrow.dataset.scanner.ScanOptions; -import org.apache.arrow.dataset.scanner.Scanner; -import org.apache.arrow.dataset.source.Dataset; -import org.apache.arrow.util.AutoCloseables; -import org.apache.arrow.vector.ipc.WriteChannel; -import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; -import org.apache.arrow.vector.ipc.message.MessageSerializer; -import org.apache.commons.io.FileUtils; -import org.junit.Assert; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestDatasetFileWriter extends TestDataset { - - @ClassRule - public static final TemporaryFolder TMP = new TemporaryFolder(); - - public static final String AVRO_SCHEMA_USER = "user.avsc"; - - @Test - public void testParquetWriteSimple() throws Exception { - ParquetWriteSupport writeSupport = ParquetWriteSupport.writeTempFile(AVRO_SCHEMA_USER, TMP.newFolder(), - 1, "a", 2, "b", 3, "c", 2, "d"); - String sampleParquet = writeSupport.getOutputURI(); - FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), - ParquetFileFormat.createDefault(), sampleParquet); - ScanOptions options = new ScanOptions(new String[0], Filter.EMPTY, 100); - final Dataset dataset = factory.finish(); - final Scanner scanner = dataset.newScan(options); - final File writtenFolder = TMP.newFolder(); - final String writtenParquet = writtenFolder.toURI().toString(); - try { - DatasetFileWriter.write(scanner, ParquetFileFormat.createDefault(), writtenParquet); - assertParquetFileEquals(sampleParquet, Objects.requireNonNull(writtenFolder.listFiles())[0].toURI().toString()); - } finally { - AutoCloseables.close(factory, scanner, dataset); - } - } - - @Test - public void testParquetWriteWithPartitions() throws Exception { - ParquetWriteSupport writeSupport = ParquetWriteSupport.writeTempFile(AVRO_SCHEMA_USER, TMP.newFolder(), - 1, "a", 2, "b", 3, "c", 2, "d"); - String sampleParquet = writeSupport.getOutputURI(); - FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), - ParquetFileFormat.createDefault(), sampleParquet); - ScanOptions options = new ScanOptions(new String[0], Filter.EMPTY, 100); - final Dataset dataset = factory.finish(); - final Scanner scanner = dataset.newScan(options); - final File writtenFolder = TMP.newFolder(); - final String writtenParquet = writtenFolder.toURI().toString(); - try { - DatasetFileWriter.write(scanner, ParquetFileFormat.createDefault(), writtenParquet, new String[]{"id", "name"}, 100, "dat_{i}"); - final Set expectedOutputFiles = new HashSet<>( - Arrays.asList("id=1/name=a/dat_0", "id=2/name=b/dat_1", "id=3/name=c/dat_2", "id=2/name=d/dat_3")); - final Set outputFiles = FileUtils.listFiles(writtenFolder, null, true) - .stream() - .map(file -> { - return writtenFolder.toURI().relativize(file.toURI()).toString(); - }) - .collect(Collectors.toSet()); - Assert.assertEquals(expectedOutputFiles, outputFiles); - } finally { - AutoCloseables.close(factory, scanner, dataset); - } - } - - private void assertParquetFileEquals(String expectedURI, String actualURI) throws Exception { - final FileSystemDatasetFactory expectedFactory = new FileSystemDatasetFactory( - rootAllocator(), NativeMemoryPool.getDefault(), ParquetFileFormat.createDefault(), expectedURI); - List expectedBatches = collectResultFromFactory(expectedFactory, - new ScanOptions(new String[0], Filter.EMPTY, 100)); - final FileSystemDatasetFactory actualFactory = new FileSystemDatasetFactory( - rootAllocator(), NativeMemoryPool.getDefault(), ParquetFileFormat.createDefault(), actualURI); - List actualBatches = collectResultFromFactory(actualFactory, - new ScanOptions(new String[0], Filter.EMPTY, 100)); - // fast-fail by comparing metadata - Assert.assertEquals(expectedBatches.toString(), actualBatches.toString()); - // compare buffers - Assert.assertEquals(serialize(expectedBatches), serialize(actualBatches)); - AutoCloseables.close(expectedBatches, actualBatches); - } - - private String serialize(List batches) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - for (ArrowRecordBatch batch : batches) { - MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), batch); - } - return Arrays.toString(out.toByteArray()); - } -} diff --git a/java/pom.xml b/java/pom.xml index b149f6ca34c85..b269097503e4b 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -752,19 +752,19 @@ org.apache.maven.plugins maven-compiler-plugin - - - -XDcompilePolicy=simple - -Xplugin:ErrorProne - - - - com.google.errorprone - error_prone_core - 2.4.0 - - - + + + + + + + + + + + + +