From dececfda9fe3ca58d99d1d75c9f81d6a8d954059 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 25 Oct 2023 13:48:58 -0700 Subject: [PATCH 01/46] Fix merge --- .../src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index d0a5a9945ce20..8364b4a258889 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -161,7 +161,7 @@ public InnerAllocator() { } private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCapacity) { - PoolThreadCache cache = threadCache(); + PoolArenasCache cache = threadCache(); PoolArena directArena = cache.directArena; if (directArena != null) { From e56dba6c9c5f930e8064e682362c085c608a9ca3 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 25 Jul 2023 17:31:37 -0700 Subject: [PATCH 02/46] Test geo function return types --- cpp/src/arrow/type.cc | 5 ++ cpp/src/arrow/type.h | 3 ++ cpp/src/arrow/type_fwd.h | 6 +++ cpp/src/gandiva/function_registry_string.cc | 8 +++ cpp/src/gandiva/precompiled/string_ops.cc | 58 +++++++++++++++++++++ cpp/src/gandiva/precompiled/types.h | 12 +++++ 6 files changed, 92 insertions(+) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4804570bdf52f..60b71cbb71df7 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -856,6 +856,10 @@ StructType::StructType(const std::vector>& fields) children_ = fields; } +StructType::StructType() + : NestedType(Type::STRUCT) { +} + StructType::~StructType() {} std::string StructType::ToString() const { @@ -2527,6 +2531,7 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) +TYPE_FACTORY(structType, StructType) TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) TYPE_FACTORY(large_binary, LargeBinaryType) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 560805535dc4f..2957065f18e77 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1079,6 +1079,9 @@ class ARROW_EXPORT StructType : public NestedType { static constexpr const char* type_name() { return "struct"; } explicit StructType(const std::vector>& fields); + explicit StructType(); + StructType(const StructType& rhs) = delete; + StructType& operator=(const StructType& rhs) = delete; ~StructType() override; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 657abbaecc42b..450ed9a136d26 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -123,6 +123,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StructType; +class StructArray; +class StructBuilder; +struct StructScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -454,6 +459,7 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +ARROW_EXPORT const std::shared_ptr& structType(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2bc6936d77b3c..1aff48fc456de 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -263,6 +263,14 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), + NativeFunction("st_geohash", {}, DataTypeVector{float64(), float64()}, + utf8(), kResultNullIfNull, "gdv_fn_geo_hash_encode_float64_float64", + NativeFunction::kNeedsContext), + + NativeFunction("st_fromgeohash", {}, DataTypeVector{utf8()}, + arrow::structType(), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", + NativeFunction::kNeedsContext), + NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index c255b9a11c084..fc73aa7a55819 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -827,6 +827,64 @@ const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len); } +FORCE_INLINE +const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, + gdv_int32* out_len) { + //if (repeat_number == 0 || in_len <= 0) { + // *out_len = 0; + // return ""; + //} + + + //Gandiva-blarg + *out_len = 14; + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + std::string out_string = "Gandiva-blarg"; + memcpy(ret, out_string.c_str(), *out_len); + return ret; +} + +FORCE_INLINE +const gdv_struct* gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len) { + gdv_struct* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, sizeof(gdv_struct))); + ret->lat = 42; + ret->lon = 142; + return ret; + + //if (repeat_number == 0 || in_len <= 0) { + // *out_len = 0; + // return ""; + //} + + /*auto s = arrow::struct_({field("a", arrow::int32(), false), field("b", arrow::int32(), false)}); + + MemoryPool* pool_ = default_memory_pool(); + std::unique_ptr tmp; + MakeBuilder(pool_, s, &tmp); + + + +//std::vector list_lengths = {42, 43}; +//std::vector list_offsets = {142, 143}; +//410 ListBuilder* list_vb = checked_cast(builder_->field_builder(0)); + Int32Builder* int_vb = checked_cast(builder_->field_builder(0)); + Int32Builder* int_vb2 = checked_cast(builder_->field_builder(1)); +//420 ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(), +//421 list_is_valid.data())); + + int_vb->UnsafeAppend(42); + int_vb->UnsafeAppend(43); + int_vb2->UnsafeAppend(142); + int_vb2->UnsafeAppend(143); +*/ +} + FORCE_INLINE const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_number, gdv_int32* out_len) { diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 83bbdee208562..2f42355b4ca50 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -41,6 +41,13 @@ using gdv_utf8 = char*; using gdv_binary = char*; using gdv_day_time_interval = int64_t; +struct GeoStruct { + int32_t lat; + int32_t lon; +}; + +using gdv_struct = GeoStruct; + #ifdef GANDIVA_UNIT_TEST // unit tests may be compiled without O2, so inlining may not happen. #define FORCE_INLINE @@ -464,6 +471,11 @@ gdv_int64 truncate_int64_int32(gdv_int64 in, gdv_int32 out_scale); const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_times, gdv_int32* out_len); +const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, + gdv_int32* out_len); + +const gdv_struct* gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len); + const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, gdv_int64 length, gdv_int32* out_len); From 05fb64f92c14eb9dca33647fbe5fb49579f25baf Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 26 Jul 2023 09:55:20 -0700 Subject: [PATCH 03/46] Fix unix build --- cpp/src/arrow/c/bridge.cc | 2 +- cpp/src/arrow/type.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 85a5156d11db2..32dbc088a7118 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -195,7 +195,7 @@ struct SchemaExporter { } Status ExportSchema(const Schema& schema) { - static const StructType dummy_struct_type({}); + static const StructType dummy_struct_type = StructType(); flags_ = 0; RETURN_NOT_OK(ExportFormat(dummy_struct_type)); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 2957065f18e77..ddeb45b721f89 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1081,7 +1081,7 @@ class ARROW_EXPORT StructType : public NestedType { explicit StructType(const std::vector>& fields); explicit StructType(); StructType(const StructType& rhs) = delete; - StructType& operator=(const StructType& rhs) = delete; + StructType& operator=(const StructType& rhs) = delete; ~StructType() override; From 0593acd14954a53dfd6f1497c87ccbfbaec24be5 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 3 Aug 2023 14:21:52 -0700 Subject: [PATCH 04/46] Struct testing --- cpp/src/gandiva/expression_registry.cc | 3 +++ cpp/src/gandiva/function_registry_string.cc | 3 ++- cpp/src/gandiva/llvm_generator.cc | 14 ++++++++++++++ cpp/src/gandiva/llvm_types.cc | 1 + cpp/src/gandiva/llvm_types.h | 4 ++++ cpp/src/gandiva/precompiled/string_ops.cc | 9 +++++---- cpp/src/gandiva/precompiled/types.h | 6 +++--- cpp/src/gandiva/tree_expr_builder.cc | 7 ++++--- java/gandiva/CMakeLists.txt | 2 +- .../src/main/cpp/expression_registry_helper.cc | 10 +++++++++- java/gandiva/src/main/cpp/jni_common.cc | 3 ++- .../gandiva/evaluator/ExpressionRegistry.java | 3 ++- .../arrow/gandiva/evaluator/Projector.java | 18 +++++++++++------- .../gandiva/expression/ArrowTypeHelper.java | 6 ++++++ 14 files changed, 67 insertions(+), 22 deletions(-) diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index 9bff97f5ad269..b5b79af7f818a 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -166,6 +166,9 @@ static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector case arrow::Type::type::INTERVAL_DAY_TIME: vector.push_back(arrow::day_time_interval()); break; + case arrow::Type::type::STRUCT: + vector.push_back(arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)})); + break; default: // Unsupported types. test ensures that // when one of these are added build breaks. diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 1aff48fc456de..442cdecbde7d3 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -268,7 +268,8 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsContext), NativeFunction("st_fromgeohash", {}, DataTypeVector{utf8()}, - arrow::structType(), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", + arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", + //arrow::structType(), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", NativeFunction::kNeedsContext), NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 1615eece1f2c7..b757ef23f93cc 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -17,6 +17,7 @@ #include "gandiva/llvm_generator.h" +#include #include #include #include @@ -397,6 +398,10 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_populate_varlen_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); + } else if (output_type_id == arrow::Type::STRUCT) { + std::cout << "LR creating struct type to store the result." << std::endl; + auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); + builder->CreateStore(output_value->data(), slot_offset); } else { return Status::NotImplemented("output type ", output->Type()->ToString(), " not supported"); @@ -530,6 +535,15 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args); } else { value = ir_builder()->CreateCall(fn, args, full_name); + + std::string str; + llvm::raw_string_ostream output(str); + std::string str2; + llvm::raw_string_ostream output2(str2); + ret_type->print(output); + value->getType()->print(output2); + std::cout << "LR ret type " << str << " value ret type " << str2 << std::endl; + DCHECK(value->getType() == ret_type); } diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index de322a8c0fcb5..f68fd098f6bef 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -42,6 +42,7 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::BINARY, i8_ptr_type()}, {arrow::Type::type::DECIMAL, i128_type()}, {arrow::Type::type::INTERVAL_MONTHS, i32_type()}, + {arrow::Type::type::STRUCT, struct_type()}, {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}}; } diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index d6f0952713efc..77480ab57d06a 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -46,6 +46,10 @@ class GANDIVA_EXPORT LLVMTypes { llvm::Type* i128_type() { return llvm::Type::getInt128Ty(context_); } + llvm::StructType* struct_type() { + return llvm::StructType::get(context_, {double_type(), double_type()}, false); + } + llvm::StructType* i128_split_type() { // struct with high/low bits (see decimal_ops.cc:DecimalSplit) return llvm::StructType::get(context_, {i64_type(), i64_type()}, false); diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index fc73aa7a55819..9c4458ea1b705 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -851,10 +851,11 @@ const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float6 } FORCE_INLINE -const gdv_struct* gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len) { - gdv_struct* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, sizeof(gdv_struct))); - ret->lat = 42; - ret->lon = 142; +const gdv_struct gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len) { + //gdv_struct* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, sizeof(gdv_struct))); + gdv_struct ret; + ret.lattitude = 42; + ret.longitude = 142; return ret; //if (repeat_number == 0 || in_len <= 0) { diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 2f42355b4ca50..1867f5b785eed 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -42,8 +42,8 @@ using gdv_binary = char*; using gdv_day_time_interval = int64_t; struct GeoStruct { - int32_t lat; - int32_t lon; + double lattitude; + double longitude; }; using gdv_struct = GeoStruct; @@ -474,7 +474,7 @@ const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_le const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, gdv_int32* out_len); -const gdv_struct* gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len); +const gdv_struct gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len); const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 82bb661ecda80..45c7d7bfe7647 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -144,12 +144,13 @@ NodePtr TreeExprBuilder::MakeOr(const NodeVector& children) { static bool print_expr = false; ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result_field) { + if (true || print_expr) { + std::cout << "Expression: " << root_node->ToString() << "\n"; + } if (result_field == nullptr) { + std::cout << "LR MakeExpression result_field is null" << std::endl; return nullptr; } - if (print_expr) { - std::cout << "Expression: " << root_node->ToString() << "\n"; - } return ExpressionPtr(new Expression(root_node, result_field)); } diff --git a/java/gandiva/CMakeLists.txt b/java/gandiva/CMakeLists.txt index 629ab2fb347d8..60762f6307c06 100644 --- a/java/gandiva/CMakeLists.txt +++ b/java/gandiva/CMakeLists.txt @@ -38,7 +38,7 @@ set(GANDIVA_PROTO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/proto) get_filename_component(GANDIVA_PROTO_FILE_ABSOLUTE ${GANDIVA_PROTO_DIR}/Types.proto ABSOLUTE) -find_package(Protobuf REQUIRED) +find_package(Protobuf CONFIG REQUIRED) add_custom_command(OUTPUT ${GANDIVA_PROTO_OUTPUT_FILES} COMMAND protobuf::protoc --proto_path ${GANDIVA_PROTO_DIR} --cpp_out ${GANDIVA_PROTO_OUTPUT_DIR} ${GANDIVA_PROTO_FILE_ABSOLUTE} diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index 6765df3b9727f..9bf5a07426d98 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -136,10 +136,15 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) gandiva_data_type->set_type(types::GandivaType::INTERVAL); gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME); break; + case arrow::Type::STRUCT: + gandiva_data_type->set_type(types::GandivaType::STRUCT); + break; default: // un-supported types. test ensures that // when one of these are added build breaks. - DCHECK(false); + //DCHECK(false); + printf("LR Found unsupported type %d\n", type->id()); + fflush(stdout); } } @@ -172,6 +177,9 @@ Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSu types::GandivaFunctions gandiva_functions; for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { + printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).base_name().c_str()); + fflush(stdout); + types::FunctionSignature* function_signature = gandiva_functions.add_function(); function_signature->set_name((*function).base_name()); types::ExtGandivaType* return_type = function_signature->mutable_returntype(); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index d5e54f38e3692..35dcdf2ee2d61 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -268,9 +268,10 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { return ProtoTypeToTimestamp(ext_type); case types::INTERVAL: return ProtoTypeToInterval(ext_type); + case types::STRUCT: + return arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}); case types::FIXED_SIZE_BINARY: case types::LIST: - case types::STRUCT: case types::UNION: case types::DICTIONARY: case types::MAP: diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 0155af08234ad..32990330ee310 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -178,11 +178,12 @@ private static ArrowType getArrowType(ExtGandivaType type) { return new ArrowType.Decimal(0, 0, 128); case GandivaType.INTERVAL_VALUE: return new ArrowType.Interval(mapArrowIntervalUnit(type.getIntervalType())); + case GandivaType.STRUCT_VALUE: + return new ArrowType.Struct(); case GandivaType.FIXED_SIZE_BINARY_VALUE: case GandivaType.MAP_VALUE: case GandivaType.DICTIONARY_VALUE: case GandivaType.LIST_VALUE: - case GandivaType.STRUCT_VALUE: case GandivaType.UNION_VALUE: default: assert false; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index c146fce26c150..5982ea31d6239 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -22,16 +22,15 @@ import org.apache.arrow.gandiva.exceptions.EvaluatorClosedException; import org.apache.arrow.gandiva.exceptions.GandivaException; -import org.apache.arrow.gandiva.exceptions.UnsupportedTypeException; import org.apache.arrow.gandiva.expression.ArrowTypeHelper; import org.apache.arrow.gandiva.expression.ExpressionTree; import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.SelectionVectorType; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.BaseVariableWidthVector; -import org.apache.arrow.vector.FixedWidthVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.message.ArrowBuffer; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; @@ -357,12 +356,12 @@ private void evaluate(int numRows, List buffers, List buf idx = 0; int outColumnIdx = 0; for (ValueVector valueVector : outColumns) { - boolean isFixedWith = valueVector instanceof FixedWidthVector; + /*boolean isFixedWith = valueVector instanceof FixedWidthVector;*/ boolean isVarWidth = valueVector instanceof VariableWidthVector; - if (!isFixedWith && !isVarWidth) { + /*if (!isFixedWith && !isVarWidth) { throw new UnsupportedTypeException( "Unsupported value vector type " + valueVector.getField().getFieldType()); - } + }*/ outAddrs[idx] = valueVector.getValidityBuffer().memoryAddress(); outSizes[idx++] = valueVector.getValidityBuffer().capacity(); @@ -374,8 +373,13 @@ private void evaluate(int numRows, List buffers, List buf // save vector to allow for resizing. resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; } - outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); - outSizes[idx++] = valueVector.getDataBuffer().capacity(); + if (valueVector instanceof StructVector) { + outAddrs[idx] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().memoryAddress(); + outSizes[idx++] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().capacity(); + } else { + outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); + outSizes[idx++] = valueVector.getDataBuffer().capacity(); + } valueVector.setValueCount(selectionVectorRecordCount); outColumnIdx++; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 90f8684b455a8..40b975d064351 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -143,6 +143,11 @@ private static void initArrowTypeDate(ArrowType.Date dateType, } } + private static void initArrowTypeStruct(ArrowType.Struct structType, + GandivaTypes.ExtGandivaType.Builder builder) { + builder.setType(GandivaTypes.GandivaType.STRUCT); + } + private static void initArrowTypeTime(ArrowType.Time timeType, GandivaTypes.ExtGandivaType.Builder builder) { short timeUnit = timeType.getUnit().getFlatbufID(); @@ -287,6 +292,7 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp break; } case Type.Struct_: { // 13 + ArrowTypeHelper.initArrowTypeStruct((ArrowType.Struct) arrowType, builder); break; } case Type.Union: { // 14 From 690f5ea8c8506216541c5c48edf9d58f3bb22f4d Mon Sep 17 00:00:00 2001 From: Jiangtao Peng Date: Fri, 1 Jan 2021 20:30:30 +0800 Subject: [PATCH 05/46] Merge in list/array stuff --- cpp/src/gandiva/annotator.cc | 58 +++++- cpp/src/gandiva/dex.h | 28 +++ cpp/src/gandiva/dex_visitor.h | 6 + cpp/src/gandiva/expr_decomposer.cc | 12 +- cpp/src/gandiva/expr_validator.cc | 4 +- cpp/src/gandiva/field_descriptor.h | 12 +- cpp/src/gandiva/gdv_function_stubs.cc | 126 +++++++++++++ cpp/src/gandiva/llvm_generator.cc | 151 +++++++++++++++- cpp/src/gandiva/llvm_generator.h | 13 +- cpp/src/gandiva/llvm_types.h | 11 ++ cpp/src/gandiva/llvm_types_test.cc | 10 + cpp/src/gandiva/lvalue.h | 23 +++ cpp/src/gandiva/projector.cc | 160 +++++++++++++++- cpp/src/gandiva/tests/CMakeLists.txt | 1 + cpp/src/gandiva/tests/list_test.cc | 171 ++++++++++++++++++ .../tests/projector_build_validation_test.cc | 5 +- 16 files changed, 768 insertions(+), 23 deletions(-) create mode 100644 cpp/src/gandiva/tests/list_test.cc diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index b341fdde3a3f4..540420563ae1d 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -46,15 +46,23 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { int data_idx = buffer_count_++; int validity_idx = buffer_count_++; int offsets_idx = FieldDescriptor::kInvalidIdx; + int child_offsets_idx = FieldDescriptor::kInvalidIdx; if (arrow::is_binary_like(field->type()->id())) { offsets_idx = buffer_count_++; } + + if (field->type()->id() == arrow::Type::LIST) { + offsets_idx = buffer_count_++; + if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { + child_offsets_idx = buffer_count_++; + } + } int data_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; if (is_output) { data_buffer_ptr_idx = buffer_count_++; } return std::make_shared(field, data_idx, validity_idx, offsets_idx, - data_buffer_ptr_idx); + data_buffer_ptr_idx, child_offsets_idx); } int Annotator::AddHolderPointer(void* holder) { @@ -80,16 +88,52 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, if (desc.HasOffsetsIdx()) { uint8_t* offsets_buf = const_cast(array_data.buffers[buffer_idx]->data()); eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); - ++buffer_idx; + + if (desc.HasChildOffsetsIdx()) { + if (is_output) { + // if list field is output field, we should put buffer pointer into eval batch + // for resizing + uint8_t* child_offsets_buf = reinterpret_cast( + array_data.child_data.at(0)->buffers[buffer_idx].get()); + eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, + array_data.child_data.at(0)->offset); + } else { + // if list field is input field, just put buffer data into eval batch + uint8_t* child_offsets_buf = const_cast( + array_data.child_data.at(0)->buffers[buffer_idx]->data()); + eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, + array_data.child_data.at(0)->offset); + } + } + if (array_data.type->id() != arrow::Type::LIST || + arrow::is_binary_like(array_data.type->field(0)->type()->id())) + // primitive type list data buffer index is 1 + // binary like type list data buffer index is 2 + ++buffer_idx; + } + + if (array_data.type->id() != arrow::Type::LIST) { + uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); + eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); + } else { + uint8_t* data_buf = + const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); + eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); } - uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); - eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); if (is_output) { // pass in the Buffer object for output data buffers. Can be used for resizing. - uint8_t* data_buf_ptr = - reinterpret_cast(array_data.buffers[buffer_idx].get()); - eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); + if (array_data.type->id() != arrow::Type::LIST) { + uint8_t* data_buf_ptr = + reinterpret_cast(array_data.buffers[buffer_idx].get()); + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); + } else { + // list data buffer is in child data buffer + uint8_t* data_buf_ptr = reinterpret_cast( + array_data.child_data.at(0)->buffers[buffer_idx].get()); + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, + array_data.child_data.at(0)->offset); + } } } diff --git a/cpp/src/gandiva/dex.h b/cpp/src/gandiva/dex.h index 2998c2131769a..c35ee93dc03a2 100644 --- a/cpp/src/gandiva/dex.h +++ b/cpp/src/gandiva/dex.h @@ -80,6 +80,19 @@ class GANDIVA_EXPORT VectorReadFixedLenValueDex : public VectorReadBaseDex { void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } }; +/// value component of a fixed-len list ValueVector +class GANDIVA_EXPORT VectorReadFixedLenValueListDex : public VectorReadBaseDex { + public: + explicit VectorReadFixedLenValueListDex(FieldDescriptorPtr field_desc) + : VectorReadBaseDex(field_desc) {} + + int DataIdx() const { return field_desc_->data_idx(); } + + int OffsetsIdx() const { return field_desc_->offsets_idx(); } + + void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } +}; + /// value component of a variable-len ValueVector class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex { public: @@ -93,6 +106,21 @@ class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex { void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } }; +/// value component of a variable-len list ValueVector +class GANDIVA_EXPORT VectorReadVarLenValueListDex : public VectorReadBaseDex { + public: + explicit VectorReadVarLenValueListDex(FieldDescriptorPtr field_desc) + : VectorReadBaseDex(field_desc) {} + + int DataIdx() const { return field_desc_->data_idx(); } + + int OffsetsIdx() const { return field_desc_->offsets_idx(); } + + int ChildOffsetsIdx() const { return field_desc_->child_data_offsets_idx(); } + + void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } +}; + /// validity based on a local bitmap. class GANDIVA_EXPORT LocalBitMapValidityDex : public Dex { public: diff --git a/cpp/src/gandiva/dex_visitor.h b/cpp/src/gandiva/dex_visitor.h index 5d160bb22ca68..4a03b9c21fc8a 100644 --- a/cpp/src/gandiva/dex_visitor.h +++ b/cpp/src/gandiva/dex_visitor.h @@ -28,7 +28,9 @@ namespace gandiva { class VectorReadValidityDex; class VectorReadFixedLenValueDex; +class VectorReadFixedLenValueListDex; class VectorReadVarLenValueDex; +class VectorReadVarLenValueListDex; class LocalBitMapValidityDex; class LiteralDex; class TrueDex; @@ -49,7 +51,9 @@ class GANDIVA_EXPORT DexVisitor { virtual void Visit(const VectorReadValidityDex& dex) = 0; virtual void Visit(const VectorReadFixedLenValueDex& dex) = 0; + virtual void Visit(const VectorReadFixedLenValueListDex& dex) = 0; virtual void Visit(const VectorReadVarLenValueDex& dex) = 0; + virtual void Visit(const VectorReadVarLenValueListDex& dex) = 0; virtual void Visit(const LocalBitMapValidityDex& dex) = 0; virtual void Visit(const TrueDex& dex) = 0; virtual void Visit(const FalseDex& dex) = 0; @@ -75,7 +79,9 @@ class GANDIVA_EXPORT DexVisitor { class GANDIVA_EXPORT DexDefaultVisitor : public DexVisitor { VISIT_DCHECK(VectorReadValidityDex) VISIT_DCHECK(VectorReadFixedLenValueDex) + VISIT_DCHECK(VectorReadFixedLenValueListDex) VISIT_DCHECK(VectorReadVarLenValueDex) + VISIT_DCHECK(VectorReadVarLenValueListDex) VISIT_DCHECK(LocalBitMapValidityDex) VISIT_DCHECK(TrueDex) VISIT_DCHECK(FalseDex) diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 957d9d046bd57..8ff48e5b5957f 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -39,8 +39,16 @@ Status ExprDecomposer::Visit(const FieldNode& node) { DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; - if (desc->HasOffsetsIdx()) { - value_dex = std::make_shared(desc); + if (desc->HasChildOffsetsIdx()) { + // handle list type + value_dex = std::make_shared(desc); + } else if (desc->HasOffsetsIdx()) { + if (desc->field()->type()->id() == arrow::Type::LIST) { + // handle list type + value_dex = std::make_shared(desc); + } else { + value_dex = std::make_shared(desc); + } } else { value_dex = std::make_shared(desc); } diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc index 35a13494523d0..265f2c119cd0e 100644 --- a/cpp/src/gandiva/expr_validator.cc +++ b/cpp/src/gandiva/expr_validator.cc @@ -67,7 +67,7 @@ Status ExprValidator::Validate(const ExpressionPtr& expr) { } Status ExprValidator::Visit(const FieldNode& node) { - auto llvm_type = types_->IRType(node.return_type()->id()); + auto llvm_type = types_->DataVecType(node.return_type()); ARROW_RETURN_IF(llvm_type == nullptr, Status::ExpressionValidationError("Field ", node.field()->name(), " has unsupported data type ", @@ -136,7 +136,7 @@ Status ExprValidator::Visit(const IfNode& node) { } Status ExprValidator::Visit(const LiteralNode& node) { - auto llvm_type = types_->IRType(node.return_type()->id()); + auto llvm_type = types_->DataVecType(node.return_type()); ARROW_RETURN_IF(llvm_type == nullptr, Status::ExpressionValidationError("Value ", ToString(node.holder()), " has unsupported data type ", diff --git a/cpp/src/gandiva/field_descriptor.h b/cpp/src/gandiva/field_descriptor.h index 0fe6fe37f4dd3..7b2d0c3b4fa92 100644 --- a/cpp/src/gandiva/field_descriptor.h +++ b/cpp/src/gandiva/field_descriptor.h @@ -30,12 +30,14 @@ class FieldDescriptor { static const int kInvalidIdx = -1; FieldDescriptor(FieldPtr field, int data_idx, int validity_idx = kInvalidIdx, - int offsets_idx = kInvalidIdx, int data_buffer_ptr_idx = kInvalidIdx) + int offsets_idx = kInvalidIdx, int data_buffer_ptr_idx = kInvalidIdx, + int child_offsets_idx = kInvalidIdx) : field_(field), data_idx_(data_idx), validity_idx_(validity_idx), offsets_idx_(offsets_idx), - data_buffer_ptr_idx_(data_buffer_ptr_idx) {} + data_buffer_ptr_idx_(data_buffer_ptr_idx), + child_offsets_idx_(child_offsets_idx) {} /// Index of validity array in the array-of-buffers int validity_idx() const { return validity_idx_; } @@ -49,6 +51,9 @@ class FieldDescriptor { /// Index of data buffer pointer in the array-of-buffers int data_buffer_ptr_idx() const { return data_buffer_ptr_idx_; } + /// Index of list type child data offsets + int child_data_offsets_idx() const { return child_offsets_idx_; } + FieldPtr field() const { return field_; } const std::string& Name() const { return field_->name(); } @@ -58,12 +63,15 @@ class FieldDescriptor { bool HasDataBufferPtrIdx() const { return data_buffer_ptr_idx_ != kInvalidIdx; } + bool HasChildOffsetsIdx() const { return child_offsets_idx_ != kInvalidIdx; } + private: FieldPtr field_; int data_idx_; int validity_idx_; int offsets_idx_; int data_buffer_ptr_idx_; + int child_offsets_idx_; }; } // namespace gandiva diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 5146f7fa1990a..a7c33ee6831a9 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -37,6 +37,31 @@ #include "gandiva/to_date_holder.h" /// Stub functions that can be accessed from LLVM or the pre-compiled library. +#define POPULATE_NUMERIC_LIST_TYPE_VECTOR(TYPE, SCALE) \ + int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ + int32_t* offsets, int64_t slot, \ + TYPE* entry_buf, int32_t entry_len) { \ + auto buffer = reinterpret_cast(data_ptr); \ + int32_t offset = static_cast(buffer->size()); \ + auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ + if (!status.ok()) { \ + gandiva::ExecutionContext* context = \ + reinterpret_cast(context_ptr); \ + context->set_error_msg(status.message().c_str()); \ + return -1; \ + } \ + memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ + offsets[slot] = offset / SCALE; \ + offsets[slot + 1] = offset / SCALE + entry_len; \ + return 0; \ + } + +#define ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(LLVM_TYPE, DATA_TYPE) \ + args = {types->i64_type(), types->i8_ptr_type(), types->i32_ptr_type(), \ + types->i64_type(), types->LLVM_TYPE##_ptr_type(), types->i32_type()}; \ + engine->AddGlobalMappingForFunc( \ + "gdv_fn_populate_list_" #DATA_TYPE "_vector", types->i32_type() /*return_type*/, \ + args, reinterpret_cast(gdv_fn_populate_list_##DATA_TYPE##_vector)); extern "C" { @@ -161,6 +186,71 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, return 0; } +POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) +POPULATE_NUMERIC_LIST_TYPE_VECTOR(int64_t, 8) +POPULATE_NUMERIC_LIST_TYPE_VECTOR(float, 4) +POPULATE_NUMERIC_LIST_TYPE_VECTOR(double, 8) + +int32_t gdv_fn_populate_list_varlen_vector(int64_t context_ptr, int8_t* data_ptr, + int32_t* offsets, int32_t* child_offsets, + int64_t slot, const char* entry_buf, + int32_t* entry_child_offsets, + int32_t entry_offsets_len) { + // we should calculate varlen list type varlen offset + // copy from entry child offsets + // it should be noted that, + // buffer size unit is byte(8 bit), + // offset element unit is int32(32 bit) + auto child_offsets_buffer = reinterpret_cast(child_offsets); + int32_t child_offsets_buffer_offset = + static_cast(child_offsets_buffer->size()); + + // data buffer elelment is char(8 bit) + auto data_buffer = reinterpret_cast(data_ptr); + int32_t data_buffer_offset = static_cast(data_buffer->size()); + + // sets the size in the child offsets buffer + // offsets element is int32, we should resize buffer by extra offsets_len * 4 + auto status = child_offsets_buffer->Resize( + child_offsets_buffer_offset + entry_offsets_len * 4, false /*shrink*/); + if (!status.ok()) { + gandiva::ExecutionContext* context = + reinterpret_cast(context_ptr); + + context->set_error_msg(status.message().c_str()); + return -1; + } + + // append the new child offsets entry to child offsets buffer + // offsets buffer last offset number indicating data length + // we should take this extra offset into consider + // so the initialize child_offsets_buffer length is 1(int32) + memcpy(child_offsets_buffer->mutable_data() + child_offsets_buffer_offset - 4, + (char*)entry_child_offsets, (entry_offsets_len + 1) * 4); + + // compute data length + int32_t data_length = + *(entry_child_offsets + entry_offsets_len) - *(entry_child_offsets); + + // sets the size in the child offsets buffer. + status = data_buffer->Resize(data_buffer_offset + data_length, false /*shrink*/); + if (!status.ok()) { + gandiva::ExecutionContext* context = + reinterpret_cast(context_ptr); + + context->set_error_msg(status.message().c_str()); + return -1; + } + + // append the new child offsets entry to child offsets buffer + memcpy(data_buffer->mutable_data() + data_buffer_offset, entry_buf, data_length); + + // update offsets buffer. + offsets[slot] = child_offsets_buffer_offset / 4 - 1; + offsets[slot + 1] = child_offsets_buffer_offset / 4 - 1 + entry_offsets_len; + return 0; +} + #define CRC_FUNCTION(TYPE) \ GANDIVA_EXPORT \ int64_t gdv_fn_crc_32_##TYPE(int64_t ctx, const char* input, int32_t input_len) { \ @@ -1174,6 +1264,24 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i32_type() /*return_type*/, args, reinterpret_cast(gdv_fn_cast_intervalyear_utf8)); + engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_utf8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_in_expr_lookup_utf8)); + + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i32, int32_t) + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i64, int64_t) + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(float, float) + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(double, double) + + // gdv_fn_populate_varlen_vector + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* offsets ptr + types->i64_type(), // int64_t slot + types->i8_ptr_type(), // const char* entry_buf + types->i32_type()}; // int32_t entry__len + + engine->AddGlobalMappingForFunc("gdv_fn_populate_varlen_vector", // gdv_fn_cast_intervalyear_utf8_int32 args = { types->i64_type(), // context @@ -1190,6 +1298,24 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { "gdv_fn_cast_intervalyear_utf8_int32", types->i32_type() /*return_type*/, args, reinterpret_cast(gdv_fn_cast_intervalyear_utf8_int32)); + // gdv_fn_populate_list_varlen_vector + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* offsets ptr + types->i32_ptr_type(), // int32_t* child offsets ptr + types->i64_type(), // int64_t slot + types->i8_ptr_type(), // const char* entry_buf + types->i32_ptr_type(), // int32_t* entry child offsets ptr + types->i32_type()}; // int32_t entry child offsets length + + engine->AddGlobalMappingForFunc( + "gdv_fn_populate_list_varlen_vector", types->i32_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_populate_list_varlen_vector)); + + // gdv_fn_random + args = {types->i64_type()}; + engine->AddGlobalMappingForFunc("gdv_fn_random", types->double_type(), args, + reinterpret_cast(gdv_fn_random)); // to_utc_timezone_timestamp args = { types->i64_type(), // context diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index b757ef23f93cc..fc973acb6fc97 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -31,7 +31,6 @@ #include "gandiva/lvalue.h" namespace gandiva { - #define ADD_TRACE(...) \ if (enable_ir_traces_) { \ AddTrace(__VA_ARGS__); \ @@ -211,6 +210,14 @@ llvm::Value* LLVMGenerator::GetOffsetsReference(llvm::Value* arg_addrs, int idx, return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_oarray"); } +/// Get reference to child offsets array at specified index in the args list. +llvm::Value* LLVMGenerator::GetChildOffsetsReference(llvm::Value* arg_addrs, int idx, + FieldPtr field) { + const std::string& name = field->name(); + llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name); + return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_coarray"); +} + /// Get reference to local bitmap array at specified index in the args list. llvm::Value* LLVMGenerator::GetLocalBitMapReference(llvm::Value* arg_bitmaps, int idx) { llvm::Value* load = LoadVectorAtIndex(arg_bitmaps, types()->i64_type(), idx, ""); @@ -402,6 +409,38 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, std::cout << "LR creating struct type to store the result." << std::endl; auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); builder->CreateStore(output_value->data(), slot_offset); + } else if (output_type_id == arrow::Type::LIST) { + auto output_list_internal_type = output->Type()->field(0)->type()->id(); + if (arrow::is_binary_like(output_list_internal_type)) { + auto output_list_value = std::dynamic_pointer_cast(output_value); + llvm::Value* child_output_offset_ref = GetChildOffsetsReference( + arg_addrs, output->child_data_offsets_idx(), output->field()); + AddFunctionCall( + "gdv_fn_populate_list_varlen_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + child_output_offset_ref, loop_var, output_list_value->data(), + output_list_value->child_offsets(), output_list_value->offsets_length()}); + } else if (output_list_internal_type == arrow::Type::INT32) { + AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else if (output_list_internal_type == arrow::Type::INT64) { + AddFunctionCall("gdv_fn_populate_list_int64_t_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else if (output_list_internal_type == arrow::Type::FLOAT) { + AddFunctionCall("gdv_fn_populate_list_float_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else if (output_list_internal_type == arrow::Type::DOUBLE) { + AddFunctionCall("gdv_fn_populate_list_double_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else { + return Status::NotImplemented("list internal type ", + output->Type()->field(0)->type()->ToString(), + " not supported"); + } } else { return Status::NotImplemented("output type ", output->Type()->ToString(), " not supported"); @@ -620,6 +659,46 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { result_ = lvalue; } +void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { + llvm::IRBuilder<>* builder = ir_builder(); + llvm::Value* slot; + + // compute list len from the offsets array. + llvm::Value* offsets_slot_ref = + GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); + llvm::Value* offsets_slot_index = + builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); + + // => offset_start = offsets[loop_var] + slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index); + llvm::Value* offset_start = builder->CreateLoad(slot, "offset_start"); + + // => offset_end = offsets[loop_var + 1] + llvm::Value* offsets_slot_index_next = builder->CreateAdd( + offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); + slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(slot, "offset_end"); + + // => offsets_len_value = offset_end - offset_start + llvm::Value* list_len = builder->CreateSub(offset_end, offset_start, "offsets_len"); + + // get data array + llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); + // do not forget slice offset + llvm::Value* offset_start_int64 = + builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); + llvm::Value* slot_index = + builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.DataIdx())); + llvm::Value* data_list = builder->CreateGEP(slot_ref, slot_index); + + // TODO: handle bool type bitmap + // TODO: handle decimal precision and scale + + ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", + list_len); + result_.reset(new LValue(data_list, list_len)); +} + void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot; @@ -655,6 +734,65 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { result_.reset(new LValue(data_value, len_value)); } +/* + * create list type field context for each loop + */ +void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { + /* Example + * list_data: [["var_len_val11"], ["var_len_val211", "var_len_val22"], + * ["var_len_val3331"]] loop_var: 0, 1, 2 data_buffer: + * var_len_val11var_len_val211var_len_val22var_len_val3331 offsets_buffer: 0, 1, 3, 4 + * list_element_len = offsets[loop_var+1]-offsets[loop_var] => 1, 2, 1 + * child_offsets_buffer: 0, 13, 27, 40, 55 + * for i in list_element_len: + * data_buffer[child_offsets_buffer[offsets[i+1]] - child_offsets_buffer[offsets[i]]] + * => list_data[loop_var][i] + */ + llvm::IRBuilder<>* builder = ir_builder(); + llvm::Value* slot; + + // compute list length from the offsets array + llvm::Value* offsets_slot_ref = + GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); + llvm::Value* offsets_slot_index = + builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); + + // => offset_start = offsets[loop_var] + slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index); + llvm::Value* offset_start = builder->CreateLoad(slot, "offset_start"); + + // => offset_end = offsets[loop_var + 1] + llvm::Value* offsets_slot_index_next = builder->CreateAdd( + offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); + slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(slot, "offset_end"); + + // => list_data_length = offset_end - offset_start + llvm::Value* list_data_length = + builder->CreateSub(offset_end, offset_start, "offsets_len"); + + // get the child offsets array from the child offsets array, + // start from offset 'offset_start' + llvm::Value* child_offset_slot_ref = + GetBufferReference(dex.ChildOffsetsIdx(), kBufferTypeChildOffsets, dex.Field()); + // do not forget slice offset + llvm::Value* offset_start_int64 = + builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); + llvm::Value* child_offset_slot_index = + builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.ChildOffsetsIdx())); + llvm::Value* child_offsets = + builder->CreateGEP(child_offset_slot_ref, child_offset_slot_index); + llvm::Value* child_offset_start = + builder->CreateLoad(child_offsets, "child_offset_start"); + + // get the data array + llvm::Value* data_slot_ref = + GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); + llvm::Value* data_value = builder->CreateGEP(data_slot_ref, child_offset_start); + + result_.reset(new ListLValue(data_value, child_offsets, list_data_length)); +} + void LLVMGenerator::Visitor::Visit(const VectorReadValidityDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot_ref = @@ -791,7 +929,7 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); LLVMTypes* types = generator_->types(); auto arrow_type_id = arrow_return_type->id(); - auto result_type = types->IRType(arrow_type_id); + auto result_type = types->DataVecType(arrow_return_type); // Build combined validity of the args. llvm::Value* is_valid = types->true_constant(); @@ -1194,7 +1332,7 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, // Emit the merge block. builder->SetInsertPoint(merge_bb); - auto llvm_type = types->IRType(result_type->id()); + auto llvm_type = types->DataVecType(result_type); llvm::PHINode* result_value = builder->CreatePHI(llvm_type, 2, "res_value"); result_value->addIncoming(then_lvalue->data(), then_bb); result_value->addIncoming(else_lvalue->data(), else_bb); @@ -1240,7 +1378,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, std::vector* params) { auto types = generator_->types(); auto arrow_return_type_id = arrow_return_type->id(); - auto llvm_return_type = types->IRType(arrow_return_type_id); + auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); if (arrow_return_type_id == arrow::Type::DECIMAL) { @@ -1370,6 +1508,10 @@ llvm::Value* LLVMGenerator::Visitor::GetBufferReference(int idx, BufferType buff case kBufferTypeOffsets: slot_ref = generator_->GetOffsetsReference(arg_addrs_, idx, field); break; + + case kBufferTypeChildOffsets: + slot_ref = generator_->GetChildOffsetsReference(arg_addrs_, idx, field); + break; } // Revert to the saved block. @@ -1468,5 +1610,4 @@ void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { } AddFunctionCall(print_fn_name, types()->i32_type(), args); } - } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 04f9b854b1d29..2d10871a81f98 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -102,7 +102,9 @@ class GANDIVA_EXPORT LLVMGenerator { void Visit(const VectorReadValidityDex& dex) override; void Visit(const VectorReadFixedLenValueDex& dex) override; + void Visit(const VectorReadFixedLenValueListDex& dex) override; void Visit(const VectorReadVarLenValueDex& dex) override; + void Visit(const VectorReadVarLenValueListDex& dex) override; void Visit(const LocalBitMapValidityDex& dex) override; void Visit(const TrueDex& dex) override; void Visit(const FalseDex& dex) override; @@ -127,7 +129,12 @@ class GANDIVA_EXPORT LLVMGenerator { bool has_arena_allocs() { return has_arena_allocs_; } private: - enum BufferType { kBufferTypeValidity = 0, kBufferTypeData, kBufferTypeOffsets }; + enum BufferType { + kBufferTypeValidity = 0, + kBufferTypeData, + kBufferTypeOffsets, + kBufferTypeChildOffsets + }; llvm::IRBuilder<>* ir_builder() { return generator_->ir_builder(); } llvm::Module* module() { return generator_->module(); } @@ -195,6 +202,10 @@ class GANDIVA_EXPORT LLVMGenerator { /// Generate code to load the vector at specified index and cast it as offsets array. llvm::Value* GetOffsetsReference(llvm::Value* arg_addrs, int idx, FieldPtr field); + /// Generate code to load the vector at specified index and cast it as child offsets + /// array. + llvm::Value* GetChildOffsetsReference(llvm::Value* arg_addrs, int idx, FieldPtr field); + /// Generate code to load the vector at specified index and cast it as buffer pointer. llvm::Value* GetDataBufferPtrReference(llvm::Value* arg_addrs, int idx, FieldPtr field); diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index 77480ab57d06a..059f1d051f8ca 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -69,6 +69,10 @@ class GANDIVA_EXPORT LLVMTypes { llvm::PointerType* i128_ptr_type() { return ptr_type(i128_type()); } + llvm::PointerType* float_ptr_type() { return ptr_type(float_type()); } + + llvm::PointerType* double_ptr_type() { return ptr_type(double_type()); } + template llvm::Constant* int_constant(ctype val) { return llvm::ConstantInt::get(context_, llvm::APInt(N, val)); @@ -108,6 +112,13 @@ class GANDIVA_EXPORT LLVMTypes { /// For a given data type, find the ir type used for the data vector slot. llvm::Type* DataVecType(const DataTypePtr& data_type) { + // support list type + // list type data is formed by base type buffer, wrapped with offsets buffer + // offsets buffer is to separate data into list + // not support nested list + if (data_type->id() == arrow::Type::LIST) { + return IRType(data_type->field(0)->type()->id()); + } return IRType(data_type->id()); } diff --git a/cpp/src/gandiva/llvm_types_test.cc b/cpp/src/gandiva/llvm_types_test.cc index 6669683061825..665a82d133fad 100644 --- a/cpp/src/gandiva/llvm_types_test.cc +++ b/cpp/src/gandiva/llvm_types_test.cc @@ -50,12 +50,22 @@ TEST_F(TestLLVMTypes, TestFound) { types_->i64_type()); EXPECT_EQ(types_->DataVecType(arrow::timestamp(arrow::TimeUnit::MILLI)), types_->i64_type()); + + EXPECT_EQ(types_->IRType(arrow::Type::STRING), types_->i8_ptr_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::boolean())), types_->i1_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::int32())), types_->i32_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::int64())), types_->i64_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::float32())), types_->float_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::float64())), types_->double_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::utf8())), types_->i8_ptr_type()); } TEST_F(TestLLVMTypes, TestNotFound) { EXPECT_EQ(types_->IRType(arrow::Type::SPARSE_UNION), nullptr); EXPECT_EQ(types_->IRType(arrow::Type::DENSE_UNION), nullptr); EXPECT_EQ(types_->DataVecType(arrow::null()), nullptr); + // not support nested list type + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::list(arrow::utf8()))), nullptr); } } // namespace gandiva diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index df292855b69af..4d1dca8f7cf4e 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -74,4 +74,27 @@ class GANDIVA_EXPORT DecimalLValue : public LValue { llvm::Value* scale_; }; +class GANDIVA_EXPORT ListLValue : public LValue { + public: + ListLValue(llvm::Value* data, llvm::Value* child_offsets, llvm::Value* offsets_length, + llvm::Value* validity = NULLPTR) + : LValue(data, NULLPTR, validity), + child_offsets_(child_offsets), + offsets_length_(offsets_length) {} + + llvm::Value* child_offsets() { return child_offsets_; } + + llvm::Value* offsets_length() { return offsets_length_; } + + void AppendFunctionParams(std::vector* params) override { + LValue::AppendFunctionParams(params); + params->push_back(child_offsets_); + params->push_back(offsets_length_); + } + + private: + llvm::Value* child_offsets_; + llvm::Value* offsets_length_; +}; + } // namespace gandiva diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 54de03963f7e7..7dd4442081ef4 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -29,6 +29,96 @@ namespace gandiva { +class ProjectorCacheKey { + public: + ProjectorCacheKey(SchemaPtr schema, std::shared_ptr configuration, + ExpressionVector expression_vector, SelectionVector::Mode mode) + : schema_(schema), configuration_(configuration), mode_(mode), uniqifier_(0) { + static const int kSeedValue = 4; + size_t result = kSeedValue; + for (auto& expr : expression_vector) { + std::string expr_as_string = expr->ToString(); + expressions_as_strings_.push_back(expr_as_string); + arrow::internal::hash_combine(result, expr_as_string); + UpdateUniqifier(expr_as_string); + } + arrow::internal::hash_combine(result, static_cast(mode)); + arrow::internal::hash_combine(result, configuration->Hash()); + arrow::internal::hash_combine(result, schema_->ToString()); + arrow::internal::hash_combine(result, uniqifier_); + hash_code_ = result; + } + + std::size_t Hash() const { return hash_code_; } + + bool operator==(const ProjectorCacheKey& other) const { + // arrow schema does not overload equality operators. + if (!(schema_->Equals(*other.schema().get(), true))) { + return false; + } + + if (*configuration_ != *other.configuration_) { + return false; + } + + if (expressions_as_strings_ != other.expressions_as_strings_) { + return false; + } + + if (mode_ != other.mode_) { + return false; + } + + if (uniqifier_ != other.uniqifier_) { + return false; + } + return true; + } + + bool operator!=(const ProjectorCacheKey& other) const { return !(*this == other); } + + SchemaPtr schema() const { return schema_; } + + std::string ToString() const { + std::stringstream ss; + // indent, window, indent_size, null_rep and skip new lines. + arrow::PrettyPrintOptions options{0, 10, 2, "null", true}; + DCHECK_OK(PrettyPrint(*schema_.get(), options, &ss)); + + ss << "Expressions: ["; + bool first = true; + for (auto& expr : expressions_as_strings_) { + if (first) { + first = false; + } else { + ss << ", "; + } + + ss << expr; + } + ss << "]"; + return ss.str(); + } + + private: + void UpdateUniqifier(const std::string& expr) { + if (uniqifier_ == 0) { + // caching of expressions with re2 patterns causes lock contention. So, use + // multiple instances to reduce contention. + if (expr.find(" like(") != std::string::npos) { + uniqifier_ = std::hash()(std::this_thread::get_id()) % 16; + } + } + } + + const SchemaPtr schema_; + const std::shared_ptr configuration_; + SelectionVector::Mode mode_; + std::vector expressions_as_strings_; + size_t hash_code_; + uint32_t uniqifier_; +}; + Projector::Projector(std::unique_ptr llvm_generator, SchemaPtr schema, const FieldVector& output_fields, std::shared_ptr configuration) @@ -217,6 +307,35 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, // Create and return array arrays. output->clear(); for (auto& array_data : output_data_vecs) { + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + /* when allocate array data, child data length is an initialized value, + * after calculating, child data offsets buffer has been resized for results, + * but array data length is unchanged. + * We should recalculate child data length and make ArrayData with new length + * + * Otherwise, child data offsets buffer length is data length + 1 + * and offset data is int32_t, need use buffer->size()/4 - 1 + */ + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + array_data->buffers, {new_child_data}, + array_data->null_count, array_data->offset); + } + output->push_back(arrow::MakeArray(array_data)); } return Status::OK(); @@ -243,6 +362,23 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, buffers.push_back(std::move(offsets_buffer)); } + if (type_id == arrow::Type::LIST) { + auto offsets_len = arrow::BitUtil::BytesForBits((num_records + 1) * 32); + + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, arrow::AllocateBuffer(offsets_len, pool)); + buffers.push_back(std::move(offsets_buffer)); + + if (arrow::is_binary_like(type->field(0)->type()->id())) { + // child offsets length is internal data length + 1 + // offsets element is int32 + // so here i just allocate extra 32 bit for extra 1 length + ARROW_ASSIGN_OR_RAISE( + auto child_offsets_buffer, + arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(32), pool)); + buffers.push_back(std::move(child_offsets_buffer)); + } + } + // The output vector always has a data array. int64_t data_len; if (arrow::is_primitive(type_id) || type_id == arrow::Type::DECIMAL) { @@ -251,6 +387,8 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } else if (arrow::is_binary_like(type_id)) { // we don't know the expected size for varlen output vectors. data_len = 0; + } else if (type_id == arrow::Type::LIST) { + data_len = 0; } else { return Status::Invalid("Unsupported output data type " + type->ToString()); } @@ -263,7 +401,24 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); - *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + if (type->id() == arrow::Type::LIST) { + auto internal_type = type->field(0)->type(); + ArrayDataPtr child_data; + if (arrow::is_primitive(internal_type->id())) { + child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, + {nullptr, std::move(buffers[2])}, 0); + } + if (arrow::is_binary_like(internal_type->id())) { + child_data = arrow::ArrayData::Make( + internal_type, 0 /*initialize length*/, + {nullptr, std::move(buffers[2]), std::move(buffers[3])}, 0); + } + *array_data = arrow::ArrayData::Make( + type, num_records, {std::move(buffers[0]), std::move(buffers[1])}, {child_data}); + + } else { + *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + } return Status::OK(); } @@ -290,7 +445,7 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, min_bitmap_len, " actual size ", bitmap_len)); auto type_id = field.type()->id(); - if (arrow::is_binary_like(type_id)) { + if (arrow::is_binary_like(type_id) || type_id == arrow::Type::LIST) { // validate size of offsets buffer. int64_t min_offsets_len = arrow::bit_util::BytesForBits((num_records + 1) * 32); int64_t offsets_len = array_data.buffers[1]->capacity(); @@ -339,4 +494,5 @@ std::shared_ptr Projector::GetSecondaryCacheKey(std::string prima return arrow::Buffer::FromString(key); } + } // namespace gandiva diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index b89c0ac225209..bc607702126af 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -25,6 +25,7 @@ add_gandiva_test(binary_test) add_gandiva_test(date_time_test) add_gandiva_test(to_string_test) add_gandiva_test(utf8_test) +add_gandiva_test(list_test) add_gandiva_test(hash_test) add_gandiva_test(in_expr_test) add_gandiva_test(null_validity_test) diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc new file mode 100644 index 0000000000000..1abf22db5bc3f --- /dev/null +++ b/cpp/src/gandiva/tests/list_test.cc @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +namespace gandiva { + +using arrow::boolean; +using arrow::float32; +using arrow::float64; +using arrow::int32; +using arrow::int64; +using arrow::utf8; +using std::string; +using std::vector; + +class TestList : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + protected: + arrow::MemoryPool* pool_; +}; + +template +void _build_list_array(const vector& values, const vector& length, + const vector& validity, arrow::MemoryPool* pool, + ArrayPtr* array) { + size_t sum = 0; + for (auto& len : length) { + sum += len; + } + EXPECT_TRUE(values.size() == sum); + EXPECT_TRUE(length.size() == validity.size()); + + auto value_builder = std::make_shared(pool); + auto builder = std::make_shared(pool, value_builder); + int i = 0; + for (size_t l = 0; l < length.size(); l++) { + if (validity[l]) { + auto status = builder->Append(); + for (int j = 0; j < length[l]; j++) { + ASSERT_OK(value_builder->Append(values[i])); + i++; + } + } else { + ASSERT_OK(builder->AppendNull()); + for (int j = 0; j < length[l]; j++) { + i++; + } + } + } + ASSERT_OK(builder->Finish(array)); +} + +/* + * expression: + * input: a + * output: res + * typeof(a) can be list / list / list + */ +void _test_list_type_field_alias(DataTypePtr type, ArrayPtr array, + arrow::MemoryPool* pool) { + auto field_a = field("a", type); + auto schema = arrow::schema({field_a}); + auto result = field("res", type); + + auto num_records = 5; + assert(array->length() == num_records); + + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array}); + + // Make expression + auto field_a_node = TreeExprBuilder::MakeField(field_a); + auto expr = TreeExprBuilder::MakeExpression(field_a_node, result); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + EXPECT_ARROW_ARRAY_EQUALS(array, outputs[0]); + // EXPECT_ARROW_ARRAY_EQUALS will not check the length of child data, but + // ArrayData::Slice method will check length. ArrayData::ToString method will call + // ArrayData::Slice method + EXPECT_TRUE(array->ToString() == outputs[0]->ToString()); + EXPECT_TRUE(array->null_count() == outputs[0]->null_count()); +} + +TEST_F(TestList, TestListUtf8) { + ArrayPtr array; + _build_list_array( + {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", + "eeee", "eeeee"}, + {1, 4, 3, 2, 5}, {true, true, false, true, true}, pool_, &array); + _test_list_type_field_alias(list(utf8()), array, pool_); +} + +TEST_F(TestList, TestListUtf8WithInvalidData) { + ArrayPtr array; + _build_list_array( + {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", + "eeee", "eeeee"}, + {1, 2, 3, 4, 5}, {true, false, true, true, false}, pool_, &array); + _test_list_type_field_alias(list(utf8()), array, pool_); +} + +TEST_F(TestList, TestListInt64) { + ArrayPtr array; + _build_list_array( + {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, + 50000}, + {1, 2, 5, 4, 3}, {true, true, true, true, false}, pool_, &array); + _test_list_type_field_alias(list(int64()), array, pool_); +} + +TEST_F(TestList, TestListInt32) { + ArrayPtr array; + _build_list_array( + {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, + 50000}, + {5, 2, 3, 4, 1}, {true, false, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(int32()), array, pool_); +} + +TEST_F(TestList, TestListFloat32) { + ArrayPtr array; + _build_list_array( + {1.1f, 11.1f, 22.2f, 111.1f, 222.2f, 333.3f, 1111.1f, 2222.2f, 3333.3f, 4444.4f, + 11111.1f, 22222.2f, 33333.3f, 44444.4f, 55555.5f}, + {1, 2, 3, 4, 5}, {true, true, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(float32()), array, pool_); +} + +TEST_F(TestList, TestListFloat64) { + ArrayPtr array; + _build_list_array( + {1.1, 1.11, 2.22, 1.111, 2.222, 3.333, 1.1111, 2.2222, 3.3333, 4.4444, 1.11111, + 2.22222, 3.33333, 4.44444, 5.55555}, + {1, 2, 4, 3, 5}, {true, false, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(float64()), array, pool_); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index 5b86844f940bf..82b59ef19ad75 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -26,6 +26,7 @@ namespace gandiva { using arrow::boolean; using arrow::float32; using arrow::int32; +using arrow::utf8; class TestProjector : public ::testing::Test { public: @@ -80,7 +81,7 @@ TEST_F(TestProjector, TestNotMatchingDataType) { TEST_F(TestProjector, TestNotSupportedDataType) { // schema for input fields - auto field0 = field("f0", list(int32())); + auto field0 = field("f0", map(utf8(), int32())); auto schema = arrow::schema({field0}); // output fields @@ -94,7 +95,7 @@ TEST_F(TestProjector, TestNotSupportedDataType) { std::shared_ptr projector; auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Field f0 has unsupported data type list"; + std::string expected_error = "Field f0 has unsupported data type map"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } From 41c5083369b6a7830347bf72e84eb3e6a6acf6d6 Mon Sep 17 00:00:00 2001 From: Jiangtao Peng Date: Sat, 2 Jan 2021 15:23:15 +0800 Subject: [PATCH 06/46] add two list type input gandiva function --- cpp/src/gandiva/CMakeLists.txt | 15 +++ cpp/src/gandiva/array_ops.cc | 76 ++++++++++++ cpp/src/gandiva/array_ops.h | 33 ++++++ cpp/src/gandiva/array_ops_test.cc | 52 +++++++++ cpp/src/gandiva/exported_funcs.h | 6 + cpp/src/gandiva/function_registry.cc | 14 ++- cpp/src/gandiva/function_registry_array.cc | 35 ++++++ cpp/src/gandiva/function_registry_array.h | 28 +++++ cpp/src/gandiva/function_registry_test.cc | 10 ++ cpp/src/gandiva/gdv_function_stubs.cc | 58 ++++----- cpp/src/gandiva/precompiled/types.h | 2 + cpp/src/gandiva/tests/list_test.cc | 129 +++++++++++++++++++++ 12 files changed, 427 insertions(+), 31 deletions(-) create mode 100644 cpp/src/gandiva/array_ops.cc create mode 100644 cpp/src/gandiva/array_ops.h create mode 100644 cpp/src/gandiva/array_ops_test.cc create mode 100644 cpp/src/gandiva/function_registry_array.cc create mode 100644 cpp/src/gandiva/function_registry_array.h diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 6a92224e9113d..174ad91041724 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -61,9 +61,11 @@ set(SRC_FILES expression_registry.cc exported_funcs_registry.cc filter.cc + array_ops.cc function_ir_builder.cc function_registry.cc function_registry_arithmetic.cc + function_registry_array.cc function_registry_datetime.cc function_registry_hash.cc function_registry_math_ops.cc @@ -249,7 +251,20 @@ add_gandiva_test(internals-test random_generator_holder_test.cc hash_utils_test.cc gdv_function_stubs_test.cc +<<<<<<< HEAD interval_holder_test.cc) +======= + array_ops_test.cc + EXTRA_DEPENDENCIES + LLVM::LLVM_INTERFACE + EXTRA_INCLUDES + $ + ${GANDIVA_INTERNALS_TEST_ARGUMENTS}) + +if(ARROW_GANDIVA_JAVA) + add_subdirectory(jni) +endif() +>>>>>>> cdcde4e08 (add two list type input gandiva function) add_subdirectory(precompiled) add_subdirectory(tests) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc new file mode 100644 index 0000000000000..6763839fd947c --- /dev/null +++ b/cpp/src/gandiva/array_ops.cc @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/array_ops.h" + +#include "arrow/util/value_parsing.h" +#include "gandiva/engine.h" +#include "gandiva/exported_funcs.h" + +/// Stub functions that can be accessed from LLVM or the pre-compiled library. + +extern "C" { + +bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len, + const char* contains_data, int32_t contains_data_length) { + for (int i = 0; i < entry_offsets_len; i++) { + int32_t entry_len = *(entry_child_offsets + i + 1) - *(entry_child_offsets + i); + if (entry_len != contains_data_length) { + entry_buf = entry_buf + entry_len; + continue; + } + if (strncmp(entry_buf, contains_data, contains_data_length) == 0) { + return true; + } + entry_buf = entry_buf + entry_len; + } + return false; +} + +int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len) { + int64_t res = entry_offsets_len; + return res; +} +} + +namespace gandiva { +void ExportedArrayFunctions::AddMappings(Engine* engine) const { + std::vector args; + auto types = engine->types(); + + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* child offsets ptr + types->i32_type()}; // int32_t child offsets length + + engine->AddGlobalMappingForFunc("array_utf8_length", types->i64_type() /*return_type*/, + args, reinterpret_cast(array_utf8_length)); + + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* child offsets ptr + types->i32_type(), // int32_t child offsets length + types->i8_ptr_type(), // const char* contains data buf + types->i32_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_utf8_contains_utf8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_utf8_contains_utf8)); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h new file mode 100644 index 0000000000000..19bf35e9f4f02 --- /dev/null +++ b/cpp/src/gandiva/array_ops.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "gandiva/visibility.h" + +/// Array functions that can be accessed from LLVM. +extern "C" { +GANDIVA_EXPORT +bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len, + const char* contains_data, int32_t contains_data_length); +GANDIVA_EXPORT +int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len); +} diff --git a/cpp/src/gandiva/array_ops_test.cc b/cpp/src/gandiva/array_ops_test.cc new file mode 100644 index 0000000000000..f2209c8e63408 --- /dev/null +++ b/cpp/src/gandiva/array_ops_test.cc @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { + +TEST(TestArrayOps, TestUtf8ContainsUtf8) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + const char* entry_buf = "trianglecirclerectangle"; + int32_t entry_child_offsets[] = {0, 8, 14, 24}; + int32_t entry_offsets_len = 3; + const char* contains_data = "triangle"; + int32_t contains_data_length = 8; + + EXPECT_EQ( + array_utf8_contains_utf8(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len, + contains_data, contains_data_length), + true); +} + +TEST(TestArrayOps, TestUtf8Length) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + const char* entry_buf = "trianglecirclerectangle"; + int32_t entry_child_offsets[] = {0, 8, 14, 24}; + int32_t entry_offsets_len = 3; + + EXPECT_EQ(array_utf8_length(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len), + 3); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/exported_funcs.h b/cpp/src/gandiva/exported_funcs.h index 5a14c52162156..55145b301e78c 100644 --- a/cpp/src/gandiva/exported_funcs.h +++ b/cpp/src/gandiva/exported_funcs.h @@ -32,6 +32,12 @@ class ExportedFuncsBase { virtual void AddMappings(Engine* engine) const = 0; }; +// Class for exporting Array functions +class ExportedArrayFunctions : public ExportedFuncsBase { + void AddMappings(Engine* engine) const override; +}; +REGISTER_EXPORTED_FUNCS(ExportedArrayFunctions); + // Class for exporting Stub functions class ExportedStubFunctions : public ExportedFuncsBase { void AddMappings(Engine* engine) const override; diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 67b7b404b325c..9180e8c33ca33 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -16,17 +16,19 @@ // under the License. #include "gandiva/function_registry.h" + +#include +#include +#include + #include "gandiva/function_registry_arithmetic.h" +#include "gandiva/function_registry_array.h" #include "gandiva/function_registry_datetime.h" #include "gandiva/function_registry_hash.h" #include "gandiva/function_registry_math_ops.h" #include "gandiva/function_registry_string.h" #include "gandiva/function_registry_timestamp_arithmetic.h" -#include -#include -#include - namespace gandiva { FunctionRegistry::iterator FunctionRegistry::begin() const { @@ -64,6 +66,10 @@ SignatureMap FunctionRegistry::InitPCMap() { auto v6 = GetDateTimeArithmeticFunctionRegistry(); pc_registry_.insert(std::end(pc_registry_), v6.begin(), v6.end()); + + auto v7 = GetArrayFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v7.begin(), v7.end()); + for (auto& elem : pc_registry_) { for (auto& func_signature : elem.signatures()) { map.insert(std::make_pair(&(func_signature), &elem)); diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc new file mode 100644 index 0000000000000..057115caf04e0 --- /dev/null +++ b/cpp/src/gandiva/function_registry_array.cc @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_array.h" + +#include "gandiva/function_registry_common.h" + +namespace gandiva { +std::vector GetArrayFunctionRegistry() { + static std::vector array_fn_registry_ = { + NativeFunction("array_contains", {}, DataTypeVector{list(utf8()), utf8()}, + boolean(), kResultNullIfNull, "array_utf8_contains_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("array_length", {}, DataTypeVector{list(utf8())}, int64(), + kResultNullIfNull, "array_utf8_length", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + }; + return array_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_array.h b/cpp/src/gandiva/function_registry_array.h new file mode 100644 index 0000000000000..9b8e4553702a8 --- /dev/null +++ b/cpp/src/gandiva/function_registry_array.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetArrayFunctionRegistry(); + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_test.cc b/cpp/src/gandiva/function_registry_test.cc index e3c1e85f79cba..f33388e0aefdf 100644 --- a/cpp/src/gandiva/function_registry_test.cc +++ b/cpp/src/gandiva/function_registry_test.cc @@ -93,4 +93,14 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { "different precompiled functions:\n" << stream.str(); } + +TEST_F(TestFunctionRegistry, TestFound2) { + FunctionSignature array_length("array_length", {list(utf8())}, arrow::int64()); + + const NativeFunction* function = registry_.LookupSignature(array_length); + EXPECT_NE(function, nullptr); + EXPECT_THAT(function->signatures(), testing::Contains(array_length)); + EXPECT_EQ(function->pc_name(), "array_utf8_length"); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index a7c33ee6831a9..94079587d6ad2 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -36,33 +36,6 @@ #include "gandiva/random_generator_holder.h" #include "gandiva/to_date_holder.h" -/// Stub functions that can be accessed from LLVM or the pre-compiled library. -#define POPULATE_NUMERIC_LIST_TYPE_VECTOR(TYPE, SCALE) \ - int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ - int32_t* offsets, int64_t slot, \ - TYPE* entry_buf, int32_t entry_len) { \ - auto buffer = reinterpret_cast(data_ptr); \ - int32_t offset = static_cast(buffer->size()); \ - auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ - if (!status.ok()) { \ - gandiva::ExecutionContext* context = \ - reinterpret_cast(context_ptr); \ - context->set_error_msg(status.message().c_str()); \ - return -1; \ - } \ - memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ - offsets[slot] = offset / SCALE; \ - offsets[slot + 1] = offset / SCALE + entry_len; \ - return 0; \ - } - -#define ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(LLVM_TYPE, DATA_TYPE) \ - args = {types->i64_type(), types->i8_ptr_type(), types->i32_ptr_type(), \ - types->i64_type(), types->LLVM_TYPE##_ptr_type(), types->i32_type()}; \ - engine->AddGlobalMappingForFunc( \ - "gdv_fn_populate_list_" #DATA_TYPE "_vector", types->i32_type() /*return_type*/, \ - args, reinterpret_cast(gdv_fn_populate_list_##DATA_TYPE##_vector)); - extern "C" { static char mask_array[256] = { @@ -186,6 +159,26 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, return 0; } +/// Stub functions that can be accessed from LLVM or the pre-compiled library. +#define POPULATE_NUMERIC_LIST_TYPE_VECTOR(TYPE, SCALE) \ + int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ + int32_t* offsets, int64_t slot, \ + TYPE* entry_buf, int32_t entry_len) { \ + auto buffer = reinterpret_cast(data_ptr); \ + int32_t offset = static_cast(buffer->size()); \ + auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ + if (!status.ok()) { \ + gandiva::ExecutionContext* context = \ + reinterpret_cast(context_ptr); \ + context->set_error_msg(status.message().c_str()); \ + return -1; \ + } \ + memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ + offsets[slot] = offset / SCALE; \ + offsets[slot + 1] = offset / SCALE + entry_len; \ + return 0; \ + } + POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) POPULATE_NUMERIC_LIST_TYPE_VECTOR(int64_t, 8) POPULATE_NUMERIC_LIST_TYPE_VECTOR(float, 4) @@ -928,6 +921,8 @@ const char* gdv_mask_show_last_n_utf8_int32(int64_t context, const char* data, int32_t n_to_mask = num_of_chars - n_to_show; return gdv_mask_first_n_utf8_int32(context, data, data_len, n_to_mask, out_len); } + +#undef POPULATE_NUMERIC_LIST_TYPE_VECTOR } namespace gandiva { @@ -1268,6 +1263,13 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_in_expr_lookup_utf8)); +#define ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(LLVM_TYPE, DATA_TYPE) \ + args = {types->i64_type(), types->i8_ptr_type(), types->i32_ptr_type(), \ + types->i64_type(), types->LLVM_TYPE##_ptr_type(), types->i32_type()}; \ + engine->AddGlobalMappingForFunc( \ + "gdv_fn_populate_list_" #DATA_TYPE "_vector", types->i32_type() /*return_type*/, \ + args, reinterpret_cast(gdv_fn_populate_list_##DATA_TYPE##_vector)); + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i32, int32_t) ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i64, int64_t) ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(float, float) @@ -1415,4 +1417,6 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("mask_utf8", types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(mask_utf8)); } + +#undef ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 1867f5b785eed..15fba4867e650 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -19,6 +19,8 @@ #include + +#include "gandiva/array_ops.h" #include "gandiva/gdv_function_stubs.h" // Use the same names as in arrow data types. Makes it easy to write pre-processor macros. diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index 1abf22db5bc3f..d8f44a5c235bb 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -168,4 +168,133 @@ TEST_F(TestList, TestListFloat64) { _test_list_type_field_alias(list(float64()), array, pool_); } +/* + * array_length(a) + */ +TEST_F(TestList, TestListUtf8Length) { + // schema for input fields + auto field_a = field("a", list(utf8())); + auto schema = arrow::schema({field_a}); + + // output fields + auto res = field("res", int64()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", + "eeee", "eeeee"}, + {1, 2, 3, 4, 5}, {true, true, true, true, true}, pool_, &array_a); + + // expected output + auto exp = MakeArrowArrayInt64({1, 2, 3, 4, 5}, {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // build expressions. + // array_length(a) + auto expr = TreeExprBuilder::MakeExpression("array_length", {field_a}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +TEST_F(TestList, TestListUtf8LengthWithInvalidData) { + // schema for input fields + auto field_a = field("a", list(utf8())); + auto schema = arrow::schema({field_a}); + + // output fields + auto res = field("res", int64()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {"a", "b", "bb", "cc", "cc", "ccc", "d", "dd", "ddd"}, {1, 2, 2, 3, 1}, + {true, false, true, false, true}, pool_, &array_a); + + // expected output + auto exp = MakeArrowArrayInt64({1, 2, 2, 3, 1}, {true, false, true, false, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // build expressions. + // array_length(a) + auto expr = TreeExprBuilder::MakeExpression("array_length", {field_a}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +/* + * array_contains(a, "element") + */ +TEST_F(TestList, TestListUtf8Contains) { + // schema for input fields + auto field_a = field("a", list(utf8())); + auto field_b = field("b", utf8()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", boolean()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {"rectangle", "circle", "rectangle", "circle", "triangle", "triangle", "circle", + "rectangle"}, + {2, 3, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); + auto array_b = + MakeArrowArrayUtf8({"rectangle", "circle", "circle", "circle", "rectangll"}); + + // expected output + auto exp = MakeArrowArrayBool({true, true, false, true, false}, + {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + auto expr = TreeExprBuilder::MakeExpression("array_contains", {field_a, field_b}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + } // namespace gandiva From aa0a15ccfebdd485c4b9a3575e684a0e9f638dce Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 3 Aug 2023 14:40:37 -0700 Subject: [PATCH 07/46] Fix leftover merge --- cpp/src/gandiva/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 174ad91041724..2fb34cbe53c92 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -251,9 +251,7 @@ add_gandiva_test(internals-test random_generator_holder_test.cc hash_utils_test.cc gdv_function_stubs_test.cc -<<<<<<< HEAD interval_holder_test.cc) -======= array_ops_test.cc EXTRA_DEPENDENCIES LLVM::LLVM_INTERFACE @@ -264,7 +262,6 @@ add_gandiva_test(internals-test if(ARROW_GANDIVA_JAVA) add_subdirectory(jni) endif() ->>>>>>> cdcde4e08 (add two list type input gandiva function) add_subdirectory(precompiled) add_subdirectory(tests) From 0fdb09b8bdb19e3f78be580a7d64a0cef385964a Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 3 Aug 2023 14:44:28 -0700 Subject: [PATCH 08/46] Fix merge --- cpp/src/gandiva/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 2fb34cbe53c92..ca377b6f45b60 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -251,8 +251,8 @@ add_gandiva_test(internals-test random_generator_holder_test.cc hash_utils_test.cc gdv_function_stubs_test.cc - interval_holder_test.cc) - array_ops_test.cc + interval_holder_test.cc + array_ops_test.cc) EXTRA_DEPENDENCIES LLVM::LLVM_INTERFACE EXTRA_INCLUDES From 1976d7c064d42024b3df1171ca33d4f5d5666850 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 10 Aug 2023 14:55:07 -0700 Subject: [PATCH 09/46] Somewhat working for int32 --- cpp/src/gandiva/CMakeLists.txt | 9 -- cpp/src/gandiva/annotator.cc | 34 +++- cpp/src/gandiva/array_ops.cc | 26 +++ cpp/src/gandiva/array_ops.h | 4 + cpp/src/gandiva/array_ops_test.cc | 13 ++ cpp/src/gandiva/decimal_ir.h | 2 +- cpp/src/gandiva/expr_decomposer.cc | 7 + cpp/src/gandiva/expression_registry.cc | 4 + cpp/src/gandiva/function_registry.cc | 4 + cpp/src/gandiva/function_registry_array.cc | 7 +- cpp/src/gandiva/function_registry_test.cc | 2 +- cpp/src/gandiva/gdv_function_stubs.cc | 3 + cpp/src/gandiva/llvm_generator.cc | 148 +++++++++++++++--- cpp/src/gandiva/projector.cc | 14 +- cpp/src/gandiva/tests/list_test.cc | 127 ++++++++++++++- .../main/cpp/expression_registry_helper.cc | 7 + java/gandiva/src/main/cpp/jni_common.cc | 83 +++++++++- .../gandiva/evaluator/ExpressionRegistry.java | 3 +- .../gandiva/expression/ArrowTypeHelper.java | 6 + java/pom.xml | 10 -- 20 files changed, 453 insertions(+), 60 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index ca377b6f45b60..dc0c427f48d23 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -253,15 +253,6 @@ add_gandiva_test(internals-test gdv_function_stubs_test.cc interval_holder_test.cc array_ops_test.cc) - EXTRA_DEPENDENCIES - LLVM::LLVM_INTERFACE - EXTRA_INCLUDES - $ - ${GANDIVA_INTERNALS_TEST_ARGUMENTS}) - -if(ARROW_GANDIVA_JAVA) - add_subdirectory(jni) -endif() add_subdirectory(precompiled) add_subdirectory(tests) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 540420563ae1d..fad439823db93 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -17,6 +17,7 @@ #include "gandiva/annotator.h" +#include #include #include @@ -52,6 +53,7 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { } if (field->type()->id() == arrow::Type::LIST) { + std::cout << "LR Annotator::MakeDesc 1" << std::endl; offsets_idx = buffer_count_++; if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { child_offsets_idx = buffer_count_++; @@ -90,6 +92,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); if (desc.HasChildOffsetsIdx()) { + std::cout << "LR Annotator::PrepareBuffersForField 1 for field " << desc.Name() << " type is " << array_data.type->id() << std::endl; if (is_output) { // if list field is output field, we should put buffer pointer into eval batch // for resizing @@ -98,6 +101,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); } else { + std::cout << "LR Annotator::PrepareBuffersForField 2" << std::endl; // if list field is input field, just put buffer data into eval batch uint8_t* child_offsets_buf = const_cast( array_data.child_data.at(0)->buffers[buffer_idx]->data()); @@ -106,19 +110,31 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, } } if (array_data.type->id() != arrow::Type::LIST || - arrow::is_binary_like(array_data.type->field(0)->type()->id())) - // primitive type list data buffer index is 1 - // binary like type list data buffer index is 2 - ++buffer_idx; + arrow::is_binary_like(array_data.type->field(0)->type()->id())) { + std::cout << "LR Annotator::PrepareBuffersForField 3" << std::endl; + + // primitive type list data buffer index is 1 + // binary like type list data buffer index is 2 + ++buffer_idx; + } } if (array_data.type->id() != arrow::Type::LIST) { + std::cout << "LR Annotator::PrepareBuffersForField 4" << std::endl; + + std::cout << "LR Annotator::PrepareBuffersForField 4 buffer_idx " << buffer_idx << std::endl; uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField 4a" << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); + std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; } else { + std::cout << "LR Annotator::PrepareBuffersForField 5 buffer_idx " << buffer_idx << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; + uint8_t* data_buf = const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); + std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; } if (is_output) { @@ -142,6 +158,7 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, EvalBatchPtr eval_batch = std::make_shared( record_batch.num_rows(), buffer_count_, local_bitmap_count_); + std::cout << "LR PrepareEvalBatch 1" << std::endl; // Fill in the entries for the input fields. for (int i = 0; i < record_batch.num_columns(); ++i) { const std::string& name = record_batch.column_name(i); @@ -151,6 +168,14 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, continue; } + std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch schema " << record_batch.schema()->ToString() + << " num rows " << record_batch.num_rows() + << " num columns " << record_batch.num_columns() + << " data size " << record_batch.column_data().size() + << " col 1 " << record_batch.column(0)->ToString() + << std::endl; + + std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch data " << record_batch.ToString() << std::endl; PrepareBuffersForField(*(found->second), *(record_batch.column_data(i)), eval_batch.get(), false /*is_output*/); } @@ -162,6 +187,7 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, PrepareBuffersForField(*desc, *arraydata, eval_batch.get(), true /*is_output*/); ++idx; } + std::cout << "LR PrepareEvalBatch 2" << std::endl; return eval_batch; } diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 6763839fd947c..0a2546567b914 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -17,6 +17,8 @@ #include "gandiva/array_ops.h" +#include + #include "arrow/util/value_parsing.h" #include "gandiva/engine.h" #include "gandiva/exported_funcs.h" @@ -42,6 +44,21 @@ bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, return false; } +bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, + int32_t contains_data) { + std::cout << "LR array_int32_contains_int32 offset length=" << entry_offsets_len << std::endl; + for (int i = 0; i < entry_offsets_len; i++) { + std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_len = *(entry_buf + i); + std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + if (entry_len == contains_data) { + return true; + } + } + return false; +} + int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len) { int64_t res = entry_offsets_len; @@ -72,5 +89,14 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("array_utf8_contains_utf8", types->i1_type() /*return_type*/, args, reinterpret_cast(array_utf8_contains_utf8)); + + args = {types->i64_type(), // int64_t execution_context + types->i32_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t child offsets length + types->i32_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_int32_contains_int32", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_int32_contains_int32)); } } // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 19bf35e9f4f02..7a32e303b3b08 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -30,4 +30,8 @@ bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, GANDIVA_EXPORT int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len); +GANDIVA_EXPORT +bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, + int32_t contains_data); } diff --git a/cpp/src/gandiva/array_ops_test.cc b/cpp/src/gandiva/array_ops_test.cc index f2209c8e63408..12dd6f9c56d30 100644 --- a/cpp/src/gandiva/array_ops_test.cc +++ b/cpp/src/gandiva/array_ops_test.cc @@ -23,6 +23,19 @@ namespace gandiva { +TEST(TestArrayOps, TestInt32ContainsInt32) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t data[] = {1, 2, 3, 4}; + int32_t entry_offsets_len = 3; + int32_t contains_data = 2; + + EXPECT_EQ( + array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, + contains_data), + true); +} + TEST(TestArrayOps, TestUtf8ContainsUtf8) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/decimal_ir.h b/cpp/src/gandiva/decimal_ir.h index b11730f1e231e..1a7cad7107036 100644 --- a/cpp/src/gandiva/decimal_ir.h +++ b/cpp/src/gandiva/decimal_ir.h @@ -29,7 +29,7 @@ namespace gandiva { class DecimalIR : public FunctionIRBuilder { public: explicit DecimalIR(Engine* engine) - : FunctionIRBuilder(engine), enable_ir_traces_(false) {} + : FunctionIRBuilder(engine), enable_ir_traces_(true) {} /// Build decimal IR functions and add them to the engine. static Status AddFunctions(Engine* engine); diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 8ff48e5b5957f..41cb64b3eba87 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -17,6 +17,7 @@ #include "gandiva/expr_decomposer.h" +#include #include #include #include @@ -37,19 +38,25 @@ namespace gandiva { Status ExprDecomposer::Visit(const FieldNode& node) { auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field()); + std::cout << "LR ExprDecomposer" << std::endl; DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; if (desc->HasChildOffsetsIdx()) { + std::cout << "LR ExprDecomposer 1" << std::endl; // handle list type value_dex = std::make_shared(desc); } else if (desc->HasOffsetsIdx()) { + std::cout << "LR ExprDecomposer 2" << std::endl; if (desc->field()->type()->id() == arrow::Type::LIST) { // handle list type + std::cout << "LR ExprDecomposer 3" << std::endl; value_dex = std::make_shared(desc); } else { + std::cout << "LR ExprDecomposer 4" << std::endl; value_dex = std::make_shared(desc); } } else { + std::cout << "LR ExprDecomposer 5" << std::endl; value_dex = std::make_shared(desc); } result_ = std::make_shared(validity_dex, value_dex); diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index b5b79af7f818a..20be12548e0f9 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -169,6 +169,10 @@ static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector case arrow::Type::type::STRUCT: vector.push_back(arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)})); break; + case arrow::Type::type::LIST: + //vector.push_back(arrow::list(arrow::utf8())); + vector.push_back(arrow::list(arrow::int32())); + break; default: // Unsupported types. test ensures that // when one of these are added build breaks. diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 9180e8c33ca33..a2ae2426b9235 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -17,6 +17,7 @@ #include "gandiva/function_registry.h" +#include #include #include #include @@ -71,7 +72,10 @@ SignatureMap FunctionRegistry::InitPCMap() { pc_registry_.insert(std::end(pc_registry_), v7.begin(), v7.end()); for (auto& elem : pc_registry_) { + std::cout << "LR pc_registry_ item " << elem.pc_name() << " first signature name " << elem.signatures()[0].base_name() << std::endl; for (auto& func_signature : elem.signatures()) { + std::cout << "LR Adding function to map " << func_signature.base_name() << std::endl; + //std::cout << " LR args " << func_signature.param_types map.insert(std::make_pair(&(func_signature), &elem)); } } diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index 057115caf04e0..1f37b9d612b3a 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -22,12 +22,15 @@ namespace gandiva { std::vector GetArrayFunctionRegistry() { static std::vector array_fn_registry_ = { - NativeFunction("array_contains", {}, DataTypeVector{list(utf8()), utf8()}, + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(utf8()), utf8()}, boolean(), kResultNullIfNull, "array_utf8_contains_utf8", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), - NativeFunction("array_length", {}, DataTypeVector{list(utf8())}, int64(), + NativeFunction("array_lengthGandiva", {}, DataTypeVector{list(utf8())}, int64(), kResultNullIfNull, "array_utf8_length", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, + boolean(), kResultNullIfNull, "array_int32_contains_int32", + NativeFunction::kNeedsContext), }; return array_fn_registry_; } diff --git a/cpp/src/gandiva/function_registry_test.cc b/cpp/src/gandiva/function_registry_test.cc index f33388e0aefdf..63ede751b44e3 100644 --- a/cpp/src/gandiva/function_registry_test.cc +++ b/cpp/src/gandiva/function_registry_test.cc @@ -95,7 +95,7 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { } TEST_F(TestFunctionRegistry, TestFound2) { - FunctionSignature array_length("array_length", {list(utf8())}, arrow::int64()); + FunctionSignature array_length("array_lengthGandiva", {list(utf8())}, arrow::int64()); const NativeFunction* function = registry_.LookupSignature(array_length); EXPECT_NE(function, nullptr); diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 94079587d6ad2..793d4f68feb74 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -1284,6 +1284,9 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i32_type()}; // int32_t entry__len engine->AddGlobalMappingForFunc("gdv_fn_populate_varlen_vector", + types->i32_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_populate_varlen_vector)); + // gdv_fn_cast_intervalyear_utf8_int32 args = { types->i64_type(), // context diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index fc973acb6fc97..8bc24666166f4 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -36,7 +36,28 @@ namespace gandiva { AddTrace(__VA_ARGS__); \ } -LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(false) {} +namespace { + std::string printType(llvm::Type* t) { + if (t == nullptr) { + return std::string("null"); + } + std::string str; + llvm::raw_string_ostream output(str); + t->print(output); + return str; + } + std::string printType(llvm::Value* t) { + if (t == nullptr) { + return std::string("null"); + } + std::string str; + llvm::raw_string_ostream output(str); + t->print(output); + return str; + } +} + +LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(true) {} Status LLVMGenerator::Make(std::shared_ptr config, bool cached, std::unique_ptr* llvm_generator) { @@ -123,10 +144,13 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); + int jello = 0; + std::cout << "LR LLVMGenerator::Execute " << jello++ << std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); + std::cout << "LR LLVMGenerator::Execute " << jello++ << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -136,6 +160,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } + std::cout << "LR LLVMGenerator::Execute " << jello++ << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -145,6 +170,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } + std::cout << "LR LLVMGenerator::Execute A" << jello++ << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -156,6 +182,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); + std::cout << "LR LLVMGenerator::Execute A" << jello++ << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -214,7 +241,7 @@ llvm::Value* LLVMGenerator::GetOffsetsReference(llvm::Value* arg_addrs, int idx, llvm::Value* LLVMGenerator::GetChildOffsetsReference(llvm::Value* arg_addrs, int idx, FieldPtr field) { const std::string& name = field->name(); - llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name); + llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_coarray"); } @@ -278,6 +305,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, FieldDescriptorPtr output, int suffix_idx, std::string& fn_name, SelectionVector::Mode selection_vector_mode) { + std::cout << "LR CodeGenExprValue" << std::endl; + try { llvm::IRBuilder<>* builder = ir_builder(); // Create fn prototype : // int expr_1 (long **addrs, long *offsets, long **bitmaps, @@ -411,6 +440,7 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, builder->CreateStore(output_value->data(), slot_offset); } else if (output_type_id == arrow::Type::LIST) { auto output_list_internal_type = output->Type()->field(0)->type()->id(); + std::cout << "LR creating list type to store the result with internal type " << output_list_internal_type << std::endl; if (arrow::is_binary_like(output_list_internal_type)) { auto output_list_value = std::dynamic_pointer_cast(output_value); llvm::Value* child_output_offset_ref = GetChildOffsetsReference( @@ -445,8 +475,13 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, return Status::NotImplemented("output type ", output->Type()->ToString(), " not supported"); } - ADD_TRACE("saving result " + output->Name() + " value %T", output_value->data()); + //LR HACK somehow this caused a crash???? + //std::cout << "LR saving result " << output->Name() << " value " << + // printType(output_value->data()) << std::endl; + //ADD_TRACE("saving result 2 " + output->Name() + " value %T", output_value->data()); + int jello = 0; + std::cout << "LR CodeGenExprValue " << jello++ << std::endl; if (visitor.has_arena_allocs()) { // Reset allocations to avoid excessive memory usage. Once the result is copied to // the output vector (store instruction above), any memory allocations in this @@ -456,20 +491,28 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_context_arena_reset", types()->void_type(), reset_args); } + std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // check loop_var loop_var->addIncoming(types()->i64_constant(0), loop_entry); llvm::Value* loop_update = builder->CreateAdd(loop_var, types()->i64_constant(1), "loop_var+1"); loop_var->addIncoming(loop_update, loop_body_tail); + std::cout << "LR CodeGenExprValue " << jello++ << std::endl; llvm::Value* loop_var_check = builder->CreateICmpSLT(loop_update, arg_nrecords, "loop_var < nrec"); builder->CreateCondBr(loop_var_check, loop_body, loop_exit); + std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // Loop exit builder->SetInsertPoint(loop_exit); builder->CreateRet(types()->i32_constant(0)); + std::cout << "LR CodeGenExprValue " << jello++ << std::endl; return Status::OK(); + } catch (std::exception& e) { + std::cout << e.what() << std::endl; + throw e; + } } /// Return value of a bit in bitMap. @@ -626,6 +669,7 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi } void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { + ADD_VISITOR_TRACE("VectorReadFixedLenValueDex"); llvm::IRBuilder<>* builder = ir_builder(); auto types = generator_->types(); llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); @@ -633,6 +677,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { llvm::Value* slot_value; std::shared_ptr lvalue; + ADD_VISITOR_TRACE("VectorReadFixedLenValueDex"); switch (dex.FieldType()->id()) { case arrow::Type::BOOL: slot_value = generator_->GetPackedBitValue(slot_ref, slot_index); @@ -660,24 +705,37 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { } void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { + ADD_VISITOR_TRACE("VectorReadFixedLenValueListDex"); llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot; + auto types = generator_->types(); + auto type = types->IRType(dex.FieldType()->id()); + + std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; + arrow::Type::type at = arrow::Type::INT32; + type = types->IRType(at); + //type = types->DataVecType(dex.FieldType()); + std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; // compute list len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); + std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << + printType(offsets_slot_index) << std::endl; // => offset_start = offsets[loop_var] - slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index); - llvm::Value* offset_start = builder->CreateLoad(slot, "offset_start"); + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); + llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); // => offset_end = offsets[loop_var + 1] llvm::Value* offsets_slot_index_next = builder->CreateAdd( offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); - slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index_next); - llvm::Value* offset_end = builder->CreateLoad(slot, "offset_end"); + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(type,slot, "offset_end"); // => offsets_len_value = offset_end - offset_start llvm::Value* list_len = builder->CreateSub(offset_end, offset_start, "offsets_len"); @@ -689,11 +747,14 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); llvm::Value* slot_index = builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.DataIdx())); - llvm::Value* data_list = builder->CreateGEP(slot_ref, slot_index); + llvm::Value* data_list = builder->CreateGEP(type, slot_ref, slot_index); // TODO: handle bool type bitmap // TODO: handle decimal precision and scale + std::cout << "LR VectorReadFixedLenValueListDex slot_ref " << printType(slot_ref) << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex visit fixed-len data list vector " << dex.FieldName() << + " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", list_len); result_.reset(new LValue(data_list, list_len)); @@ -703,7 +764,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot; auto types = generator_->types(); - + ADD_VISITOR_TRACE("VectorReadVarLenValueDex"); // compute len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); @@ -748,8 +809,17 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { * data_buffer[child_offsets_buffer[offsets[i+1]] - child_offsets_buffer[offsets[i]]] * => list_data[loop_var][i] */ + ADD_VISITOR_TRACE("VectorReadVarLenValueListDex"); llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot; + auto types = generator_->types(); + auto type = types->IRType(dex.FieldType()->id()); + std::cout << "LR dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + std::cout << "LR IRType is " << printType(type) << std::endl; + //type = types->DataVecType(dex.FieldType()); + //LR HACK. Original was type = types->DataVecType(dex.FieldType()); + arrow::Type::type at = arrow::Type::INT32; + type = types->IRType(at); // compute list length from the offsets array llvm::Value* offsets_slot_ref = @@ -757,43 +827,54 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); + int i = 0; + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_start = offsets[loop_var] - slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index); - llvm::Value* offset_start = builder->CreateLoad(slot, "offset_start"); - + std::cout << "LR Type is " << printType(type) << std::endl; + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); + + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_end = offsets[loop_var + 1] llvm::Value* offsets_slot_index_next = builder->CreateAdd( offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); - slot = builder->CreateGEP(offsets_slot_ref, offsets_slot_index_next); - llvm::Value* offset_end = builder->CreateLoad(slot, "offset_end"); + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(type, slot, "offset_end"); + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => list_data_length = offset_end - offset_start llvm::Value* list_data_length = builder->CreateSub(offset_end, offset_start, "offsets_len"); + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // get the child offsets array from the child offsets array, // start from offset 'offset_start' llvm::Value* child_offset_slot_ref = GetBufferReference(dex.ChildOffsetsIdx(), kBufferTypeChildOffsets, dex.Field()); + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // do not forget slice offset llvm::Value* offset_start_int64 = builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); llvm::Value* child_offset_slot_index = builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.ChildOffsetsIdx())); llvm::Value* child_offsets = - builder->CreateGEP(child_offset_slot_ref, child_offset_slot_index); + builder->CreateGEP(type, child_offset_slot_ref, child_offset_slot_index); llvm::Value* child_offset_start = - builder->CreateLoad(child_offsets, "child_offset_start"); + builder->CreateLoad(type, child_offsets, "child_offset_start"); + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // get the data array llvm::Value* data_slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); - llvm::Value* data_value = builder->CreateGEP(data_slot_ref, child_offset_start); - + llvm::Value* data_value = builder->CreateGEP(type, data_slot_ref, child_offset_start); + + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; result_.reset(new ListLValue(data_value, child_offsets, list_data_length)); } void LLVMGenerator::Visitor::Visit(const VectorReadValidityDex& dex) { + ADD_VISITOR_TRACE("VectorReadValidityDex"); llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot_ref = GetBufferReference(dex.ValidityIdx(), kBufferTypeValidity, dex.Field()); @@ -806,6 +887,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadValidityDex& dex) { } void LLVMGenerator::Visitor::Visit(const LocalBitMapValidityDex& dex) { + ADD_VISITOR_TRACE("LocalBitMapValidityDex"); llvm::Value* slot_ref = GetLocalBitMapReference(dex.local_bitmap_idx()); llvm::Value* validity = generator_->GetPackedBitValue(slot_ref, loop_var_); @@ -816,18 +898,22 @@ void LLVMGenerator::Visitor::Visit(const LocalBitMapValidityDex& dex) { } void LLVMGenerator::Visitor::Visit(const TrueDex& dex) { + ADD_VISITOR_TRACE("TrueDex"); result_.reset(new LValue(generator_->types()->true_constant())); } void LLVMGenerator::Visitor::Visit(const FalseDex& dex) { + ADD_VISITOR_TRACE("FalseDex"); result_.reset(new LValue(generator_->types()->false_constant())); } void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { + ADD_VISITOR_TRACE("LiteralDex"); LLVMTypes* types = generator_->types(); llvm::Value* value = nullptr; llvm::Value* len = nullptr; + std::cout << "LR LiteralDex type " << dex.type()->id() << std::endl; switch (dex.type()->id()) { case arrow::Type::BOOL: value = types->i1_constant(std::get(dex.holder())); @@ -868,7 +954,7 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { case arrow::Type::STRING: case arrow::Type::BINARY: { const std::string& str = std::get(dex.holder()); - + std::cout << "LR Literal string " << str << std::endl; value = ir_builder()->CreateGlobalStringPtr(str.c_str()); len = types->i32_constant(static_cast(str.length())); break; @@ -922,6 +1008,8 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { native_function->NeedsContext()); auto arrow_return_type = dex.func_descriptor()->return_type(); + std::cout << "LR NonNullableFunc 1 result_type " << printType(generator_->types()->DataVecType(arrow_return_type)) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << printType(generator_->types()->IRType(arrow_return_type->id())) << std::endl; + if (native_function->CanReturnErrors()) { // slow path : if a function can return errors, skip invoking the function // unless all of the input args are valid. Otherwise, it can cause spurious errors. @@ -930,6 +1018,7 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { LLVMTypes* types = generator_->types(); auto arrow_type_id = arrow_return_type->id(); auto result_type = types->DataVecType(arrow_return_type); + std::cout << "LR NonNullableFunc 2 result_type " << printType(result_type) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << types->IRType(arrow_type_id) << std::endl; // Build combined validity of the args. llvm::Value* is_valid = types->true_constant(); @@ -1277,25 +1366,31 @@ void LLVMGenerator::Visitor::VisitInExpression( } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } @@ -1303,6 +1398,7 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, std::function then_func, std::function else_func, DataTypePtr result_type) { + ADD_VISITOR_TRACE("BuildIfElse"); llvm::IRBuilder<>* builder = ir_builder(); llvm::LLVMContext* context = generator_->context(); LLVMTypes* types = generator_->types(); @@ -1433,11 +1529,13 @@ std::vector LLVMGenerator::Visitor::BuildParams( bool with_context) { std::vector params; + ADD_VISITOR_TRACE("LLVMGenerator::Visitor::BuildParams"); // add context if required. if (with_context) { params.push_back(arg_context_ptr_); } + std::cout << "LR BuildParams1" << std::endl; // if the function has holder, add the holder pointer. if (holder_idx != -1) { auto builder = ir_builder(); @@ -1446,6 +1544,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( llvm::BasicBlock* saved_block = builder->GetInsertBlock(); builder->SetInsertPoint(entry_block_); + std::cout << "LR BuildParams1a" << std::endl; auto holder = generator_->LoadVectorAtIndex( arg_holder_ptrs_, generator_->types()->i64_type(), holder_idx, "holder"); @@ -1453,16 +1552,20 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(holder); } + std::cout << "LR BuildParams2" << std::endl; // build the function params, along with the validities. for (auto& pair : args) { // build value. DexPtr value_expr = pair->value_expr(); + std::cout << "LR BuildParams2a" << std::endl; value_expr->Accept(*this); + std::cout << "LR BuildParams2b" << std::endl; LValue& result_ref = *result(); // append all the parameters corresponding to this LValue. result_ref.AppendFunctionParams(¶ms); + std::cout << "LR BuildParams2c" << std::endl; // build validity. if (with_validity) { llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs()); @@ -1586,9 +1689,9 @@ std::string LLVMGenerator::ReplaceFormatInTrace(const std::string& in_msg, } void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { - if (!enable_ir_traces_) { - return; - } + //if (!enable_ir_traces_) { + // return; + //} std::string dmsg = "IR_TRACE:: " + msg + "\n"; std::string print_fn_name = "printf"; @@ -1596,6 +1699,7 @@ void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { dmsg = ReplaceFormatInTrace(dmsg, value, &print_fn_name); } trace_strings_.push_back(dmsg); + std::cout << dmsg << std::endl; // cast this to an llvm pointer. const char* str = trace_strings_.back().c_str(); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 7dd4442081ef4..a95e80b148824 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -17,6 +17,7 @@ #include "gandiva/projector.h" +#include #include #include #include @@ -285,12 +286,14 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* p Status Projector::Evaluate(const arrow::RecordBatch& batch, const SelectionVector* selection_vector, arrow::MemoryPool* pool, arrow::ArrayVector* output) const { + std::cout << "LR Projector::Evaluate" << std::endl; ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); + std::cout << "LR Projector::Evaluate num_rows" << num_rows << std::endl; // Allocate the output data vecs. ArrayDataVector output_data_vecs; for (auto& field : output_fields_) { @@ -348,6 +351,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::Status astatus; std::vector> buffers; + std::cout << "LR Projector::AllocArrayData Enter" << std::endl; // The output vector always has a null bitmap. int64_t size = arrow::bit_util::BytesForBits(num_records); ARROW_ASSIGN_OR_RAISE(auto bitmap_buffer, arrow::AllocateBuffer(size, pool)); @@ -363,7 +367,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } if (type_id == arrow::Type::LIST) { - auto offsets_len = arrow::BitUtil::BytesForBits((num_records + 1) * 32); + auto offsets_len = arrow::bit_util::BytesForBits((num_records + 1) * 32); ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, arrow::AllocateBuffer(offsets_len, pool)); buffers.push_back(std::move(offsets_buffer)); @@ -374,7 +378,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, // so here i just allocate extra 32 bit for extra 1 length ARROW_ASSIGN_OR_RAISE( auto child_offsets_buffer, - arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(32), pool)); + arrow::AllocateResizableBuffer(arrow::bit_util::BytesForBits(32), pool)); buffers.push_back(std::move(child_offsets_buffer)); } } @@ -401,14 +405,18 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); + std::cout << "LR Projector::AllocArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { + std::cout << "LR Projector::AllocArrayData List" << std::endl; auto internal_type = type->field(0)->type(); ArrayDataPtr child_data; if (arrow::is_primitive(internal_type->id())) { + std::cout << "LR Projector::AllocArrayData List 1" << std::endl; child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, {nullptr, std::move(buffers[2])}, 0); } if (arrow::is_binary_like(internal_type->id())) { + std::cout << "LR Projector::AllocArrayData List 2" << std::endl; child_data = arrow::ArrayData::Make( internal_type, 0 /*initialize length*/, {nullptr, std::move(buffers[2]), std::move(buffers[3])}, 0); @@ -419,6 +427,8 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } else { *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); } + + std::cout << "LR Projector::AllocArrayData Done" << std::endl; return Status::OK(); } diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index d8f44a5c235bb..4906bbab7e47c 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -21,6 +21,8 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" #include "gandiva/projector.h" #include "gandiva/tests/test_util.h" #include "gandiva/tree_expr_builder.h" @@ -93,19 +95,24 @@ void _test_list_type_field_alias(DataTypePtr type, ArrayPtr array, auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array}); // Make expression + std::cout << "Make expression" << std::endl; auto field_a_node = TreeExprBuilder::MakeField(field_a); auto expr = TreeExprBuilder::MakeExpression(field_a_node, result); + std::cout << "Build a projector for the expressions." << std::endl; // Build a projector for the expressions. std::shared_ptr projector; auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + std::cout << "status message: " << status.message() << std::endl; EXPECT_TRUE(status.ok()) << status.message(); + std::cout << "Evaluate expression" << std::endl; // Evaluate expression arrow::ArrayVector outputs; status = projector->Evaluate(*in_batch, pool, &outputs); EXPECT_TRUE(status.ok()) << status.message(); + std::cout << "Check results" << std::endl; EXPECT_ARROW_ARRAY_EQUALS(array, outputs[0]); // EXPECT_ARROW_ARRAY_EQUALS will not check the length of child data, but // ArrayData::Slice method will check length. ArrayData::ToString method will call @@ -114,6 +121,7 @@ void _test_list_type_field_alias(DataTypePtr type, ArrayPtr array, EXPECT_TRUE(array->null_count() == outputs[0]->null_count()); } +/* TEST_F(TestList, TestListUtf8) { ArrayPtr array; _build_list_array( @@ -140,6 +148,19 @@ TEST_F(TestList, TestListInt64) { {1, 2, 5, 4, 3}, {true, true, true, true, false}, pool_, &array); _test_list_type_field_alias(list(int64()), array, pool_); } +*/ +TEST_F(TestList, TestListArrayInt32) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t data[] = {11, 2, 23, 42}; + int32_t entry_offsets_len = 4; + int32_t contains_data = 42; + + EXPECT_EQ( + array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, + contains_data), + true); +} TEST_F(TestList, TestListInt32) { ArrayPtr array; @@ -150,6 +171,106 @@ TEST_F(TestList, TestListInt32) { _test_list_type_field_alias(list(int32()), array, pool_); } +TEST_F(TestList, TestListInt32LiteralContains) { + // schema for input fields + auto field_a = field("a", list(int32())); + auto field_b = field("b", int32()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", boolean()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {1, 5, 19, 42, 57}, + {1, 1, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); + + auto array_b = + MakeArrowArrayInt32({42, 42, 42, 42, 42}); + + // expected output + auto exp = MakeArrowArrayBool({false, false, false, true, false}, + {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + + //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + std::vector field_nodes; + auto node = TreeExprBuilder::MakeField(field_a); + field_nodes.push_back(node); + + auto node2 = TreeExprBuilder::MakeLiteral(42); + field_nodes.push_back(node2); + + auto func_node = TreeExprBuilder::MakeFunction("array_containsGandiva", field_nodes, res->type()); + auto expr = TreeExprBuilder::MakeExpression(func_node, res); + //////// + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +TEST_F(TestList, TestListInt32Contains) { + // schema for input fields + auto field_a = field("a", list(int32())); + auto field_b = field("b", int32()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", boolean()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {1, 5, 19, 42, 57}, + {1, 1, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); + + auto array_b = + MakeArrowArrayInt32({42, 42, 42, 42, 42}); + + // expected output + auto exp = MakeArrowArrayBool({false, false, false, true, false}, + {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + TEST_F(TestList, TestListFloat32) { ArrayPtr array; _build_list_array( @@ -195,7 +316,7 @@ TEST_F(TestList, TestListUtf8Length) { // build expressions. // array_length(a) - auto expr = TreeExprBuilder::MakeExpression("array_length", {field_a}, res); + auto expr = TreeExprBuilder::MakeExpression("array_lengthGandiva", {field_a}, res); // Build a projector for the expressions. std::shared_ptr projector; @@ -234,7 +355,7 @@ TEST_F(TestList, TestListUtf8LengthWithInvalidData) { // build expressions. // array_length(a) - auto expr = TreeExprBuilder::MakeExpression("array_length", {field_a}, res); + auto expr = TreeExprBuilder::MakeExpression("array_lengthGandiva", {field_a}, res); // Build a projector for the expressions. std::shared_ptr projector; @@ -281,7 +402,7 @@ TEST_F(TestList, TestListUtf8Contains) { // build expressions. // array_contains(a, b) - auto expr = TreeExprBuilder::MakeExpression("array_contains", {field_a, field_b}, res); + auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); // Build a projector for the expressions. std::shared_ptr projector; diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index 9bf5a07426d98..0efb2e412e873 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -139,6 +139,9 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) case arrow::Type::STRUCT: gandiva_data_type->set_type(types::GandivaType::STRUCT); break; + case arrow::Type::LIST: + gandiva_data_type->set_type(types::GandivaType::LIST); + break; default: // un-supported types. test ensures that // when one of these are added build breaks. @@ -173,11 +176,15 @@ Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSu JNIEXPORT jbyteArray JNICALL Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedFunctions( // NOLINT JNIEnv* env, jobject types_helper) { + printf("LR Entering JNI call getGandivaSupportedFunctions\n"); + fflush(stdout); + ExpressionRegistry expr_registry; types::GandivaFunctions gandiva_functions; for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).base_name().c_str()); + printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).ToString().c_str()); fflush(stdout); types::FunctionSignature* function_signature = gandiva_functions.add_function(); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 35dcdf2ee2d61..42cb62ec401ba 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -270,8 +270,10 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { return ProtoTypeToInterval(ext_type); case types::STRUCT: return arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}); - case types::FIXED_SIZE_BINARY: case types::LIST: + return arrow::list(arrow::int32()); + //return arrow::list(arrow::utf8()); + case types::FIXED_SIZE_BINARY: case types::UNION: case types::DICTIONARY: case types::MAP: @@ -297,6 +299,7 @@ FieldPtr ProtoTypeToField(const types::Field& f) { NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) { FieldPtr field_ptr = ProtoTypeToField(node.field()); + std::cout << "LR created field " << field_ptr->ToString(true) << std::endl; if (field_ptr == nullptr) { std::cerr << "Unable to create field node from protobuf\n"; return nullptr; @@ -468,6 +471,7 @@ NodePtr ProtoTypeToNullNode(const types::NullNode& node) { NodePtr ProtoTypeToNode(const types::TreeNode& node) { if (node.has_fieldnode()) { + std::cout << "LR Found ProtoTypeToNode fieldnode " << std::endl; return ProtoTypeToFieldNode(node.fieldnode()); } @@ -516,6 +520,7 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { } if (node.has_stringnode()) { + std::cout << "LR Found StringNode" << std::endl; return TreeExprBuilder::MakeStringLiteral(node.stringnode().value()); } @@ -625,10 +630,73 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); } +////////// +auto type = field->type(); +auto type_id = type->id(); +//num_rows = num_records or ?? + if (type_id == arrow::Type::LIST) { + + if (buf_idx >= in_bufs_len) { + return Status::Invalid("insufficient number of in_buf_addrs"); + } - auto array_data = arrow::ArrayData::Make(field->type(), num_rows, std::move(buffers)); + // add offsets buffer for variable-len fields. + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + auto offsets = std::shared_ptr( + new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + buffers.push_back(offsets); + + + if (arrow::is_binary_like(type->field(0)->type()->id())) { + // child offsets length is internal data length + 1 + // offsets element is int32 + // so here i just allocate extra 32 bit for extra 1 length + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + + auto child_offsets_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + + buffers.push_back(std::move(child_offsets_buffer)); + } + } + + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + + std::cout << "LR New ArrayData 1" << std::endl; + if (type->id() == arrow::Type::LIST) { + std::cout << "LR New ArrayData List" << std::endl; + auto internal_type = type->field(0)->type(); + std::shared_ptr child_data; + if (arrow::is_primitive(internal_type->id())) { + std::cout << "LR New ArrayData List 1" << std::endl; + child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, + {nullptr, std::move(data_buffer)}, 0); + } + if (arrow::is_binary_like(internal_type->id())) { + std::cout << "LR New ArrayData List NYI 2" << std::endl; + /*child_data = arrow::ArrayData::Make( + internal_type, 0, + {nullptr, std::move(data_buffer), std::move(child_data)}, 0);*/ + } + + auto array_data = arrow::ArrayData::Make(type, num_rows, {std::move(buffers[0]), std::move(buffers[1])}, {child_data}); + columns.push_back(array_data); + + } else { + auto array_data = arrow::ArrayData::Make(type, num_rows, std::move(buffers)); columns.push_back(array_data); } + +///////// +//TODO use unique_ptr +//Was +//auto array_data = arrow::ArrayData::Make(field->type(), num_rows, std::move(buffers)); +//columns.push_back(array_data); + + } *batch = arrow::RecordBatch::Make(schema, num_rows, columns); return Status::OK(); } @@ -776,7 +844,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build status = Projector::Make(schema_ptr, expr_vector, mode, config, sec_cache, &projector); if (!status.ok()) { - ss << "Failed to make LLVM module due to " << status.message() << "\n"; + ss << "Failed to make LLVM module [1]cdue to " << status.message() << "\n"; releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env); goto err_out; } @@ -864,6 +932,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( jlongArray buf_addrs, jlongArray buf_sizes, jint sel_vec_type, jint sel_vec_rows, jlong sel_vec_addr, jlong sel_vec_size, jlongArray out_buf_addrs, jlongArray out_buf_sizes) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << std::endl; Status status; std::shared_ptr holder = projector_modules_.Lookup(module_id); if (holder == nullptr) { @@ -899,7 +968,11 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( if (!status.ok()) { break; } - + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + << " Made a recordbatch num_rows " << num_rows + << in_batch->ToString() + << std::endl; + std::shared_ptr selection_vector; auto selection_buffer = std::make_shared( reinterpret_cast(sel_vec_addr), sel_vec_size); @@ -1062,7 +1135,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build // good to invoke the filter builder now status = Filter::Make(schema_ptr, condition_ptr, config, sec_cache, &filter); if (!status.ok()) { - ss << "Failed to make LLVM module due to " << status.message() << "\n"; + ss << "Failed to make LLVM module [2] due to " << status.message() << "\n"; releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env); goto err_out; } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 32990330ee310..39358a084ba98 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -180,10 +180,11 @@ private static ArrowType getArrowType(ExtGandivaType type) { return new ArrowType.Interval(mapArrowIntervalUnit(type.getIntervalType())); case GandivaType.STRUCT_VALUE: return new ArrowType.Struct(); + case GandivaType.LIST_VALUE: + return new ArrowType.List(); case GandivaType.FIXED_SIZE_BINARY_VALUE: case GandivaType.MAP_VALUE: case GandivaType.DICTIONARY_VALUE: - case GandivaType.LIST_VALUE: case GandivaType.UNION_VALUE: default: assert false; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 40b975d064351..47d97c6b0dca8 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -148,6 +148,11 @@ private static void initArrowTypeStruct(ArrowType.Struct structType, builder.setType(GandivaTypes.GandivaType.STRUCT); } + private static void initArrowTypeList(ArrowType.List listType, + GandivaTypes.ExtGandivaType.Builder builder) { + builder.setType(GandivaTypes.GandivaType.LIST); + } + private static void initArrowTypeTime(ArrowType.Time timeType, GandivaTypes.ExtGandivaType.Builder builder) { short timeUnit = timeType.getUnit().getFlatbufID(); @@ -289,6 +294,7 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp break; } case Type.List: { // 12 + ArrowTypeHelper.initArrowTypeList((ArrowType.List) arrowType, builder); break; } case Type.Struct_: { // 13 diff --git a/java/pom.xml b/java/pom.xml index 747320d2f8a40..8237c8f06f271 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -713,16 +713,6 @@ - format - memory - vector - tools - adapter/jdbc - flight - performance - algorithm - adapter/avro - compression From b2a838094697bff2519b13534686df2f6c43c162 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 1 Sep 2023 14:21:04 -0700 Subject: [PATCH 10/46] Almost working --- cpp/src/gandiva/annotator.cc | 3 + cpp/src/gandiva/array_ops.cc | 60 +++++- cpp/src/gandiva/array_ops.h | 13 ++ cpp/src/gandiva/engine.cc | 4 +- cpp/src/gandiva/function_registry_array.cc | 9 + cpp/src/gandiva/llvm_generator.cc | 69 ++++-- cpp/src/gandiva/llvm_types.cc | 3 +- cpp/src/gandiva/llvm_types.h | 13 +- cpp/src/gandiva/lvalue.h | 5 +- cpp/src/gandiva/projector.cc | 60 +++++- cpp/src/gandiva/projector.h | 6 +- cpp/src/gandiva/tests/list_test.cc | 197 ++++++++++++++++-- cpp/src/gandiva/tree_expr_builder.cc | 7 +- java/gandiva/src/main/cpp/jni_common.cc | 63 +++++- .../arrow/gandiva/evaluator/Projector.java | 30 +++ .../gandiva/evaluator/ProjectorTest.java | 50 +++++ 16 files changed, 543 insertions(+), 49 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index fad439823db93..d586de7076f7c 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -144,6 +144,8 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, reinterpret_cast(array_data.buffers[buffer_idx].get()); eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); } else { + std::cout << "LR Annotator::PrepareBuffersForField is_output index " << desc.data_buffer_ptr_idx() << std::endl; + // list data buffer is in child data buffer uint8_t* data_buf_ptr = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); @@ -181,6 +183,7 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, } // Fill in the entries for the output fields. + std::cout << "LR PrepareEvalBatch preparing output fields" << std::endl; int idx = 0; for (auto& arraydata : out_vector) { const FieldDescriptorPtr& desc = out_descs_.at(idx); diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 0a2546567b914..c1dbf3c380691 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -18,8 +18,11 @@ #include "gandiva/array_ops.h" #include +#include #include "arrow/util/value_parsing.h" + +#include "gandiva/gdv_function_stubs.h" #include "gandiva/engine.h" #include "gandiva/exported_funcs.h" @@ -50,7 +53,25 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, std::cout << "LR array_int32_contains_int32 offset length=" << entry_offsets_len << std::endl; for (int i = 0; i < entry_offsets_len; i++) { std::cout << "LR going to check " << entry_buf + i << std::endl; - int32_t entry_len = *(entry_buf + i); + //LR TODO + //int32_t entry_len = *(entry_buf + i); + //coming as int64 for some reason. *2 + int32_t entry_len = *(entry_buf + (i * 2)); + std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + if (entry_len == contains_data) { + return true; + } + } + return false; +} + +bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_offsets_len, + int64_t contains_data) { + std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; + for (int i = 0; i < entry_offsets_len; i++) { + std::cout << "LR going to check " << entry_buf + i << std::endl; + int64_t entry_len = *(entry_buf + (i*2)); //LR TODO sizeof int64? std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; if (entry_len == contains_data) { return true; @@ -59,6 +80,23 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, return false; } + +int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { + std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; + + int integers[] = { 1, 2, 3, contains_data, 5 }; + *out_len = 5;// * 4; + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); + memcpy(ret, integers, *out_len * 4); + std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + + + //return reinterpret_cast(ret); + return reinterpret_cast(ret); +} + + int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len) { int64_t res = entry_offsets_len; @@ -98,5 +136,25 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("array_int32_contains_int32", types->i1_type() /*return_type*/, args, reinterpret_cast(array_int32_contains_int32)); + + args = {types->i64_type(), // int64_t execution_context + types->i64_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t child offsets length + types->i64_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_int64_contains_int64", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_int64_contains_int64)); + + + args = {types->i64_type(), // int64_t execution_context + types->i32_type(), + types->i32_ptr_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_int32_make_array", + types->i32_ptr_type(), args, + reinterpret_cast(array_int32_make_array)); + + } } // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 7a32e303b3b08..b41fe2a086e8f 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -21,6 +21,10 @@ #include "gandiva/visibility.h" +namespace llvm { +class VectorType; +} + /// Array functions that can be accessed from LLVM. extern "C" { GANDIVA_EXPORT @@ -34,4 +38,13 @@ GANDIVA_EXPORT bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, int32_t contains_data); +GANDIVA_EXPORT +bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_offsets_len, + int64_t contains_data); + +GANDIVA_EXPORT +int32_t* array_int32_make_array(int64_t context_ptr, + int32_t contains_data, + int32_t* out_len); } diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index f5f9460ddd1f2..8c4b03dd55a06 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -300,6 +300,8 @@ Status Engine::FinalizeModule() { if (!cached_) { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); +/* + //LR Turning this off seems to provide better error messages with compilation/generation failures. if (optimize_) { // misc passes to allow for inlining, vectorization, .. std::unique_ptr pass_manager( @@ -324,7 +326,7 @@ Status Engine::FinalizeModule() { pass_builder.populateModulePassManager(*pass_manager); pass_manager->run(*module_); } - +*/ ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), Status::CodeGenError("Module verification failed after optimizer")); } diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index 1f37b9d612b3a..638c09b72a534 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -31,6 +31,15 @@ std::vector GetArrayFunctionRegistry() { NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, boolean(), kResultNullIfNull, "array_int32_contains_int32", NativeFunction::kNeedsContext), + NativeFunction("array_contains", {}, DataTypeVector{list(int32()), int32()}, + boolean(), kResultNullIfNull, "array_int32_contains_int32", + NativeFunction::kNeedsContext), + NativeFunction("array_makeGandiva", {}, DataTypeVector{int32()}, + list(int32()), kResultNullIfNull, "array_int32_make_array", + NativeFunction::kNeedsContext), + /*NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, + boolean(), kResultNullIfNull, "array_int64_contains_int64", + NativeFunction::kNeedsContext),*/ }; return array_fn_registry_; } diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 8bc24666166f4..f31913a326674 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -92,6 +92,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out std::unique_ptr compiled_expr(new CompiledExpr(value_validity, output)); std::string fn_name = "expr_" + std::to_string(idx) + "_" + std::to_string(static_cast(selection_vector_mode_)); + std::cout << "LR LLVMGenerator::Add " << fn_name << std::endl; if (!cached_) { ARROW_RETURN_NOT_OK(engine_->LoadFunctionIRs()); ARROW_RETURN_NOT_OK(CodeGenExprValue(value_validity->value_expr(), @@ -100,6 +101,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out } compiled_expr->SetFunctionName(selection_vector_mode_, fn_name); compiled_exprs_.push_back(std::move(compiled_expr)); + std::cout << "LR LLVMGenerator::Add Done" << std::endl; return Status::OK(); } @@ -108,14 +110,19 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode mode) { selection_vector_mode_ = mode; + std::cout << "LR LLVMGenerator::Build " << std::endl; for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); ARROW_RETURN_NOT_OK(Add(expr, output)); } + std::cout << "LR LLVMGenerator::Build 2" << std::endl; + //Too much logging. needle in haystack? + //std::cout << "LR LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); - + std::cout << "LR LLVMGenerator::Build FinalizeModule" << std::endl; + // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { auto fn_name = compiled_expr->GetFunctionName(mode); @@ -123,6 +130,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn); } + std::cout << "LR LLVMGenerator::Build Done" << std::endl; return Status::OK(); } @@ -144,13 +152,12 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); - int jello = 0; - std::cout << "LR LLVMGenerator::Execute " << jello++ << std::endl; + std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); - std::cout << "LR LLVMGenerator::Execute " << jello++ << std::endl; + std::cout << "LR LLVMGenerator::Execute 2" << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -160,7 +167,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } - std::cout << "LR LLVMGenerator::Execute " << jello++ << std::endl; + std::cout << "LR LLVMGenerator::Execute 3" << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -170,7 +177,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } - std::cout << "LR LLVMGenerator::Execute A" << jello++ << std::endl; + std::cout << "LR LLVMGenerator::Execute A1" << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -182,7 +189,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); - std::cout << "LR LLVMGenerator::Execute A" << jello++ << std::endl; + std::cout << "LR LLVMGenerator::Execute A2" << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -305,7 +312,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, FieldDescriptorPtr output, int suffix_idx, std::string& fn_name, SelectionVector::Mode selection_vector_mode) { - std::cout << "LR CodeGenExprValue" << std::endl; + std::cout << "LR CodeGenExprValue for output field " << output->Name() + << " type " << output->Type()->ToString() << " output type id " << output->Type()->id() << std::endl; try { llvm::IRBuilder<>* builder = ir_builder(); // Create fn prototype : @@ -404,6 +412,7 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } // The visitor can add code to both the entry/loop blocks. + std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, slice_offsets, arg_context_ptr, position_var); value_expr->Accept(visitor); @@ -441,6 +450,7 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } else if (output_type_id == arrow::Type::LIST) { auto output_list_internal_type = output->Type()->field(0)->type()->id(); std::cout << "LR creating list type to store the result with internal type " << output_list_internal_type << std::endl; + if (arrow::is_binary_like(output_list_internal_type)) { auto output_list_value = std::dynamic_pointer_cast(output_value); llvm::Value* child_output_offset_ref = GetChildOffsetsReference( @@ -451,6 +461,19 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, child_output_offset_ref, loop_var, output_list_value->data(), output_list_value->child_offsets(), output_list_value->offsets_length()}); } else if (output_list_internal_type == arrow::Type::INT32) { + + + std::string str1; + llvm::raw_string_ostream output1(str1); + output_value->data()->print(output1); + + std::string str2; + llvm::raw_string_ostream output2(str2); + output_value->length()->print(output2); + + + std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," + << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); @@ -604,7 +627,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::Function* fn = module()->getFunction(full_name); DCHECK_NE(fn, nullptr) << "missing function " << full_name; - if (enable_ir_traces_ && !full_name.compare("printf") && + if (!full_name.compare("printf") && !full_name.compare("printff")) { // Trace for debugging ADD_TRACE("invoke native fn " + full_name); @@ -624,7 +647,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::raw_string_ostream output2(str2); ret_type->print(output); value->getType()->print(output2); - std::cout << "LR ret type " << str << " value ret type " << str2 << std::endl; + std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; DCHECK(value->getType() == ret_type); } @@ -644,9 +667,7 @@ std::shared_ptr LLVMGenerator::BuildDecimalLValue(llvm::Value* va } #define ADD_VISITOR_TRACE(...) \ - if (generator_->enable_ir_traces_) { \ generator_->AddTrace(__VA_ARGS__); \ - } // Visitor for generating the code for a decomposed expression. LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* function, @@ -1018,6 +1039,8 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { LLVMTypes* types = generator_->types(); auto arrow_type_id = arrow_return_type->id(); auto result_type = types->DataVecType(arrow_return_type); + //Result type array/list is special. + //auto result_type = types->IRType(arrow_type_id); std::cout << "LR NonNullableFunc 2 result_type " << printType(result_type) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << types->IRType(arrow_type_id) << std::endl; // Build combined validity of the args. @@ -1477,6 +1500,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; if (arrow_return_type_id == arrow::Type::DECIMAL) { // For decimal fns, the output precision/scale are passed along as parameters. // @@ -1504,12 +1528,31 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, // add extra arg for return length for variable len return types (allocated on stack). llvm::AllocaInst* result_len_ptr = nullptr; if (arrow::is_binary_like(arrow_return_type_id)) { + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is binary like" << std::endl; result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, "result_len", entry_block_); params->push_back(result_len_ptr); has_arena_allocs_ = true; } + if (arrow_return_type_id == arrow::Type::LIST) { + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is list" << std::endl; + result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, + "result_len", entry_block_); + params->push_back(result_len_ptr); + has_arena_allocs_ = true; + + + } + + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; + for (auto p : *params) { + std::string str1; + llvm::raw_string_ostream output1(str1); + p->print(output1); + std::cout << str1 << std::endl; + } + // Make the function call llvm::IRBuilder<>* builder = ir_builder(); auto value = @@ -1520,6 +1563,8 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr->getAllocatedType(), result_len_ptr); + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE" << std::endl; + return std::make_shared(value, value_len); } } diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index f68fd098f6bef..68be62816f60e 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -43,7 +43,8 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::DECIMAL, i128_type()}, {arrow::Type::type::INTERVAL_MONTHS, i32_type()}, {arrow::Type::type::STRUCT, struct_type()}, - {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}}; + {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}, + {arrow::Type::type::LIST, list_type()}}; } } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index 059f1d051f8ca..f235535423536 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -50,6 +51,8 @@ class GANDIVA_EXPORT LLVMTypes { return llvm::StructType::get(context_, {double_type(), double_type()}, false); } + llvm::VectorType* list_type() { return llvm::ScalableVectorType::get(i8_type(), (unsigned int)0); } + llvm::StructType* i128_split_type() { // struct with high/low bits (see decimal_ops.cc:DecimalSplit) return llvm::StructType::get(context_, {i64_type(), i64_type()}, false); @@ -95,6 +98,10 @@ class GANDIVA_EXPORT LLVMTypes { return llvm::ConstantFP::get(float_type(), val); } + llvm::LLVMContext* get_context() { + return &context_; + } + llvm::Constant* double_constant(double val) { return llvm::ConstantFP::get(double_type(), val); } @@ -117,7 +124,11 @@ class GANDIVA_EXPORT LLVMTypes { // offsets buffer is to separate data into list // not support nested list if (data_type->id() == arrow::Type::LIST) { - return IRType(data_type->field(0)->type()->id()); + //LR HACK + //std::cout << "LR Returning list type as type " << data_type->field(0)->type()->id()<< " for IR " << std::endl; + //return IRType(data_type->field(0)->type()->id()); + //return IRType(data_type->id()); + return i32_ptr_type(); } return IRType(data_type->id()); } diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 4d1dca8f7cf4e..2b0f1ca2d51ae 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "arrow/util/macros.h" @@ -80,7 +81,9 @@ class GANDIVA_EXPORT ListLValue : public LValue { llvm::Value* validity = NULLPTR) : LValue(data, NULLPTR, validity), child_offsets_(child_offsets), - offsets_length_(offsets_length) {} + offsets_length_(offsets_length) { + std::cout << "LR Creating ListLValue " << std::endl; + } llvm::Value* child_offsets() { return child_offsets_; } diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index a95e80b148824..7ac3e499b2bf4 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -169,6 +169,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, ARROW_RETURN_IF(configuration == nullptr, Status::Invalid("Configuration cannot be null")); + std::cout << "LR Projector::Make 1" << std::endl; // see if equivalent projector was already built std::shared_ptr>> cache = LLVMGenerator::GetCache(); @@ -191,6 +192,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::unique_ptr llvm_gen; ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); + std::cout << "LR Projector::Make 2" << std::endl; if (!is_cached && sec_cache != nullptr) { std::shared_ptr arrow_buffer = sec_cache->Get(GetSecondaryCacheKey(cache_key.ToString())); @@ -208,6 +210,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. + std::cout << "LR Projector::Make 3" << std::endl; if (!is_cached) { ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { @@ -227,11 +230,13 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, output_fields.push_back(expr->result()); } + std::cout << "LR Projector::Make 4" << std::endl; // Instantiate the projector with the completely built llvm generator *projector = std::shared_ptr( new Projector(std::move(llvm_gen), schema, output_fields, configuration)); projector->get()->SetBuiltFromCache(is_cached); + std::cout << "LR Projector::Make 5" << std::endl; if (sec_cache != nullptr && is_cached == false) { std::shared_ptr sec_cached_obj = cache->GetObjectCode(cache_key); llvm::StringRef string_buffer = sec_cached_obj->getBuffer(); @@ -240,6 +245,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, sec_cache->Set(GetSecondaryCacheKey(cache_key.ToString()), arrow_buffer); } + std::cout << "LR Projector::Make DONE" << std::endl; return Status::OK(); } @@ -253,6 +259,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) const { ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + std::cout << "LR the other Projector::Evaluate" << std::endl; if (output_data_vecs.size() != output_fields_.size()) { std::stringstream ss; ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() @@ -260,8 +267,10 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, return Status::Invalid(ss.str()); } +std::cout << "LR the other Projector::Evaluate 1a" << std::endl; int idx = 0; for (auto& array_data : output_data_vecs) { + std::cout << "LR the other Projector::Evaluate checking array_data" << std::endl; if (array_data == nullptr) { std::stringstream ss; ss << "array for output field " << output_fields_[idx]->name() << "is null."; @@ -271,11 +280,48 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); + std::cout << "LR the other Projector::Evaluate about to validate capacity" << std::endl; ARROW_RETURN_NOT_OK( ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), num_rows)); ++idx; } - return llvm_generator_->Execute(batch, selection_vector, output_data_vecs); + std::cout << "LR the other Projector::Evaluate 2" << std::endl; + ARROW_RETURN_NOT_OK( + llvm_generator_->Execute(batch, selection_vector, output_data_vecs)); + + // Create and return array arrays. + + for (auto& array_data : output_data_vecs) { + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + std::cout << "LR the other Projector::Evaluate modifying child array " << + child_data->buffers[1]->ToString() << std::endl; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data->child_data.clear(); + array_data->child_data.push_back(new_child_data); + //array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + // array_data->buffers, {new_child_data}, + // array_data->null_count, array_data->offset); + } + + } + + + return Status::OK(); } Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool, @@ -447,15 +493,20 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, ARROW_RETURN_IF(array_data.buffers.size() < 2, Status::Invalid("ArrayData must have at least 2 buffers")); +std::cout << "LR ValidateArrayDataCapacity" << std::endl; int64_t min_bitmap_len = arrow::bit_util::BytesForBits(num_records); + std::cout << "LR ValidateArrayDataCapacity arra_data 0 is " << array_data.buffers[0] << std::endl; int64_t bitmap_len = array_data.buffers[0]->capacity(); + std::cout << "LR ValidateArrayDataCapacity" << std::endl; ARROW_RETURN_IF( bitmap_len < min_bitmap_len, Status::Invalid("Bitmap buffer too small for ", field.name(), " expected minimum ", min_bitmap_len, " actual size ", bitmap_len)); auto type_id = field.type()->id(); - if (arrow::is_binary_like(type_id) || type_id == arrow::Type::LIST) { + std::cout << "LR ValidateArrayDataCapacity" << std::endl; + //LR TODO + if (arrow::is_binary_like(type_id)) { //|| type_id == arrow::Type::LIST) { // validate size of offsets buffer. int64_t min_offsets_len = arrow::bit_util::BytesForBits((num_records + 1) * 32); int64_t offsets_len = array_data.buffers[1]->capacity(); @@ -477,7 +528,10 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, int64_t data_len = array_data.buffers[1]->capacity(); ARROW_RETURN_IF(data_len < min_data_len, Status::Invalid("Data buffer too small for ", field.name())); - } else { + } else if (type_id == arrow::Type::LIST) { + return Status::OK(); + } + else { return Status::Invalid("Unsupported output data type " + field.type()->ToString()); } diff --git a/cpp/src/gandiva/projector.h b/cpp/src/gandiva/projector.h index 24ec11e3eab59..53d0ef6d62431 100644 --- a/cpp/src/gandiva/projector.h +++ b/cpp/src/gandiva/projector.h @@ -154,14 +154,14 @@ class GANDIVA_EXPORT Projector { bool GetBuiltFromCache(); void Clear(); + /// Allocate an ArrowData of length 'length'. + Status AllocArrayData(const DataTypePtr& type, int64_t num_records, + arrow::MemoryPool* pool, ArrayDataPtr* array_data) const; private: Projector(std::unique_ptr llvm_generator, SchemaPtr schema, const FieldVector& output_fields, std::shared_ptr); - /// Allocate an ArrowData of length 'length'. - Status AllocArrayData(const DataTypePtr& type, int64_t num_records, - arrow::MemoryPool* pool, ArrayDataPtr* array_data) const; /// Validate that the ArrayData has sufficient capacity to accommodate 'num_records'. Status ValidateArrayDataCapacity(const arrow::ArrayData& array_data, diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index 4906bbab7e47c..cc2daf0e59d93 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -149,6 +149,185 @@ TEST_F(TestList, TestListInt64) { _test_list_type_field_alias(list(int64()), array, pool_); } */ + + +TEST_F(TestList, TestListInt32) { + ArrayPtr array; + _build_list_array( + {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, + 50000}, + {5, 2, 3, 4, 1}, {true, false, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(int32()), array, pool_); +} + +TEST_F(TestList, TestMakeArray) { + // schema for input fields + auto field_b = field("b", int32()); + auto schema = arrow::schema({field_b}); + + // output fields + auto res = field("res", list(int32())); + + // Create a row-batch with some sample data + int num_records = 5; + auto array_b = + MakeArrowArrayInt32({42, 43, 44, 45, 46}); + + // expected output + auto exp1 = MakeArrowArrayInt32({ 1, 2, 3, 42, 5}, + {true, true, true, true, true}); + + // auto exp = MakeArrowArrayArray({ 42, 42, 44, 45, 46}, + // {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_b}); + + // build expressions. + // array_contains(a, b) + + //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + //std::vector field_nodes; + //auto node2 = TreeExprBuilder::MakeLiteral(42); + //field_nodes.push_back(node2); + + //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); + //auto expr = TreeExprBuilder::MakeExpression(func_node, res); + std::cout << "LR test is about to make expression " << std::endl; + auto expr = TreeExprBuilder::MakeExpression("array_makeGandiva", {field_b}, res); + //////// + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + std::cout << "LR Test 2 " << std::endl; + //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); + + std::cout << "LR ==================================================================================== " << std::endl; + + + + //Try the second method. + arrow::ArrayDataVector outputs2; + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + + + int num_records2 = 5; + std::vector> buffers; + + + + //int64_t size = arrow::bit_util::BytesForBits(num_records2); + int64_t size = 20; + auto bitmap_buffer = arrow::AllocateBuffer(size, pool_); + buffers.push_back(*std::move(bitmap_buffer)); + auto offsets_len = arrow::bit_util::BytesForBits((num_records2 + 1) * 32); + + auto offsets_buffer = arrow::AllocateBuffer(offsets_len*10, pool_); + buffers.push_back(*std::move(offsets_buffer)); + + std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; + //auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, 0, offsets_len); + //outputs2.push_back(array_data); + + + +std::vector> buffers2; +auto bitmap_buffer2 = arrow::AllocateBuffer(size, pool_); + buffers2.push_back(*std::move(bitmap_buffer2)); + + auto offsets_buffer2 = arrow::AllocateBuffer(offsets_len, pool_); + buffers2.push_back(*std::move(offsets_buffer2)); +std::shared_ptr dt2 = std::make_shared(); + + auto array_data_child = arrow::ArrayData::Make(dt2, num_records2, buffers2, 0, 0); + array_data_child->buffers = std::move(buffers2); + + std::vector> kids; + kids.push_back(array_data_child); + + +auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, kids, 0, 0); +array_data->buffers = std::move(buffers); +outputs2.push_back(array_data); + +std::cout << "LR Test " << array_data << " arra_data 0 is " << array_data->buffers[0] << std::endl; + //std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; + std::cout << "LR about to evaluate 2nd " << std::endl; + + status = projector->Evaluate(*(in_batch.get()), outputs2); + EXPECT_TRUE(status.ok()) << status.message(); + arrow::ArrayData ad = *outputs2.at(0); + arrow::ArraySpan sp(*ad.child_data.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); + + + + +for (auto& array_data : outputs2) { + auto child_data = array_data->child_data[0]; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + /* when allocate array data, child data length is an initialized value, + * after calculating, child data offsets buffer has been resized for results, + * but array data length is unchanged. + * We should recalculate child data length and make ArrayData with new length + * + * Otherwise, child data offsets buffer length is data length + 1 + * and offset data is int32_t, need use buffer->size()/4 - 1 + */ + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + array_data->buffers, {new_child_data}, + array_data->null_count, array_data->offset); + + + auto newArray = arrow::MakeArray(array_data); + //arrow::ArraySpan sp(newArray); + EXPECT_ARROW_ARRAY_EQUALS(exp1, newArray); +} + + + + std::cout << "LR ====================THIRD=WAY================================== " << std::endl; + { + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + +ArrayDataPtr output_data; + auto s = projector->AllocArrayData(dt, num_records2, pool_, &output_data); + ArrayDataVector output_data_vecs; + output_data_vecs.push_back(output_data); + + status = projector->Evaluate(*(in_batch.get()), output_data_vecs); + EXPECT_TRUE(status.ok()) << status.message(); + arrow::ArraySpan sp(*output_data_vecs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); + } +} + +/* TEST_F(TestList, TestListArrayInt32) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); @@ -162,14 +341,6 @@ TEST_F(TestList, TestListArrayInt32) { true); } -TEST_F(TestList, TestListInt32) { - ArrayPtr array; - _build_list_array( - {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, - 50000}, - {5, 2, 3, 4, 1}, {true, false, true, true, true}, pool_, &array); - _test_list_type_field_alias(list(int32()), array, pool_); -} TEST_F(TestList, TestListInt32LiteralContains) { // schema for input fields @@ -289,9 +460,7 @@ TEST_F(TestList, TestListFloat64) { _test_list_type_field_alias(list(float64()), array, pool_); } -/* - * array_length(a) - */ + TEST_F(TestList, TestListUtf8Length) { // schema for input fields auto field_a = field("a", list(utf8())); @@ -371,9 +540,7 @@ TEST_F(TestList, TestListUtf8LengthWithInvalidData) { EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); } -/* - * array_contains(a, "element") - */ + TEST_F(TestList, TestListUtf8Contains) { // schema for input fields auto field_a = field("a", list(utf8())); @@ -417,5 +584,5 @@ TEST_F(TestList, TestListUtf8Contains) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); } - +*/ } // namespace gandiva diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 45c7d7bfe7647..87f7bb16fe12a 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -144,9 +144,8 @@ NodePtr TreeExprBuilder::MakeOr(const NodeVector& children) { static bool print_expr = false; ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result_field) { - if (true || print_expr) { - std::cout << "Expression: " << root_node->ToString() << "\n"; - } + std::cout << "LR Expression: " << root_node->ToString() << "\n"; + if (result_field == nullptr) { std::cout << "LR MakeExpression result_field is null" << std::endl; return nullptr; @@ -165,7 +164,9 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } + std::cout << "LR MakeExpression making function for " << function << std::endl; auto func_node = MakeFunction(function, field_nodes, out_field->type()); + std::cout << "LR MakeExpression function is " << func_node->ToString() << std::endl; return MakeExpression(func_node, out_field); } diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 42cb62ec401ba..4b4fd27e0c995 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -631,6 +631,9 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, buffers.push_back(offsets); } ////////// + + + auto type = field->type(); auto type_id = type->id(); //num_rows = num_records or ?? @@ -661,25 +664,27 @@ auto type_id = type->id(); } } - jlong offsets_addr = in_buf_addrs[buf_idx++]; - jlong offsets_size = in_buf_sizes[sz_idx++]; - auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); - + std::cout << "LR New ArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + + std::cout << "LR New ArrayData List" << std::endl; auto internal_type = type->field(0)->type(); std::shared_ptr child_data; if (arrow::is_primitive(internal_type->id())) { std::cout << "LR New ArrayData List 1" << std::endl; - child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, + child_data = arrow::ArrayData::Make(internal_type, 0, {nullptr, std::move(data_buffer)}, 0); } if (arrow::is_binary_like(internal_type->id())) { std::cout << "LR New ArrayData List NYI 2" << std::endl; - /*child_data = arrow::ArrayData::Make( - internal_type, 0, - {nullptr, std::move(data_buffer), std::move(child_data)}, 0);*/ + //child_data = arrow::ArrayData::Make( + // internal_type, 0, + // {nullptr, std::move(data_buffer), std::move(child_data)}, 0); } auto array_data = arrow::ArrayData::Make(type, num_rows, {std::move(buffers[0]), std::move(buffers[1])}, {child_data}); @@ -971,6 +976,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << " Made a recordbatch num_rows " << num_rows << in_batch->ToString() + << " there are " << out_bufs_len << " buffers " << std::endl; std::shared_ptr selection_vector; @@ -1030,15 +1036,56 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( "null"); break; } + + buffers.push_back(std::make_shared( + env, jexpander, output_vector_idx, value_buf, data_sz)); + } else if (field->type()->id() == arrow::Type::LIST) { buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, value_buf, data_sz)); } else { buffers.push_back(std::make_shared(value_buf, data_sz)); } + if (field->type()->id() == arrow::Type::LIST) { + + std::vector> child_buffers; + + CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); + uint8_t* child_valid_buf = reinterpret_cast(out_bufs[buf_idx++]); + child_buffers.push_back(std::make_shared( + env, jexpander, output_vector_idx, child_valid_buf, data_sz)); + + CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); + uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); + child_buffers.push_back(std::make_shared( + env, jexpander, output_vector_idx, child_offset_buf, data_sz)); + + CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); + uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); + child_buffers.push_back(std::make_shared( + env, jexpander, output_vector_idx, child_data_buf, data_sz)); + + std::shared_ptr dt2 = std::make_shared(); + auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); + //array_data_child-> + + + std::vector> kids; + kids.push_back(array_data_child); + //auto array_data = std::make_shared(field->type(), output_row_count); + auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers, kids); + array_data->child_data = std::move(kids); + output.push_back(array_data); + ++output_vector_idx; + + std::cout << "LR jni_common there are " << buffers.size() << " buffers" << std::endl; + + } else { auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers); output.push_back(array_data); ++output_vector_idx; + } + } if (!status.ok()) { break; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 5982ea31d6239..8981ed8569c1c 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -30,6 +30,7 @@ import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.message.ArrowBuffer; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; @@ -331,6 +332,7 @@ private void evaluate(int numRows, List buffers, List buf throw new EvaluatorClosedException(); } + logger.error("LR Projector.java evaluate"); if (numExprs != outColumns.size()) { logger.info("Expected " + numExprs + " columns, got " + outColumns.size()); throw new GandivaException("Incorrect number of columns for the output vector"); @@ -351,11 +353,20 @@ private void evaluate(int numRows, List buffers, List buf boolean hasVariableWidthColumns = false; BaseVariableWidthVector[] resizableVectors = new BaseVariableWidthVector[outColumns.size()]; + long[] outAddrs = new long[3 * outColumns.size()]; long[] outSizes = new long[3 * outColumns.size()]; + idx = 0; int outColumnIdx = 0; for (ValueVector valueVector : outColumns) { + if (valueVector instanceof ListVector) { + //LR HACK there is only one column. + logger.error("LR Projector.java evaluate out columns=" + outColumns.size()); + outAddrs = new long[5 * outColumns.size()]; + outSizes = new long[5 * outColumns.size()]; + } + /*boolean isFixedWith = valueVector instanceof FixedWidthVector;*/ boolean isVarWidth = valueVector instanceof VariableWidthVector; /*if (!isFixedWith && !isVarWidth) { @@ -376,6 +387,25 @@ private void evaluate(int numRows, List buffers, List buf if (valueVector instanceof StructVector) { outAddrs[idx] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().memoryAddress(); outSizes[idx++] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().capacity(); + } + if (valueVector instanceof ListVector) { + outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); + outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); + + //vector valid + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); + + + //vector offset + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).capacity(); + + //vector data + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).capacity(); + //LR HACK TODO ((ListVector) valueVector).getDataVector().capacity(); + } else { outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); outSizes[idx++] = valueVector.getDataBuffer().capacity(); diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java index 8dd759ee885d2..df0fd8639b231 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java @@ -48,6 +48,7 @@ import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.holders.NullableIntervalDayHolder; import org.apache.arrow.vector.holders.NullableIntervalYearHolder; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -57,6 +58,7 @@ import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; import org.junit.Ignore; @@ -288,6 +290,54 @@ public void testEvaluate() throws GandivaException, Exception { eval.close(); } + @Test + public void testEvaluateArray() throws GandivaException, Exception { + ArrowType int32 = new ArrowType.Int(32, true); + ArrowType listInt32 = new ArrowType.List(); + + Field a = Field.nullable("a", int32); + List args = Lists.newArrayList(a); + + Field retType = Field.nullable("c", listInt32); + ExpressionTree root = TreeBuilder.makeExpression("array_makeGandiva", args, retType); + + List exprs = Lists.newArrayList(root); + + Schema schema = new Schema(args); + Projector eval = Projector.make(schema, exprs); + + int numRows = 16; + byte[] validity = new byte[]{(byte) 255, 0}; + // second half is "undefined" + int[] aValues = new int[]{1, 2, 3, 42, 5}; + + + ArrowBuf validitya = buf(validity); + ArrowBuf valuesa = intBuf(aValues); + ArrowRecordBatch batch = + new ArrowRecordBatch( + numRows, + Lists.newArrayList(new ArrowFieldNode(numRows, 5)), + Lists.newArrayList(validitya, valuesa)); + + FieldType ft = new FieldType(true, int32, null); + ListVector intVector = new ListVector("result", allocator, ft, null); + //ListVector.allocateNew(numRows); + + List output = new ArrayList(); + output.add(intVector); + eval.evaluate(batch, output); + + System.out.println(intVector.getDataVector()); + + + + // free buffers + releaseRecordBatch(batch); + releaseValueVectors(output); + eval.close(); + } + @Test public void testEvaluateDivZero() throws GandivaException, Exception { Field a = Field.nullable("a", int32); From 0db2c7a7facfb04fc21f593b6447fe642ee799be Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 7 Sep 2023 17:55:05 -0700 Subject: [PATCH 11/46] lists kind of working --- cpp/src/gandiva/annotator.cc | 4 +- cpp/src/gandiva/array_ops.cc | 43 +++++++++- cpp/src/gandiva/array_ops.h | 6 ++ cpp/src/gandiva/function_registry_array.cc | 3 + cpp/src/gandiva/gdv_function_stubs.cc | 7 ++ cpp/src/gandiva/projector.cc | 13 +++- cpp/src/gandiva/tests/list_test.cc | 2 +- java/gandiva/src/main/cpp/jni_common.cc | 58 ++++++++++++-- .../arrow/gandiva/evaluator/Projector.java | 78 ++++++++++++++++--- 9 files changed, 191 insertions(+), 23 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index d586de7076f7c..81f217307481c 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -128,7 +128,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; } else { - std::cout << "LR Annotator::PrepareBuffersForField 5 buffer_idx " << buffer_idx << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; uint8_t* data_buf = @@ -139,6 +139,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, if (is_output) { // pass in the Buffer object for output data buffers. Can be used for resizing. + if (array_data.type->id() != arrow::Type::LIST) { uint8_t* data_buf_ptr = reinterpret_cast(array_data.buffers[buffer_idx].get()); @@ -153,6 +154,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, array_data.child_data.at(0)->offset); } } + } EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index c1dbf3c380691..dda2129998dbf 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -84,7 +84,7 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; - int integers[] = { 1, 2, 3, contains_data, 5 }; + int integers[] = { contains_data, 21, 3, contains_data, 5 }; *out_len = 5;// * 4; //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); @@ -96,6 +96,34 @@ int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int3 return reinterpret_cast(ret); } +int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { + std::cout << "LR array_int32_remove offset data=" << remove_data << std::endl; + + //LR sizes are HACK + int* integers = new int[5]; + int j = 0; + for (int i = 0; i < entry_offsets_len; i++) { + std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_len = *(entry_buf + (i * 2)); + std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + if (entry_len == remove_data) { + continue; + } else { + integers[j++] = entry_len; + } + } + + *out_len = 5;// * 4; + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); + memcpy(ret, integers, *out_len * 4); + std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + + delete [] integers; + //return reinterpret_cast(ret); + return reinterpret_cast(ret); +} int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len) { @@ -148,13 +176,22 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { args = {types->i64_type(), // int64_t execution_context - types->i32_type(), - types->i32_ptr_type()}; // int32_t contains data length + types->i32_type(), // array item input + types->i32_ptr_type()}; // out array length engine->AddGlobalMappingForFunc("array_int32_make_array", types->i32_ptr_type(), args, reinterpret_cast(array_int32_make_array)); + args = {types->i64_type(), // int64_t execution_context + types->i32_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t child offsets length + types->i32_type(), //value to remove from input + types->i32_ptr_type()}; // out array length + + engine->AddGlobalMappingForFunc("array_int32_remove", + types->i32_ptr_type(), args, + reinterpret_cast(array_int32_remove)); } } // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index b41fe2a086e8f..76c158f0e27f3 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -47,4 +47,10 @@ GANDIVA_EXPORT int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len); + +GANDIVA_EXPORT +int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, + int32_t remove_data, + int32_t* out_len); } diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index 638c09b72a534..dc81b6b4601c3 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -37,6 +37,9 @@ std::vector GetArrayFunctionRegistry() { NativeFunction("array_makeGandiva", {}, DataTypeVector{int32()}, list(int32()), kResultNullIfNull, "array_int32_make_array", NativeFunction::kNeedsContext), + NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int32()), int32()}, + list(int32()), kResultNullIfNull, "array_int32_remove", + NativeFunction::kNeedsContext), /*NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, boolean(), kResultNullIfNull, "array_int64_contains_int64", NativeFunction::kNeedsContext),*/ diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 793d4f68feb74..ac157a90d4300 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -164,8 +164,11 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ int32_t* offsets, int64_t slot, \ TYPE* entry_buf, int32_t entry_len) { \ + std::cout << "gdv_fn_populate 1" << std::endl; \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ + std::cout << "gdv_fn_populate 2 data_ptr" << data_ptr << " buffer " << buffer << \ + " offset " << offset << " entry_len " << entry_len << " scale " << SCALE << std::endl; \ auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ @@ -173,7 +176,11 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, context->set_error_msg(status.message().c_str()); \ return -1; \ } \ + std::cout << "gdv_fn_populate resized buffer to =" << offset + entry_len * SCALE << std::endl; \ + std::cout << "gdv_fn_populate copying bytes =" << entry_len * SCALE << std::endl; \ + std::cout << "gdv_fn_populate buffer =" << buffer->ToString() << " offeset " << offset << std::endl; \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ + std::cout << "gdv_fn_populate buffer after =" << buffer->ToString() << std::endl; \ offsets[slot] = offset / SCALE; \ offsets[slot + 1] = offset / SCALE + entry_len; \ return 0; \ diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 7ac3e499b2bf4..34abedf1081e4 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -292,10 +292,16 @@ std::cout << "LR the other Projector::Evaluate 1a" << std::endl; // Create and return array arrays. for (auto& array_data : output_data_vecs) { + if (array_data->type->id() == arrow::Type::LIST) { auto child_data = array_data->child_data[0]; std::cout << "LR the other Projector::Evaluate modifying child array " << child_data->buffers[1]->ToString() << std::endl; + std::cout << "LR the other Projector::Evaluate child array[3] " << + int32_t( (*child_data->buffers[1])[3*4]) << std::endl; + //std::cout << "LR the other Projector::Evaluate modifying child0 array " << + //child_data->buffers[0]->ToString() << std::endl; + int64_t child_data_size = 1; if (arrow::is_binary_like(child_data->type->id())) { @@ -313,6 +319,11 @@ std::cout << "LR the other Projector::Evaluate 1a" << std::endl; child_data->type, child_data_size, child_data->buffers, child_data->offset); array_data->child_data.clear(); array_data->child_data.push_back(new_child_data); + + std::cout << "LR the other Projector::Evaluate child data size " << child_data_size << std::endl; + std::cout << "LR the other Projector::Evaluate after modifying child array[3] " << + int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; + //array_data = arrow::ArrayData::Make(array_data->type, array_data->length, // array_data->buffers, {new_child_data}, // array_data->null_count, array_data->offset); @@ -453,7 +464,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, std::cout << "LR Projector::AllocArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { - std::cout << "LR Projector::AllocArrayData List" << std::endl; + std::cout << "LR Projector::AllocArrayData List. There are number of buffers=" << buffers.size() << std::endl; auto internal_type = type->field(0)->type(); ArrayDataPtr child_data; if (arrow::is_primitive(internal_type->id())) { diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index cc2daf0e59d93..7936873d073c0 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -212,7 +212,7 @@ TEST_F(TestList, TestMakeArray) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); - std::cout << "LR ==================================================================================== " << std::endl; + std::cout << "LR ==============================SECOND=WAY==================================================== " << std::endl; diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 4b4fd27e0c995..c737309b2deac 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -1013,12 +1013,14 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( for (FieldPtr field : ret_types) { std::vector> buffers; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer" << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* validity_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong bitmap_sz = out_sizes[sz_idx++]; buffers.push_back(std::make_shared(validity_buf, bitmap_sz)); if (arrow::is_binary_like(field->type()->id())) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding buffer" << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* offsets_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong offsets_sz = out_sizes[sz_idx++]; @@ -1037,29 +1039,30 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer size=" << data_sz << std::endl; buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, value_buf, data_sz)); } else if (field->type()->id() == arrow::Type::LIST) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding buffer size=" << data_sz << std::endl; buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, value_buf, data_sz)); } else { buffers.push_back(std::make_shared(value_buf, data_sz)); } - + if (field->type()->id() == arrow::Type::LIST) { std::vector> child_buffers; - CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); - uint8_t* child_valid_buf = reinterpret_cast(out_bufs[buf_idx++]); - child_buffers.push_back(std::make_shared( - env, jexpander, output_vector_idx, child_valid_buf, data_sz)); + + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, child_offset_buf, data_sz)); + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( @@ -1080,7 +1083,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( std::cout << "LR jni_common there are " << buffers.size() << " buffers" << std::endl; - } else { + } else { auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers); output.push_back(array_data); ++output_vector_idx; @@ -1090,9 +1093,52 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( if (!status.ok()) { break; } + + std::cout << "LR jni_common calling evaluate" << std::endl; status = holder->projector()->Evaluate(*in_batch, selection_vector.get(), output); + //LRtest1 + std::cout << "LR jni_common after evaluating the output size is " << output.size() << std::endl; + arrow::ArraySpan sp(*(output[0])); + std::cout << "LR jni_common after evaluating the output 0 is " << sp.ToArray()->ToString() << std::endl; + auto array_data = output[0]; + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + std::cout << "LR jni_common child array[3] " << + int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; + std::cout << "LR jni_common child array[0] " << + int32_t( (*(array_data->child_data[0])->buffers[1])[0*4]) << std::endl; + std::cout << "LR jni_common child via data ptr array[0] " << + int32_t( *(*(array_data->child_data[0])->buffers[1]).data()) << std::endl; + std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" + << (array_data->child_data[0])->length << std::endl; + + //LRTest1 Start + int numRecords = 5; + //int numRecords = (array_data->child_data[0])->length * array_data->length; + int recordSize = numRecords * 4; + + memcpy(&out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); + out_sizes[3] = recordSize; + + + //validity buffer? + bool valid[] = {true, true, true, true, true}; + memcpy(&out_bufs[2], valid, 5); + out_sizes[2] = 5; + + //offset buffer is not needed. + //int32_t offsetsBuffer[] = {0}; + //memcpy(&out_bufs[1], offsetsBuffer, 1 * 4); + //out_sizes[1] = 1; + + std::cout << "LR jni_common after copy parent buff child array[0] " << + int32_t( (out_bufs[3])) << std::endl; + //LRTest1 End + } + } while (0); + env->ReleaseLongArrayElements(buf_addrs, in_buf_addrs, JNI_ABORT); env->ReleaseLongArrayElements(buf_sizes, in_buf_sizes, JNI_ABORT); env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 8981ed8569c1c..0625b4830b4f8 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -27,12 +27,15 @@ import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.SelectionVectorType; import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.message.ArrowBuffer; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; @@ -360,12 +363,12 @@ private void evaluate(int numRows, List buffers, List buf idx = 0; int outColumnIdx = 0; for (ValueVector valueVector : outColumns) { - if (valueVector instanceof ListVector) { - //LR HACK there is only one column. - logger.error("LR Projector.java evaluate out columns=" + outColumns.size()); - outAddrs = new long[5 * outColumns.size()]; - outSizes = new long[5 * outColumns.size()]; - } + if (valueVector instanceof ListVector) { + //LR HACK there is only one column. + logger.error("LR Projector.java evaluate out columns=" + outColumns.size()); + outAddrs = new long[5 * outColumns.size()]; + outSizes = new long[5 * outColumns.size()]; + } /*boolean isFixedWith = valueVector instanceof FixedWidthVector;*/ boolean isVarWidth = valueVector instanceof VariableWidthVector; @@ -389,22 +392,40 @@ private void evaluate(int numRows, List buffers, List buf outSizes[idx++] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().capacity(); } if (valueVector instanceof ListVector) { + + + List fieldBufs = ((ListVector) valueVector).getDataVector().getFieldBuffers(); + logger.error("LR Projector.java evaluate ListVector has buffers=" + fieldBufs.size()); + + outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); //vector valid - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); //vector offset - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).memoryAddress(); - outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).capacity(); + logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); //vector data - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); - outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).capacity(); + //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); + //outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).capacity(); + //LR HACK TODO ((ListVector) valueVector).getDataVector().capacity(); + + + + + + + + + + } else { outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); @@ -415,12 +436,47 @@ private void evaluate(int numRows, List buffers, List buf outColumnIdx++; } + logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, this.moduleId, numRows, bufAddrs, bufSizes, selectionVectorType, selectionVectorRecordCount, selectionVectorAddr, selectionVectorSize, outAddrs, outSizes); + + //outColumns.clear(); + //FieldType ft = new FieldType(true, int32, null); + //ListVector lv = new ListVector("res", allocator, ft, null); + //System.out.println(intVector.getDataVector()); + + + //logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3][3 * 4]); + for (ValueVector valueVector : outColumns) { + if (valueVector instanceof ListVector) { + //LR HACK + + int numRecordsFound = 5; + //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; + logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound); + + ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + List outBufsNew = new ArrayList(); + + outBufsNew.add(ab); + outBufsNew.add(ab2); + ArrowFieldNode afn = new ArrowFieldNode(numRecordsFound, 0); + ((ListVector) valueVector).getDataVector().clear(); + ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + //byte[] valid = new byte[outsizes[2]]; + //LR HACK + //for (int i = 0; i < outSizes[2]; i++) { + for (int i = 0; i < numRecordsFound * 10; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + } + } + } + } /** From 89d9f2dd6b6827885984d7a6c765faf2c29013c3 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 8 Sep 2023 16:20:28 -0700 Subject: [PATCH 12/46] add stuff. 100 rows --- build_release.sh | 33 +++++++++++++++++ build_testing.sh | 37 +++++++++++++++++++ cpp/src/gandiva/engine.cc | 3 +- java/gandiva/src/main/cpp/jni_common.cc | 2 +- .../arrow/gandiva/evaluator/Projector.java | 6 ++- 5 files changed, 76 insertions(+), 5 deletions(-) create mode 100755 build_release.sh create mode 100755 build_testing.sh diff --git a/build_release.sh b/build_release.sh new file mode 100755 index 0000000000000..533467d0484d0 --- /dev/null +++ b/build_release.sh @@ -0,0 +1,33 @@ +rm -rf cpp-jni java-dist java-jni cpp/debug +mkdir cpp/debug +cd cpp/debug + +arch -x86_64 cmake -DCMAKE_BUILD_TYPE=RELEASE -DARROW_GANDIVA=ON -DARROW_JEMALLOC=OFF -DARROW_GANDIVA_JAVA=ON -DARROW_BUILD_TESTS=OFF .. +arch -x86_64 make -j 8 +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +cd ../../ +mkdir -p java-jni cpp-jni + +arch -x86_64 cmake -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF -DARROW_JEMALLOC=OFF -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=ON -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON -DARROW_ORC=ON -DARROW_PARQUET=ON -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_UNITY_BUILD=ON +arch -x86_64 cmake --build cpp-jni --target install --config Release +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +arch -x86_64 cmake -S java -B java-jni -DARROW_JAVA_JNI_ENABLE_C=OFF -DARROW_JEMALLOC=OFF -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_PREFIX_PATH=$PWD/java-dist/lib/x86_64/cmake +arch -x86_64 cmake --build java-jni --target install --config Release +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +cd java +/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Parrow-jni clean install diff --git a/build_testing.sh b/build_testing.sh new file mode 100755 index 0000000000000..e270f3758101c --- /dev/null +++ b/build_testing.sh @@ -0,0 +1,37 @@ +rm -rf cpp-jni java-dist java-jni cpp/debug +mkdir cpp/debug +cd cpp/debug + +echo "====CPP====" +arch -x86_64 cmake -DCMAKE_BUILD_TYPE=DEBUG -DARROW_GANDIVA=ON -DARROW_JEMALLOC=OFF -DARROW_GANDIVA_JAVA=ON -DARROW_BUILD_TESTS=ON .. +arch -x86_64 make -j 8 +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +cd ../../ +mkdir -p java-jni cpp-jni + +echo "====CPP-JNI====" +arch -x86_64 cmake -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF -DARROW_JEMALLOC=OFF -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=ON -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON -DARROW_ORC=ON -DARROW_PARQUET=ON -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_UNITY_BUILD=ON +arch -x86_64 cmake --build cpp-jni --target install --config Debug +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +echo "====JAVA-JNI====" +arch -x86_64 cmake -S java -B java-jni -DARROW_JAVA_JNI_ENABLE_C=OFF -DARROW_JEMALLOC=OFF -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_PREFIX_PATH=$PWD/java-dist/lib/x86_64/cmake -DArrowTesting_DIR=$PWD/cpp/debug/src/arrow +arch -x86_64 cmake --build java-jni --target install --config Debug +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +echo "====JARS====" +cd java +/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Parrow-jni clean install diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 8c4b03dd55a06..80e60ab7ba721 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -300,7 +300,6 @@ Status Engine::FinalizeModule() { if (!cached_) { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); -/* //LR Turning this off seems to provide better error messages with compilation/generation failures. if (optimize_) { // misc passes to allow for inlining, vectorization, .. @@ -326,7 +325,7 @@ Status Engine::FinalizeModule() { pass_builder.populateModulePassManager(*pass_manager); pass_manager->run(*module_); } -*/ + ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), Status::CodeGenError("Module verification failed after optimizer")); } diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index c737309b2deac..3d02488a9f973 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -1113,7 +1113,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( << (array_data->child_data[0])->length << std::endl; //LRTest1 Start - int numRecords = 5; + int numRecords = 5 * 100; //int numRecords = (array_data->child_data[0])->length * array_data->length; int recordSize = numRecords * 4; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 0625b4830b4f8..fe040ab4382e8 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -455,14 +455,16 @@ private void evaluate(int numRows, List buffers, List buf if (valueVector instanceof ListVector) { //LR HACK - int numRecordsFound = 5; + int numRecordsFound = 5 * 100; //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound); + //ArrowBuf ab0 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); List outBufsNew = new ArrayList(); + //outBufsNew.add(ab0); outBufsNew.add(ab); outBufsNew.add(ab2); ArrowFieldNode afn = new ArrowFieldNode(numRecordsFound, 0); @@ -471,7 +473,7 @@ private void evaluate(int numRows, List buffers, List buf //byte[] valid = new byte[outsizes[2]]; //LR HACK //for (int i = 0; i < outSizes[2]; i++) { - for (int i = 0; i < numRecordsFound * 10; i++) { + for (int i = 0; i < numRecordsFound; i++) { BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); } } From 4249251a3b0a6f941c19d9c4b86555efe1b42032 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 12 Sep 2023 09:30:53 -0700 Subject: [PATCH 13/46] Removed logging and testing bigger size --- cpp/src/gandiva/annotator.cc | 46 ++++--- cpp/src/gandiva/array_ops.cc | 24 ++-- cpp/src/gandiva/expr_decomposer.cc | 17 ++- cpp/src/gandiva/function_registry.cc | 4 +- cpp/src/gandiva/gdv_function_stubs.cc | 9 +- cpp/src/gandiva/llvm_generator.cc | 126 +++++++++--------- cpp/src/gandiva/lvalue.h | 2 +- cpp/src/gandiva/projector.cc | 60 ++++----- cpp/src/gandiva/tree_expr_builder.cc | 8 +- java/gandiva/src/main/cpp/jni_common.cc | 72 +++++----- .../arrow/gandiva/evaluator/Projector.java | 18 ++- 11 files changed, 207 insertions(+), 179 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 81f217307481c..2d91ba43ab435 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -53,7 +53,7 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { } if (field->type()->id() == arrow::Type::LIST) { - std::cout << "LR Annotator::MakeDesc 1" << std::endl; + //std::cout << "LR Annotator::MakeDesc 1" << std::endl; offsets_idx = buffer_count_++; if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { child_offsets_idx = buffer_count_++; @@ -81,37 +81,42 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // The validity buffer is optional. Use nullptr if it does not have one. if (array_data.buffers[buffer_idx]) { uint8_t* validity_buf = const_cast(array_data.buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -6 " << &validity_buf << std::endl; eval_batch->SetBuffer(desc.validity_idx(), validity_buf, array_data.offset); } else { + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -5 null " << std::endl; eval_batch->SetBuffer(desc.validity_idx(), nullptr, array_data.offset); } ++buffer_idx; if (desc.HasOffsetsIdx()) { uint8_t* offsets_buf = const_cast(array_data.buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << std::endl; eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); if (desc.HasChildOffsetsIdx()) { - std::cout << "LR Annotator::PrepareBuffersForField 1 for field " << desc.Name() << " type is " << array_data.type->id() << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 1 for field " << desc.Name() << " type is " << array_data.type->id() << std::endl; if (is_output) { // if list field is output field, we should put buffer pointer into eval batch // for resizing uint8_t* child_offsets_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3 " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); } else { - std::cout << "LR Annotator::PrepareBuffersForField 2" << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 2" << std::endl; // if list field is input field, just put buffer data into eval batch uint8_t* child_offsets_buf = const_cast( array_data.child_data.at(0)->buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2 " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); } } if (array_data.type->id() != arrow::Type::LIST || arrow::is_binary_like(array_data.type->field(0)->type()->id())) { - std::cout << "LR Annotator::PrepareBuffersForField 3" << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 3" << std::endl; // primitive type list data buffer index is 1 // binary like type list data buffer index is 2 @@ -120,21 +125,23 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, } if (array_data.type->id() != arrow::Type::LIST) { - std::cout << "LR Annotator::PrepareBuffersForField 4" << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 4" << std::endl; - std::cout << "LR Annotator::PrepareBuffersForField 4 buffer_idx " << buffer_idx << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 4 buffer_idx " << buffer_idx << std::endl; uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField 4a" << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 4a" << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -1 " << &data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); - std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; } else { - std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; - std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; uint8_t* data_buf = const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 0 " << &data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); - std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; } if (is_output) { @@ -143,13 +150,16 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, if (array_data.type->id() != arrow::Type::LIST) { uint8_t* data_buf_ptr = reinterpret_cast(array_data.buffers[buffer_idx].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 1 " << &data_buf_ptr << std::endl; eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); } else { - std::cout << "LR Annotator::PrepareBuffersForField is_output index " << desc.data_buffer_ptr_idx() << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField is_output index " << desc.data_buffer_ptr_idx() << std::endl; // list data buffer is in child data buffer uint8_t* data_buf_ptr = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 2 " << &data_buf_ptr << std::endl; + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.child_data.at(0)->offset); } @@ -162,7 +172,7 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, EvalBatchPtr eval_batch = std::make_shared( record_batch.num_rows(), buffer_count_, local_bitmap_count_); - std::cout << "LR PrepareEvalBatch 1" << std::endl; + //std::cout << "LR PrepareEvalBatch 1" << std::endl; // Fill in the entries for the input fields. for (int i = 0; i < record_batch.num_columns(); ++i) { const std::string& name = record_batch.column_name(i); @@ -172,27 +182,27 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, continue; } - std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch schema " << record_batch.schema()->ToString() + /*std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch schema " << record_batch.schema()->ToString() << " num rows " << record_batch.num_rows() << " num columns " << record_batch.num_columns() << " data size " << record_batch.column_data().size() << " col 1 " << record_batch.column(0)->ToString() - << std::endl; + << std::endl;*/ - std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch data " << record_batch.ToString() << std::endl; + //std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch data " << record_batch.ToString() << std::endl; PrepareBuffersForField(*(found->second), *(record_batch.column_data(i)), eval_batch.get(), false /*is_output*/); } // Fill in the entries for the output fields. - std::cout << "LR PrepareEvalBatch preparing output fields" << std::endl; + //std::cout << "LR PrepareEvalBatch preparing output fields" << std::endl; int idx = 0; for (auto& arraydata : out_vector) { const FieldDescriptorPtr& desc = out_descs_.at(idx); PrepareBuffersForField(*desc, *arraydata, eval_batch.get(), true /*is_output*/); ++idx; } - std::cout << "LR PrepareEvalBatch 2" << std::endl; + //std::cout << "LR PrepareEvalBatch 2" << std::endl; return eval_batch; } diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index dda2129998dbf..b957bdebcecab 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -50,14 +50,14 @@ bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, int32_t contains_data) { - std::cout << "LR array_int32_contains_int32 offset length=" << entry_offsets_len << std::endl; + //std::cout << "LR array_int32_contains_int32 offset length=" << entry_offsets_len << std::endl; for (int i = 0; i < entry_offsets_len; i++) { - std::cout << "LR going to check " << entry_buf + i << std::endl; + //std::cout << "LR going to check " << entry_buf + i << std::endl; //LR TODO //int32_t entry_len = *(entry_buf + i); //coming as int64 for some reason. *2 int32_t entry_len = *(entry_buf + (i * 2)); - std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; if (entry_len == contains_data) { return true; } @@ -68,11 +68,11 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t entry_offsets_len, int64_t contains_data) { - std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; + //std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; for (int i = 0; i < entry_offsets_len; i++) { - std::cout << "LR going to check " << entry_buf + i << std::endl; + //std::cout << "LR going to check " << entry_buf + i << std::endl; int64_t entry_len = *(entry_buf + (i*2)); //LR TODO sizeof int64? - std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; if (entry_len == contains_data) { return true; } @@ -82,14 +82,14 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { - std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; + //std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; int integers[] = { contains_data, 21, 3, contains_data, 5 }; *out_len = 5;// * 4; //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); memcpy(ret, integers, *out_len * 4); - std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; //return reinterpret_cast(ret); @@ -98,15 +98,15 @@ int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int3 int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { - std::cout << "LR array_int32_remove offset data=" << remove_data << std::endl; + //std::cout << "LR array_int32_remove offset data=" << remove_data << std::endl; //LR sizes are HACK int* integers = new int[5]; int j = 0; for (int i = 0; i < entry_offsets_len; i++) { - std::cout << "LR going to check " << entry_buf + i << std::endl; + //std::cout << "LR going to check " << entry_buf + i << std::endl; int32_t entry_len = *(entry_buf + (i * 2)); - std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; if (entry_len == remove_data) { continue; } else { @@ -118,7 +118,7 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); memcpy(ret, integers, *out_len * 4); - std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; delete [] integers; //return reinterpret_cast(ret); diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 41cb64b3eba87..ec56d30c51589 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -38,25 +38,28 @@ namespace gandiva { Status ExprDecomposer::Visit(const FieldNode& node) { auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field()); - std::cout << "LR ExprDecomposer" << std::endl; + //std::cout << "LR ExprDecomposer" << std::endl; DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; if (desc->HasChildOffsetsIdx()) { - std::cout << "LR ExprDecomposer 1" << std::endl; + //std::cout << "LR ExprDecomposer 1" << std::endl; // handle list type value_dex = std::make_shared(desc); } else if (desc->HasOffsetsIdx()) { - std::cout << "LR ExprDecomposer 2" << std::endl; + //std::cout << "LR ExprDecomposer 2" << std::endl; if (desc->field()->type()->id() == arrow::Type::LIST) { // handle list type - std::cout << "LR ExprDecomposer 3" << std::endl; - value_dex = std::make_shared(desc); + //std::cout << "LR ExprDecomposer 3" << std::endl; + auto p = std::make_shared(desc); + value_dex = p; + int v = p->DataIdx(); + //std::cout << "LR primitive list type " v << " " << } else { - std::cout << "LR ExprDecomposer 4" << std::endl; + //std::cout << "LR ExprDecomposer 4" << std::endl; value_dex = std::make_shared(desc); } } else { - std::cout << "LR ExprDecomposer 5" << std::endl; + //std::cout << "LR ExprDecomposer 5" << std::endl; value_dex = std::make_shared(desc); } result_ = std::make_shared(validity_dex, value_dex); diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index a2ae2426b9235..021100678a08e 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -72,9 +72,9 @@ SignatureMap FunctionRegistry::InitPCMap() { pc_registry_.insert(std::end(pc_registry_), v7.begin(), v7.end()); for (auto& elem : pc_registry_) { - std::cout << "LR pc_registry_ item " << elem.pc_name() << " first signature name " << elem.signatures()[0].base_name() << std::endl; + //std::cout << "LR pc_registry_ item " << elem.pc_name() << " first signature name " << elem.signatures()[0].base_name() << std::endl; for (auto& func_signature : elem.signatures()) { - std::cout << "LR Adding function to map " << func_signature.base_name() << std::endl; + //std::cout << "LR Adding function to map " << func_signature.base_name() << std::endl; //std::cout << " LR args " << func_signature.param_types map.insert(std::make_pair(&(func_signature), &elem)); } diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index ac157a90d4300..876f5b72b9941 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -164,18 +164,19 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ int32_t* offsets, int64_t slot, \ TYPE* entry_buf, int32_t entry_len) { \ - std::cout << "gdv_fn_populate 1" << std::endl; \ + std::cout << "gdv_fn_populate 1 data_ptr is " << data_ptr << std::endl; \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ std::cout << "gdv_fn_populate 2 data_ptr" << data_ptr << " buffer " << buffer << \ - " offset " << offset << " entry_len " << entry_len << " scale " << SCALE << std::endl; \ - auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ + " offset " << offset << " entry_len " << entry_len << " scale " << SCALE << \ + " want to resize to " << (offset + entry_len * SCALE) << std::endl; \ + /*auto status = buffer->Resize(offset + entry_len * SCALE, false); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ reinterpret_cast(context_ptr); \ context->set_error_msg(status.message().c_str()); \ return -1; \ - } \ + } */ \ std::cout << "gdv_fn_populate resized buffer to =" << offset + entry_len * SCALE << std::endl; \ std::cout << "gdv_fn_populate copying bytes =" << entry_len * SCALE << std::endl; \ std::cout << "gdv_fn_populate buffer =" << buffer->ToString() << " offeset " << offset << std::endl; \ diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index f31913a326674..b3de4ac524387 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -36,7 +36,7 @@ namespace gandiva { AddTrace(__VA_ARGS__); \ } -namespace { +/*namespace { std::string printType(llvm::Type* t) { if (t == nullptr) { return std::string("null"); @@ -55,7 +55,7 @@ namespace { t->print(output); return str; } -} +}*/ LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(true) {} @@ -92,7 +92,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out std::unique_ptr compiled_expr(new CompiledExpr(value_validity, output)); std::string fn_name = "expr_" + std::to_string(idx) + "_" + std::to_string(static_cast(selection_vector_mode_)); - std::cout << "LR LLVMGenerator::Add " << fn_name << std::endl; + //std::cout << "LR LLVMGenerator::Add " << fn_name << std::endl; if (!cached_) { ARROW_RETURN_NOT_OK(engine_->LoadFunctionIRs()); ARROW_RETURN_NOT_OK(CodeGenExprValue(value_validity->value_expr(), @@ -101,7 +101,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out } compiled_expr->SetFunctionName(selection_vector_mode_, fn_name); compiled_exprs_.push_back(std::move(compiled_expr)); - std::cout << "LR LLVMGenerator::Add Done" << std::endl; + //std::cout << "LR LLVMGenerator::Add Done" << std::endl; return Status::OK(); } @@ -110,18 +110,18 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode mode) { selection_vector_mode_ = mode; - std::cout << "LR LLVMGenerator::Build " << std::endl; + //std::cout << "LR LLVMGenerator::Build " << std::endl; for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); ARROW_RETURN_NOT_OK(Add(expr, output)); } - std::cout << "LR LLVMGenerator::Build 2" << std::endl; + //std::cout << "LR LLVMGenerator::Build 2" << std::endl; //Too much logging. needle in haystack? //std::cout << "LR LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); - std::cout << "LR LLVMGenerator::Build FinalizeModule" << std::endl; + //std::cout << "LR LLVMGenerator::Build FinalizeModule" << std::endl; // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { @@ -130,7 +130,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn); } - std::cout << "LR LLVMGenerator::Build Done" << std::endl; + //std::cout << "LR LLVMGenerator::Build Done" << std::endl; return Status::OK(); } @@ -152,12 +152,12 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); - std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; + //std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); - std::cout << "LR LLVMGenerator::Execute 2" << std::endl; + //std::cout << "LR LLVMGenerator::Execute 2" << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -167,7 +167,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } - std::cout << "LR LLVMGenerator::Execute 3" << std::endl; + //std::cout << "LR LLVMGenerator::Execute 3" << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -177,7 +177,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } - std::cout << "LR LLVMGenerator::Execute A1" << std::endl; + //std::cout << "LR LLVMGenerator::Execute A1" << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -189,7 +189,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); - std::cout << "LR LLVMGenerator::Execute A2" << std::endl; + //std::cout << "LR LLVMGenerator::Execute A2" << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -312,8 +312,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, FieldDescriptorPtr output, int suffix_idx, std::string& fn_name, SelectionVector::Mode selection_vector_mode) { - std::cout << "LR CodeGenExprValue for output field " << output->Name() - << " type " << output->Type()->ToString() << " output type id " << output->Type()->id() << std::endl; + //std::cout << "LR CodeGenExprValue for output field " << output->Name() + // << " type " << output->Type()->ToString() << " output type id " << output->Type()->id() << std::endl; try { llvm::IRBuilder<>* builder = ir_builder(); // Create fn prototype : @@ -412,7 +412,7 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } // The visitor can add code to both the entry/loop blocks. - std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; + //std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, slice_offsets, arg_context_ptr, position_var); value_expr->Accept(visitor); @@ -444,12 +444,12 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); } else if (output_type_id == arrow::Type::STRUCT) { - std::cout << "LR creating struct type to store the result." << std::endl; + //std::cout << "LR creating struct type to store the result." << std::endl; auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); builder->CreateStore(output_value->data(), slot_offset); } else if (output_type_id == arrow::Type::LIST) { auto output_list_internal_type = output->Type()->field(0)->type()->id(); - std::cout << "LR creating list type to store the result with internal type " << output_list_internal_type << std::endl; + //std::cout << "LR creating list type to store the result with internal type " << output_list_internal_type << std::endl; if (arrow::is_binary_like(output_list_internal_type)) { auto output_list_value = std::dynamic_pointer_cast(output_value); @@ -472,8 +472,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, output_value->length()->print(output2); - std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," - << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; + //std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," + // << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); @@ -503,8 +503,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, // printType(output_value->data()) << std::endl; //ADD_TRACE("saving result 2 " + output->Name() + " value %T", output_value->data()); - int jello = 0; - std::cout << "LR CodeGenExprValue " << jello++ << std::endl; + //int jello = 0; + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; if (visitor.has_arena_allocs()) { // Reset allocations to avoid excessive memory usage. Once the result is copied to // the output vector (store instruction above), any memory allocations in this @@ -514,23 +514,23 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_context_arena_reset", types()->void_type(), reset_args); } - std::cout << "LR CodeGenExprValue " << jello++ << std::endl; + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // check loop_var loop_var->addIncoming(types()->i64_constant(0), loop_entry); llvm::Value* loop_update = builder->CreateAdd(loop_var, types()->i64_constant(1), "loop_var+1"); loop_var->addIncoming(loop_update, loop_body_tail); - std::cout << "LR CodeGenExprValue " << jello++ << std::endl; + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; llvm::Value* loop_var_check = builder->CreateICmpSLT(loop_update, arg_nrecords, "loop_var < nrec"); builder->CreateCondBr(loop_var_check, loop_body, loop_exit); - std::cout << "LR CodeGenExprValue " << jello++ << std::endl; + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // Loop exit builder->SetInsertPoint(loop_exit); builder->CreateRet(types()->i32_constant(0)); - std::cout << "LR CodeGenExprValue " << jello++ << std::endl; + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; return Status::OK(); } catch (std::exception& e) { std::cout << e.what() << std::endl; @@ -647,7 +647,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::raw_string_ostream output2(str2); ret_type->print(output); value->getType()->print(output2); - std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; + //std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; DCHECK(value->getType() == ret_type); } @@ -732,21 +732,21 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; - std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; + //std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; arrow::Type::type at = arrow::Type::INT32; type = types->IRType(at); //type = types->DataVecType(dex.FieldType()); - std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; // compute list len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); - std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << - printType(offsets_slot_index) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << + // printType(offsets_slot_index) << std::endl; // => offset_start = offsets[loop_var] slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); @@ -773,9 +773,9 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { // TODO: handle bool type bitmap // TODO: handle decimal precision and scale - std::cout << "LR VectorReadFixedLenValueListDex slot_ref " << printType(slot_ref) << std::endl; - std::cout << "LR VectorReadFixedLenValueListDex visit fixed-len data list vector " << dex.FieldName() << - " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex slot_ref " << printType(slot_ref) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex visit fixed-len data list vector " << dex.FieldName() << + // " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", list_len); result_.reset(new LValue(data_list, list_len)); @@ -835,8 +835,8 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { llvm::Value* slot; auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - std::cout << "LR dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - std::cout << "LR IRType is " << printType(type) << std::endl; + //std::cout << "LR dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + //std::cout << "LR IRType is " << printType(type) << std::endl; //type = types->DataVecType(dex.FieldType()); //LR HACK. Original was type = types->DataVecType(dex.FieldType()); arrow::Type::type at = arrow::Type::INT32; @@ -851,29 +851,29 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { int i = 0; std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_start = offsets[loop_var] - std::cout << "LR Type is " << printType(type) << std::endl; + //std::cout << "LR Type is " << printType(type) << std::endl; slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_end = offsets[loop_var + 1] llvm::Value* offsets_slot_index_next = builder->CreateAdd( offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); llvm::Value* offset_end = builder->CreateLoad(type, slot, "offset_end"); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => list_data_length = offset_end - offset_start llvm::Value* list_data_length = builder->CreateSub(offset_end, offset_start, "offsets_len"); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // get the child offsets array from the child offsets array, // start from offset 'offset_start' llvm::Value* child_offset_slot_ref = GetBufferReference(dex.ChildOffsetsIdx(), kBufferTypeChildOffsets, dex.Field()); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // do not forget slice offset llvm::Value* offset_start_int64 = builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); @@ -884,13 +884,13 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { llvm::Value* child_offset_start = builder->CreateLoad(type, child_offsets, "child_offset_start"); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // get the data array llvm::Value* data_slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); llvm::Value* data_value = builder->CreateGEP(type, data_slot_ref, child_offset_start); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; result_.reset(new ListLValue(data_value, child_offsets, list_data_length)); } @@ -934,7 +934,7 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { llvm::Value* value = nullptr; llvm::Value* len = nullptr; - std::cout << "LR LiteralDex type " << dex.type()->id() << std::endl; + //std::cout << "LR LiteralDex type " << dex.type()->id() << std::endl; switch (dex.type()->id()) { case arrow::Type::BOOL: value = types->i1_constant(std::get(dex.holder())); @@ -975,7 +975,7 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { case arrow::Type::STRING: case arrow::Type::BINARY: { const std::string& str = std::get(dex.holder()); - std::cout << "LR Literal string " << str << std::endl; + //std::cout << "LR Literal string " << str << std::endl; value = ir_builder()->CreateGlobalStringPtr(str.c_str()); len = types->i32_constant(static_cast(str.length())); break; @@ -1029,7 +1029,7 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { native_function->NeedsContext()); auto arrow_return_type = dex.func_descriptor()->return_type(); - std::cout << "LR NonNullableFunc 1 result_type " << printType(generator_->types()->DataVecType(arrow_return_type)) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << printType(generator_->types()->IRType(arrow_return_type->id())) << std::endl; + //std::cout << "LR NonNullableFunc 1 result_type " << printType(generator_->types()->DataVecType(arrow_return_type)) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << printType(generator_->types()->IRType(arrow_return_type->id())) << std::endl; if (native_function->CanReturnErrors()) { // slow path : if a function can return errors, skip invoking the function @@ -1041,7 +1041,7 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { auto result_type = types->DataVecType(arrow_return_type); //Result type array/list is special. //auto result_type = types->IRType(arrow_type_id); - std::cout << "LR NonNullableFunc 2 result_type " << printType(result_type) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << types->IRType(arrow_type_id) << std::endl; + //std::cout << "LR NonNullableFunc 2 result_type " << printType(result_type) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << types->IRType(arrow_type_id) << std::endl; // Build combined validity of the args. llvm::Value* is_valid = types->true_constant(); @@ -1500,7 +1500,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; if (arrow_return_type_id == arrow::Type::DECIMAL) { // For decimal fns, the output precision/scale are passed along as parameters. // @@ -1528,7 +1528,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, // add extra arg for return length for variable len return types (allocated on stack). llvm::AllocaInst* result_len_ptr = nullptr; if (arrow::is_binary_like(arrow_return_type_id)) { - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is binary like" << std::endl; + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is binary like" << std::endl; result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, "result_len", entry_block_); params->push_back(result_len_ptr); @@ -1536,7 +1536,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, } if (arrow_return_type_id == arrow::Type::LIST) { - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is list" << std::endl; + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is list" << std::endl; result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, "result_len", entry_block_); params->push_back(result_len_ptr); @@ -1545,7 +1545,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, } - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; for (auto p : *params) { std::string str1; llvm::raw_string_ostream output1(str1); @@ -1563,7 +1563,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr->getAllocatedType(), result_len_ptr); - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE" << std::endl; + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE" << std::endl; return std::make_shared(value, value_len); } @@ -1580,7 +1580,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(arg_context_ptr_); } - std::cout << "LR BuildParams1" << std::endl; + //std::cout << "LR BuildParams1" << std::endl; // if the function has holder, add the holder pointer. if (holder_idx != -1) { auto builder = ir_builder(); @@ -1589,7 +1589,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( llvm::BasicBlock* saved_block = builder->GetInsertBlock(); builder->SetInsertPoint(entry_block_); - std::cout << "LR BuildParams1a" << std::endl; + //std::cout << "LR BuildParams1a" << std::endl; auto holder = generator_->LoadVectorAtIndex( arg_holder_ptrs_, generator_->types()->i64_type(), holder_idx, "holder"); @@ -1597,20 +1597,20 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(holder); } - std::cout << "LR BuildParams2" << std::endl; + //std::cout << "LR BuildParams2" << std::endl; // build the function params, along with the validities. for (auto& pair : args) { // build value. DexPtr value_expr = pair->value_expr(); - std::cout << "LR BuildParams2a" << std::endl; + //std::cout << "LR BuildParams2a" << std::endl; value_expr->Accept(*this); - std::cout << "LR BuildParams2b" << std::endl; + //std::cout << "LR BuildParams2b" << std::endl; LValue& result_ref = *result(); // append all the parameters corresponding to this LValue. result_ref.AppendFunctionParams(¶ms); - std::cout << "LR BuildParams2c" << std::endl; + //std::cout << "LR BuildParams2c" << std::endl; // build validity. if (with_validity) { llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs()); @@ -1734,9 +1734,9 @@ std::string LLVMGenerator::ReplaceFormatInTrace(const std::string& in_msg, } void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { - //if (!enable_ir_traces_) { - // return; - //} + if (!enable_ir_traces_) { + return; + } std::string dmsg = "IR_TRACE:: " + msg + "\n"; std::string print_fn_name = "printf"; diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 2b0f1ca2d51ae..7e6a5c2fb96eb 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -82,7 +82,7 @@ class GANDIVA_EXPORT ListLValue : public LValue { : LValue(data, NULLPTR, validity), child_offsets_(child_offsets), offsets_length_(offsets_length) { - std::cout << "LR Creating ListLValue " << std::endl; + //std::cout << "LR Creating ListLValue " << std::endl; } llvm::Value* child_offsets() { return child_offsets_; } diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 34abedf1081e4..5a5dc7c60869c 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -169,7 +169,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, ARROW_RETURN_IF(configuration == nullptr, Status::Invalid("Configuration cannot be null")); - std::cout << "LR Projector::Make 1" << std::endl; + //std::cout << "LR Projector::Make 1" << std::endl; // see if equivalent projector was already built std::shared_ptr>> cache = LLVMGenerator::GetCache(); @@ -192,7 +192,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::unique_ptr llvm_gen; ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); - std::cout << "LR Projector::Make 2" << std::endl; + //std::cout << "LR Projector::Make 2" << std::endl; if (!is_cached && sec_cache != nullptr) { std::shared_ptr arrow_buffer = sec_cache->Get(GetSecondaryCacheKey(cache_key.ToString())); @@ -210,7 +210,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. - std::cout << "LR Projector::Make 3" << std::endl; + //std::cout << "LR Projector::Make 3" << std::endl; if (!is_cached) { ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { @@ -230,13 +230,13 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, output_fields.push_back(expr->result()); } - std::cout << "LR Projector::Make 4" << std::endl; + //std::cout << "LR Projector::Make 4" << std::endl; // Instantiate the projector with the completely built llvm generator *projector = std::shared_ptr( new Projector(std::move(llvm_gen), schema, output_fields, configuration)); projector->get()->SetBuiltFromCache(is_cached); - std::cout << "LR Projector::Make 5" << std::endl; + //std::cout << "LR Projector::Make 5" << std::endl; if (sec_cache != nullptr && is_cached == false) { std::shared_ptr sec_cached_obj = cache->GetObjectCode(cache_key); llvm::StringRef string_buffer = sec_cached_obj->getBuffer(); @@ -245,7 +245,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, sec_cache->Set(GetSecondaryCacheKey(cache_key.ToString()), arrow_buffer); } - std::cout << "LR Projector::Make DONE" << std::endl; + //std::cout << "LR Projector::Make DONE" << std::endl; return Status::OK(); } @@ -259,7 +259,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) const { ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); - std::cout << "LR the other Projector::Evaluate" << std::endl; + //std::cout << "LR the other Projector::Evaluate" << std::endl; if (output_data_vecs.size() != output_fields_.size()) { std::stringstream ss; ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() @@ -267,10 +267,10 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, return Status::Invalid(ss.str()); } -std::cout << "LR the other Projector::Evaluate 1a" << std::endl; +//std::cout << "LR the other Projector::Evaluate 1a" << std::endl; int idx = 0; for (auto& array_data : output_data_vecs) { - std::cout << "LR the other Projector::Evaluate checking array_data" << std::endl; + //std::cout << "LR the other Projector::Evaluate checking array_data" << std::endl; if (array_data == nullptr) { std::stringstream ss; ss << "array for output field " << output_fields_[idx]->name() << "is null."; @@ -280,12 +280,12 @@ std::cout << "LR the other Projector::Evaluate 1a" << std::endl; auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); - std::cout << "LR the other Projector::Evaluate about to validate capacity" << std::endl; + //std::cout << "LR the other Projector::Evaluate about to validate capacity" << std::endl; ARROW_RETURN_NOT_OK( ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), num_rows)); ++idx; } - std::cout << "LR the other Projector::Evaluate 2" << std::endl; + //std::cout << "LR the other Projector::Evaluate 2" << std::endl; ARROW_RETURN_NOT_OK( llvm_generator_->Execute(batch, selection_vector, output_data_vecs)); @@ -295,10 +295,10 @@ std::cout << "LR the other Projector::Evaluate 1a" << std::endl; if (array_data->type->id() == arrow::Type::LIST) { auto child_data = array_data->child_data[0]; - std::cout << "LR the other Projector::Evaluate modifying child array " << - child_data->buffers[1]->ToString() << std::endl; - std::cout << "LR the other Projector::Evaluate child array[3] " << - int32_t( (*child_data->buffers[1])[3*4]) << std::endl; + //std::cout << "LR the other Projector::Evaluate modifying child array " << + //child_data->buffers[1]->ToString() << std::endl; + //std::cout << "LR the other Projector::Evaluate child array[3] " << + //int32_t( (*child_data->buffers[1])[3*4]) << std::endl; //std::cout << "LR the other Projector::Evaluate modifying child0 array " << //child_data->buffers[0]->ToString() << std::endl; @@ -320,9 +320,9 @@ std::cout << "LR the other Projector::Evaluate 1a" << std::endl; array_data->child_data.clear(); array_data->child_data.push_back(new_child_data); - std::cout << "LR the other Projector::Evaluate child data size " << child_data_size << std::endl; - std::cout << "LR the other Projector::Evaluate after modifying child array[3] " << - int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; + //std::cout << "LR the other Projector::Evaluate child data size " << child_data_size << std::endl; + //std::cout << "LR the other Projector::Evaluate after modifying child array[3] " << + //int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; //array_data = arrow::ArrayData::Make(array_data->type, array_data->length, // array_data->buffers, {new_child_data}, @@ -343,14 +343,14 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* p Status Projector::Evaluate(const arrow::RecordBatch& batch, const SelectionVector* selection_vector, arrow::MemoryPool* pool, arrow::ArrayVector* output) const { - std::cout << "LR Projector::Evaluate" << std::endl; + //std::cout << "LR Projector::Evaluate" << std::endl; ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); - std::cout << "LR Projector::Evaluate num_rows" << num_rows << std::endl; + //std::cout << "LR Projector::Evaluate num_rows" << num_rows << std::endl; // Allocate the output data vecs. ArrayDataVector output_data_vecs; for (auto& field : output_fields_) { @@ -408,7 +408,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::Status astatus; std::vector> buffers; - std::cout << "LR Projector::AllocArrayData Enter" << std::endl; + //std::cout << "LR Projector::AllocArrayData Enter" << std::endl; // The output vector always has a null bitmap. int64_t size = arrow::bit_util::BytesForBits(num_records); ARROW_ASSIGN_OR_RAISE(auto bitmap_buffer, arrow::AllocateBuffer(size, pool)); @@ -462,18 +462,18 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); - std::cout << "LR Projector::AllocArrayData 1" << std::endl; + //std::cout << "LR Projector::AllocArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { - std::cout << "LR Projector::AllocArrayData List. There are number of buffers=" << buffers.size() << std::endl; + // std::cout << "LR Projector::AllocArrayData List. There are number of buffers=" << buffers.size() << std::endl; auto internal_type = type->field(0)->type(); ArrayDataPtr child_data; if (arrow::is_primitive(internal_type->id())) { - std::cout << "LR Projector::AllocArrayData List 1" << std::endl; + //std::cout << "LR Projector::AllocArrayData List 1" << std::endl; child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, {nullptr, std::move(buffers[2])}, 0); } if (arrow::is_binary_like(internal_type->id())) { - std::cout << "LR Projector::AllocArrayData List 2" << std::endl; + //std::cout << "LR Projector::AllocArrayData List 2" << std::endl; child_data = arrow::ArrayData::Make( internal_type, 0 /*initialize length*/, {nullptr, std::move(buffers[2]), std::move(buffers[3])}, 0); @@ -485,7 +485,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); } - std::cout << "LR Projector::AllocArrayData Done" << std::endl; + // std::cout << "LR Projector::AllocArrayData Done" << std::endl; return Status::OK(); } @@ -504,18 +504,18 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, ARROW_RETURN_IF(array_data.buffers.size() < 2, Status::Invalid("ArrayData must have at least 2 buffers")); -std::cout << "LR ValidateArrayDataCapacity" << std::endl; +//std::cout << "LR ValidateArrayDataCapacity" << std::endl; int64_t min_bitmap_len = arrow::bit_util::BytesForBits(num_records); - std::cout << "LR ValidateArrayDataCapacity arra_data 0 is " << array_data.buffers[0] << std::endl; + //std::cout << "LR ValidateArrayDataCapacity arra_data 0 is " << array_data.buffers[0] << std::endl; int64_t bitmap_len = array_data.buffers[0]->capacity(); - std::cout << "LR ValidateArrayDataCapacity" << std::endl; + //std::cout << "LR ValidateArrayDataCapacity" << std::endl; ARROW_RETURN_IF( bitmap_len < min_bitmap_len, Status::Invalid("Bitmap buffer too small for ", field.name(), " expected minimum ", min_bitmap_len, " actual size ", bitmap_len)); auto type_id = field.type()->id(); - std::cout << "LR ValidateArrayDataCapacity" << std::endl; + //std::cout << "LR ValidateArrayDataCapacity" << std::endl; //LR TODO if (arrow::is_binary_like(type_id)) { //|| type_id == arrow::Type::LIST) { // validate size of offsets buffer. diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 87f7bb16fe12a..c43285843a1ee 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -144,10 +144,10 @@ NodePtr TreeExprBuilder::MakeOr(const NodeVector& children) { static bool print_expr = false; ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result_field) { - std::cout << "LR Expression: " << root_node->ToString() << "\n"; + //std::cout << "LR Expression: " << root_node->ToString() << "\n"; if (result_field == nullptr) { - std::cout << "LR MakeExpression result_field is null" << std::endl; + //std::cout << "LR MakeExpression result_field is null" << std::endl; return nullptr; } return ExpressionPtr(new Expression(root_node, result_field)); @@ -164,9 +164,9 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } - std::cout << "LR MakeExpression making function for " << function << std::endl; + //std::cout << "LR MakeExpression making function for " << function << std::endl; auto func_node = MakeFunction(function, field_nodes, out_field->type()); - std::cout << "LR MakeExpression function is " << func_node->ToString() << std::endl; + //std::cout << "LR MakeExpression function is " << func_node->ToString() << std::endl; return MakeExpression(func_node, out_field); } diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 3d02488a9f973..56cb1015fad48 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -299,7 +299,7 @@ FieldPtr ProtoTypeToField(const types::Field& f) { NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) { FieldPtr field_ptr = ProtoTypeToField(node.field()); - std::cout << "LR created field " << field_ptr->ToString(true) << std::endl; + //std::cout << "LR created field " << field_ptr->ToString(true) << std::endl; if (field_ptr == nullptr) { std::cerr << "Unable to create field node from protobuf\n"; return nullptr; @@ -471,7 +471,7 @@ NodePtr ProtoTypeToNullNode(const types::NullNode& node) { NodePtr ProtoTypeToNode(const types::TreeNode& node) { if (node.has_fieldnode()) { - std::cout << "LR Found ProtoTypeToNode fieldnode " << std::endl; + //std::cout << "LR Found ProtoTypeToNode fieldnode " << std::endl; return ProtoTypeToFieldNode(node.fieldnode()); } @@ -520,7 +520,7 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { } if (node.has_stringnode()) { - std::cout << "LR Found StringNode" << std::endl; + //std::cout << "LR Found StringNode" << std::endl; return TreeExprBuilder::MakeStringLiteral(node.stringnode().value()); } @@ -665,23 +665,23 @@ auto type_id = type->id(); } - std::cout << "LR New ArrayData 1" << std::endl; + //std::cout << "LR New ArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { jlong offsets_addr = in_buf_addrs[buf_idx++]; jlong offsets_size = in_buf_sizes[sz_idx++]; auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); - std::cout << "LR New ArrayData List" << std::endl; + //std::cout << "LR New ArrayData List" << std::endl; auto internal_type = type->field(0)->type(); std::shared_ptr child_data; if (arrow::is_primitive(internal_type->id())) { - std::cout << "LR New ArrayData List 1" << std::endl; + //std::cout << "LR New ArrayData List 1" << std::endl; child_data = arrow::ArrayData::Make(internal_type, 0, {nullptr, std::move(data_buffer)}, 0); } if (arrow::is_binary_like(internal_type->id())) { - std::cout << "LR New ArrayData List NYI 2" << std::endl; + //std::cout << "LR New ArrayData List NYI 2" << std::endl; //child_data = arrow::ArrayData::Make( // internal_type, 0, // {nullptr, std::move(data_buffer), std::move(child_data)}, 0); @@ -894,10 +894,16 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { // callback into java to expand the buffer jobject ret = env_->CallObjectMethod(jexpander_, vector_expander_method_, vector_idx_, new_capacity); + std::cout << "Buffer expand: New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << vector_expander_method_ << + " jexpander_ " << jexpander_ << std::endl; if (env_->ExceptionCheck()) { env_->ExceptionDescribe(); env_->ExceptionClear(); - return Status::OutOfMemory("buffer expand failed in java"); + std::cout << "Buffer expand failed. New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << vector_expander_method_ << + " jexpander_ " << jexpander_ << std::endl; + return Status::OutOfMemory("buffer expand failed in java."); } jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); @@ -937,7 +943,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( jlongArray buf_addrs, jlongArray buf_sizes, jint sel_vec_type, jint sel_vec_rows, jlong sel_vec_addr, jlong sel_vec_size, jlongArray out_buf_addrs, jlongArray out_buf_sizes) { - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << std::endl; Status status; std::shared_ptr holder = projector_modules_.Lookup(module_id); if (holder == nullptr) { @@ -973,11 +979,11 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( if (!status.ok()) { break; } - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + /*std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << " Made a recordbatch num_rows " << num_rows << in_batch->ToString() << " there are " << out_bufs_len << " buffers " - << std::endl; + << std::endl;*/ std::shared_ptr selection_vector; auto selection_buffer = std::make_shared( @@ -1013,14 +1019,14 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( for (FieldPtr field : ret_types) { std::vector> buffers; - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer" << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer" << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* validity_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong bitmap_sz = out_sizes[sz_idx++]; buffers.push_back(std::make_shared(validity_buf, bitmap_sz)); if (arrow::is_binary_like(field->type()->id())) { - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding buffer" << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding buffer" << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* offsets_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong offsets_sz = out_sizes[sz_idx++]; @@ -1030,7 +1036,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* value_buf = reinterpret_cast(out_bufs[buf_idx++]); - jlong data_sz = out_sizes[sz_idx++]; + jlong data_sz = out_sizes[sz_idx++] * 1000; if (arrow::is_binary_like(field->type()->id())) { if (jexpander == nullptr) { status = Status::Invalid( @@ -1039,11 +1045,11 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer size=" << data_sz << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer size=" << data_sz << std::endl; buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, value_buf, data_sz)); } else if (field->type()->id() == arrow::Type::LIST) { - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding buffer size=" << data_sz << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding buffer size=" << data_sz << std::endl; buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, value_buf, data_sz)); } else { @@ -1056,13 +1062,13 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer size=" << data_sz << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, child_offset_buf, data_sz)); - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer size=" << data_sz << std::endl; + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( @@ -1081,7 +1087,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( output.push_back(array_data); ++output_vector_idx; - std::cout << "LR jni_common there are " << buffers.size() << " buffers" << std::endl; + //std::cout << "LR jni_common there are " << buffers.size() << " buffers" << std::endl; } else { auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers); @@ -1094,26 +1100,26 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - std::cout << "LR jni_common calling evaluate" << std::endl; + //std::cout << "LR jni_common calling evaluate" << std::endl; status = holder->projector()->Evaluate(*in_batch, selection_vector.get(), output); //LRtest1 - std::cout << "LR jni_common after evaluating the output size is " << output.size() << std::endl; + //std::cout << "LR jni_common after evaluating the output size is " << output.size() << std::endl; arrow::ArraySpan sp(*(output[0])); - std::cout << "LR jni_common after evaluating the output 0 is " << sp.ToArray()->ToString() << std::endl; + //std::cout << "LR jni_common after evaluating the output 0 is " << sp.ToArray()->ToString() << std::endl; auto array_data = output[0]; if (array_data->type->id() == arrow::Type::LIST) { auto child_data = array_data->child_data[0]; - std::cout << "LR jni_common child array[3] " << - int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; - std::cout << "LR jni_common child array[0] " << - int32_t( (*(array_data->child_data[0])->buffers[1])[0*4]) << std::endl; - std::cout << "LR jni_common child via data ptr array[0] " << - int32_t( *(*(array_data->child_data[0])->buffers[1]).data()) << std::endl; - std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" - << (array_data->child_data[0])->length << std::endl; + //std::cout << "LR jni_common child array[3] " << + //int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; + //std::cout << "LR jni_common child array[0] " << + //int32_t( (*(array_data->child_data[0])->buffers[1])[0*4]) << std::endl; + //std::cout << "LR jni_common child via data ptr array[0] " << + //int32_t( *(*(array_data->child_data[0])->buffers[1]).data()) << std::endl; + //std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" + // << (array_data->child_data[0])->length << std::endl; //LRTest1 Start - int numRecords = 5 * 100; + int numRecords = 5 * 1000000; //int numRecords = (array_data->child_data[0])->length * array_data->length; int recordSize = numRecords * 4; @@ -1131,8 +1137,8 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //memcpy(&out_bufs[1], offsetsBuffer, 1 * 4); //out_sizes[1] = 1; - std::cout << "LR jni_common after copy parent buff child array[0] " << - int32_t( (out_bufs[3])) << std::endl; + //std::cout << "LR jni_common after copy parent buff child array[0] " << + //int32_t( (out_bufs[3])) << std::endl; //LRTest1 End } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index fe040ab4382e8..a12569b1a0175 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -393,7 +393,11 @@ private void evaluate(int numRows, List buffers, List buf } if (valueVector instanceof ListVector) { - + hasVariableWidthColumns = true; + //LR TODO figure out what to use here resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; + //resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; + //resizeableVectors[outColumnIdx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); + List fieldBufs = ((ListVector) valueVector).getDataVector().getFieldBuffers(); logger.error("LR Projector.java evaluate ListVector has buffers=" + fieldBufs.size()); @@ -455,9 +459,9 @@ private void evaluate(int numRows, List buffers, List buf if (valueVector instanceof ListVector) { //LR HACK - int numRecordsFound = 5 * 100; + int numRecordsFound = 5 * 1000000; //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; - logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound); + //logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound); //ArrowBuf ab0 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); @@ -473,8 +477,12 @@ private void evaluate(int numRows, List buffers, List buf //byte[] valid = new byte[outsizes[2]]; //LR HACK //for (int i = 0; i < outSizes[2]; i++) { - for (int i = 0; i < numRecordsFound; i++) { - BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + try { + for (int i = 0; i < numRecordsFound; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + } + } catch (IndexOutOfBoundsException e) { + return; } } } From 4832d63e68e0b0483372eee104e6b8ea039212cd Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 13 Sep 2023 16:21:22 -0700 Subject: [PATCH 14/46] Working version with 100 rows and correct data --- cpp/src/gandiva/array_ops.cc | 13 ++- cpp/src/gandiva/gdv_function_stubs.cc | 8 +- cpp/src/gandiva/projector.cc | 1 + java/gandiva/src/main/cpp/jni_common.cc | 8 +- .../arrow/gandiva/evaluator/Projector.java | 107 ++++++++++++++++-- 5 files changed, 116 insertions(+), 21 deletions(-) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index b957bdebcecab..7e5931c18cd42 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -54,9 +54,9 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, for (int i = 0; i < entry_offsets_len; i++) { //std::cout << "LR going to check " << entry_buf + i << std::endl; //LR TODO - //int32_t entry_len = *(entry_buf + i); + int32_t entry_len = *(entry_buf + i); //coming as int64 for some reason. *2 - int32_t entry_len = *(entry_buf + (i * 2)); + //int32_t entry_len = *(entry_buf + (i * 2)); //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; if (entry_len == contains_data) { return true; @@ -98,15 +98,16 @@ int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int3 int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { - //std::cout << "LR array_int32_remove offset data=" << remove_data << std::endl; + //std::cout << "LR array_int32_remove data=" << remove_data + // << " entry_offsets_len " << entry_offsets_len << std::endl; //LR sizes are HACK int* integers = new int[5]; int j = 0; for (int i = 0; i < entry_offsets_len; i++) { - //std::cout << "LR going to check " << entry_buf + i << std::endl; - int32_t entry_len = *(entry_buf + (i * 2)); - //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_len = *(entry_buf + (i * 1)); + std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; if (entry_len == remove_data) { continue; } else { diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 876f5b72b9941..c739861fcf492 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -168,15 +168,15 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ std::cout << "gdv_fn_populate 2 data_ptr" << data_ptr << " buffer " << buffer << \ - " offset " << offset << " entry_len " << entry_len << " scale " << SCALE << \ - " want to resize to " << (offset + entry_len * SCALE) << std::endl; \ - /*auto status = buffer->Resize(offset + entry_len * SCALE, false); \ + " offset " << offset << " entry_len " << entry_len << " scale " \ + << SCALE << " slot " << slot<< std::endl; \ + auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ reinterpret_cast(context_ptr); \ context->set_error_msg(status.message().c_str()); \ return -1; \ - } */ \ + } \ std::cout << "gdv_fn_populate resized buffer to =" << offset + entry_len * SCALE << std::endl; \ std::cout << "gdv_fn_populate copying bytes =" << entry_len * SCALE << std::endl; \ std::cout << "gdv_fn_populate buffer =" << buffer->ToString() << " offeset " << offset << std::endl; \ diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 5a5dc7c60869c..d5c0ef6f9d638 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -394,6 +394,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, array_data = arrow::ArrayData::Make(array_data->type, array_data->length, array_data->buffers, {new_child_data}, array_data->null_count, array_data->offset); + std::cout << "LR Making array data length " << array_data->length << std::endl; } output->push_back(arrow::MakeArray(array_data)); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 56cb1015fad48..f26763fc0c03f 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -1061,13 +1061,14 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( std::vector> child_buffers; - + data_sz = out_sizes[sz_idx++]; //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( env, jexpander, output_vector_idx, child_offset_buf, data_sz)); + data_sz = out_sizes[sz_idx++]; //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); @@ -1138,7 +1139,10 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //out_sizes[1] = 1; //std::cout << "LR jni_common after copy parent buff child array[0] " << - //int32_t( (out_bufs[3])) << std::endl; + //"," << int32_t( (out_bufs[3])) << + //"," << int32_t( (out_bufs[3]+4)) << + //"," << int32_t( (out_bufs[3])+8) << + //"," << int32_t( (out_bufs[3])+12) << std::endl; //LRTest1 End } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index a12569b1a0175..91729b7e519f0 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -29,13 +29,12 @@ import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.vector.BaseVariableWidthVector; -import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.ipc.message.ArrowBuffer; -import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; @@ -412,8 +411,14 @@ private void evaluate(int numRows, List buffers, List buf //vector offset logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); + //The realloc avoids dynamic resizing, will have to be fixed later. outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); + logger.error("LR Projector.java evaluate ListVector set buffer " + idx + + " as ptr=" + outAddrs[idx - 1] + " size " + outSizes[idx - 1]); //vector data //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); @@ -461,29 +466,113 @@ private void evaluate(int numRows, List buffers, List buf int numRecordsFound = 5 * 1000000; //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; - //logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound); + //logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound + " outSizes[3]=" + outSizes[3]); + //LR HACK 9-13 10:34 + /*public void startList() { + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx() + 1L) * OFFSET_WIDTH)); + listStarted = true; + } + + @Override + public void endList() { + vector.getOffsetBuffer().setInt((idx() + 1L) * OFFSET_WIDTH, writer.idx()); + setPosition(idx() + 1); + listStarted = false; + */ + + /*ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + for (int i = 0; i < 50; i++) { + System.out.println("LR arrowbuf=" + Integer.reverseBytes(ab2.getInt(i))); + System.out.println("LR arrowbuf=" + ab2.getInt(i)); + System.out.println("LR arrowbuf=" + ab2.getShort(i)); + System.out.println("LR arrowbuf=" + ab2.getInt(i * 4)); + System.out.println("LR arrowbuf======"); + }*/ + + /*ArrowBuf ab = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); + for (int i = 0; i < 50; i++) { + System.out.println("LR arrowbuf2=" + Integer.reverseBytes(ab.getInt(i))); + System.out.println("LR arrowbuf2=" + ab.getInt(i)); + System.out.println("LR arrowbuf2=" + ab.getShort(i)); + }*/ + + + UnionListWriter writer = ((ListVector) valueVector).getWriter(); + for (int i = 0; i < 100; i++) { + writer.startList(); + writer.setPosition(i); + for (int j = 0; j < 5; j++) { + writer.writeInt(ab2.getInt((j + (5 * i)) * 4)); + } + writer.setValueCount(5); + writer.endList(); + } + ((ListVector) valueVector).setValueCount(100); + + + //LR HACK 9-13 10:34 All the multiline comment + /* + import org.apache.arrow.memory.ReferenceManager; + import org.apache.arrow.vector.BitVectorHelper; + import org.apache.arrow.vector.ipc.message.ArrowFieldNode; + */ //ArrowBuf ab0 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + /*ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + ArrowBuf abb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); List outBufsNew = new ArrayList(); + StringBuilder sbb = new StringBuilder(); + abb.print(sbb, 1); + System.out.println("LR abb=" + sbb); + //outBufsNew.add(ab0); - outBufsNew.add(ab); - outBufsNew.add(ab2); + outBufsNew.add(abb); + outBufsNew.add(abb2); ArrowFieldNode afn = new ArrowFieldNode(numRecordsFound, 0); ((ListVector) valueVector).getDataVector().clear(); ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + + //LR HACK 9-12 10:09 + //ArrowBuf offBuff = ((ListVector) valueVector).getOffsetBuffer(); + //for (int i = 0; i < 101; i++) { + // offBuff.setInt(i, 5 * i * 4); + //} + + + + + //byte[] valid = new byte[outsizes[2]]; //LR HACK //for (int i = 0; i < outSizes[2]; i++) { + int simple = 0; try { - for (int i = 0; i < numRecordsFound; i++) { + for (int i = 0; i < numRecordsFound * 4; i++) { BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + simple++; + //BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); } } catch (IndexOutOfBoundsException e) { - return; + simple = 0; + } + ArrowBuf ab3 = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); + for (int i = 0; i < 50; i++) { + System.out.println("LR arrowbuf after=" + Integer.reverseBytes(ab3.getInt(i))); + System.out.println("LR arrowbuf after=" + ab3.getInt(i)); + System.out.println("LR arrowbuf after=" + ab3.getShort(i)); + } + ArrowBuf ab3a = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1); + for (int i = 0; i < 50; i++) { + System.out.println("LR arrowbuf aftera=" + Integer.reverseBytes(ab3a.getInt(i))); + System.out.println("LR arrowbuf aftera=" + ab3a.getInt(i)); + System.out.println("LR arrowbuf aftera=" + ab3a.getShort(i)); } + IntVector iv = (IntVector) ((ListVector) valueVector).getDataVector(); + for (int i = 0; i < 50; i++) { + System.out.println("LR IntVector=" + iv.get(i)); + }*/ } } From bcfcd8803485618b1cde909b239069fd67685e04 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 13 Sep 2023 16:21:49 -0700 Subject: [PATCH 15/46] update scripts --- build_release.sh | 3 ++- build_testing.sh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/build_release.sh b/build_release.sh index 533467d0484d0..5afaff588237c 100755 --- a/build_release.sh +++ b/build_release.sh @@ -30,4 +30,5 @@ then fi cd java -/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Parrow-jni clean install +/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow/java-dist/lib -Parrow-jni clean install +cp gandiva/target/arrow-gandiva-12.0.1.jar /Users/logan.riggs/github/dremio/enterprise/distribution/server/target/dremio-enterprise-24.3.0-SNAPSHOT/dremio-enterprise-24.3.0-SNAPSHOT/jars/3rdparty/ diff --git a/build_testing.sh b/build_testing.sh index e270f3758101c..9604ba5678ff2 100755 --- a/build_testing.sh +++ b/build_testing.sh @@ -35,3 +35,4 @@ fi echo "====JARS====" cd java /opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Parrow-jni clean install +cp java/gandiva/target/arrow-gandiva-12.0.1.jar /Users/logan.riggs/github/dremio/enterprise/distribution/server/target/dremio-enterprise-24.3.0-SNAPSHOT/dremio-enterprise-24.3.0-SNAPSHOT/jars/3rdparty/ From a82ef6a580014ae83d71bdabc33dc5ed72d94eec Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 14 Sep 2023 16:57:22 -0700 Subject: [PATCH 16/46] Using some dynamic sizes --- cpp/src/gandiva/array_ops.cc | 4 +- cpp/src/gandiva/expr_decomposer.cc | 2 +- cpp/src/gandiva/gdv_function_stubs.cc | 8 --- java/gandiva/src/main/cpp/jni_common.cc | 22 ++++-- .../arrow/gandiva/evaluator/JniWrapper.java | 3 +- .../gandiva/evaluator/ListVectorExpander.java | 69 +++++++++++++++++++ .../arrow/gandiva/evaluator/Projector.java | 37 +++++++--- 7 files changed, 118 insertions(+), 27 deletions(-) create mode 100644 java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 7e5931c18cd42..fd04a84986974 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -105,9 +105,9 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int* integers = new int[5]; int j = 0; for (int i = 0; i < entry_offsets_len; i++) { - std::cout << "LR going to check " << entry_buf + i << std::endl; + //std::cout << "LR going to check " << entry_buf + i << std::endl; int32_t entry_len = *(entry_buf + (i * 1)); - std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; if (entry_len == remove_data) { continue; } else { diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index ec56d30c51589..72c992df11c7e 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -52,7 +52,7 @@ Status ExprDecomposer::Visit(const FieldNode& node) { //std::cout << "LR ExprDecomposer 3" << std::endl; auto p = std::make_shared(desc); value_dex = p; - int v = p->DataIdx(); + //int v = p->DataIdx(); //std::cout << "LR primitive list type " v << " " << } else { //std::cout << "LR ExprDecomposer 4" << std::endl; diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index c739861fcf492..045b97c698086 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -164,12 +164,8 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ int32_t* offsets, int64_t slot, \ TYPE* entry_buf, int32_t entry_len) { \ - std::cout << "gdv_fn_populate 1 data_ptr is " << data_ptr << std::endl; \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ - std::cout << "gdv_fn_populate 2 data_ptr" << data_ptr << " buffer " << buffer << \ - " offset " << offset << " entry_len " << entry_len << " scale " \ - << SCALE << " slot " << slot<< std::endl; \ auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ @@ -177,11 +173,7 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, context->set_error_msg(status.message().c_str()); \ return -1; \ } \ - std::cout << "gdv_fn_populate resized buffer to =" << offset + entry_len * SCALE << std::endl; \ - std::cout << "gdv_fn_populate copying bytes =" << entry_len * SCALE << std::endl; \ - std::cout << "gdv_fn_populate buffer =" << buffer->ToString() << " offeset " << offset << std::endl; \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ - std::cout << "gdv_fn_populate buffer after =" << buffer->ToString() << std::endl; \ offsets[slot] = offset / SCALE; \ offsets[slot + 1] = offset / SCALE + entry_len; \ return 0; \ diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index f26763fc0c03f..1b96109f256cc 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -939,7 +939,7 @@ Status JavaResizableBuffer::Resize(const int64_t new_size, bool shrink_to_fit) { JNIEXPORT void JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( - JNIEnv* env, jobject object, jobject jexpander, jlong module_id, jint num_rows, + JNIEnv* env, jobject object, jobject jexpander, jobject jListExpander, jlong module_id, jint num_rows, jlongArray buf_addrs, jlongArray buf_sizes, jint sel_vec_type, jint sel_vec_rows, jlong sel_vec_addr, jlong sel_vec_size, jlongArray out_buf_addrs, jlongArray out_buf_sizes) { @@ -1060,20 +1060,29 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( std::vector> child_buffers; + if (jListExpander == nullptr) { + status = Status::Invalid( + "expression has variable len output columns, but the jListExpander object is " + "null"); + break; + } + + + //LR TODO the two buffers... data_sz = out_sizes[sz_idx++]; //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( - env, jexpander, output_vector_idx, child_offset_buf, data_sz)); + env, jListExpander, output_vector_idx, child_offset_buf, data_sz)); data_sz = out_sizes[sz_idx++]; //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( - env, jexpander, output_vector_idx, child_data_buf, data_sz)); + env, jListExpander, output_vector_idx, child_data_buf, data_sz)); std::shared_ptr dt2 = std::make_shared(); auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); @@ -1120,9 +1129,12 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( // << (array_data->child_data[0])->length << std::endl; //LRTest1 Start - int numRecords = 5 * 1000000; + int numRecords = (array_data->child_data[0])->length; //int numRecords = (array_data->child_data[0])->length * array_data->length; - int recordSize = numRecords * 4; + int recordSize = numRecords * 4; //LR TODO HACK + + std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" + << (array_data->child_data[0])->length << " using numRecords=" << numRecords << std::endl; memcpy(&out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); out_sizes[3] = recordSize; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java index 293d51a87a5fd..f883ed7081547 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java @@ -50,6 +50,7 @@ native long buildProjector(Object cache, byte[] schemaBuf, byte[] exprListBuf, * and store the output in ValueVectors. Throws an exception in case of errors * * @param expander VectorExpander object. Used for callbacks from cpp. + * @param listExpander ListVectorExpander object. Used for callbacks from cpp. * @param moduleId moduleId representing expressions. Created using a call to * buildNativeCode * @param numRows Number of rows in the record batch @@ -63,7 +64,7 @@ native long buildProjector(Object cache, byte[] schemaBuf, byte[] exprListBuf, * @param outSizes The allocated size of the output buffers. On successful evaluation, * the result is stored in the output buffers */ - native void evaluateProjector(Object expander, long moduleId, int numRows, + native void evaluateProjector(Object expander, Object listExpander, long moduleId, int numRows, long[] bufAddrs, long[] bufSizes, int selectionVectorType, int selectionVectorSize, long selectionVectorBufferAddr, long selectionVectorBufferSize, diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java new file mode 100644 index 0000000000000..7019f396b9677 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.vector.complex.ListVector; + +/** + * This class provides the functionality to expand output vectors using a callback mechanism from + * gandiva. + */ +public class ListVectorExpander { + private final ListVector[] vectors; + + public ListVectorExpander(ListVector[] vectors) { + this.vectors = vectors; + } + + /** + * Result of vector expansion. + */ + public static class ExpandResult { + public long address; + public long capacity; + + public ExpandResult(long address, long capacity) { + this.address = address; + this.capacity = capacity; + } + } + + /** + * Expand vector at specified index. This is used as a back call from jni, and is only + * relevant for ListVectors. + * + * @param index index of buffer in the list passed to jni. + * @param toCapacity the size to which the buffer should be expanded to. + * + * @return address and size of the buffer after expansion. + */ + public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { + if (index >= vectors.length || vectors[index] == null) { + throw new IllegalArgumentException("invalid index " + index); + } + + ListVector vector = vectors[index]; + while (vector.getDataVector().getFieldBuffers().get(0).capacity() < toCapacity) { + vector.reAlloc(); + } + return new ExpandResult( + vector.getDataVector().getFieldBuffers().get(0).memoryAddress(), + vector.getDataVector().getFieldBuffers().get(0).capacity()); + } + +} diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 91729b7e519f0..ae904bb64a9e1 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -355,7 +355,8 @@ private void evaluate(int numRows, List buffers, List buf boolean hasVariableWidthColumns = false; BaseVariableWidthVector[] resizableVectors = new BaseVariableWidthVector[outColumns.size()]; - + ListVector[] resizableListVectors = new ListVector[outColumns.size()]; + long[] outAddrs = new long[3 * outColumns.size()]; long[] outSizes = new long[3 * outColumns.size()]; @@ -393,6 +394,7 @@ private void evaluate(int numRows, List buffers, List buf if (valueVector instanceof ListVector) { hasVariableWidthColumns = true; + resizableListVectors[outColumnIdx] = (ListVector) valueVector; //LR TODO figure out what to use here resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; //resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; //resizeableVectors[outColumnIdx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); @@ -413,7 +415,14 @@ private void evaluate(int numRows, List buffers, List buf logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); ((ListVector) valueVector).reAlloc(); ((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); //100 rows + + //This doesnt actually allocate any memory. + //((ListVector) valueVector).setInitialCapacity(1000000); + //while (((ListVector) valueVector).getValueCapacity() < 1000000) { + // ((ListVector) valueVector).reAlloc(); + //} + //The realloc avoids dynamic resizing, will have to be fixed later. outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); @@ -448,6 +457,7 @@ private void evaluate(int numRows, List buffers, List buf logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, + hasVariableWidthColumns ? new ListVectorExpander(resizableListVectors) : null, this.moduleId, numRows, bufAddrs, bufSizes, selectionVectorType, selectionVectorRecordCount, selectionVectorAddr, selectionVectorSize, @@ -464,9 +474,9 @@ private void evaluate(int numRows, List buffers, List buf if (valueVector instanceof ListVector) { //LR HACK - int numRecordsFound = 5 * 1000000; + //int numRecordsFound = 5 * 100; //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; - //logger.error("LR Projector.java using outsizes numRecords=" + numRecordsFound + " outSizes[3]=" + outSizes[3]); + //logger.error("LR Projector.java using numRecords=" + numRecordsFound + " outSizes[3]=" + outSizes[3]); //LR HACK 9-13 10:34 /*public void startList() { @@ -482,8 +492,8 @@ public void endList() { listStarted = false; */ - /*ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); - for (int i = 0; i < 50; i++) { + ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + /*for (int i = 0; i < 50; i++) { System.out.println("LR arrowbuf=" + Integer.reverseBytes(ab2.getInt(i))); System.out.println("LR arrowbuf=" + ab2.getInt(i)); System.out.println("LR arrowbuf=" + ab2.getShort(i)); @@ -498,18 +508,25 @@ public void endList() { System.out.println("LR arrowbuf2=" + ab.getShort(i)); }*/ - + logger.error("LR Projector.java using numRecords=" + + selectionVectorRecordCount + " outSizes[3]=" + outSizes[3]); UnionListWriter writer = ((ListVector) valueVector).getWriter(); - for (int i = 0; i < 100; i++) { + for (int i = 0; i < selectionVectorRecordCount; i++) { writer.startList(); writer.setPosition(i); for (int j = 0; j < 5; j++) { - writer.writeInt(ab2.getInt((j + (5 * i)) * 4)); + int index = ((j + (5 * i)) * 4); + //Not sure whats going on. Buffer too small? + try { + writer.writeInt(ab2.getInt(index)); + } catch (IndexOutOfBoundsException e) { + continue; + } } writer.setValueCount(5); writer.endList(); } - ((ListVector) valueVector).setValueCount(100); + ((ListVector) valueVector).setValueCount(selectionVectorRecordCount); //LR HACK 9-13 10:34 All the multiline comment From 5b38893f8cec1fb9d096c16937c481f4de8afd1a Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 19 Sep 2023 16:27:15 -0700 Subject: [PATCH 17/46] Working with 1million rows --- cpp/src/gandiva/gdv_function_stubs.cc | 2 + cpp/src/gandiva/projector.cc | 4 +- java/gandiva/src/main/cpp/jni_common.cc | 90 +++++++++++++++---- .../gandiva/evaluator/ListVectorExpander.java | 2 + .../arrow/gandiva/evaluator/Projector.java | 13 ++- 5 files changed, 88 insertions(+), 23 deletions(-) diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 045b97c698086..a82ba23974c09 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -173,6 +173,8 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, context->set_error_msg(status.message().c_str()); \ return -1; \ } \ + std::cout << "LR populate_list slot " << slot << " offset = " << offset << " buffer = " << \ + (int64_t)(buffer->mutable_data() + offset) << std::endl; \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ offsets[slot] = offset / SCALE; \ offsets[slot + 1] = offset / SCALE + entry_len; \ diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index d5c0ef6f9d638..97f28f652ea22 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -291,7 +291,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, // Create and return array arrays. - for (auto& array_data : output_data_vecs) { + /* for (auto& array_data : output_data_vecs) { if (array_data->type->id() == arrow::Type::LIST) { auto child_data = array_data->child_data[0]; @@ -329,7 +329,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, // array_data->null_count, array_data->offset); } - } + }*/ return Status::OK(); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 1b96109f256cc..32ecf6beee1f8 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -82,8 +82,10 @@ jclass configuration_builder_class_; // refs for self. static jclass gandiva_exception_; static jclass vector_expander_class_; +static jclass listvector_expander_class_; static jclass vector_expander_ret_class_; static jmethodID vector_expander_method_; +static jmethodID listvector_expander_method_; static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; @@ -125,6 +127,15 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { vector_expander_class_, "expandOutputVectorAtIndex", "(IJ)Lorg/apache/arrow/gandiva/evaluator/VectorExpander$ExpandResult;"); + jclass local_listexpander_class = + env->FindClass("org/apache/arrow/gandiva/evaluator/ListVectorExpander"); + listvector_expander_class_ = (jclass)env->NewGlobalRef(local_listexpander_class); + env->DeleteLocalRef(local_listexpander_class); + + listvector_expander_method_ = env->GetMethodID( + listvector_expander_class_, "expandOutputVectorAtIndex", + "(IJ)Lorg/apache/arrow/gandiva/evaluator/ListVectorExpander$ExpandResult;"); + jclass local_expander_ret_class = env->FindClass("org/apache/arrow/gandiva/evaluator/VectorExpander$ExpandResult"); vector_expander_ret_class_ = (jclass)env->NewGlobalRef(local_expander_ret_class); @@ -164,6 +175,7 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { env->DeleteGlobalRef(configuration_builder_class_); env->DeleteGlobalRef(gandiva_exception_); env->DeleteGlobalRef(vector_expander_class_); + env->DeleteGlobalRef(vector_expander_class_); env->DeleteGlobalRef(vector_expander_ret_class_); env->DeleteGlobalRef(secondary_cache_class_); env->DeleteGlobalRef(cache_buf_ret_class_); @@ -871,12 +883,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build /// class JavaResizableBuffer : public arrow::ResizableBuffer { public: - JavaResizableBuffer(JNIEnv* env, jobject jexpander, int32_t vector_idx, uint8_t* buffer, + JavaResizableBuffer(JNIEnv* env, jobject jexpander, jmethodID jmethod, int32_t vector_idx, uint8_t* buffer, int32_t len) : ResizableBuffer(buffer, len), env_(env), jexpander_(jexpander), - vector_idx_(vector_idx) { + vector_idx_(vector_idx), + method_(jmethod) { size_ = 0; } @@ -887,21 +900,21 @@ class JavaResizableBuffer : public arrow::ResizableBuffer { private: JNIEnv* env_; jobject jexpander_; + jmethodID method_; int32_t vector_idx_; }; Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { // callback into java to expand the buffer - jobject ret = env_->CallObjectMethod(jexpander_, vector_expander_method_, vector_idx_, + + //LR TODO listvector_expander_method_ vector_expander_method_ + jobject ret = env_->CallObjectMethod(jexpander_, method_, vector_idx_, new_capacity); - std::cout << "Buffer expand: New capacity is " << new_capacity << - " vector id " << vector_idx_ << " expander method " << vector_expander_method_ << - " jexpander_ " << jexpander_ << std::endl; if (env_->ExceptionCheck()) { env_->ExceptionDescribe(); env_->ExceptionClear(); std::cout << "Buffer expand failed. New capacity is " << new_capacity << - " vector id " << vector_idx_ << " expander method " << vector_expander_method_ << + " vector id " << vector_idx_ << " expander method " << method_ << " jexpander_ " << jexpander_ << std::endl; return Status::OutOfMemory("buffer expand failed in java."); } @@ -909,8 +922,14 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); + std::cout << "Buffer expand: New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << method_ << + " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << + " and the original buffer ptr=" << data_ << " and the new ptr=" << ret_address << std::endl; + data_ = reinterpret_cast(ret_address); capacity_ = ret_capacity; + return Status::OK(); } @@ -984,6 +1003,15 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( << in_batch->ToString() << " there are " << out_bufs_len << " buffers " << std::endl;*/ + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + << " there are " << out_bufs_len << " buffers " + << std::endl; + for (int i = 0; i < out_bufs_len; i++) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + << " buffer " << i + << "length " << out_sizes[i] + << std::endl; + } std::shared_ptr selection_vector; auto selection_buffer = std::make_shared( @@ -1047,11 +1075,11 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer size=" << data_sz << std::endl; buffers.push_back(std::make_shared( - env, jexpander, output_vector_idx, value_buf, data_sz)); + env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else if (field->type()->id() == arrow::Type::LIST) { //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding buffer size=" << data_sz << std::endl; buffers.push_back(std::make_shared( - env, jexpander, output_vector_idx, value_buf, data_sz)); + env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else { buffers.push_back(std::make_shared(value_buf, data_sz)); } @@ -1071,18 +1099,22 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //LR TODO the two buffers... data_sz = out_sizes[sz_idx++]; - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer size=" << data_sz << std::endl; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer " << buf_idx + << " size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( - env, jListExpander, output_vector_idx, child_offset_buf, data_sz)); + env, jListExpander, listvector_expander_method_, output_vector_idx, child_offset_buf, data_sz)); + + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer " << buf_idx + << " size=" << out_sizes[sz_idx] << " outsize index=" << sz_idx << " address " << out_bufs[buf_idx] + << " output_vector_idx=" << output_vector_idx << std::endl; data_sz = out_sizes[sz_idx++]; - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( - env, jListExpander, output_vector_idx, child_data_buf, data_sz)); + env, jListExpander, listvector_expander_method_, output_vector_idx, child_data_buf, data_sz)); std::shared_ptr dt2 = std::make_shared(); auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); @@ -1135,15 +1167,37 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" << (array_data->child_data[0])->length << " using numRecords=" << numRecords << std::endl; + std::cout << "LR jni_common out_bufs[3]=" << out_bufs[3] << " after eval=" + << (jlong)(array_data->child_data[0])->buffers[1]->data() << std::endl; + //LR test1 + out_bufs[3] = (jlong)(array_data->child_data[0])->buffers[1]->data(); + out_sizes[3] = (jlong)(array_data->child_data[0])->buffers[1]->capacity(); + + //Copy the new buffer ptr back to Java. The above two lines don't copy it to java, just to the local array. + env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); + env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); + //env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT); + //memcpy((void*)out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); + //out_sizes[3] = recordSize; + //int test[] = {42,21,42,21,42}; + //memcpy((void *)out_bufs[3], test, 20); + - memcpy(&out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); - out_sizes[3] = recordSize; + + //std::cout << "LR jni_common the (validity)? buffer has size=" << out_sizes[2] << " and the first thing is " + //<< out_bufs[2] << " and the second is " << (out_bufs[2])[1] std::endl; //validity buffer? - bool valid[] = {true, true, true, true, true}; - memcpy(&out_bufs[2], valid, 5); - out_sizes[2] = 5; + //bool valid[] = {true, true, true, true, true}; + //memcpy(&out_bufs[2], valid, 5); + //out_sizes[2] = 5; + + + + + + //offset buffer is not needed. //int32_t offsetsBuffer[] = {0}; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index 7019f396b9677..85ff261e3d85e 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -61,6 +61,8 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { while (vector.getDataVector().getFieldBuffers().get(0).capacity() < toCapacity) { vector.reAlloc(); } + System.out.println("LR Expanding ListVector. New capacity=" + + vector.getDataVector().getFieldBuffers().get(0).capacity()); return new ExpandResult( vector.getDataVector().getFieldBuffers().get(0).memoryAddress(), vector.getDataVector().getFieldBuffers().get(0).capacity()); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index ae904bb64a9e1..c412b5f1baae1 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -413,10 +413,13 @@ private void evaluate(int numRows, List buffers, List buf //vector offset logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); - ((ListVector) valueVector).reAlloc(); + /*((ListVector) valueVector).reAlloc(); ((ListVector) valueVector).reAlloc(); ((ListVector) valueVector).reAlloc(); //100 rows - + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); + */ + //This doesnt actually allocate any memory. //((ListVector) valueVector).setInitialCapacity(1000000); //while (((ListVector) valueVector).getValueCapacity() < 1000000) { @@ -455,6 +458,7 @@ private void evaluate(int numRows, List buffers, List buf } logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); + logger.error("LR Projector.java before evaluateProjector buffer[3]=" + outAddrs[3]); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, hasVariableWidthColumns ? new ListVectorExpander(resizableListVectors) : null, @@ -469,7 +473,7 @@ private void evaluate(int numRows, List buffers, List buf //System.out.println(intVector.getDataVector()); - //logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3][3 * 4]); + logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3]); for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { //LR HACK @@ -492,6 +496,8 @@ public void endList() { listStarted = false; */ + //ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); /*for (int i = 0; i < 50; i++) { System.out.println("LR arrowbuf=" + Integer.reverseBytes(ab2.getInt(i))); @@ -519,6 +525,7 @@ public void endList() { //Not sure whats going on. Buffer too small? try { writer.writeInt(ab2.getInt(index)); + //writer.writeInt(42); } catch (IndexOutOfBoundsException e) { continue; } From 2f33cda9ce41c7dcae893b37445d0b8d72bf7b76 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 27 Sep 2023 12:49:36 -0700 Subject: [PATCH 18/46] Everything working. --- cpp/src/arrow/buffer.h | 15 +- cpp/src/gandiva/annotator.cc | 8 +- cpp/src/gandiva/array_ops.cc | 32 ++- cpp/src/gandiva/decimal_ir.h | 2 +- cpp/src/gandiva/gdv_function_stubs.cc | 10 +- cpp/src/gandiva/llvm_generator.cc | 5 +- java/gandiva/src/main/cpp/jni_common.cc | 131 +++++++++--- .../gandiva/evaluator/ListVectorExpander.java | 33 ++- .../arrow/gandiva/evaluator/Projector.java | 192 +++++++++++++++--- 9 files changed, 354 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 9270c4dea3fb6..66da004c2beb4 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -444,10 +444,21 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { return Reserve(sizeof(T) * new_nb_elements); } + public: + uint8_t* offsetBuffer; + int64_t offsetCapacity; + protected: - ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} + ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) { + offsetBuffer = nullptr; + offsetCapacity = 0; + + } ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr mm) - : MutableBuffer(data, size, std::move(mm)) {} + : MutableBuffer(data, size, std::move(mm)) { + offsetBuffer = nullptr; + offsetCapacity = 0; + } }; /// \defgroup buffer-allocation-functions Functions for allocating buffers diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 2d91ba43ab435..4cc0e1dc29bb8 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -53,7 +53,7 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { } if (field->type()->id() == arrow::Type::LIST) { - //std::cout << "LR Annotator::MakeDesc 1" << std::endl; + std::cout << "LR Annotator::MakeDesc 1" << std::endl; offsets_idx = buffer_count_++; if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { child_offsets_idx = buffer_count_++; @@ -91,7 +91,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, if (desc.HasOffsetsIdx()) { uint8_t* offsets_buf = const_cast(array_data.buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << " using idx=" << buffer_idx << std::endl; eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); if (desc.HasChildOffsetsIdx()) { @@ -139,7 +139,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, uint8_t* data_buf = const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 0 " << &data_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting offset eval buffer idx=" << buffer_idx << " data=" << &data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); //std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; } @@ -158,7 +158,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // list data buffer is in child data buffer uint8_t* data_buf_ptr = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 2 " << &data_buf_ptr << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval data buffer " << buffer_idx << " data=" << &data_buf_ptr << std::endl; eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.child_data.at(0)->offset); diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index fd04a84986974..0cbac7942bb06 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -95,7 +95,7 @@ int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int3 //return reinterpret_cast(ret); return reinterpret_cast(ret); } - +/* int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { //std::cout << "LR array_int32_remove data=" << remove_data @@ -125,6 +125,36 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, //return reinterpret_cast(ret); return reinterpret_cast(ret); } +*/ +int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { + //std::cout << "LR array_int32_remove data=" << remove_data + // << " entry_offsets_len " << entry_offsets_len << std::endl; + + std::vector newInts; + + for (int i = 0; i < entry_offsets_len; i++) { + //std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_item = *(entry_buf + (i * 1)); + //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + if (entry_item == remove_data) { + continue; + } else { + newInts.push_back(entry_item); + } + } + + *out_len = newInts.size(); + int32_t outBufferLength = *out_len * sizeof(int); + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); + memcpy(ret, newInts.data(), outBufferLength); + //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + + + //return reinterpret_cast(ret); + return reinterpret_cast(ret); +} int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len) { diff --git a/cpp/src/gandiva/decimal_ir.h b/cpp/src/gandiva/decimal_ir.h index 1a7cad7107036..b11730f1e231e 100644 --- a/cpp/src/gandiva/decimal_ir.h +++ b/cpp/src/gandiva/decimal_ir.h @@ -29,7 +29,7 @@ namespace gandiva { class DecimalIR : public FunctionIRBuilder { public: explicit DecimalIR(Engine* engine) - : FunctionIRBuilder(engine), enable_ir_traces_(true) {} + : FunctionIRBuilder(engine), enable_ir_traces_(false) {} /// Build decimal IR functions and add them to the engine. static Status AddFunctions(Engine* engine); diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index a82ba23974c09..3e506f83a33c3 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -173,14 +173,20 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, context->set_error_msg(status.message().c_str()); \ return -1; \ } \ - std::cout << "LR populate_list slot " << slot << " offset = " << offset << " buffer = " << \ - (int64_t)(buffer->mutable_data() + offset) << std::endl; \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ + std::cout << "LR gdv_fn_populate buffer=" << buffer->data() << std::endl; \ + std::cout << " and offset=" << offsets << " * =" << *offsets << std::endl; \ + std::cout << "Setting offset slot=" << slot << "=" << offset / SCALE << std::endl; \ + std::cout << "Setting offset slot+1=" << slot + 1 << "=" << offset / SCALE + entry_len << std::endl; \ + offsets = reinterpret_cast(buffer->offsetBuffer); \ offsets[slot] = offset / SCALE; \ offsets[slot + 1] = offset / SCALE + entry_len; \ return 0; \ } + //buffer->offsetBuffer[slot] = offset / SCALE; + //buffer->offsetBuffer[slot + 1] = offset / SCALE + entry_len; + POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) POPULATE_NUMERIC_LIST_TYPE_VECTOR(int64_t, 8) POPULATE_NUMERIC_LIST_TYPE_VECTOR(float, 4) diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index b3de4ac524387..a97c8b02b07ac 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -57,7 +57,7 @@ namespace gandiva { } }*/ -LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(true) {} +LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(false) {} Status LLVMGenerator::Make(std::shared_ptr config, bool cached, std::unique_ptr* llvm_generator) { @@ -472,7 +472,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, output_value->length()->print(output2); - //std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," + std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," + << output_offset_ref << "," << loop_var << std::endl; // << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 32ecf6beee1f8..1f647e0e2797b 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -84,10 +84,15 @@ static jclass gandiva_exception_; static jclass vector_expander_class_; static jclass listvector_expander_class_; static jclass vector_expander_ret_class_; +static jclass list_expander_ret_class_; static jmethodID vector_expander_method_; static jmethodID listvector_expander_method_; static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; +static jfieldID list_expander_ret_address_; +static jfieldID list_expander_ret_capacity_; +static jfieldID list_expander_offset_ret_address_; +static jfieldID list_expander_offset_ret_capacity_; static jclass secondary_cache_class_; static jmethodID cache_get_method_; @@ -141,11 +146,25 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { vector_expander_ret_class_ = (jclass)env->NewGlobalRef(local_expander_ret_class); env->DeleteLocalRef(local_expander_ret_class); + jclass local_list_expander_ret_class = + env->FindClass("org/apache/arrow/gandiva/evaluator/ListVectorExpander$ExpandResult"); + list_expander_ret_class_ = (jclass)env->NewGlobalRef(local_list_expander_ret_class); + env->DeleteLocalRef(local_list_expander_ret_class); + vector_expander_ret_address_ = env->GetFieldID(vector_expander_ret_class_, "address", "J"); vector_expander_ret_capacity_ = env->GetFieldID(vector_expander_ret_class_, "capacity", "J"); + list_expander_ret_address_ = + env->GetFieldID(list_expander_ret_class_, "address", "J"); + list_expander_ret_capacity_ = + env->GetFieldID(list_expander_ret_class_, "capacity", "J"); + list_expander_offset_ret_address_ = + env->GetFieldID(list_expander_ret_class_, "offsetaddress", "J"); + list_expander_offset_ret_capacity_ = + env->GetFieldID(list_expander_ret_class_, "offsetcapacity", "J"); + jclass local_cache_class = env->FindClass("org/apache/arrow/gandiva/evaluator/JavaSecondaryCacheInterface"); secondary_cache_class_ = (jclass)env->NewGlobalRef(local_cache_class); @@ -175,8 +194,9 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { env->DeleteGlobalRef(configuration_builder_class_); env->DeleteGlobalRef(gandiva_exception_); env->DeleteGlobalRef(vector_expander_class_); - env->DeleteGlobalRef(vector_expander_class_); + env->DeleteGlobalRef(listvector_expander_class_); env->DeleteGlobalRef(vector_expander_ret_class_); + env->DeleteGlobalRef(list_expander_ret_class_); env->DeleteGlobalRef(secondary_cache_class_); env->DeleteGlobalRef(cache_buf_ret_class_); } @@ -884,12 +904,14 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build class JavaResizableBuffer : public arrow::ResizableBuffer { public: JavaResizableBuffer(JNIEnv* env, jobject jexpander, jmethodID jmethod, int32_t vector_idx, uint8_t* buffer, - int32_t len) + int32_t len, bool isListVec = false) : ResizableBuffer(buffer, len), env_(env), jexpander_(jexpander), vector_idx_(vector_idx), - method_(jmethod) { + method_(jmethod), + isList(isListVec) + { size_ = 0; } @@ -897,11 +919,12 @@ class JavaResizableBuffer : public arrow::ResizableBuffer { Status Reserve(const int64_t new_capacity) override; - private: + public: JNIEnv* env_; jobject jexpander_; jmethodID method_; int32_t vector_idx_; + bool isList; }; Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { @@ -919,16 +942,36 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { return Status::OutOfMemory("buffer expand failed in java."); } - jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); - jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); - std::cout << "Buffer expand: New capacity is " << new_capacity << + if (isList) { + jlong ret_address = env_->GetLongField(ret, list_expander_ret_address_); + jlong ret_capacity = env_->GetLongField(ret, list_expander_ret_capacity_); + jlong offset_ret_address = env_->GetLongField(ret, list_expander_offset_ret_address_); + jlong offset_ret_capacity = env_->GetLongField(ret, list_expander_offset_ret_capacity_); + + std::cout << "Buffer expand: New capacity is " << new_capacity << " vector id " << vector_idx_ << " expander method " << method_ << " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << - " and the original buffer ptr=" << data_ << " and the new ptr=" << ret_address << std::endl; + " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << + " and the original offset ptr=" << reinterpret_cast(offsetBuffer) << " and the new ptr=" << offset_ret_address << std::endl; - data_ = reinterpret_cast(ret_address); - capacity_ = ret_capacity; + data_ = reinterpret_cast(ret_address); + capacity_ = ret_capacity; + + offsetBuffer = reinterpret_cast(offset_ret_address); + offsetCapacity = offset_ret_capacity; + } else { + jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); + jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); + + std::cout << "Buffer expand: New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << method_ << + " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << + " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << std::endl; + + data_ = reinterpret_cast(ret_address); + capacity_ = ret_capacity; + } return Status::OK(); } @@ -1039,6 +1082,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } + std::shared_ptr outBufJava = nullptr; auto ret_types = holder->rettypes(); ArrayDataVector output; int buf_idx = 0; @@ -1047,14 +1091,14 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( for (FieldPtr field : ret_types) { std::vector> buffers; - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer" << std::endl; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* validity_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong bitmap_sz = out_sizes[sz_idx++]; buffers.push_back(std::make_shared(validity_buf, bitmap_sz)); if (arrow::is_binary_like(field->type()->id())) { - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding buffer" << std::endl; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding bufferbuffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* offsets_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong offsets_sz = out_sizes[sz_idx++]; @@ -1064,7 +1108,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* value_buf = reinterpret_cast(out_bufs[buf_idx++]); - jlong data_sz = out_sizes[sz_idx++] * 1000; + jlong data_sz = out_sizes[sz_idx++]; if (arrow::is_binary_like(field->type()->id())) { if (jexpander == nullptr) { status = Status::Invalid( @@ -1073,17 +1117,20 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer size=" << data_sz << std::endl; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; buffers.push_back(std::make_shared( env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else if (field->type()->id() == arrow::Type::LIST) { - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding buffer size=" << data_sz << std::endl; - buffers.push_back(std::make_shared( + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding list offset buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; + std::cout << " size=" << out_sizes[sz_idx - 1] << " outsize index=" << sz_idx - 1 << " address " << out_bufs[buf_idx - 1] + << " output_vector_idx=" << output_vector_idx << std::endl; + buffers.push_back(std::make_shared( env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else { buffers.push_back(std::make_shared(value_buf, data_sz)); } + if (field->type()->id() == arrow::Type::LIST) { std::vector> child_buffers; @@ -1099,7 +1146,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //LR TODO the two buffers... data_sz = out_sizes[sz_idx++]; - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding buffer " << buf_idx + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding child nbuffer " << buf_idx << " size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); @@ -1107,14 +1154,18 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( env, jListExpander, listvector_expander_method_, output_vector_idx, child_offset_buf, data_sz)); - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding buffer " << buf_idx + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding child buffer " << buf_idx << " size=" << out_sizes[sz_idx] << " outsize index=" << sz_idx << " address " << out_bufs[buf_idx] << " output_vector_idx=" << output_vector_idx << std::endl; data_sz = out_sizes[sz_idx++]; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); - child_buffers.push_back(std::make_shared( - env, jListExpander, listvector_expander_method_, output_vector_idx, child_data_buf, data_sz)); + + outBufJava = std::make_shared( + env, jListExpander, listvector_expander_method_, output_vector_idx, child_data_buf, data_sz, true); + outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); + outBufJava->offsetCapacity = out_sizes[1]; + child_buffers.push_back(outBufJava); std::shared_ptr dt2 = std::make_shared(); auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); @@ -1163,29 +1214,51 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //LRTest1 Start int numRecords = (array_data->child_data[0])->length; //int numRecords = (array_data->child_data[0])->length * array_data->length; - int recordSize = numRecords * 4; //LR TODO HACK - std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" - << (array_data->child_data[0])->length << " using numRecords=" << numRecords << std::endl; - std::cout << "LR jni_common out_bufs[3]=" << out_bufs[3] << " after eval=" - << (jlong)(array_data->child_data[0])->buffers[1]->data() << std::endl; + //std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" + // << (array_data->child_data[0])->length << " using numRecords=" << numRecords << std::endl; + //std::cout << "LR jni_common out_bufs[3]=" << out_bufs[3] << " after eval=" + // << (jlong)(array_data->child_data[0])->buffers[1]->data() << std::endl; //LR test1 out_bufs[3] = (jlong)(array_data->child_data[0])->buffers[1]->data(); out_sizes[3] = (jlong)(array_data->child_data[0])->buffers[1]->capacity(); //Copy the new buffer ptr back to Java. The above two lines don't copy it to java, just to the local array. - env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); - env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); + //env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); + //env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); + + //array_data.child_data.at(0)->offset) + //env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT); //memcpy((void*)out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); //out_sizes[3] = recordSize; //int test[] = {42,21,42,21,42}; //memcpy((void *)out_bufs[3], test, 20); + /*out_sizes[2] = numRecords * 20; + int test[numRecords * 20]; + for (int i = 0; i < numRecords; i++) { + test[i] = 0; + } + memcpy((void *)out_bufs[2], test, numRecords*4); + */ + + //LR test1 Havent tried yet. + //out_bufs[2] = (jlong)(array_data->child_data[0])->buffers[0]->data(); + //out_sizes[2] = (jlong)(array_data->child_data[0])->buffers[0]->capacity(); + + //out_bufs[1] = (jlong)(array_data->child_data[0])->buffers[0]->data(); + //out_sizes[1] = (jlong)(array_data->child_data[0])->buffers[0]->capacity(); + + //out_bufs[1] = (jlong)(array_data)->buffers[0]->data(); + //out_sizes[1] = (jlong)(array_data)->buffers[0]->capacity(); + out_bufs[1] = (jlong) outBufJava->offsetBuffer; + out_sizes[1] = (jlong) outBufJava->offsetCapacity; + + env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); + env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); - //std::cout << "LR jni_common the (validity)? buffer has size=" << out_sizes[2] << " and the first thing is " - //<< out_bufs[2] << " and the second is " << (out_bufs[2])[1] std::endl; //validity buffer? diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index 85ff261e3d85e..c14d2e810e83b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -17,6 +17,7 @@ package org.apache.arrow.gandiva.evaluator; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.complex.ListVector; /** @@ -36,10 +37,22 @@ public ListVectorExpander(ListVector[] vectors) { public static class ExpandResult { public long address; public long capacity; + public long offsetaddress; + public long offsetcapacity; - public ExpandResult(long address, long capacity) { + /** + * fdsfsdfds. + * @param address dsfds + * @param capacity dfsdf + * @param offsetad dsfdsfsd + * @param offsetcap dfsfs + * + */ + public ExpandResult(long address, long capacity, long offsetad, long offsetcap) { this.address = address; this.capacity = capacity; + this.offsetaddress = offsetad; + this.offsetcapacity = offsetcap; } } @@ -57,15 +70,25 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { throw new IllegalArgumentException("invalid index " + index); } + int valueBufferIndex = 1; ListVector vector = vectors[index]; - while (vector.getDataVector().getFieldBuffers().get(0).capacity() < toCapacity) { + while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { vector.reAlloc(); } System.out.println("LR Expanding ListVector. New capacity=" + - vector.getDataVector().getFieldBuffers().get(0).capacity()); + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity()); + System.out.println("LR Expanding ListVector. Offset data is "); + ArrowBuf ab = vector.getOffsetBuffer(); + String s = "offsetBuffer = ["; + for (int i = 0; i < 20; i++) { + s += ab.getInt(i) + ","; + } + System.out.println(s); return new ExpandResult( - vector.getDataVector().getFieldBuffers().get(0).memoryAddress(), - vector.getDataVector().getFieldBuffers().get(0).capacity()); + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).memoryAddress(), + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity(), + vector.getOffsetBuffer().memoryAddress(), + vector.getOffsetBuffer().capacity()); } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index c412b5f1baae1..89321f5911ad6 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -27,13 +27,12 @@ import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.SelectionVectorType; import org.apache.arrow.memory.ArrowBuf; -import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; -import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.ipc.message.ArrowBuffer; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; @@ -380,6 +379,7 @@ private void evaluate(int numRows, List buffers, List buf outAddrs[idx] = valueVector.getValidityBuffer().memoryAddress(); outSizes[idx++] = valueVector.getValidityBuffer().capacity(); if (isVarWidth) { + logger.error("LR Projector.java evaluate isVarWidth setting buffer=" + idx); outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); hasVariableWidthColumns = true; @@ -393,6 +393,12 @@ private void evaluate(int numRows, List buffers, List buf } if (valueVector instanceof ListVector) { + /*((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); //100 rows + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc();*/ + hasVariableWidthColumns = true; resizableListVectors[outColumnIdx] = (ListVector) valueVector; //LR TODO figure out what to use here resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; @@ -403,22 +409,20 @@ private void evaluate(int numRows, List buffers, List buf logger.error("LR Projector.java evaluate ListVector has buffers=" + fieldBufs.size()); + logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); //vector valid + logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); //vector offset logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); - /*((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc(); //100 rows - ((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc(); - */ + + //This doesnt actually allocate any memory. //((ListVector) valueVector).setInitialCapacity(1000000); @@ -426,11 +430,12 @@ private void evaluate(int numRows, List buffers, List buf // ((ListVector) valueVector).reAlloc(); //} + logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); //The realloc avoids dynamic resizing, will have to be fixed later. - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); - outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); - logger.error("LR Projector.java evaluate ListVector set buffer " + idx + - " as ptr=" + outAddrs[idx - 1] + " size " + outSizes[idx - 1]); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).capacity(); + //logger.error("LR Projector.java evaluate ListVector set buffer " + idx + + // " as ptr=" + outAddrs[idx - 1] + " size " + outSizes[idx - 1]); //vector data //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); @@ -459,6 +464,7 @@ private void evaluate(int numRows, List buffers, List buf logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); logger.error("LR Projector.java before evaluateProjector buffer[3]=" + outAddrs[3]); + logger.error("LR Projector.java before evaluateProjector buffer[1]=" + outAddrs[1]); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, hasVariableWidthColumns ? new ListVectorExpander(resizableListVectors) : null, @@ -474,6 +480,7 @@ private void evaluate(int numRows, List buffers, List buf logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3]); + logger.error("LR Projector.java after evaluateProjector buffer[1]=" + outAddrs[1]); for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { //LR HACK @@ -498,25 +505,14 @@ public void endList() { //ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); - /*for (int i = 0; i < 50; i++) { - System.out.println("LR arrowbuf=" + Integer.reverseBytes(ab2.getInt(i))); - System.out.println("LR arrowbuf=" + ab2.getInt(i)); - System.out.println("LR arrowbuf=" + ab2.getShort(i)); - System.out.println("LR arrowbuf=" + ab2.getInt(i * 4)); - System.out.println("LR arrowbuf======"); - }*/ - /*ArrowBuf ab = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); - for (int i = 0; i < 50; i++) { - System.out.println("LR arrowbuf2=" + Integer.reverseBytes(ab.getInt(i))); - System.out.println("LR arrowbuf2=" + ab.getInt(i)); - System.out.println("LR arrowbuf2=" + ab.getShort(i)); - }*/ + //ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); logger.error("LR Projector.java using numRecords=" + selectionVectorRecordCount + " outSizes[3]=" + outSizes[3]); - UnionListWriter writer = ((ListVector) valueVector).getWriter(); + + //import org.apache.arrow.vector.complex.impl.UnionListWriter; + /*UnionListWriter writer = ((ListVector) valueVector).getWriter(); for (int i = 0; i < selectionVectorRecordCount; i++) { writer.startList(); writer.setPosition(i); @@ -533,7 +529,147 @@ public void endList() { writer.setValueCount(5); writer.endList(); } - ((ListVector) valueVector).setValueCount(selectionVectorRecordCount); + ((ListVector) valueVector).setValueCount(selectionVectorRecordCount);*/ + + + //offsetBuffer = [0,83886080,327680,1280,5,167772160,655360,2560,10,251658240,983040,3840,15, + //335544320,1310720,5120,20, + //419430400,1638400,6400,25,503316480,1966080,7680,30,587202560,2293760,8960,35,671088640,2621440,10240,40, + //754974720,2949120,11520, + + + + + + + + + + + + String s = ""; + List fv = ((ListVector) valueVector).getDataVector().getFieldBuffers(); + for (ArrowBuf ab : fv) { + s = ""; + for (int i = 0; i < 20; i++) { + s += ab.getInt(i) + ","; + } + logger.error("LR Projector.java before updating listvector. size=" + + ab.capacity() + " buffer=" + s); + } + + ArrowBuf fvv = ((ListVector) valueVector).getValidityBuffer(); + s = ""; + for (int i = 0; i < 20; i++) { + s += fvv.getInt(i) + ","; + } + logger.error("LR Projector.java before updating listvector. getValidityBuffer=" + + fvv.capacity() + " buffer=" + s); + + ArrowBuf fvvv = ((ListVector) valueVector).getOffsetBuffer(); + s = ""; + for (int i = 0; i < 20; i++) { + s += fvvv.getInt(i) + ","; + } + logger.error("LR Projector.java before updating listvector. getOffsetBuffer=" + + fvvv.capacity() + " buffer=" + s); + + + ((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); + + ((ListVector) valueVector).setLastSet(selectionVectorRecordCount - 1); + /* + //Validity then data. + ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + ArrowBuf abb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + List outBufsNew = new ArrayList(); + + //outBufsNew.add(ab0); + outBufsNew.add(abb); + outBufsNew.add(abb2); + ArrowFieldNode afn = new ArrowFieldNode(selectionVectorRecordCount * 5, 0); + ((ListVector) valueVector).getDataVector().clear(); + ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + + //TODO Need to get validity [0] and offset [1] buffer for the listvector. + //((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + + List outBufsNew2 = new ArrayList(); + + + + ArrowBuf mabb22 = new ArrowBuf(ReferenceManager.NO_OP, null, selectionVectorRecordCount, outAddrs[0]); + for (int i = 0; i < selectionVectorRecordCount; i++) { + BitVectorHelper.setBit(mabb22, i); + } + + ArrowBuf mabb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[1], outAddrs[1]); + //for (int i = 0; i < selectionVectorRecordCount; i++) { + // mabb2.setInt(i * 4, 5 * i); + //} + s = "offset? buffer mabb2, outAddrs[0]="; + for (int i = 0; i < 20; i++) { + s += mabb2.getInt(i) + ","; + } + System.out.println(s); + + outBufsNew2.add(mabb22); + outBufsNew2.add(mabb2); + ArrowFieldNode afn2 = new ArrowFieldNode(selectionVectorRecordCount, 0); + ((ListVector) valueVector).loadFieldBuffers(afn2, outBufsNew2); + + + */ + + //((ListVector) valueVector).setValueCount(selectionVectorRecordCount); + //((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount); + + int simple = 0; + try { + for (int i = 0; i < selectionVectorRecordCount * 5; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + simple++; + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + try { + for (int i = 0; i < selectionVectorRecordCount; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); + simple++; + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + + + + + + + /* + + + + try { + for (int i = 0; i < selectionVectorRecordCount; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); + simple++; + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + + + for (int i = 0; i < selectionVectorRecordCount; i++) { + ((ListVector) valueVector).getOffsetBuffer().setInt(i * 4, 5 * i); + } + */ + + + + + //LR HACK 9-13 10:34 All the multiline comment From e29a751f79e3f241d1fffcc48832b70f90db5a67 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 28 Sep 2023 13:13:50 -0700 Subject: [PATCH 19/46] Remove some logging. --- cpp/src/gandiva/gdv_function_stubs.cc | 4 ---- .../arrow/gandiva/evaluator/Projector.java | 18 +++++++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 3e506f83a33c3..e307220aad0e2 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -174,10 +174,6 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, return -1; \ } \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ - std::cout << "LR gdv_fn_populate buffer=" << buffer->data() << std::endl; \ - std::cout << " and offset=" << offsets << " * =" << *offsets << std::endl; \ - std::cout << "Setting offset slot=" << slot << "=" << offset / SCALE << std::endl; \ - std::cout << "Setting offset slot+1=" << slot + 1 << "=" << offset / SCALE + entry_len << std::endl; \ offsets = reinterpret_cast(buffer->offsetBuffer); \ offsets[slot] = offset / SCALE; \ offsets[slot + 1] = offset / SCALE + entry_len; \ diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 89321f5911ad6..b9801f6aacf2c 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -462,9 +462,9 @@ private void evaluate(int numRows, List buffers, List buf outColumnIdx++; } - logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); - logger.error("LR Projector.java before evaluateProjector buffer[3]=" + outAddrs[3]); - logger.error("LR Projector.java before evaluateProjector buffer[1]=" + outAddrs[1]); + //logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); + //logger.error("LR Projector.java before evaluateProjector buffer[3]=" + outAddrs[3]); + //logger.error("LR Projector.java before evaluateProjector buffer[1]=" + outAddrs[1]); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, hasVariableWidthColumns ? new ListVectorExpander(resizableListVectors) : null, @@ -479,8 +479,8 @@ private void evaluate(int numRows, List buffers, List buf //System.out.println(intVector.getDataVector()); - logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3]); - logger.error("LR Projector.java after evaluateProjector buffer[1]=" + outAddrs[1]); + //logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3]); + //logger.error("LR Projector.java after evaluateProjector buffer[1]=" + outAddrs[1]); for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { //LR HACK @@ -508,8 +508,8 @@ public void endList() { //ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); - logger.error("LR Projector.java using numRecords=" + - selectionVectorRecordCount + " outSizes[3]=" + outSizes[3]); + // logger.error("LR Projector.java using numRecords=" + + // selectionVectorRecordCount + " outSizes[3]=" + outSizes[3]); //import org.apache.arrow.vector.complex.impl.UnionListWriter; /*UnionListWriter writer = ((ListVector) valueVector).getWriter(); @@ -546,7 +546,7 @@ public void endList() { - + /* String s = ""; List fv = ((ListVector) valueVector).getDataVector().getFieldBuffers(); for (ArrowBuf ab : fv) { @@ -573,7 +573,7 @@ public void endList() { } logger.error("LR Projector.java before updating listvector. getOffsetBuffer=" + fvvv.capacity() + " buffer=" + s); - + */ ((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); From eb0c4979fe25ba4b59d262e4fe8028db023a43bb Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 10 Oct 2023 14:18:26 -0700 Subject: [PATCH 20/46] experiement 1 --- cpp/src/arrow/buffer.h | 3 + cpp/src/gandiva/annotator.cc | 19 +- cpp/src/gandiva/array_ops.cc | 72 ++++- cpp/src/gandiva/array_ops.h | 8 +- cpp/src/gandiva/bitmap_accumulator.h | 3 + cpp/src/gandiva/compiled_expr.h | 3 +- cpp/src/gandiva/dex.h | 4 + cpp/src/gandiva/engine.cc | 3 +- cpp/src/gandiva/expr_decomposer.cc | 4 +- cpp/src/gandiva/field_descriptor.h | 9 +- cpp/src/gandiva/function_registry_array.cc | 2 +- cpp/src/gandiva/gdv_function_stubs.cc | 24 +- cpp/src/gandiva/llvm_generator.cc | 132 ++++++--- cpp/src/gandiva/llvm_generator_test.cc | 67 +++++ cpp/src/gandiva/llvm_types.h | 2 + cpp/src/gandiva/local_bitmaps_holder.h | 2 + cpp/src/gandiva/lvalue.h | 57 +++- cpp/src/gandiva/tests/list_test.cc | 269 +++++++++++++++++- java/gandiva/src/main/cpp/jni_common.cc | 10 + .../gandiva/evaluator/ListVectorExpander.java | 8 +- .../arrow/gandiva/evaluator/Projector.java | 25 +- 21 files changed, 653 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 66da004c2beb4..598b393e5a80c 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -447,17 +447,20 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { public: uint8_t* offsetBuffer; int64_t offsetCapacity; + uint8_t* validityBuffer; protected: ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) { offsetBuffer = nullptr; offsetCapacity = 0; + validityBuffer = nullptr; } ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr mm) : MutableBuffer(data, size, std::move(mm)) { offsetBuffer = nullptr; offsetCapacity = 0; + validityBuffer = nullptr; } }; diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 4cc0e1dc29bb8..3e593c9f99fe3 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -63,8 +63,12 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { if (is_output) { data_buffer_ptr_idx = buffer_count_++; } + int child_valid_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; + if (is_output) { + child_valid_buffer_ptr_idx = buffer_count_++; + } return std::make_shared(field, data_idx, validity_idx, offsets_idx, - data_buffer_ptr_idx, child_offsets_idx); + data_buffer_ptr_idx, child_offsets_idx, child_valid_buffer_ptr_idx); } int Annotator::AddHolderPointer(void* holder) { @@ -104,6 +108,13 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3 " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); + + uint8_t* child_valid_buf = reinterpret_cast( + array_data.child_data.at(0)->buffers[0].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3 " << &child_valid_buf << std::endl; + eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, + array_data.child_data.at(0)->offset); + } else { //std::cout << "LR Annotator::PrepareBuffersForField 2" << std::endl; // if list field is input field, just put buffer data into eval batch @@ -112,6 +123,12 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2 " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); + + uint8_t* child_valid_buf = const_cast( + array_data.child_data.at(0)->buffers[0]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2 " << &child_valid_buf << std::endl; + eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_valid_buf, + array_data.child_data.at(0)->offset); } } if (array_data.type->id() != arrow::Type::LIST || diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 0cbac7942bb06..15cac9dcf7d96 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -17,6 +17,7 @@ #include "gandiva/array_ops.h" +#include #include #include @@ -126,31 +127,84 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, return reinterpret_cast(ret); } */ + + + + int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { + int32_t entry_offsets_len, const int32_t* notSureWhatThisIs, bool entry_valid, + int32_t remove_data, bool entry_validWhat, bool* valid_buf, int32_t* out_len, int32_t* valid_ptr) { //std::cout << "LR array_int32_remove data=" << remove_data // << " entry_offsets_len " << entry_offsets_len << std::endl; + std::cout << "LR array_int32_remove" << std::endl; std::vector newInts; - + + + /*std::bitset<8> validBits(*entry_valid); //LR TODO handle size. + std::bitset<8> outputValidBits; + std::cout << "LR Entry bitset is " << validBits << std::endl; for (int i = 0; i < entry_offsets_len; i++) { //std::cout << "LR going to check " << entry_buf + i << std::endl; int32_t entry_item = *(entry_buf + (i * 1)); //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; if (entry_item == remove_data) { continue; + } else if (!validBits[i]) { + outputValidBits[i] = 0; + newInts.push_back(0); //This will be marked invalid, so data doesn't matter. + } else { + outputValidBits[i] = 1; + //Note the vector can have n elements, while validbits might have n+1. + newInts.push_back(entry_item); + } + }*/ + + entry_validWhat = true; + std::bitset<8> outputValidBits; + std::vector outValid; + for (int i = 0; i < entry_offsets_len; i++) { + //std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_item = *(entry_buf + (i * 1)); + //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + if (entry_item == remove_data) { + outValid.push_back(false); + newInts.push_back(42); + outputValidBits[i] = 0; + entry_validWhat = false; } else { + outValid.push_back(true); + //Note the vector can have n elements, while validbits might have n+1. newInts.push_back(entry_item); + outputValidBits[i] = 1; } } - *out_len = newInts.size(); - int32_t outBufferLength = *out_len * sizeof(int); + *out_len = (int)newInts.size(); + int32_t outBufferLength = (int)*out_len * sizeof(int); //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); memcpy(ret, newInts.data(), outBufferLength); //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + bool validBools[*out_len]; + for (unsigned int i = 0; i < outValid.size(); i++) { + validBools[i] = outValid[i]; + } + + uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); + //memcpy(validRet, validBools, *out_len); + unsigned long ll = outputValidBits.to_ulong(); + memcpy(validRet, &ll, 1); + //*valid_len = 1; + std::cout << "LR valid_buf is " << valid_buf << std::endl; + std::cout << "LR outputValidBits is " << outputValidBits << std::endl; + valid_buf = reinterpret_cast(validRet); + + valid_ptr = reinterpret_cast(validRet); + std::cout << "LR setting valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " valid_ptr bitset data is " << std::bitset<8>(*valid_ptr) + << " return value is " << reinterpret_cast(ret) << std::endl; + //return reinterpret_cast(ret); return reinterpret_cast(ret); @@ -217,12 +271,18 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { args = {types->i64_type(), // int64_t execution_context types->i32_ptr_type(), // int8_t* data ptr types->i32_type(), // int32_t child offsets length + types->i32_ptr_type(), // Not Sure??? + types->i1_type(), // bool validity types->i32_type(), //value to remove from input - types->i32_ptr_type()}; // out array length + types->i1_type(), // bool validity + types->i1_ptr_type(), //valid buffer + types->i32_ptr_type(), // out array length + types->i32_ptr_type() //valid_ptr + + }; engine->AddGlobalMappingForFunc("array_int32_remove", types->i32_ptr_type(), args, reinterpret_cast(array_int32_remove)); - } } // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 76c158f0e27f3..5886dadb38306 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -51,6 +51,12 @@ int32_t* array_int32_make_array(int64_t context_ptr, GANDIVA_EXPORT int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, + const int32_t* notSureWhatThisIs, + bool entry_valid, int32_t remove_data, - int32_t* out_len); + bool entry_validWhat, + bool* valid_buf, + int32_t* out_len, + int32_t* valid_ptr); + } diff --git a/cpp/src/gandiva/bitmap_accumulator.h b/cpp/src/gandiva/bitmap_accumulator.h index 9eaec81763786..2a115f3830e23 100644 --- a/cpp/src/gandiva/bitmap_accumulator.h +++ b/cpp/src/gandiva/bitmap_accumulator.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "arrow/util/macros.h" @@ -36,9 +37,11 @@ class GANDIVA_EXPORT BitMapAccumulator : public DexDefaultVisitor { void Visit(const VectorReadValidityDex& dex) { int idx = dex.ValidityIdx(); + std::cout << "LR BitMapAccumulator visiting " << idx << std::endl; auto bitmap = eval_batch_.GetBuffer(idx); // The bitmap could be null. Ignore it in this case. if (bitmap != NULLPTR) { + std::cout << "LR BitMapAccumulator is not null " << bitmap << std::endl; src_maps_.push_back(bitmap); src_map_offsets_.push_back(eval_batch_.GetBufferOffset(idx)); } diff --git a/cpp/src/gandiva/compiled_expr.h b/cpp/src/gandiva/compiled_expr.h index 4933e7f4922f6..b4244aae63380 100644 --- a/cpp/src/gandiva/compiled_expr.h +++ b/cpp/src/gandiva/compiled_expr.h @@ -36,7 +36,8 @@ class CompiledExpr { ValueValidityPairPtr value_validity() const { return value_validity_; } - FieldDescriptorPtr output() const { return output_; } + FieldDescriptorPtr output() const { + return output_; } void SetFunctionName(SelectionVector::Mode mode, std::string& name) { ir_functions_[static_cast(mode)] = name; diff --git a/cpp/src/gandiva/dex.h b/cpp/src/gandiva/dex.h index c35ee93dc03a2..95053ddabfb75 100644 --- a/cpp/src/gandiva/dex.h +++ b/cpp/src/gandiva/dex.h @@ -90,6 +90,10 @@ class GANDIVA_EXPORT VectorReadFixedLenValueListDex : public VectorReadBaseDex { int OffsetsIdx() const { return field_desc_->offsets_idx(); } + int ValidityIdx() const { return field_desc_->validity_idx(); } + + int ChildValidityIdx() const { return field_desc_->child_data_validity_idx(); } + void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } }; diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 80e60ab7ba721..2033919cde2b3 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -301,7 +301,8 @@ Status Engine::FinalizeModule() { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); //LR Turning this off seems to provide better error messages with compilation/generation failures. - if (optimize_) { + //if (optimize_) { + if (false) { // misc passes to allow for inlining, vectorization, .. std::unique_ptr pass_manager( new llvm::legacy::PassManager()); diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 72c992df11c7e..ec1a8f9e16039 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -38,7 +38,7 @@ namespace gandiva { Status ExprDecomposer::Visit(const FieldNode& node) { auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field()); - //std::cout << "LR ExprDecomposer" << std::endl; + std::cout << "LR ExprDecomposer" << std::endl; DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; if (desc->HasChildOffsetsIdx()) { @@ -126,7 +126,9 @@ Status ExprDecomposer::Visit(const FunctionNode& in_node) { } else { DCHECK(native_function->result_nullable_type() == kResultNullInternal); + //LR TODO Need validity? // Add a local bitmap to track the output validity. + std::cout << "LR Making a nullable function holder with validity." << std::endl; int local_bitmap_idx = annotator_.AddLocalBitMap(); auto validity_dex = std::make_shared(local_bitmap_idx); diff --git a/cpp/src/gandiva/field_descriptor.h b/cpp/src/gandiva/field_descriptor.h index 7b2d0c3b4fa92..2b72c45837fdb 100644 --- a/cpp/src/gandiva/field_descriptor.h +++ b/cpp/src/gandiva/field_descriptor.h @@ -31,13 +31,14 @@ class FieldDescriptor { FieldDescriptor(FieldPtr field, int data_idx, int validity_idx = kInvalidIdx, int offsets_idx = kInvalidIdx, int data_buffer_ptr_idx = kInvalidIdx, - int child_offsets_idx = kInvalidIdx) + int child_offsets_idx = kInvalidIdx, int child_validity_idx = kInvalidIdx) : field_(field), data_idx_(data_idx), validity_idx_(validity_idx), offsets_idx_(offsets_idx), data_buffer_ptr_idx_(data_buffer_ptr_idx), - child_offsets_idx_(child_offsets_idx) {} + child_offsets_idx_(child_offsets_idx), + child_validity_idx_(child_validity_idx) {} /// Index of validity array in the array-of-buffers int validity_idx() const { return validity_idx_; } @@ -53,6 +54,7 @@ class FieldDescriptor { /// Index of list type child data offsets int child_data_offsets_idx() const { return child_offsets_idx_; } + int child_data_validity_idx() const { return child_validity_idx_; } FieldPtr field() const { return field_; } @@ -65,6 +67,8 @@ class FieldDescriptor { bool HasChildOffsetsIdx() const { return child_offsets_idx_ != kInvalidIdx; } + bool HasChildValidityIdx() const { return child_validity_idx_ != kInvalidIdx; } + private: FieldPtr field_; int data_idx_; @@ -72,6 +76,7 @@ class FieldDescriptor { int offsets_idx_; int data_buffer_ptr_idx_; int child_offsets_idx_; + int child_validity_idx_; }; } // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index dc81b6b4601c3..826deb24bbbf0 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -38,7 +38,7 @@ std::vector GetArrayFunctionRegistry() { list(int32()), kResultNullIfNull, "array_int32_make_array", NativeFunction::kNeedsContext), NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int32()), int32()}, - list(int32()), kResultNullIfNull, "array_int32_remove", + list(int32()), kResultNullInternal, "array_int32_remove", NativeFunction::kNeedsContext), /*NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, boolean(), kResultNullIfNull, "array_int64_contains_int64", diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index e307220aad0e2..cf85ffcc055e0 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -163,9 +163,10 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, #define POPULATE_NUMERIC_LIST_TYPE_VECTOR(TYPE, SCALE) \ int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ int32_t* offsets, int64_t slot, \ - TYPE* entry_buf, int32_t entry_len) { \ + TYPE* entry_buf, int32_t entry_len, int32_t* valid_ptr) { \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ + std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; \ auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ @@ -173,13 +174,28 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, context->set_error_msg(status.message().c_str()); \ return -1; \ } \ + std::cout << "LR gdv_fn_populate_list_ 2 valid_ptr" << valid_ptr << std::endl; \ + std::cout << "LR gdv_fn_populate_list_ " << buffer << " " << offset; \ + std::cout << " " << entry_len << " " << SCALE << "]]" << std::endl; \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ + std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; \ + std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; \ + std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; \ + int v[6] = {255, 255, 255, 255, 255, 255}; \ + memcpy(buffer->validityBuffer + slot, v, 6); \ + std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << std::endl; \ + std::bitset<8> bs(*valid_ptr); \ + std::cout << "LR bitset of valid ptr is " << bs << std::endl; \ offsets = reinterpret_cast(buffer->offsetBuffer); \ offsets[slot] = offset / SCALE; \ offsets[slot + 1] = offset / SCALE + entry_len; \ + std::cout << "LR gdv_fn_populate_list_ Done" << std::endl; \ return 0; \ } +// int32_t vv = 5; + //memcpy(buffer->validityBuffer + slot, *vv, 1); + //buffer->offsetBuffer[slot] = offset / SCALE; //buffer->offsetBuffer[slot + 1] = offset / SCALE + entry_len; @@ -1269,7 +1285,7 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { #define ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(LLVM_TYPE, DATA_TYPE) \ args = {types->i64_type(), types->i8_ptr_type(), types->i32_ptr_type(), \ - types->i64_type(), types->LLVM_TYPE##_ptr_type(), types->i32_type()}; \ + types->i64_type(), types->LLVM_TYPE##_ptr_type(), types->i32_type(), types->i32_ptr_type()}; \ engine->AddGlobalMappingForFunc( \ "gdv_fn_populate_list_" #DATA_TYPE "_vector", types->i32_type() /*return_type*/, \ args, reinterpret_cast(gdv_fn_populate_list_##DATA_TYPE##_vector)); @@ -1315,7 +1331,9 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i64_type(), // int64_t slot types->i8_ptr_type(), // const char* entry_buf types->i32_ptr_type(), // int32_t* entry child offsets ptr - types->i32_type()}; // int32_t entry child offsets length + types->i32_type(), // int32_t entry child offsets length + types->i32_ptr_type() // int32_t* entry child valid ptr + }; engine->AddGlobalMappingForFunc( "gdv_fn_populate_list_varlen_vector", types->i32_type() /*return_type*/, args, diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index a97c8b02b07ac..f17d3a44ee367 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -36,7 +36,7 @@ namespace gandiva { AddTrace(__VA_ARGS__); \ } -/*namespace { +namespace { std::string printType(llvm::Type* t) { if (t == nullptr) { return std::string("null"); @@ -55,7 +55,7 @@ namespace gandiva { t->print(output); return str; } -}*/ +} LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(false) {} @@ -118,7 +118,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode //std::cout << "LR LLVMGenerator::Build 2" << std::endl; //Too much logging. needle in haystack? - //std::cout << "LR LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; + std::cout << "LR LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); //std::cout << "LR LLVMGenerator::Build FinalizeModule" << std::endl; @@ -152,12 +152,12 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); - //std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; + std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); - //std::cout << "LR LLVMGenerator::Execute 2" << std::endl; + std::cout << "LR LLVMGenerator::Execute 2" << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -167,7 +167,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } - //std::cout << "LR LLVMGenerator::Execute 3" << std::endl; + std::cout << "LR LLVMGenerator::Execute 3" << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -177,7 +177,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } - //std::cout << "LR LLVMGenerator::Execute A1" << std::endl; + std::cout << "LR LLVMGenerator::Execute A1" << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -189,7 +189,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); - //std::cout << "LR LLVMGenerator::Execute A2" << std::endl; + std::cout << "LR LLVMGenerator::Execute A2" << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -412,12 +412,15 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } // The visitor can add code to both the entry/loop blocks. - //std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; + std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, slice_offsets, arg_context_ptr, position_var); value_expr->Accept(visitor); LValuePtr output_value = visitor.result(); + //std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; + std::cout << "LR output_value from visitor is " << output_value->to_string() << std::endl; + // The "current" block may have changed due to code generation in the visitor. llvm::BasicBlock* loop_body_tail = builder->GetInsertBlock(); @@ -473,23 +476,25 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," - << output_offset_ref << "," << loop_var << std::endl; + << output_offset_ref << "," << loop_var << + " output_value->data() " << output_value->data() << " output_value->validity() " << output_value->validity() << + " output_value->length() " << output_value->length() << std::endl; // << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, - loop_var, output_value->data(), output_value->length()}); + loop_var, output_value->data(), output_value->length(), output_value->validity()}); } else if (output_list_internal_type == arrow::Type::INT64) { AddFunctionCall("gdv_fn_populate_list_int64_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, - loop_var, output_value->data(), output_value->length()}); + loop_var, output_value->data(), output_value->length(), output_value->validity()}); } else if (output_list_internal_type == arrow::Type::FLOAT) { AddFunctionCall("gdv_fn_populate_list_float_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, - loop_var, output_value->data(), output_value->length()}); + loop_var, output_value->data(), output_value->length(), output_value->validity()}); } else if (output_list_internal_type == arrow::Type::DOUBLE) { AddFunctionCall("gdv_fn_populate_list_double_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, - loop_var, output_value->data(), output_value->length()}); + loop_var, output_value->data(), output_value->length(), output_value->validity()}); } else { return Status::NotImplemented("list internal type ", output->Type()->field(0)->type()->ToString(), @@ -599,6 +604,12 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, uint8_t* dst_bitmap = eval_batch->GetBuffer(out_idx); // Compute the destination bitmap. if (selection_vector == nullptr) { + std::cout << "LR blarg" << std::endl; + std::cout << "LR bitmap array buffer index is " << out_idx << " bitset is " << std::bitset<8>(*dst_bitmap) << std::endl; + std::cout << "LR bitmap array buffer index is " << 0 << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(0)) << std::endl; + std::cout << "LR bitmap thing getting the validity buffer " << compiled_expr.output()->child_data_validity_idx() << std::endl; + std::cout << "LR Eval buffer has " << eval_batch->GetNumBuffers() << std::endl; + // << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(compiled_expr.output()->child_data_validity_idx() )) << std::endl; accumulator.ComputeResult(dst_bitmap); } else { /// The output bitmap is an intersection of some input/local bitmaps. However, with a @@ -607,10 +618,23 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, /// /// 1. Do the intersection of input/local bitmaps to generate a temporary bitmap. /// 2. copy just the relevant bits from the temporary bitmap to the output bitmap. + LocalBitMapsHolder bit_map_holder(eval_batch->num_records(), 1); uint8_t* temp_bitmap = bit_map_holder.GetLocalBitMap(0); accumulator.ComputeResult(temp_bitmap); + + std::cout << "LR computing bitmap. Size is " << bit_map_holder.GetLocalBitMapSize() << std::endl; + for (int i = 0; i < bit_map_holder.GetLocalBitMapSize(); i++) { + uint8_t* arr = bit_map_holder.GetLocalBitMap(i); + std::cout << "LR bitmap array [" << i << "] size is " << bit_map_holder.GetNumRecords() << " bitset is " << std::bitset<8>(*arr) << std::endl; + + } + + + + + auto num_out_records = selection_vector->GetNumSlots(); // the memset isn't required, doing it just for valgrind. memset(dst_bitmap, 0, arrow::bit_util::BytesForBits(num_out_records)); @@ -733,21 +757,21 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - //std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; - //std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - //std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; + std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; arrow::Type::type at = arrow::Type::INT32; type = types->IRType(at); //type = types->DataVecType(dex.FieldType()); - //std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; // compute list len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); - //std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << - // printType(offsets_slot_index) << std::endl; + std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << + printType(offsets_slot_index) << std::endl; // => offset_start = offsets[loop_var] slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); @@ -772,6 +796,24 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { llvm::Value* data_list = builder->CreateGEP(type, slot_ref, slot_index); // TODO: handle bool type bitmap + //Validity bitmap. + //llvm::Value* b_slot_ref = GetBufferReference(dex.ValidityIdx(), kBufferTypeValidity, dex.Field()); + //llvm::Value* b_slot_index = + // builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); + //llvm::Value* validity = generator_->GetPackedValidityBitValue(b_slot_ref, b_slot_index); + + llvm::Value* b_slot_index = + builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); + llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); + llvm::Value* validity = builder->CreateGEP(type, b_slot_ref, b_slot_index); + + std::string str3 = "validity:"; + if (validity) { + llvm::raw_string_ostream output3(str3); + validity->print(output3); + } + std::cout << "LR VectorReadFixedLenValueListDex using validity " << str3 << std::endl; + // TODO: handle decimal precision and scale //std::cout << "LR VectorReadFixedLenValueListDex slot_ref " << printType(slot_ref) << std::endl; @@ -779,7 +821,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { // " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", list_len); - result_.reset(new LValue(data_list, list_len)); + result_.reset(new LValue(data_list, list_len, validity)); } void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { @@ -1113,6 +1155,13 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { auto result_valid_i8 = builder->CreateLoad(types->i8_type(), result_valid_ptr); llvm::Value* result_valid = builder->CreateTrunc(result_valid_i8, types->i1_type()); + std::bitset<8> bs(dex.local_bitmap_idx()); + std::cout <<"LR NullableInternal validity from dex.local_bitmap_idx()=" << bs << std::endl; + + + auto result_valid_i8ptr = builder->CreateLoad(types->i8_ptr_type(), result_valid_ptr); + + std::cout << "LR NullableInternal function param validity=" << result_valid_i8ptr << std::endl; // set validity bit in the local bitmap. ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid); } @@ -1482,6 +1531,7 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& pair) { // generate code for value + std::cout << "LR LLVMGenerator::Visitor::BuildValueAndValidity" << std::endl; auto value_expr = pair.value_expr(); value_expr->Accept(*this); auto value = result()->data(); @@ -1501,7 +1551,10 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); - //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; + for (unsigned int i = 0; i < params->size(); i++) { + std::cout << "LR param " << i << printType(params->at(i)) << std::endl; + } if (arrow_return_type_id == arrow::Type::DECIMAL) { // For decimal fns, the output precision/scale are passed along as parameters. // @@ -1528,6 +1581,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, } // add extra arg for return length for variable len return types (allocated on stack). llvm::AllocaInst* result_len_ptr = nullptr; + llvm::AllocaInst* valid_ptr = nullptr; if (arrow::is_binary_like(arrow_return_type_id)) { //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is binary like" << std::endl; result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, @@ -1543,7 +1597,10 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, params->push_back(result_len_ptr); has_arena_allocs_ = true; - + valid_ptr = new llvm::AllocaInst(generator_->types()->i32_ptr_type(), 0, + "valid_ptr", entry_block_); + std::cout << "LR allocinst for valid_ptr=" << printType(valid_ptr) << std::endl; + params->push_back(valid_ptr); } //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; @@ -1564,9 +1621,13 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr->getAllocatedType(), result_len_ptr); - //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE" << std::endl; - - return std::make_shared(value, value_len); + auto validity = + (valid_ptr == nullptr) + ? nullptr + : builder->CreateLoad(valid_ptr->getAllocatedType(), valid_ptr); + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using validity=" << validity << " ptr=" << valid_ptr << std::endl; + std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using value_len=" << value_len << " ptr=" << result_len_ptr << std::endl; + return std::make_shared(value, value_len, validity); } } @@ -1581,7 +1642,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(arg_context_ptr_); } - //std::cout << "LR BuildParams1" << std::endl; + std::cout << "LR BuildParams1" << std::endl; // if the function has holder, add the holder pointer. if (holder_idx != -1) { auto builder = ir_builder(); @@ -1590,7 +1651,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( llvm::BasicBlock* saved_block = builder->GetInsertBlock(); builder->SetInsertPoint(entry_block_); - //std::cout << "LR BuildParams1a" << std::endl; + std::cout << "LR BuildParams1a" << std::endl; auto holder = generator_->LoadVectorAtIndex( arg_holder_ptrs_, generator_->types()->i64_type(), holder_idx, "holder"); @@ -1598,24 +1659,25 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(holder); } - //std::cout << "LR BuildParams2" << std::endl; + std::cout << "LR BuildParams2" << std::endl; // build the function params, along with the validities. for (auto& pair : args) { // build value. DexPtr value_expr = pair->value_expr(); - //std::cout << "LR BuildParams2a" << std::endl; + std::cout << "LR BuildParams2a" << std::endl; value_expr->Accept(*this); - //std::cout << "LR BuildParams2b" << std::endl; + std::cout << "LR BuildParams2b" << std::endl; LValue& result_ref = *result(); // append all the parameters corresponding to this LValue. result_ref.AppendFunctionParams(¶ms); - //std::cout << "LR BuildParams2c" << std::endl; + std::cout << "LR BuildParams2c" << std::endl; // build validity. if (with_validity) { llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs()); params.push_back(validity_expr); + std::cout << "LR BuildParams2d adding combined validity" << std::endl; } } @@ -1735,9 +1797,9 @@ std::string LLVMGenerator::ReplaceFormatInTrace(const std::string& in_msg, } void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { - if (!enable_ir_traces_) { - return; - } + //if (!enable_ir_traces_) { + // return; + //} std::string dmsg = "IR_TRACE:: " + msg + "\n"; std::string print_fn_name = "printf"; diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 028893b0b4594..2c0c742eb79c2 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -114,5 +114,72 @@ TEST_F(TestLLVMGenerator, TestAdd) { EXPECT_THAT(out, testing::ElementsAre(6, 8, 10, 12)); EXPECT_EQ(out_bitmap, 0ULL); } +/* +TEST_F(TestLLVMGenerator, TestArrayRemove) { + // Setup LLVM generator to do an array remove. + std::unique_ptr generator; + ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), false, &generator)); + Annotator annotator; + + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + auto field0 = std::make_shared("f0", dt); + auto desc0 = annotator.CheckAndAddInputFieldDescriptor(field0); + auto validity_dex0 = std::make_shared(desc0); + auto value_dex0 = std::make_shared(desc0); + auto pair0 = std::make_shared(validity_dex0, value_dex0); + + auto field1 = std::make_shared("f1", arrow::int32()); + auto desc1 = annotator.CheckAndAddInputFieldDescriptor(field1); + auto validity_dex1 = std::make_shared(desc1); + auto value_dex1 = std::make_shared(desc1); + auto pair1 = std::make_shared(validity_dex1, value_dex1); + + DataTypeVector params{dt, arrow::int32()}; + auto func_desc = std::make_shared("array_removeGandiva", params, arrow::int32()); + FunctionSignature signature(func_desc->name(), func_desc->params(), + func_desc->return_type()); + const NativeFunction* native_func = + generator->function_registry_.LookupSignature(signature); + + std::vector pairs{pair0, pair1}; + auto func_dex = std::make_shared( + func_desc, native_func, FunctionHolderPtr(nullptr), -1, pairs); + + auto field_sum = std::make_shared("out", arrow::int32()); + auto desc_sum = annotator.CheckAndAddInputFieldDescriptor(field_sum); + + std::string fn_name = "codegen"; + + ASSERT_OK(generator->engine_->LoadFunctionIRs()); + ASSERT_OK(generator->CodeGenExprValue(func_dex, 4, desc_sum, 0, fn_name, + SelectionVector::MODE_NONE)); + + ASSERT_OK(generator->engine_->FinalizeModule()); + auto ir = generator->engine_->DumpIR(); + EXPECT_THAT(ir, testing::HasSubstr("vector.body")); + + EvalFunc eval_func = (EvalFunc)generator->engine_->CompiledFunction(fn_name); + + constexpr size_t kNumRecords = 4; + std::array a0{1, 2, 3, 4}; + std::array a1{5, 6, 7, 8}; + uint64_t in_bitmap = 0xffffffffffffffffull; + + std::array out{0, 0, 0, 0}; + uint64_t out_bitmap = 0; + + std::array addrs{ + reinterpret_cast(a0.data()), reinterpret_cast(&in_bitmap), + reinterpret_cast(a1.data()), reinterpret_cast(&in_bitmap), + reinterpret_cast(out.data()), reinterpret_cast(&out_bitmap), + }; + std::array addr_offsets{0, 0, 0, 0, 0, 0}; + eval_func(addrs.data(), addr_offsets.data(), nullptr, nullptr, nullptr, + 0 /* dummy context ptr */, kNumRecords); + + EXPECT_THAT(out, testing::ElementsAre(6, 8, 10, 12)); + EXPECT_EQ(out_bitmap, 0ULL); +}*/ } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index f235535423536..fc875c14d380a 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -64,6 +64,8 @@ class GANDIVA_EXPORT LLVMTypes { llvm::PointerType* ptr_type(llvm::Type* type) { return type->getPointerTo(); } + llvm::PointerType* i1_ptr_type() { return ptr_type(i1_type()); } + llvm::PointerType* i8_ptr_type() { return ptr_type(i8_type()); } llvm::PointerType* i32_ptr_type() { return ptr_type(i32_type()); } diff --git a/cpp/src/gandiva/local_bitmaps_holder.h b/cpp/src/gandiva/local_bitmaps_holder.h index a172fb973c4a5..4c3d55c47c585 100644 --- a/cpp/src/gandiva/local_bitmaps_holder.h +++ b/cpp/src/gandiva/local_bitmaps_holder.h @@ -44,6 +44,8 @@ class LocalBitMapsHolder { return local_bitmaps_array_.get()[idx]; } + int64_t GetNumRecords() { return num_records_; } + private: /// number of records in the current batch. int64_t num_records_; diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 7e6a5c2fb96eb..dbfe6dc2e18cb 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -32,7 +32,9 @@ class GANDIVA_EXPORT LValue { public: explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR, llvm::Value* validity = NULLPTR) - : data_(data), length_(length), validity_(validity) {} + : data_(data), length_(length), validity_(validity) { + std::cout << "LR created LValue " << to_string() << std::endl; + } virtual ~LValue() = default; llvm::Value* data() { return data_; } @@ -43,13 +45,41 @@ class GANDIVA_EXPORT LValue { // Append the params required when passing this as a function parameter. virtual void AppendFunctionParams(std::vector* params) { + std::cout << "LR LValue::AppendFunctionParams" << std::endl; params->push_back(data_); if (length_ != NULLPTR) { params->push_back(length_); } + if (validity_ != NULLPTR) { + params->push_back(validity_); + } } - private: + virtual std::string to_string() { + std::string s = "Base LValue"; + + std::string str1 = "data:"; + if (data_) { + llvm::raw_string_ostream output1(str1); + data_->print(output1); + } + + std::string str2 = "length:"; + if (length_) { + llvm::raw_string_ostream output2(str2); + length_->print(output2); + } + + std::string str3 = "validity:"; + if (validity_) { + llvm::raw_string_ostream output3(str3); + validity_->print(output3); + } + + return s + "\n" + str1 + "\n" + str2 + "\n" + str3; + } + + protected: llvm::Value* data_; llvm::Value* length_; llvm::Value* validity_; @@ -90,9 +120,32 @@ class GANDIVA_EXPORT ListLValue : public LValue { llvm::Value* offsets_length() { return offsets_length_; } void AppendFunctionParams(std::vector* params) override { + std::cout << "LR ListLValue::AppendFunctionParams" << std::endl; LValue::AppendFunctionParams(params); params->push_back(child_offsets_); params->push_back(offsets_length_); + params->push_back(validity_); + } + + virtual std::string to_string() override { + std::string s = "List LValue"; + + s += " " + LValue::to_string(); + + + std::string str1 = "child_offsets_:"; + if (child_offsets_) { + llvm::raw_string_ostream output1(str1); + child_offsets_->print(output1); + } + + std::string str2 = "offsets_length_:"; + if (offsets_length_) { + llvm::raw_string_ostream output2(str2); + offsets_length_->print(output2); + } + + return s + "\n" + str1 + "\n" + str2; } private: diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index 7936873d073c0..249980abbab84 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -49,7 +49,7 @@ class TestList : public ::testing::Test { template void _build_list_array(const vector& values, const vector& length, const vector& validity, arrow::MemoryPool* pool, - ArrayPtr* array) { + ArrayPtr* array, const vector& innerValidity = {}) { size_t sum = 0; for (auto& len : length) { sum += len; @@ -64,7 +64,11 @@ void _build_list_array(const vector& values, const vector& l if (validity[l]) { auto status = builder->Append(); for (int j = 0; j < length[l]; j++) { - ASSERT_OK(value_builder->Append(values[i])); + if (innerValidity.size() > (size_t)j && innerValidity[j] == false) { + auto v = value_builder->AppendNull(); + } else { + ASSERT_OK(value_builder->Append(values[i])); + } i++; } } else { @@ -77,6 +81,13 @@ void _build_list_array(const vector& values, const vector& l ASSERT_OK(builder->Finish(array)); } +template +void _build_list_array2(const vector& values, const vector& length, + const vector& validity, const vector& innerValidity, arrow::MemoryPool* pool, + ArrayPtr* array) { + return _build_list_array(values, length, validity, pool, array); + } + /* * expression: * input: a @@ -84,12 +95,12 @@ void _build_list_array(const vector& values, const vector& l * typeof(a) can be list / list / list */ void _test_list_type_field_alias(DataTypePtr type, ArrayPtr array, - arrow::MemoryPool* pool) { + arrow::MemoryPool* pool, int num_records = 5) { auto field_a = field("a", type); auto schema = arrow::schema({field_a}); auto result = field("res", type); - auto num_records = 5; + std::cout << array->ToString() << std::endl; assert(array->length() == num_records); auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array}); @@ -151,15 +162,250 @@ TEST_F(TestList, TestListInt64) { */ -TEST_F(TestList, TestListInt32) { +/*TEST_F(TestList, TestListInt32) { ArrayPtr array; - _build_list_array( - {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, - 50000}, - {5, 2, 3, 4, 1}, {true, false, true, true, true}, pool_, &array); - _test_list_type_field_alias(list(int32()), array, pool_); + _build_list_array2( + {10, 20, 30, 60, 70, 80}, + {3, 3}, {true, true}, {true, true, false, true, false, true}, pool_, &array); + _test_list_type_field_alias(list(int32()), array, pool_, 2); +}*/ + +TEST_F(TestList, TestConcatWS) { + // schema for input fields + + auto field_a = field("a", utf8()); + auto field_b = field("b", utf8()); + auto field_c = field("c", utf8()); + auto schema = arrow::schema({field_a, field_b, field_c}); + + // output fields + auto res = field("res", utf8()); + + // Create a row-batch with some sample data + int num_records = 2; + auto array_a = + MakeArrowArrayUtf8({"this", "this"}, {true, true}); + auto array_b = + MakeArrowArrayUtf8({"is", "is not"}, {true, true}); + auto array_c = + MakeArrowArrayUtf8({"a test", "a test"}, {true, true}); + + + // expected output + ArrayPtr exp1; + _build_list_array2( + {10, 30, 70, 80}, + {2, 2}, {true, true}, {true, true, true, true}, pool_, &exp1); + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); + + // build expressions. + // array_contains(a, b) + + //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + //std::vector field_nodes; + //auto node2 = TreeExprBuilder::MakeLiteral(42); + //field_nodes.push_back(node2); + + //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); + //auto expr = TreeExprBuilder::MakeExpression(func_node, res); + std::cout << "LR test is about to make expression " << std::endl; + auto expr = TreeExprBuilder::MakeExpression("concat_ws", {field_a, field_b, field_c}, res); + //////// + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + std::cout << "LR Test 2 " << std::endl; + //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); +} + +TEST_F(TestList, TestArrayRemove) { + // schema for input fields + auto field_b = field("b", int32()); + + auto field_a = field("a", list(int32())); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", list(int32())); + + // Create a row-batch with some sample data + int num_records = 2; + auto array_b = + MakeArrowArrayInt32({42, 42}, {true, true}); + + ArrayPtr array_a; + _build_list_array2( + {10, 42, 30, 42, 70, 80}, + {3, 3}, {true, true}, {true, true, true, true, true, true}, pool_, &array_a); + + // expected output + ArrayPtr exp1; + _build_list_array2( + {10, 30, 70, 80}, + {2, 2}, {true, true}, {true, true, true, true}, pool_, &exp1); + + // auto exp = MakeArrowArrayArray({ 42, 42, 44, 45, 46}, + // {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + + //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + //std::vector field_nodes; + //auto node2 = TreeExprBuilder::MakeLiteral(42); + //field_nodes.push_back(node2); + + //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); + //auto expr = TreeExprBuilder::MakeExpression(func_node, res); + std::cout << "LR test is about to make expression " << std::endl; + auto expr = TreeExprBuilder::MakeExpression("array_removeGandiva", {field_a, field_b}, res); + //////// + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + std::cout << "LR Test 2 " << std::endl; + //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); + + std::cout << "LR ==============================SECOND=WAY==================================================== " << std::endl; + + + + //Try the second method. + arrow::ArrayDataVector outputs2; + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + + + int num_records2 = 5; + std::vector> buffers; + + + + //int64_t size = arrow::bit_util::BytesForBits(num_records2); + int64_t size = 20; + auto bitmap_buffer = arrow::AllocateBuffer(size, pool_); + buffers.push_back(*std::move(bitmap_buffer)); + auto offsets_len = arrow::bit_util::BytesForBits((num_records2 + 1) * 32); + + auto offsets_buffer = arrow::AllocateBuffer(offsets_len*10, pool_); + buffers.push_back(*std::move(offsets_buffer)); + + std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; + //auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, 0, offsets_len); + //outputs2.push_back(array_data); + + + +std::vector> buffers2; +auto bitmap_buffer2 = arrow::AllocateBuffer(size, pool_); + buffers2.push_back(*std::move(bitmap_buffer2)); + + auto offsets_buffer2 = arrow::AllocateBuffer(offsets_len, pool_); + buffers2.push_back(*std::move(offsets_buffer2)); +std::shared_ptr dt2 = std::make_shared(); + + auto array_data_child = arrow::ArrayData::Make(dt2, num_records2, buffers2, 0, 0); + array_data_child->buffers = std::move(buffers2); + + std::vector> kids; + kids.push_back(array_data_child); + + +auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, kids, 0, 0); +array_data->buffers = std::move(buffers); +outputs2.push_back(array_data); + +std::cout << "LR Test " << array_data << " arra_data 0 is " << array_data->buffers[0] << std::endl; + //std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; + std::cout << "LR about to evaluate 2nd " << std::endl; + + status = projector->Evaluate(*(in_batch.get()), outputs2); + EXPECT_TRUE(status.ok()) << status.message(); + arrow::ArrayData ad = *outputs2.at(0); + arrow::ArraySpan sp(*ad.child_data.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); + + + + +for (auto& array_data : outputs2) { + auto child_data = array_data->child_data[0]; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + /* when allocate array data, child data length is an initialized value, + * after calculating, child data offsets buffer has been resized for results, + * but array data length is unchanged. + * We should recalculate child data length and make ArrayData with new length + * + * Otherwise, child data offsets buffer length is data length + 1 + * and offset data is int32_t, need use buffer->size()/4 - 1 + */ + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + array_data->buffers, {new_child_data}, + array_data->null_count, array_data->offset); + + + auto newArray = arrow::MakeArray(array_data); + //arrow::ArraySpan sp(newArray); + EXPECT_ARROW_ARRAY_EQUALS(exp1, newArray); +} + + + + std::cout << "LR ====================THIRD=WAY================================== " << std::endl; + { + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + +ArrayDataPtr output_data; + auto s = projector->AllocArrayData(dt, num_records2, pool_, &output_data); + ArrayDataVector output_data_vecs; + output_data_vecs.push_back(output_data); + + status = projector->Evaluate(*(in_batch.get()), output_data_vecs); + EXPECT_TRUE(status.ok()) << status.message(); + arrow::ArraySpan sp(*output_data_vecs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); + } } + TEST_F(TestList, TestMakeArray) { // schema for input fields auto field_b = field("b", int32()); @@ -171,7 +417,7 @@ TEST_F(TestList, TestMakeArray) { // Create a row-batch with some sample data int num_records = 5; auto array_b = - MakeArrowArrayInt32({42, 43, 44, 45, 46}); + MakeArrowArrayInt32({42, 43, 44, 45, 46}, {true, true, true, true, true}); // expected output auto exp1 = MakeArrowArrayInt32({ 1, 2, 3, 42, 5}, @@ -327,6 +573,7 @@ ArrayDataPtr output_data; } } + /* TEST_F(TestList, TestListArrayInt32) { gandiva::ExecutionContext ctx; diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 1f647e0e2797b..b37615f209d45 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -90,6 +90,7 @@ static jmethodID listvector_expander_method_; static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; static jfieldID list_expander_ret_address_; +static jfieldID list_expander_valid_address_; static jfieldID list_expander_ret_capacity_; static jfieldID list_expander_offset_ret_address_; static jfieldID list_expander_offset_ret_capacity_; @@ -164,6 +165,8 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { env->GetFieldID(list_expander_ret_class_, "offsetaddress", "J"); list_expander_offset_ret_capacity_ = env->GetFieldID(list_expander_ret_class_, "offsetcapacity", "J"); + list_expander_valid_address_ = + env->GetFieldID(list_expander_ret_class_, "validityddress", "J"); jclass local_cache_class = env->FindClass("org/apache/arrow/gandiva/evaluator/JavaSecondaryCacheInterface"); @@ -948,6 +951,7 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { jlong ret_capacity = env_->GetLongField(ret, list_expander_ret_capacity_); jlong offset_ret_address = env_->GetLongField(ret, list_expander_offset_ret_address_); jlong offset_ret_capacity = env_->GetLongField(ret, list_expander_offset_ret_capacity_); + jlong valid_address = env_->GetLongField(ret, list_expander_valid_address_); std::cout << "Buffer expand: New capacity is " << new_capacity << " vector id " << vector_idx_ << " expander method " << method_ << @@ -960,6 +964,8 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { offsetBuffer = reinterpret_cast(offset_ret_address); offsetCapacity = offset_ret_capacity; + std::cout << "LR Setting buffer validityBuffer to " << validityBuffer << std::endl; + validityBuffer = reinterpret_cast(valid_address); } else { jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); @@ -1165,6 +1171,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( env, jListExpander, listvector_expander_method_, output_vector_idx, child_data_buf, data_sz, true); outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); outBufJava->offsetCapacity = out_sizes[1]; + outBufJava->validityBuffer = reinterpret_cast(out_bufs[2]); child_buffers.push_back(outBufJava); std::shared_ptr dt2 = std::make_shared(); @@ -1255,6 +1262,9 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( out_bufs[1] = (jlong) outBufJava->offsetBuffer; out_sizes[1] = (jlong) outBufJava->offsetCapacity; + out_bufs[2] = (jlong) outBufJava->validityBuffer; + out_sizes[2] = (jlong) outBufJava->offsetCapacity; + env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index c14d2e810e83b..fc96846842a3e 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -39,6 +39,7 @@ public static class ExpandResult { public long capacity; public long offsetaddress; public long offsetcapacity; + public long validityddress; /** * fdsfsdfds. @@ -48,11 +49,12 @@ public static class ExpandResult { * @param offsetcap dfsfs * */ - public ExpandResult(long address, long capacity, long offsetad, long offsetcap) { + public ExpandResult(long address, long capacity, long offsetad, long offsetcap, long validAdd) { this.address = address; this.capacity = capacity; this.offsetaddress = offsetad; this.offsetcapacity = offsetcap; + this.validityddress = validAdd; } } @@ -71,6 +73,7 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { } int valueBufferIndex = 1; + int validBufferIndex = 0; ListVector vector = vectors[index]; while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { vector.reAlloc(); @@ -88,7 +91,8 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { vector.getDataVector().getFieldBuffers().get(valueBufferIndex).memoryAddress(), vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity(), vector.getOffsetBuffer().memoryAddress(), - vector.getOffsetBuffer().capacity()); + vector.getOffsetBuffer().capacity(), + vector.getDataVector().getFieldBuffers().get(validBufferIndex).memoryAddress()); } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index b9801f6aacf2c..4049b1bad3d4d 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -27,6 +27,7 @@ import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.SelectionVectorType; import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ValueVector; @@ -415,9 +416,10 @@ private void evaluate(int numRows, List buffers, List buf //vector valid logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); + //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); + //outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); - //vector offset logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); @@ -546,7 +548,7 @@ public void endList() { - /* + String s = ""; List fv = ((ListVector) valueVector).getDataVector().getFieldBuffers(); for (ArrowBuf ab : fv) { @@ -573,11 +575,20 @@ public void endList() { } logger.error("LR Projector.java before updating listvector. getOffsetBuffer=" + fvvv.capacity() + " buffer=" + s); - */ + ((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); ((ListVector) valueVector).setLastSet(selectionVectorRecordCount - 1); + + + ArrowBuf mabb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + s = "validity? buffer mabb2, outAddrs[2]="; + for (int i = 0; i < 20; i++) { + s += mabb2.getInt(i) + ","; + } + System.out.println(s); + /* //Validity then data. ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); @@ -624,7 +635,7 @@ public void endList() { //((ListVector) valueVector).setValueCount(selectionVectorRecordCount); //((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount); - int simple = 0; + /*TODO NEeD THIS int simple = 0; try { for (int i = 0; i < selectionVectorRecordCount * 5; i++) { BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); @@ -633,6 +644,8 @@ public void endList() { } catch (IndexOutOfBoundsException e) { simple = 0; } + */ + int simple = 0; try { for (int i = 0; i < selectionVectorRecordCount; i++) { BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); @@ -642,7 +655,7 @@ public void endList() { simple = 0; } - + From 7f62ffbba56b97591c828ba89a10f901008912cc Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 11 Oct 2023 10:54:04 -0700 Subject: [PATCH 21/46] Return validity basically working. --- cpp/src/gandiva/array_ops.cc | 16 ++++--- cpp/src/gandiva/array_ops.h | 2 +- cpp/src/gandiva/gdv_function_stubs.cc | 61 ++++++++++++++++++++++++--- cpp/src/gandiva/llvm_generator.cc | 2 +- 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 15cac9dcf7d96..a8b3a4c02b261 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -133,7 +133,7 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_offsets_len, const int32_t* notSureWhatThisIs, bool entry_valid, - int32_t remove_data, bool entry_validWhat, bool* valid_buf, int32_t* out_len, int32_t* valid_ptr) { + int32_t remove_data, bool entry_validWhat, bool* valid_buf, int32_t* out_len, int32_t** valid_ptr) { //std::cout << "LR array_int32_remove data=" << remove_data // << " entry_offsets_len " << entry_offsets_len << std::endl; @@ -160,8 +160,12 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, } }*/ - entry_validWhat = true; - std::bitset<8> outputValidBits; + std::cout << "LR notSureWhatThisIs=" << notSureWhatThisIs << std::endl; + //<< " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; + //std::bitset<10> maybeInputBits (*notSureWhatThisIs); + //std::cout << "LR maybeInputBits=" << maybeInputBits << std::endl; + entry_validWhat = true; + std::bitset<10> outputValidBits; std::vector outValid; for (int i = 0; i < entry_offsets_len; i++) { //std::cout << "LR going to check " << entry_buf + i << std::endl; @@ -199,10 +203,10 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, //*valid_len = 1; std::cout << "LR valid_buf is " << valid_buf << std::endl; std::cout << "LR outputValidBits is " << outputValidBits << std::endl; - valid_buf = reinterpret_cast(validRet); + //valid_buf = reinterpret_cast(validRet); - valid_ptr = reinterpret_cast(validRet); - std::cout << "LR setting valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " valid_ptr bitset data is " << std::bitset<8>(*valid_ptr) + *valid_ptr = reinterpret_cast(validRet); + std::cout << "LR setting valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " **valid_ptr=" << **valid_ptr << " valid_ptr bitset data is " << std::bitset<8>(**valid_ptr) << " return value is " << reinterpret_cast(ret) << std::endl; diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 5886dadb38306..d83a25dcf9c4d 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -57,6 +57,6 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, bool entry_validWhat, bool* valid_buf, int32_t* out_len, - int32_t* valid_ptr); + int32_t** valid_ptr); } diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index cf85ffcc055e0..e4e472451e733 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -163,7 +163,7 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, #define POPULATE_NUMERIC_LIST_TYPE_VECTOR(TYPE, SCALE) \ int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ int32_t* offsets, int64_t slot, \ - TYPE* entry_buf, int32_t entry_len, int32_t* valid_ptr) { \ + TYPE* entry_buf, int32_t entry_len, int32_t** valid_ptr) { \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; \ @@ -181,10 +181,8 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; \ std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; \ std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; \ - int v[6] = {255, 255, 255, 255, 255, 255}; \ - memcpy(buffer->validityBuffer + slot, v, 6); \ std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << std::endl; \ - std::bitset<8> bs(*valid_ptr); \ + std::bitset<8> bs((unsigned long)(*valid_ptr)); \ std::cout << "LR bitset of valid ptr is " << bs << std::endl; \ offsets = reinterpret_cast(buffer->offsetBuffer); \ offsets[slot] = offset / SCALE; \ @@ -195,11 +193,62 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, // int32_t vv = 5; //memcpy(buffer->validityBuffer + slot, *vv, 1); - +//memcpy(buffer->validityBuffer + slot, bs.to_ulong(), 1); //buffer->offsetBuffer[slot] = offset / SCALE; //buffer->offsetBuffer[slot + 1] = offset / SCALE + entry_len; -POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) +int32_t gdv_fn_populate_list_int32_t_vector(int64_t context_ptr, int8_t* data_ptr, + int32_t* offsets, int64_t slot, + int32_t* entry_buf, int32_t entry_len, int32_t** valid_ptr) { + int SCALE = 4; + auto buffer = reinterpret_cast(data_ptr); + int32_t offset = static_cast(buffer->size()); + std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; + auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); + if (!status.ok()) { + gandiva::ExecutionContext* context = + reinterpret_cast(context_ptr); + context->set_error_msg(status.message().c_str()); + return -1; + } + std::cout << "LR gdv_fn_populate_list_ 2 valid_ptr" << valid_ptr << std::endl; + std::cout << "LR gdv_fn_populate_list_ " << buffer << " " << offset; \ + std::cout << " " << entry_len << " " << SCALE << "]]" << std::endl; \ + memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); + std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; + std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; + std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; + //int v[6] = {255, 255, 255, 255, 255, 255}; + std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << std::endl; + int validbitIndex = offset / SCALE; + //int newValidSize = validbitIndex + entry_len; + + //TODO need to iterate over bits in valid_ptr since bitset rewuires compile time size. + std::bitset<10> bs((unsigned long)(*valid_ptr)); + for (int i = 0; i < entry_len; i++) { + + arrow::bit_util::SetBitTo(buffer->validityBuffer, validbitIndex + i, bs[i]); + } + + + /*std::bitset existingBits(buffer->validityBuffer); + std::cout << "LR bitset of existingBits " << existingBits << std::endl; + std::bitset<8> bs((unsigned long)(*valid_ptr)); + for (int i = 0; i < entry_len; i++) { + existingBits.set(validbitIndex + i, bs[i]); + } + std::bitset existingBits2(buffer->validityBuffer); + std::cout << "LR bitset of existingBits2 " << existingBits2 << std::endl; + */ + std::cout << "LR bitset of valid ptr is " << bs << std::endl; + offsets = reinterpret_cast(buffer->offsetBuffer); + offsets[slot] = offset / SCALE; + offsets[slot + 1] = offset / SCALE + entry_len; + std::cout << "LR gdv_fn_populate_list_ Done" << std::endl; + return 0; + } + +//POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) POPULATE_NUMERIC_LIST_TYPE_VECTOR(int64_t, 8) POPULATE_NUMERIC_LIST_TYPE_VECTOR(float, 4) POPULATE_NUMERIC_LIST_TYPE_VECTOR(double, 8) diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index f17d3a44ee367..283aa2ed91686 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -1624,7 +1624,7 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, auto validity = (valid_ptr == nullptr) ? nullptr - : builder->CreateLoad(valid_ptr->getAllocatedType(), valid_ptr); + : builder->CreateLoad(generator_->types()->i32_ptr_type(), valid_ptr); std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using validity=" << validity << " ptr=" << valid_ptr << std::endl; std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using value_len=" << value_len << " ptr=" << result_len_ptr << std::endl; return std::make_shared(value, value_len, validity); From 7a06e9e09bd73fd9470ce8a9a28daf0e0e98c329 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Mon, 16 Oct 2023 14:46:32 -0700 Subject: [PATCH 22/46] Working input validiy, but large data is broken again. --- cpp/src/gandiva/annotator.cc | 34 +++- cpp/src/gandiva/array_ops.cc | 78 ++++++-- cpp/src/gandiva/array_ops.h | 4 + cpp/src/gandiva/field_descriptor.h | 8 +- cpp/src/gandiva/llvm_generator.cc | 179 +++++++++++++++++- cpp/src/gandiva/llvm_generator.h | 5 +- cpp/src/gandiva/projector.cc | 8 +- java/gandiva/src/main/cpp/jni_common.cc | 22 ++- .../arrow/gandiva/evaluator/Projector.java | 8 +- 9 files changed, 305 insertions(+), 41 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 3e593c9f99fe3..9fa561fddb83d 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -64,9 +64,10 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { data_buffer_ptr_idx = buffer_count_++; } int child_valid_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; - if (is_output) { + //if (is_output) { child_valid_buffer_ptr_idx = buffer_count_++; - } + std::cout << "LR Annotator::MakeDesc 2 child_valid_buffer_ptr_idx=" << child_valid_buffer_ptr_idx << std::endl; + //} return std::make_shared(field, data_idx, validity_idx, offsets_idx, data_buffer_ptr_idx, child_offsets_idx, child_valid_buffer_ptr_idx); } @@ -105,13 +106,13 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // for resizing uint8_t* child_offsets_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3 " << &child_offsets_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3a " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); uint8_t* child_valid_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[0].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3 " << &child_valid_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3b " << &child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, array_data.child_data.at(0)->offset); @@ -120,13 +121,13 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // if list field is input field, just put buffer data into eval batch uint8_t* child_offsets_buf = const_cast( array_data.child_data.at(0)->buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2 " << &child_offsets_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2a " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); uint8_t* child_valid_buf = const_cast( array_data.child_data.at(0)->buffers[0]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2 " << &child_valid_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2b " << &child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_valid_buf, array_data.child_data.at(0)->offset); } @@ -154,11 +155,30 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, //std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; //std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; + std::cout << "LR array_data.child_data.at(0)->buffers[0]=" << array_data.child_data.at(0)->buffers[0] << std::endl; + //uint8_t* data_valid_buf = + // const_cast(array_data.child_data.at(0)->buffers[0]->data()); + //std::cout << "LR Annotator::PrepareBuffersForField setting offset eval data_valid_buf idx=" << 0 << " data_valid_buf=" << &data_valid_buf << std::endl; + //eval_batch->SetBuffer(desc.child_data_validity_idx(), data_valid_buf, array_data.child_data.at(0)->offset); + + uint8_t* data_buf = const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting offset eval buffer idx=" << buffer_idx << " data=" << &data_buf << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting data buffer desc.data_idx()=" << desc.data_idx() << " idx=" << buffer_idx << " data=" << data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); //std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; + + + std::cout << "LR array_data.child_data.at(0)->buffers[0]->data() is " << array_data.child_data.at(0)->buffers[0] << std::endl; + if (array_data.child_data.at(0)->buffers[0] ) { + uint8_t* child_valid_buf = const_cast( + array_data.child_data.at(0)->buffers[0]->data()); + //desc.set_child_data_validity_idx(4); + std::cout << "LR Annotator::PrepareBuffersForField setting child valid buffer -5b " << + " name=" << desc.Name() << " idx=" << desc.child_data_validity_idx() << " child_data_buf=" << *child_valid_buf << std::endl; + eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, 0); + } + } if (is_output) { diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index a8b3a4c02b261..3019d70bbb4ac 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -132,12 +132,14 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, const int32_t* notSureWhatThisIs, bool entry_valid, - int32_t remove_data, bool entry_validWhat, bool* valid_buf, int32_t* out_len, int32_t** valid_ptr) { + int32_t entry_offsets_len, const int32_t* notSureWhatThisIs, bool combined_validity, + int32_t remove_data, bool entry_validWhat, + /*const int32_t* array_valid_bits,*/ int64_t loop_var, int64_t validity_index_var, const int64_t* offsets, + bool* valid_buf, int32_t* out_len, int32_t** valid_ptr) { //std::cout << "LR array_int32_remove data=" << remove_data // << " entry_offsets_len " << entry_offsets_len << std::endl; - std::cout << "LR array_int32_remove" << std::endl; + std::cout << "LR array_int32_remove " << loop_var << std::endl; std::vector newInts; @@ -160,12 +162,32 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, } }*/ - std::cout << "LR notSureWhatThisIs=" << notSureWhatThisIs << std::endl; + std::cout << "LR entry_buf=" << entry_buf << " *entry_buf=" << entry_buf << std::endl; + std::cout << "LR notSureWhatThisIs=" << notSureWhatThisIs << " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; + std::cout << "LR combined_validity=" << combined_validity << " entry_validWhat=" << entry_validWhat << " validity_index_var=" << validity_index_var << std::endl; //<< " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; - //std::bitset<10> maybeInputBits (*notSureWhatThisIs); - //std::cout << "LR maybeInputBits=" << maybeInputBits << std::endl; + const int32_t* notSureWhatThisIsAdjusted = notSureWhatThisIs - (loop_var ); + std::bitset<15> maybeInputBits (*notSureWhatThisIsAdjusted); + std::cout << "LR maybeInputBits=" << maybeInputBits << std::endl; + + + int64_t validityBitIndex = 0; + //for (int i = 0; i < loop_var; i++) { + // validityBitIndex += *(offsets + i); + // std::cout << "LR i=" << i << " adding offset " << *(offsets + i) << " offset is " << offsets << std::endl; + //} +validityBitIndex = validity_index_var - entry_offsets_len; + //TODO temp until the buffer is worked out. + //validityBitIndex -= (loop_var); + + + std::cout << "Using validityBitIndex=" << validityBitIndex << std::endl; + + + entry_validWhat = true; - std::bitset<10> outputValidBits; + //std::bitset<10> outputValidBits; + std::vector outValid; for (int i = 0; i < entry_offsets_len; i++) { //std::cout << "LR going to check " << entry_buf + i << std::endl; @@ -174,35 +196,51 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, if (entry_item == remove_data) { outValid.push_back(false); newInts.push_back(42); - outputValidBits[i] = 0; + //outputValidBits[i] = 0; entry_validWhat = false; + //TODO temp until buffer is worked out } else if (!arrow::bit_util::GetBit(reinterpret_cast(array_valid_bits), validityBitIndex + i)) { + } else if (!arrow::bit_util::GetBit(reinterpret_cast(notSureWhatThisIsAdjusted), validityBitIndex + i)) { + outValid.push_back(false); + newInts.push_back(0); + //outputValidBits[i] = 0; } else { outValid.push_back(true); //Note the vector can have n elements, while validbits might have n+1. newInts.push_back(entry_item); - outputValidBits[i] = 1; + //outputValidBits[i] = 1; } } *out_len = (int)newInts.size(); + + //Since this function can remove values we don't know the length ahead of time. + //LR TODO divide by 8 and ensure at least 1? + uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); + for (int i = 0; i < outValid.size(); i++) { + arrow::bit_util::SetBitTo(validRet, i, outValid[i]); + } + int32_t outBufferLength = (int)*out_len * sizeof(int); //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); memcpy(ret, newInts.data(), outBufferLength); //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; - bool validBools[*out_len]; - for (unsigned int i = 0; i < outValid.size(); i++) { - validBools[i] = outValid[i]; - } - uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); - //memcpy(validRet, validBools, *out_len); - unsigned long ll = outputValidBits.to_ulong(); - memcpy(validRet, &ll, 1); + *valid_buf = true; + + + //unsigned long ll = outputValidBits.to_ulong(); + if (!combined_validity) { + //ll = 0; + *out_len = 0; + *valid_buf = false; //this one is what works for the top level validity. + entry_validWhat = false; + } + //LR no need, set along the way. memcpy(validRet, &ll, 1); //*valid_len = 1; std::cout << "LR valid_buf is " << valid_buf << std::endl; - std::cout << "LR outputValidBits is " << outputValidBits << std::endl; + //std::cout << "LR outputValidBits is " << outputValidBits << std::endl; //valid_buf = reinterpret_cast(validRet); *valid_ptr = reinterpret_cast(validRet); @@ -279,6 +317,10 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { types->i1_type(), // bool validity types->i32_type(), //value to remove from input types->i1_type(), // bool validity + //types->i32_ptr_type(), //in validity bitmap + types->i64_type(), //in loop var + types->i64_type(), //in validity_index_var + types->i64_ptr_type(), //in offsets types->i1_ptr_type(), //valid buffer types->i32_ptr_type(), // out array length types->i32_ptr_type() //valid_ptr diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index d83a25dcf9c4d..bd57a11dafe6a 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -55,6 +55,10 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, bool entry_valid, int32_t remove_data, bool entry_validWhat, + //const int32_t* array_valid_bits, + int64_t loop_var, + int64_t validity_index_var, + const int64_t* offsets, bool* valid_buf, int32_t* out_len, int32_t** valid_ptr); diff --git a/cpp/src/gandiva/field_descriptor.h b/cpp/src/gandiva/field_descriptor.h index 2b72c45837fdb..db7b89854335a 100644 --- a/cpp/src/gandiva/field_descriptor.h +++ b/cpp/src/gandiva/field_descriptor.h @@ -38,7 +38,9 @@ class FieldDescriptor { offsets_idx_(offsets_idx), data_buffer_ptr_idx_(data_buffer_ptr_idx), child_offsets_idx_(child_offsets_idx), - child_validity_idx_(child_validity_idx) {} + child_validity_idx_(child_validity_idx) { + std::cout << "LR FieldDescriptor=" << Name() << " " << data_idx_ << "," << data_buffer_ptr_idx_ << "," << child_validity_idx_ << std::endl; + } /// Index of validity array in the array-of-buffers int validity_idx() const { return validity_idx_; } @@ -55,7 +57,9 @@ class FieldDescriptor { /// Index of list type child data offsets int child_data_offsets_idx() const { return child_offsets_idx_; } int child_data_validity_idx() const { return child_validity_idx_; } - + void set_child_data_validity_idx(int val) { + child_validity_idx_ = val; + } FieldPtr field() const { return field_; } const std::string& Name() const { return field_->name(); } diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 283aa2ed91686..f70fd6581ee39 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -209,6 +209,7 @@ llvm::Value* LLVMGenerator::GetValidityReference(llvm::Value* arg_addrs, int idx FieldPtr field) { const std::string& name = field->name(); llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); + std::cout << "LR LLVMGenerator::GetValidityReference name=" << name << " idx=" << idx << std::endl; return ir_builder()->CreateIntToPtr(load, types()->i64_ptr_type(), name + "_varray"); } @@ -217,6 +218,7 @@ llvm::Value* LLVMGenerator::GetDataBufferPtrReference(llvm::Value* arg_addrs, in FieldPtr field) { const std::string& name = field->name(); llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); + std::cout << "LR LLVMGenerator::GetDataBufferPtrReference name=" << name << " idx=" << idx << std::endl; return ir_builder()->CreateIntToPtr(load, types()->i8_ptr_type(), name + "_buf_ptr"); } @@ -395,11 +397,20 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, slice_offsets.push_back(offset); } + llvm::AllocaInst* validity_index_var = + new llvm::AllocaInst(types()->i64_type(), 0, "validity_index_var", loop_entry); + builder->CreateStore(types()->i64_constant(0), validity_index_var); + // Loop body builder->SetInsertPoint(loop_body); // define loop_var : start with 0, +1 after each iter llvm::PHINode* loop_var = builder->CreatePHI(types()->i64_type(), 2, "loop_var"); +//LR-VAR + //Define counter for index into list validity vector. + //llvm::PHINode* validity_index_var = builder->CreatePHI(types()->i64_type(), 2, "validity_index_var"); + + llvm::Value* position_var = loop_var; if (selection_vector_mode != SelectionVector::MODE_NONE) { @@ -414,7 +425,7 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, // The visitor can add code to both the entry/loop blocks. std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, - slice_offsets, arg_context_ptr, position_var); + slice_offsets, arg_context_ptr, position_var, arg_addr_offsets, validity_index_var); value_expr->Accept(visitor); LValuePtr output_value = visitor.result(); @@ -700,7 +711,8 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi llvm::Value* arg_local_bitmaps, llvm::Value* arg_holder_ptrs, std::vector slice_offsets, - llvm::Value* arg_context_ptr, llvm::Value* loop_var) + llvm::Value* arg_context_ptr, llvm::Value* loop_var, llvm::Value* arg_offsets, + llvm::Value* validity_index_var) : generator_(generator), function_(function), entry_block_(entry_block), @@ -710,6 +722,8 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi slice_offsets_(slice_offsets), arg_context_ptr_(arg_context_ptr), loop_var_(loop_var), + arg_offsets_(arg_offsets), + validity_index_var_(validity_index_var), has_arena_allocs_(false) { ADD_VISITOR_TRACE("Iteration %T", loop_var); } @@ -795,6 +809,26 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.DataIdx())); llvm::Value* data_list = builder->CreateGEP(type, slot_ref, slot_index); +//LR-VAR + // auto valid_var = builder->CreateIntCast(list_len, types->i64_type(), true); + //builder->CreateStore(valid_var, validity_index_var_); + + + + + + + auto list_len_var = builder->CreateIntCast(list_len, types->i64_type(), true); + llvm::Value* vv_end = builder->CreateLoad(generator_->types()->i64_type(),validity_index_var_, "vv_end"); + +llvm::Value* updated_validity_index_var = builder->CreateAdd( + vv_end, list_len_var, "validity_index_var+offset"); + + builder->CreateStore(updated_validity_index_var, validity_index_var_); + //builder->CreateStore(updated_validity_index_var, validity_index_var_); + + + // TODO: handle bool type bitmap //Validity bitmap. //llvm::Value* b_slot_ref = GetBufferReference(dex.ValidityIdx(), kBufferTypeValidity, dex.Field()); @@ -821,6 +855,9 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { // " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", list_len); + ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " updated_validity_index_var %T", + updated_validity_index_var); + result_.reset(new LValue(data_list, list_len, validity)); } @@ -1143,12 +1180,69 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { auto params = BuildParams(dex.get_holder_idx(), dex.args(), true, native_function->NeedsContext()); + + + auto arrow_return_type = dex.func_descriptor()->return_type(); + auto arrow_return_type_id = arrow_return_type->id(); + + if (arrow_return_type_id == arrow::Type::LIST) + { + //Pointer to validity bitmap and bit starting index for accessing validity bits in the called function. + //llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); + //llvm::Value* validity = b_slot_ref; + + //Compute the bit offset. + //int64_t validIndex = 0; + //for (int i = 0; i < loop_var_; i++) { + // validIndex += *(arg_offsets_ + i); + //} + + /*std::string str3 = "validity:"; + if (validity) { + llvm::raw_string_ostream output3(str3); + validity->print(output3); + }*/ + std::string str32 = "loopvar:"; + if (loop_var_) { + llvm::raw_string_ostream output3(str32); + loop_var_->print(output3); + } + std::cout << "LR VectorReadFixedLenValueListDex loopvar=" << str32 << " result()->length()=" << result()->length() << std::endl; + //TODO params.push_back(validity); + params.push_back(loop_var_); + +//LR-VAR + //llvm::Value* updated_validity_index_var = builder->CreateAdd( + // validity_index_var_, result()->length(), "validity_index_var+offset"); + // check loop_var + //loop_var->addIncoming(types()->i64_constant(0), loop_entry); + + //builder->CreateStore(updated_validity_index_var, validity_index_var_); + auto valid_var = builder->CreateLoad(types->i64_type(), validity_index_var_, "loaded_var"); + params.push_back(valid_var); + + + + + params.push_back(arg_offsets_); + } + + + + + + + + + + + // add an extra arg for validity (allocated on stack). llvm::AllocaInst* result_valid_ptr = new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); params.push_back(result_valid_ptr); - auto arrow_return_type = dex.func_descriptor()->return_type(); + //auto arrow_return_type = dex.func_descriptor()->return_type(); result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); // load the result validity and truncate to i1. @@ -1166,6 +1260,84 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid); } +/* +void LLVMGenerator::Visitor::Visit(const NullableInternalListFuncDex& dex) { + ADD_VISITOR_TRACE("visit NullableInternalListFuncDex base function " + + dex.func_descriptor()->name()); + llvm::IRBuilder<>* builder = ir_builder(); + LLVMTypes* types = generator_->types(); + + const NativeFunction* native_function = dex.native_function(); + + // build function params along with validity. + auto params = BuildParams(dex.get_holder_idx(), dex.args(), true, + native_function->NeedsContext()); + + auto arrow_return_type = dex.func_descriptor()->return_type(); + + + + auto arrow_type_id = arrow_return_type->arrow_return_type->id(); + + if (arrow_return_type_id == arrow::Type::LIST) + { + //Pointer to validity bitmap and bit starting index for accessing validity bits in the called function. + llvm::Value* b_slot_index = + builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); + llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); + //llvm::Value* validity = builder->CreateGEP(type, b_slot_ref, 0); + llvm::Value* validity = b_slot_ref; + + //Compute the bit offset. + //int64_t validIndex = 0; + //for (int i = 0; i < loop_var_; i++) { + // validIndex += *(arg_offsets_ + i); + //} + + std::string str3 = "validity:"; + if (validity) { + llvm::raw_string_ostream output3(str3); + validity->print(output3); + } + std::string str32 = "loopvar:"; + if (loop_var_) { + llvm::raw_string_ostream output3(str32); + loop_var_->print(output3); + } + std::cout << "LR VectorReadFixedLenValueListDex loopvar=" + str32 + " using validity " << str3 << std::endl; + params.push_back(validity); + params.push_back(loop_var_); + params.push_back(arg_offsets_); + } + + + + + + + + // add an extra arg for validity (allocated on stack). + llvm::AllocaInst* result_valid_ptr = + new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); + params.push_back(result_valid_ptr); + + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); + + // load the result validity and truncate to i1. + auto result_valid_i8 = builder->CreateLoad(types->i8_type(), result_valid_ptr); + llvm::Value* result_valid = builder->CreateTrunc(result_valid_i8, types->i1_type()); + + std::bitset<8> bs(dex.local_bitmap_idx()); + std::cout <<"LR NullableInternalListFuncDex validity from dex.local_bitmap_idx()=" << bs << std::endl; + + + auto result_valid_i8ptr = builder->CreateLoad(types->i8_ptr_type(), result_valid_ptr); + + std::cout << "LR NullableInternalListFuncDex function param validity=" << result_valid_i8ptr << std::endl; + // set validity bit in the local bitmap. + ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid); +}*/ + void LLVMGenerator::Visitor::Visit(const IfDex& dex) { ADD_VISITOR_TRACE("visit IfExpression"); llvm::IRBuilder<>* builder = ir_builder(); @@ -1751,6 +1923,7 @@ llvm::Value* LLVMGenerator::Visitor::GetLocalBitMapReference(int idx) { /// The local bitmap is pre-filled with 1s. Clear only if invalid. void LLVMGenerator::Visitor::ClearLocalBitMapIfNotValid(int local_bitmap_idx, llvm::Value* is_valid) { + ADD_VISITOR_TRACE("ClearLocalBitMapIfNotValid"); llvm::Value* slot_ref = GetLocalBitMapReference(local_bitmap_idx); generator_->ClearPackedBitValueIfFalse(slot_ref, loop_var_, is_valid); } diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 2d10871a81f98..556f7ce4294b6 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -98,7 +98,7 @@ class GANDIVA_EXPORT LLVMGenerator { llvm::BasicBlock* entry_block, llvm::Value* arg_addrs, llvm::Value* arg_local_bitmaps, llvm::Value* arg_holder_ptrs, std::vector slice_offsets, llvm::Value* arg_context_ptr, - llvm::Value* loop_var); + llvm::Value* loop_var, llvm::Value* validity_index, llvm::Value* arg_offsets); void Visit(const VectorReadValidityDex& dex) override; void Visit(const VectorReadFixedLenValueDex& dex) override; @@ -112,6 +112,7 @@ class GANDIVA_EXPORT LLVMGenerator { void Visit(const NonNullableFuncDex& dex) override; void Visit(const NullableNeverFuncDex& dex) override; void Visit(const NullableInternalFuncDex& dex) override; + //void Visit(const NullableInternalListFuncDex& dex) override; void Visit(const IfDex& dex) override; void Visit(const BooleanAndDex& dex) override; void Visit(const BooleanOrDex& dex) override; @@ -182,6 +183,8 @@ class GANDIVA_EXPORT LLVMGenerator { std::vector slice_offsets_; llvm::Value* arg_context_ptr_; llvm::Value* loop_var_; + llvm::Value* arg_offsets_; + llvm::Value* validity_index_var_; bool has_arena_allocs_; }; diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 97f28f652ea22..1ebf1e9d8d7a6 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -463,15 +463,19 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); + + //LR TODO not sure this is needed. + ARROW_ASSIGN_OR_RAISE(auto data_valid_buffer, arrow::AllocateResizableBuffer(data_len, pool)); + //std::cout << "LR Projector::AllocArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { // std::cout << "LR Projector::AllocArrayData List. There are number of buffers=" << buffers.size() << std::endl; auto internal_type = type->field(0)->type(); ArrayDataPtr child_data; if (arrow::is_primitive(internal_type->id())) { - //std::cout << "LR Projector::AllocArrayData List 1" << std::endl; + std::cout << "LR Projector::AllocArrayData List 1" << std::endl; child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, - {nullptr, std::move(buffers[2])}, 0); + {std::move(data_valid_buffer), std::move(buffers[2])}, 0); } if (arrow::is_binary_like(internal_type->id())) { //std::cout << "LR Projector::AllocArrayData List 2" << std::endl; diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index b37615f209d45..9a40e67e61875 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -643,6 +643,7 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto validity = std::shared_ptr( new arrow::Buffer(reinterpret_cast(validity_addr), validity_size)); buffers.push_back(validity); + std::cout << "LR make_record_batch_with_buf_addrs adding validity_addr buffer=" << validity_addr << " idx=" << buf_idx - 1 << std::endl; if (buf_idx >= in_bufs_len) { return Status::Invalid("insufficient number of in_buf_addrs"); @@ -652,6 +653,7 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto data = std::shared_ptr( new arrow::Buffer(reinterpret_cast(value_addr), value_size)); buffers.push_back(data); + std::cout << "LR make_record_batch_with_buf_addrs adding value_addr buffer=" << value_addr << " idx=" << buf_idx - 1 << std::endl; if (arrow::is_binary_like(field->type()->id())) { if (buf_idx >= in_bufs_len) { @@ -664,6 +666,7 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto offsets = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); + std::cout << "LR make_record_batch_with_buf_addrs adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; } ////////// @@ -684,7 +687,12 @@ auto type_id = type->id(); auto offsets = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); - + std::cout << "LR make_record_batch_with_buf_addrs 2a adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; + std::cout << "LR bits are "; + for (int i = 0; i < 15; i++) { + std::cout << arrow::bit_util::GetBit(reinterpret_cast(offsets_addr), i) << ","; + } + std::cout << std::endl; if (arrow::is_binary_like(type->field(0)->type()->id())) { // child offsets length is internal data length + 1 @@ -696,6 +704,7 @@ auto type_id = type->id(); auto child_offsets_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(std::move(child_offsets_buffer)); + std::cout << "LR make_record_batch_with_buf_addrs 2b adding child_offsets_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; } } @@ -705,17 +714,22 @@ auto type_id = type->id(); jlong offsets_addr = in_buf_addrs[buf_idx++]; jlong offsets_size = in_buf_sizes[sz_idx++]; auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); - + std::cout << "LR make_record_batch_with_buf_addrs 3 adding data_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; //std::cout << "LR New ArrayData List" << std::endl; auto internal_type = type->field(0)->type(); std::shared_ptr child_data; if (arrow::is_primitive(internal_type->id())) { - //std::cout << "LR New ArrayData List 1" << std::endl; + std::cout << "LR New ArrayData List creating child data" << std::endl; + for (int i = 0; i < buffers.size(); i++) { + std::cout << "buffer for child data " << i << "=" << *buffers[i]->data() << std::endl; + } child_data = arrow::ArrayData::Make(internal_type, 0, - {nullptr, std::move(data_buffer)}, 0); + {std::move(buffers[2]), std::move(data_buffer)}); + std::cout << "child_data is =" << child_data->buffers[0] << " 1=" << child_data->buffers[1] << std::endl; } if (arrow::is_binary_like(internal_type->id())) { + //LR TODO need this for strings I think. //std::cout << "LR New ArrayData List NYI 2" << std::endl; //child_data = arrow::ArrayData::Make( // internal_type, 0, diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 4049b1bad3d4d..e57e7e90f9e0b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -29,7 +29,6 @@ import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.vector.BaseVariableWidthVector; -import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; import org.apache.arrow.vector.complex.ListVector; @@ -415,7 +414,7 @@ private void evaluate(int numRows, List buffers, List buf outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); //vector valid - logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); + logger.error("LR Projector.java evaluate isVarlistvector Width setting vector validity buffer=" + idx); //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); //outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); @@ -645,7 +644,8 @@ public void endList() { simple = 0; } */ - int simple = 0; + /* int simple = 0; + import org.apache.arrow.vector.BitVectorHelper; try { for (int i = 0; i < selectionVectorRecordCount; i++) { BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); @@ -654,7 +654,7 @@ public void endList() { } catch (IndexOutOfBoundsException e) { simple = 0; } - +*/ From a9a7f51b55d21d77584e65b1381b58077b8ae17e Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 20 Oct 2023 09:54:31 -0700 Subject: [PATCH 23/46] Everything working --- cpp/src/arrow/buffer.h | 3 + cpp/src/gandiva/annotator.cc | 34 +-- cpp/src/gandiva/array_ops.cc | 193 +++++++++--------- cpp/src/gandiva/array_ops.h | 28 ++- cpp/src/gandiva/bitmap_accumulator.h | 4 +- cpp/src/gandiva/expr_decomposer.cc | 2 +- cpp/src/gandiva/field_descriptor.h | 2 +- cpp/src/gandiva/function_registry_array.cc | 2 +- cpp/src/gandiva/function_registry_string.cc | 2 + cpp/src/gandiva/gdv_function_stubs.cc | 26 +-- cpp/src/gandiva/llvm_generator.cc | 130 ++++++------ cpp/src/gandiva/llvm_generator.h | 3 +- cpp/src/gandiva/lvalue.h | 6 +- cpp/src/gandiva/projector.cc | 4 +- java/gandiva/src/main/cpp/jni_common.cc | 85 ++++---- .../gandiva/evaluator/ListVectorExpander.java | 34 ++- .../arrow/gandiva/evaluator/Projector.java | 16 +- .../gandiva/evaluator/VectorExpander.java | 1 + 18 files changed, 308 insertions(+), 267 deletions(-) diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 598b393e5a80c..8daa8bafaaf39 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -448,12 +448,14 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { uint8_t* offsetBuffer; int64_t offsetCapacity; uint8_t* validityBuffer; + uint8_t* outerValidityBuffer; protected: ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) { offsetBuffer = nullptr; offsetCapacity = 0; validityBuffer = nullptr; + outerValidityBuffer = nullptr; } ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr mm) @@ -461,6 +463,7 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { offsetBuffer = nullptr; offsetCapacity = 0; validityBuffer = nullptr; + outerValidityBuffer = nullptr; } }; diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 9fa561fddb83d..dbc1cc50babaf 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -53,7 +53,7 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { } if (field->type()->id() == arrow::Type::LIST) { - std::cout << "LR Annotator::MakeDesc 1" << std::endl; + //std::cout << "LR Annotator::MakeDesc 1" << std::endl; offsets_idx = buffer_count_++; if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { child_offsets_idx = buffer_count_++; @@ -66,7 +66,7 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { int child_valid_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; //if (is_output) { child_valid_buffer_ptr_idx = buffer_count_++; - std::cout << "LR Annotator::MakeDesc 2 child_valid_buffer_ptr_idx=" << child_valid_buffer_ptr_idx << std::endl; + //std::cout << "LR Annotator::MakeDesc 2 child_valid_buffer_ptr_idx=" << child_valid_buffer_ptr_idx << std::endl; //} return std::make_shared(field, data_idx, validity_idx, offsets_idx, data_buffer_ptr_idx, child_offsets_idx, child_valid_buffer_ptr_idx); @@ -86,17 +86,17 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // The validity buffer is optional. Use nullptr if it does not have one. if (array_data.buffers[buffer_idx]) { uint8_t* validity_buf = const_cast(array_data.buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -6 " << &validity_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -6 " << &validity_buf << std::endl; eval_batch->SetBuffer(desc.validity_idx(), validity_buf, array_data.offset); } else { - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -5 null " << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -5 null " << std::endl; eval_batch->SetBuffer(desc.validity_idx(), nullptr, array_data.offset); } ++buffer_idx; if (desc.HasOffsetsIdx()) { uint8_t* offsets_buf = const_cast(array_data.buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << " using idx=" << buffer_idx << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << " using idx=" << buffer_idx << std::endl; eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); if (desc.HasChildOffsetsIdx()) { @@ -106,13 +106,13 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // for resizing uint8_t* child_offsets_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3a " << &child_offsets_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3a " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); uint8_t* child_valid_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[0].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3b " << &child_valid_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3b " << &child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, array_data.child_data.at(0)->offset); @@ -121,13 +121,13 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // if list field is input field, just put buffer data into eval batch uint8_t* child_offsets_buf = const_cast( array_data.child_data.at(0)->buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2a " << &child_offsets_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2a " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); uint8_t* child_valid_buf = const_cast( array_data.child_data.at(0)->buffers[0]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2b " << &child_valid_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2b " << &child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_valid_buf, array_data.child_data.at(0)->offset); } @@ -148,14 +148,14 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, //std::cout << "LR Annotator::PrepareBuffersForField 4 buffer_idx " << buffer_idx << std::endl; uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); //std::cout << "LR Annotator::PrepareBuffersForField 4a" << std::endl; - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -1 " << &data_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -1 " << &data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); //std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; } else { //std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; //std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; - std::cout << "LR array_data.child_data.at(0)->buffers[0]=" << array_data.child_data.at(0)->buffers[0] << std::endl; + //std::cout << "LR array_data.child_data.at(0)->buffers[0]=" << array_data.child_data.at(0)->buffers[0] << std::endl; //uint8_t* data_valid_buf = // const_cast(array_data.child_data.at(0)->buffers[0]->data()); //std::cout << "LR Annotator::PrepareBuffersForField setting offset eval data_valid_buf idx=" << 0 << " data_valid_buf=" << &data_valid_buf << std::endl; @@ -164,18 +164,18 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, uint8_t* data_buf = const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); - std::cout << "LR Annotator::PrepareBuffersForField setting data buffer desc.data_idx()=" << desc.data_idx() << " idx=" << buffer_idx << " data=" << data_buf << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting data buffer desc.data_idx()=" << desc.data_idx() << " idx=" << buffer_idx << " data=" << data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); //std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; - std::cout << "LR array_data.child_data.at(0)->buffers[0]->data() is " << array_data.child_data.at(0)->buffers[0] << std::endl; + //std::cout << "LR array_data.child_data.at(0)->buffers[0]->data() is " << array_data.child_data.at(0)->buffers[0] << std::endl; if (array_data.child_data.at(0)->buffers[0] ) { uint8_t* child_valid_buf = const_cast( array_data.child_data.at(0)->buffers[0]->data()); //desc.set_child_data_validity_idx(4); - std::cout << "LR Annotator::PrepareBuffersForField setting child valid buffer -5b " << - " name=" << desc.Name() << " idx=" << desc.child_data_validity_idx() << " child_data_buf=" << *child_valid_buf << std::endl; + // std::cout << "LR Annotator::PrepareBuffersForField setting child valid buffer -5b " << + //" name=" << desc.Name() << " idx=" << desc.child_data_validity_idx() << " child_data_buf=" << *child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, 0); } @@ -187,7 +187,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, if (array_data.type->id() != arrow::Type::LIST) { uint8_t* data_buf_ptr = reinterpret_cast(array_data.buffers[buffer_idx].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 1 " << &data_buf_ptr << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 1 " << &data_buf_ptr << std::endl; eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); } else { //std::cout << "LR Annotator::PrepareBuffersForField is_output index " << desc.data_buffer_ptr_idx() << std::endl; @@ -195,7 +195,7 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // list data buffer is in child data buffer uint8_t* data_buf_ptr = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); - std::cout << "LR Annotator::PrepareBuffersForField setting eval data buffer " << buffer_idx << " data=" << &data_buf_ptr << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField setting eval data buffer " << buffer_idx << " data=" << &data_buf_ptr << std::endl; eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.child_data.at(0)->offset); diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 3019d70bbb4ac..d83cd0a8986e6 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -49,39 +49,31 @@ bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, } bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, - int32_t contains_data) { - //std::cout << "LR array_int32_contains_int32 offset length=" << entry_offsets_len << std::endl; - for (int i = 0; i < entry_offsets_len; i++) { - //std::cout << "LR going to check " << entry_buf + i << std::endl; - //LR TODO - int32_t entry_len = *(entry_buf + i); - //coming as int64 for some reason. *2 - //int32_t entry_len = *(entry_buf + (i * 2)); - //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; - if (entry_len == contains_data) { - return true; - } + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int32_t contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row) { + if (!combined_row_validity) { + *valid_row = false; + return false; } - return false; -} + *valid_row = true; -bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, - int32_t entry_offsets_len, - int64_t contains_data) { - //std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; - for (int i = 0; i < entry_offsets_len; i++) { - //std::cout << "LR going to check " << entry_buf + i << std::endl; - int64_t entry_len = *(entry_buf + (i*2)); //LR TODO sizeof int64? - //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; - if (entry_len == contains_data) { + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = validity_index_var - entry_len; + + for (int i = 0; i < entry_len; i++) { + if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + continue; + } + int32_t entry_val = *(entry_buf + i); + if (entry_val == contains_data) { return true; } } return false; } - int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { //std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; @@ -96,50 +88,44 @@ int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int3 //return reinterpret_cast(ret); return reinterpret_cast(ret); } -/* -int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { - //std::cout << "LR array_int32_remove data=" << remove_data - // << " entry_offsets_len " << entry_offsets_len << std::endl; - //LR sizes are HACK - int* integers = new int[5]; - int j = 0; - for (int i = 0; i < entry_offsets_len; i++) { - //std::cout << "LR going to check " << entry_buf + i << std::endl; - int32_t entry_len = *(entry_buf + (i * 1)); - //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; - if (entry_len == remove_data) { - continue; - } else { - integers[j++] = entry_len; - } +bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int64_t contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row) { + //std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; + if (!combined_row_validity) { + *valid_row = false; + return false; } + *valid_row = true; - *out_len = 5;// * 4; - //length is number of items, but buffers must account for byte size. - uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); - memcpy(ret, integers, *out_len * 4); - //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = validity_index_var - entry_len; - delete [] integers; - //return reinterpret_cast(ret); - return reinterpret_cast(ret); + for (int i = 0; i < entry_len; i++) { + if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + continue; + } + int64_t entry_len = *(entry_buf + (i*2)); //LR TODO sizeof int64? + //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + if (entry_len == contains_data) { + return true; + } + } + return false; } -*/ - - - int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, const int32_t* notSureWhatThisIs, bool combined_validity, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int32_t remove_data, bool entry_validWhat, - /*const int32_t* array_valid_bits,*/ int64_t loop_var, int64_t validity_index_var, const int64_t* offsets, - bool* valid_buf, int32_t* out_len, int32_t** valid_ptr) { + /*const int32_t* array_valid_bits,*/ int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { //std::cout << "LR array_int32_remove data=" << remove_data // << " entry_offsets_len " << entry_offsets_len << std::endl; - std::cout << "LR array_int32_remove " << loop_var << std::endl; + //std::cout << "LR array_int32_remove " << loop_var << std::endl; std::vector newInts; @@ -162,13 +148,17 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, } }*/ - std::cout << "LR entry_buf=" << entry_buf << " *entry_buf=" << entry_buf << std::endl; - std::cout << "LR notSureWhatThisIs=" << notSureWhatThisIs << " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; - std::cout << "LR combined_validity=" << combined_validity << " entry_validWhat=" << entry_validWhat << " validity_index_var=" << validity_index_var << std::endl; + //std::cout << "LR entry_buf=" << entry_buf << " *entry_buf=" << entry_buf << std::endl; + //std::cout << "LR notSureWhatThisIs=" << notSureWhatThisIs << " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; + std::cout << "LR combined_row_validity=" << combined_row_validity << " entry_validWhat=" << entry_validWhat << " validity_index_var=" << validity_index_var << + " entry_validity=" << entry_validity << std::endl; //<< " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; - const int32_t* notSureWhatThisIsAdjusted = notSureWhatThisIs - (loop_var ); - std::bitset<15> maybeInputBits (*notSureWhatThisIsAdjusted); - std::cout << "LR maybeInputBits=" << maybeInputBits << std::endl; + + //LR TODO not sure what entry_validWhat is. + //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + //std::bitset<15> maybeInputBits (*notSureWhatThisIsAdjusted); + //std::cout << "LR maybeInputBits=" << maybeInputBits << std::endl; int64_t validityBitIndex = 0; @@ -176,12 +166,14 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, // validityBitIndex += *(offsets + i); // std::cout << "LR i=" << i << " adding offset " << *(offsets + i) << " offset is " << offsets << std::endl; //} -validityBitIndex = validity_index_var - entry_offsets_len; + + //The validity index already has the current row length added to it, so decrement. +validityBitIndex = validity_index_var - entry_len; //TODO temp until the buffer is worked out. //validityBitIndex -= (loop_var); - std::cout << "Using validityBitIndex=" << validityBitIndex << std::endl; + //std::cout << "Using validityBitIndex=" << validityBitIndex << std::endl; @@ -189,17 +181,16 @@ validityBitIndex = validity_index_var - entry_offsets_len; //std::bitset<10> outputValidBits; std::vector outValid; - for (int i = 0; i < entry_offsets_len; i++) { + for (int i = 0; i < entry_len; i++) { //std::cout << "LR going to check " << entry_buf + i << std::endl; int32_t entry_item = *(entry_buf + (i * 1)); //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; if (entry_item == remove_data) { - outValid.push_back(false); - newInts.push_back(42); - //outputValidBits[i] = 0; - entry_validWhat = false; + //outValid.push_back(false); + //newInts.push_back(42); + //entry_validWhat = false; //TODO temp until buffer is worked out } else if (!arrow::bit_util::GetBit(reinterpret_cast(array_valid_bits), validityBitIndex + i)) { - } else if (!arrow::bit_util::GetBit(reinterpret_cast(notSureWhatThisIsAdjusted), validityBitIndex + i)) { + } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { outValid.push_back(false); newInts.push_back(0); //outputValidBits[i] = 0; @@ -227,25 +218,25 @@ validityBitIndex = validity_index_var - entry_offsets_len; //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; - *valid_buf = true; + *valid_row = true; //unsigned long ll = outputValidBits.to_ulong(); - if (!combined_validity) { + if (!combined_row_validity) { //ll = 0; *out_len = 0; - *valid_buf = false; //this one is what works for the top level validity. + *valid_row = false; //this one is what works for the top level validity. entry_validWhat = false; } //LR no need, set along the way. memcpy(validRet, &ll, 1); //*valid_len = 1; - std::cout << "LR valid_buf is " << valid_buf << std::endl; + //std::cout << "LR valid_buf is " << valid_buf << std::endl; //std::cout << "LR outputValidBits is " << outputValidBits << std::endl; //valid_buf = reinterpret_cast(validRet); *valid_ptr = reinterpret_cast(validRet); - std::cout << "LR setting valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " **valid_ptr=" << **valid_ptr << " valid_ptr bitset data is " << std::bitset<8>(**valid_ptr) - << " return value is " << reinterpret_cast(ret) << std::endl; + //std::cout << "LR setting valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " **valid_ptr=" << **valid_ptr << " valid_ptr bitset data is " << std::bitset<8>(**valid_ptr) + // << " return value is " << reinterpret_cast(ret) << std::endl; //return reinterpret_cast(ret); @@ -284,9 +275,16 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { reinterpret_cast(array_utf8_contains_utf8)); args = {types->i64_type(), // int64_t execution_context - types->i32_ptr_type(), // int8_t* data ptr - types->i32_type(), // int32_t child offsets length - types->i32_type()}; // int32_t contains data length + types->i64_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t data length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->i32_type(), // int32_t value to check for + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type() //output validity for the row + }; engine->AddGlobalMappingForFunc("array_int32_contains_int32", types->i1_type() /*return_type*/, args, @@ -294,8 +292,15 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { args = {types->i64_type(), // int64_t execution_context types->i64_ptr_type(), // int8_t* data ptr - types->i32_type(), // int32_t child offsets length - types->i64_type()}; // int32_t contains data length + types->i32_type(), // int32_t data length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->i64_type(), // int32_t value to check for + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type() //output validity for the row + }; engine->AddGlobalMappingForFunc("array_int64_contains_int64", types->i1_type() /*return_type*/, args, @@ -311,19 +316,17 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { reinterpret_cast(array_int32_make_array)); args = {types->i64_type(), // int64_t execution_context - types->i32_ptr_type(), // int8_t* data ptr - types->i32_type(), // int32_t child offsets length - types->i32_ptr_type(), // Not Sure??? - types->i1_type(), // bool validity + types->i32_ptr_type(), // int8_t* input data ptr + types->i32_type(), // int32_t input length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity types->i32_type(), //value to remove from input - types->i1_type(), // bool validity - //types->i32_ptr_type(), //in validity bitmap - types->i64_type(), //in loop var - types->i64_type(), //in validity_index_var - types->i64_ptr_type(), //in offsets - types->i1_ptr_type(), //valid buffer - types->i32_ptr_type(), // out array length - types->i32_ptr_type() //valid_ptr + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type(), //output validity for the row + types->i32_ptr_type(), // output array length + types->i32_ptr_type() //output pointer to new validity buffer }; diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index bd57a11dafe6a..8fdf957f3d22c 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -36,12 +36,16 @@ int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len); GANDIVA_EXPORT bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, - int32_t contains_data); + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int32_t contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_buf); GANDIVA_EXPORT bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, - int32_t entry_offsets_len, - int64_t contains_data); + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int64_t contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_buf); GANDIVA_EXPORT int32_t* array_int32_make_array(int64_t context_ptr, @@ -50,17 +54,9 @@ int32_t* array_int32_make_array(int64_t context_ptr, GANDIVA_EXPORT int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, - int32_t entry_offsets_len, - const int32_t* notSureWhatThisIs, - bool entry_valid, - int32_t remove_data, - bool entry_validWhat, - //const int32_t* array_valid_bits, - int64_t loop_var, - int64_t validity_index_var, - const int64_t* offsets, - bool* valid_buf, - int32_t* out_len, - int32_t** valid_ptr); + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int32_t remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_buf, int32_t* out_len, int32_t** valid_ptr); } diff --git a/cpp/src/gandiva/bitmap_accumulator.h b/cpp/src/gandiva/bitmap_accumulator.h index 2a115f3830e23..f67b58847ce70 100644 --- a/cpp/src/gandiva/bitmap_accumulator.h +++ b/cpp/src/gandiva/bitmap_accumulator.h @@ -37,11 +37,11 @@ class GANDIVA_EXPORT BitMapAccumulator : public DexDefaultVisitor { void Visit(const VectorReadValidityDex& dex) { int idx = dex.ValidityIdx(); - std::cout << "LR BitMapAccumulator visiting " << idx << std::endl; + //std::cout << "LR BitMapAccumulator visiting " << idx << std::endl; auto bitmap = eval_batch_.GetBuffer(idx); // The bitmap could be null. Ignore it in this case. if (bitmap != NULLPTR) { - std::cout << "LR BitMapAccumulator is not null " << bitmap << std::endl; + //std::cout << "LR BitMapAccumulator is not null " << bitmap << std::endl; src_maps_.push_back(bitmap); src_map_offsets_.push_back(eval_batch_.GetBufferOffset(idx)); } diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index ec1a8f9e16039..f35b3bc5cc5e8 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -38,7 +38,7 @@ namespace gandiva { Status ExprDecomposer::Visit(const FieldNode& node) { auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field()); - std::cout << "LR ExprDecomposer" << std::endl; + //std::cout << "LR ExprDecomposer" << std::endl; DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; if (desc->HasChildOffsetsIdx()) { diff --git a/cpp/src/gandiva/field_descriptor.h b/cpp/src/gandiva/field_descriptor.h index db7b89854335a..0df7d4f2f2aaa 100644 --- a/cpp/src/gandiva/field_descriptor.h +++ b/cpp/src/gandiva/field_descriptor.h @@ -39,7 +39,7 @@ class FieldDescriptor { data_buffer_ptr_idx_(data_buffer_ptr_idx), child_offsets_idx_(child_offsets_idx), child_validity_idx_(child_validity_idx) { - std::cout << "LR FieldDescriptor=" << Name() << " " << data_idx_ << "," << data_buffer_ptr_idx_ << "," << child_validity_idx_ << std::endl; + //std::cout << "LR FieldDescriptor=" << Name() << " " << data_idx_ << "," << data_buffer_ptr_idx_ << "," << child_validity_idx_ << std::endl; } /// Index of validity array in the array-of-buffers diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index 826deb24bbbf0..f7c587a64b74d 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -29,7 +29,7 @@ std::vector GetArrayFunctionRegistry() { kResultNullIfNull, "array_utf8_length", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, - boolean(), kResultNullIfNull, "array_int32_contains_int32", + boolean(), kResultNullInternal, "array_int32_contains_int32", NativeFunction::kNeedsContext), NativeFunction("array_contains", {}, DataTypeVector{list(int32()), int32()}, boolean(), kResultNullIfNull, "array_int32_contains_int32", diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 442cdecbde7d3..edb900e976c59 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -316,6 +316,8 @@ std::vector GetStringFunctionRegistry() { // concat treats null inputs as empty strings whereas concatOperator returns null if // one of the inputs is null + NativeFunction("concatGandiva", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullNever, "concat_utf8_utf8", NativeFunction::kNeedsContext), NativeFunction("concat", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullNever, "concat_utf8_utf8", NativeFunction::kNeedsContext), NativeFunction("concat", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index e4e472451e733..76465020bb93a 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -203,7 +203,7 @@ int32_t gdv_fn_populate_list_int32_t_vector(int64_t context_ptr, int8_t* data_pt int SCALE = 4; auto buffer = reinterpret_cast(data_ptr); int32_t offset = static_cast(buffer->size()); - std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; + //std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); if (!status.ok()) { gandiva::ExecutionContext* context = @@ -211,23 +211,23 @@ int32_t gdv_fn_populate_list_int32_t_vector(int64_t context_ptr, int8_t* data_pt context->set_error_msg(status.message().c_str()); return -1; } - std::cout << "LR gdv_fn_populate_list_ 2 valid_ptr" << valid_ptr << std::endl; - std::cout << "LR gdv_fn_populate_list_ " << buffer << " " << offset; \ - std::cout << " " << entry_len << " " << SCALE << "]]" << std::endl; \ + //std::cout << "LR gdv_fn_populate_list_ 2 valid_ptr" << valid_ptr << std::endl; + //std::cout << "LR gdv_fn_populate_list_ " << buffer << " " << offset; + //std::cout << " " << entry_len << " " << SCALE << "]]" << std::endl; memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); - std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; - std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; - std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; + //std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; + //std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; + //std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; //int v[6] = {255, 255, 255, 255, 255, 255}; - std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << std::endl; + int validbitIndex = offset / SCALE; + //std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " validbitIndex=" << validbitIndex << std::endl; //int newValidSize = validbitIndex + entry_len; - //TODO need to iterate over bits in valid_ptr since bitset rewuires compile time size. - std::bitset<10> bs((unsigned long)(*valid_ptr)); + //LR TODO just copy? for (int i = 0; i < entry_len; i++) { - arrow::bit_util::SetBitTo(buffer->validityBuffer, validbitIndex + i, bs[i]); + arrow::bit_util::SetBitTo(buffer->validityBuffer, validbitIndex + i, arrow::bit_util::GetBit(reinterpret_cast(valid_ptr), i)); } @@ -240,11 +240,11 @@ int32_t gdv_fn_populate_list_int32_t_vector(int64_t context_ptr, int8_t* data_pt std::bitset existingBits2(buffer->validityBuffer); std::cout << "LR bitset of existingBits2 " << existingBits2 << std::endl; */ - std::cout << "LR bitset of valid ptr is " << bs << std::endl; + //std::cout << "LR bitset of valid ptr is " << bs << std::endl; offsets = reinterpret_cast(buffer->offsetBuffer); offsets[slot] = offset / SCALE; offsets[slot + 1] = offset / SCALE + entry_len; - std::cout << "LR gdv_fn_populate_list_ Done" << std::endl; + //std::cout << "LR gdv_fn_populate_list_ Done" << std::endl; return 0; } diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index f70fd6581ee39..7fa7073a24948 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -152,12 +152,12 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); - std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; + //std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); - std::cout << "LR LLVMGenerator::Execute 2" << std::endl; + //std::cout << "LR LLVMGenerator::Execute 2" << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -167,7 +167,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } - std::cout << "LR LLVMGenerator::Execute 3" << std::endl; + // std::cout << "LR LLVMGenerator::Execute 3" << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -177,7 +177,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } - std::cout << "LR LLVMGenerator::Execute A1" << std::endl; + //std::cout << "LR LLVMGenerator::Execute A1" << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -189,7 +189,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); - std::cout << "LR LLVMGenerator::Execute A2" << std::endl; + // std::cout << "LR LLVMGenerator::Execute A2" << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -209,7 +209,7 @@ llvm::Value* LLVMGenerator::GetValidityReference(llvm::Value* arg_addrs, int idx FieldPtr field) { const std::string& name = field->name(); llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); - std::cout << "LR LLVMGenerator::GetValidityReference name=" << name << " idx=" << idx << std::endl; + // std::cout << "LR LLVMGenerator::GetValidityReference name=" << name << " idx=" << idx << std::endl; return ir_builder()->CreateIntToPtr(load, types()->i64_ptr_type(), name + "_varray"); } @@ -218,7 +218,7 @@ llvm::Value* LLVMGenerator::GetDataBufferPtrReference(llvm::Value* arg_addrs, in FieldPtr field) { const std::string& name = field->name(); llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); - std::cout << "LR LLVMGenerator::GetDataBufferPtrReference name=" << name << " idx=" << idx << std::endl; + // std::cout << "LR LLVMGenerator::GetDataBufferPtrReference name=" << name << " idx=" << idx << std::endl; return ir_builder()->CreateIntToPtr(load, types()->i8_ptr_type(), name + "_buf_ptr"); } @@ -423,14 +423,14 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } // The visitor can add code to both the entry/loop blocks. - std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; + //std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, - slice_offsets, arg_context_ptr, position_var, arg_addr_offsets, validity_index_var); + slice_offsets, arg_context_ptr, position_var, validity_index_var); value_expr->Accept(visitor); LValuePtr output_value = visitor.result(); //std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; - std::cout << "LR output_value from visitor is " << output_value->to_string() << std::endl; + //std::cout << "LR output_value from visitor is " << output_value->to_string() << std::endl; // The "current" block may have changed due to code generation in the visitor. llvm::BasicBlock* loop_body_tail = builder->GetInsertBlock(); @@ -486,11 +486,11 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, output_value->length()->print(output2); - std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," - << output_offset_ref << "," << loop_var << - " output_value->data() " << output_value->data() << " output_value->validity() " << output_value->validity() << - " output_value->length() " << output_value->length() << std::endl; - // << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; + // std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," + // << output_offset_ref << "," << loop_var << + // " output_value->data() " << output_value->data() << " output_value->validity() " << output_value->validity() << + // " output_value->length() " << output_value->length() << std::endl; + AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length(), output_value->validity()}); @@ -615,11 +615,11 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, uint8_t* dst_bitmap = eval_batch->GetBuffer(out_idx); // Compute the destination bitmap. if (selection_vector == nullptr) { - std::cout << "LR blarg" << std::endl; - std::cout << "LR bitmap array buffer index is " << out_idx << " bitset is " << std::bitset<8>(*dst_bitmap) << std::endl; - std::cout << "LR bitmap array buffer index is " << 0 << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(0)) << std::endl; - std::cout << "LR bitmap thing getting the validity buffer " << compiled_expr.output()->child_data_validity_idx() << std::endl; - std::cout << "LR Eval buffer has " << eval_batch->GetNumBuffers() << std::endl; + // std::cout << "LR blarg" << std::endl; + //std::cout << "LR bitmap array buffer index is " << out_idx << " bitset is " << std::bitset<8>(*dst_bitmap) << std::endl; + //std::cout << "LR bitmap array buffer index is " << 0 << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(0)) << std::endl; + //std::cout << "LR bitmap thing getting the validity buffer " << compiled_expr.output()->validity_idx() << std::endl; + //std::cout << "LR Eval buffer has " << eval_batch->GetNumBuffers() << std::endl; // << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(compiled_expr.output()->child_data_validity_idx() )) << std::endl; accumulator.ComputeResult(dst_bitmap); } else { @@ -635,12 +635,12 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, accumulator.ComputeResult(temp_bitmap); - std::cout << "LR computing bitmap. Size is " << bit_map_holder.GetLocalBitMapSize() << std::endl; - for (int i = 0; i < bit_map_holder.GetLocalBitMapSize(); i++) { - uint8_t* arr = bit_map_holder.GetLocalBitMap(i); - std::cout << "LR bitmap array [" << i << "] size is " << bit_map_holder.GetNumRecords() << " bitset is " << std::bitset<8>(*arr) << std::endl; + //std::cout << "LR computing bitmap. Size is " << bit_map_holder.GetLocalBitMapSize() << std::endl; + // for (int i = 0; i < bit_map_holder.GetLocalBitMapSize(); i++) { + // uint8_t* arr = bit_map_holder.GetLocalBitMap(i); + // std::cout << "LR bitmap array [" << i << "] size is " << bit_map_holder.GetNumRecords() << " bitset is " << std::bitset<8>(*arr) << std::endl; - } + //} @@ -711,7 +711,7 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi llvm::Value* arg_local_bitmaps, llvm::Value* arg_holder_ptrs, std::vector slice_offsets, - llvm::Value* arg_context_ptr, llvm::Value* loop_var, llvm::Value* arg_offsets, + llvm::Value* arg_context_ptr, llvm::Value* loop_var, llvm::Value* validity_index_var) : generator_(generator), function_(function), @@ -722,7 +722,6 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi slice_offsets_(slice_offsets), arg_context_ptr_(arg_context_ptr), loop_var_(loop_var), - arg_offsets_(arg_offsets), validity_index_var_(validity_index_var), has_arena_allocs_(false) { ADD_VISITOR_TRACE("Iteration %T", loop_var); @@ -771,21 +770,21 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; - std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; + //std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; arrow::Type::type at = arrow::Type::INT32; type = types->IRType(at); //type = types->DataVecType(dex.FieldType()); - std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; // compute list len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); - std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << - printType(offsets_slot_index) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << + // printType(offsets_slot_index) << std::endl; // => offset_start = offsets[loop_var] slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); @@ -846,7 +845,7 @@ llvm::Value* updated_validity_index_var = builder->CreateAdd( llvm::raw_string_ostream output3(str3); validity->print(output3); } - std::cout << "LR VectorReadFixedLenValueListDex using validity " << str3 << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex using validity " << str3 << std::endl; // TODO: handle decimal precision and scale @@ -1185,7 +1184,14 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { auto arrow_return_type = dex.func_descriptor()->return_type(); auto arrow_return_type_id = arrow_return_type->id(); - if (arrow_return_type_id == arrow::Type::LIST) + bool passLoopVars = false; + for (auto& p : dex.func_descriptor()->params()) { + if (p->id() == arrow::Type::LIST) { + passLoopVars = true; + break; + } + } + if (passLoopVars) { //Pointer to validity bitmap and bit starting index for accessing validity bits in the called function. //llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); @@ -1207,7 +1213,7 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { llvm::raw_string_ostream output3(str32); loop_var_->print(output3); } - std::cout << "LR VectorReadFixedLenValueListDex loopvar=" << str32 << " result()->length()=" << result()->length() << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex loopvar=" << str32 << " result()->length()=" << result()->length() << std::endl; //TODO params.push_back(validity); params.push_back(loop_var_); @@ -1221,10 +1227,6 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { auto valid_var = builder->CreateLoad(types->i64_type(), validity_index_var_, "loaded_var"); params.push_back(valid_var); - - - - params.push_back(arg_offsets_); } @@ -1249,13 +1251,13 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { auto result_valid_i8 = builder->CreateLoad(types->i8_type(), result_valid_ptr); llvm::Value* result_valid = builder->CreateTrunc(result_valid_i8, types->i1_type()); - std::bitset<8> bs(dex.local_bitmap_idx()); - std::cout <<"LR NullableInternal validity from dex.local_bitmap_idx()=" << bs << std::endl; + //std::bitset<8> bs(dex.local_bitmap_idx()); + //std::cout <<"LR NullableInternal validity from dex.local_bitmap_idx()=" << bs << std::endl; - auto result_valid_i8ptr = builder->CreateLoad(types->i8_ptr_type(), result_valid_ptr); + // auto result_valid_i8ptr = builder->CreateLoad(types->i8_ptr_type(), result_valid_ptr); - std::cout << "LR NullableInternal function param validity=" << result_valid_i8ptr << std::endl; + // std::cout << "LR NullableInternal function param validity=" << result_valid_i8ptr << std::endl; // set validity bit in the local bitmap. ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid); } @@ -1703,7 +1705,7 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& pair) { // generate code for value - std::cout << "LR LLVMGenerator::Visitor::BuildValueAndValidity" << std::endl; + // std::cout << "LR LLVMGenerator::Visitor::BuildValueAndValidity" << std::endl; auto value_expr = pair.value_expr(); value_expr->Accept(*this); auto value = result()->data(); @@ -1723,10 +1725,10 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; - for (unsigned int i = 0; i < params->size(); i++) { - std::cout << "LR param " << i << printType(params->at(i)) << std::endl; - } + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; + //for (unsigned int i = 0; i < params->size(); i++) { + // std::cout << "LR param " << i << printType(params->at(i)) << std::endl; + //} if (arrow_return_type_id == arrow::Type::DECIMAL) { // For decimal fns, the output precision/scale are passed along as parameters. // @@ -1771,17 +1773,17 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, valid_ptr = new llvm::AllocaInst(generator_->types()->i32_ptr_type(), 0, "valid_ptr", entry_block_); - std::cout << "LR allocinst for valid_ptr=" << printType(valid_ptr) << std::endl; + // std::cout << "LR allocinst for valid_ptr=" << printType(valid_ptr) << std::endl; params->push_back(valid_ptr); } //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; - for (auto p : *params) { + /*for (auto p : *params) { std::string str1; llvm::raw_string_ostream output1(str1); p->print(output1); std::cout << str1 << std::endl; - } + }*/ // Make the function call llvm::IRBuilder<>* builder = ir_builder(); @@ -1797,8 +1799,8 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, (valid_ptr == nullptr) ? nullptr : builder->CreateLoad(generator_->types()->i32_ptr_type(), valid_ptr); - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using validity=" << validity << " ptr=" << valid_ptr << std::endl; - std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using value_len=" << value_len << " ptr=" << result_len_ptr << std::endl; + // std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using validity=" << validity << " ptr=" << valid_ptr << std::endl; + // std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using value_len=" << value_len << " ptr=" << result_len_ptr << std::endl; return std::make_shared(value, value_len, validity); } } @@ -1814,7 +1816,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(arg_context_ptr_); } - std::cout << "LR BuildParams1" << std::endl; + //std::cout << "LR BuildParams1" << std::endl; // if the function has holder, add the holder pointer. if (holder_idx != -1) { auto builder = ir_builder(); @@ -1823,7 +1825,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( llvm::BasicBlock* saved_block = builder->GetInsertBlock(); builder->SetInsertPoint(entry_block_); - std::cout << "LR BuildParams1a" << std::endl; + // std::cout << "LR BuildParams1a" << std::endl; auto holder = generator_->LoadVectorAtIndex( arg_holder_ptrs_, generator_->types()->i64_type(), holder_idx, "holder"); @@ -1831,25 +1833,25 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(holder); } - std::cout << "LR BuildParams2" << std::endl; + // std::cout << "LR BuildParams2" << std::endl; // build the function params, along with the validities. for (auto& pair : args) { // build value. DexPtr value_expr = pair->value_expr(); - std::cout << "LR BuildParams2a" << std::endl; + // std::cout << "LR BuildParams2a" << std::endl; value_expr->Accept(*this); - std::cout << "LR BuildParams2b" << std::endl; + // std::cout << "LR BuildParams2b" << std::endl; LValue& result_ref = *result(); // append all the parameters corresponding to this LValue. result_ref.AppendFunctionParams(¶ms); - std::cout << "LR BuildParams2c" << std::endl; + // std::cout << "LR BuildParams2c" << std::endl; // build validity. if (with_validity) { llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs()); params.push_back(validity_expr); - std::cout << "LR BuildParams2d adding combined validity" << std::endl; + // std::cout << "LR BuildParams2d adding combined validity" << std::endl; } } @@ -1970,9 +1972,9 @@ std::string LLVMGenerator::ReplaceFormatInTrace(const std::string& in_msg, } void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { - //if (!enable_ir_traces_) { - // return; - //} + if (!enable_ir_traces_) { + return; + } std::string dmsg = "IR_TRACE:: " + msg + "\n"; std::string print_fn_name = "printf"; diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 556f7ce4294b6..594b7253b9e93 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -98,7 +98,7 @@ class GANDIVA_EXPORT LLVMGenerator { llvm::BasicBlock* entry_block, llvm::Value* arg_addrs, llvm::Value* arg_local_bitmaps, llvm::Value* arg_holder_ptrs, std::vector slice_offsets, llvm::Value* arg_context_ptr, - llvm::Value* loop_var, llvm::Value* validity_index, llvm::Value* arg_offsets); + llvm::Value* loop_var, llvm::Value* validity_index); void Visit(const VectorReadValidityDex& dex) override; void Visit(const VectorReadFixedLenValueDex& dex) override; @@ -183,7 +183,6 @@ class GANDIVA_EXPORT LLVMGenerator { std::vector slice_offsets_; llvm::Value* arg_context_ptr_; llvm::Value* loop_var_; - llvm::Value* arg_offsets_; llvm::Value* validity_index_var_; bool has_arena_allocs_; }; diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index dbfe6dc2e18cb..3b2bbd3b0ec96 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -33,7 +33,7 @@ class GANDIVA_EXPORT LValue { explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR, llvm::Value* validity = NULLPTR) : data_(data), length_(length), validity_(validity) { - std::cout << "LR created LValue " << to_string() << std::endl; + //std::cout << "LR created LValue " << to_string() << std::endl; } virtual ~LValue() = default; @@ -45,7 +45,7 @@ class GANDIVA_EXPORT LValue { // Append the params required when passing this as a function parameter. virtual void AppendFunctionParams(std::vector* params) { - std::cout << "LR LValue::AppendFunctionParams" << std::endl; + // std::cout << "LR LValue::AppendFunctionParams" << std::endl; params->push_back(data_); if (length_ != NULLPTR) { params->push_back(length_); @@ -120,7 +120,7 @@ class GANDIVA_EXPORT ListLValue : public LValue { llvm::Value* offsets_length() { return offsets_length_; } void AppendFunctionParams(std::vector* params) override { - std::cout << "LR ListLValue::AppendFunctionParams" << std::endl; + // std::cout << "LR ListLValue::AppendFunctionParams" << std::endl; LValue::AppendFunctionParams(params); params->push_back(child_offsets_); params->push_back(offsets_length_); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 1ebf1e9d8d7a6..b0d5331a3ee48 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -394,7 +394,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, array_data = arrow::ArrayData::Make(array_data->type, array_data->length, array_data->buffers, {new_child_data}, array_data->null_count, array_data->offset); - std::cout << "LR Making array data length " << array_data->length << std::endl; + // std::cout << "LR Making array data length " << array_data->length << std::endl; } output->push_back(arrow::MakeArray(array_data)); @@ -473,7 +473,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, auto internal_type = type->field(0)->type(); ArrayDataPtr child_data; if (arrow::is_primitive(internal_type->id())) { - std::cout << "LR Projector::AllocArrayData List 1" << std::endl; + //std::cout << "LR Projector::AllocArrayData List 1" << std::endl; child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, {std::move(data_valid_buffer), std::move(buffers[2])}, 0); } diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 9a40e67e61875..bb8feb0abb18a 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -91,6 +91,7 @@ static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; static jfieldID list_expander_ret_address_; static jfieldID list_expander_valid_address_; +static jfieldID list_expander_outer_valid_address_; static jfieldID list_expander_ret_capacity_; static jfieldID list_expander_offset_ret_address_; static jfieldID list_expander_offset_ret_capacity_; @@ -166,7 +167,9 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { list_expander_offset_ret_capacity_ = env->GetFieldID(list_expander_ret_class_, "offsetcapacity", "J"); list_expander_valid_address_ = - env->GetFieldID(list_expander_ret_class_, "validityddress", "J"); + env->GetFieldID(list_expander_ret_class_, "validityaddress", "J"); + list_expander_outer_valid_address_ = + env->GetFieldID(list_expander_ret_class_, "outervalidityaddress", "J"); jclass local_cache_class = env->FindClass("org/apache/arrow/gandiva/evaluator/JavaSecondaryCacheInterface"); @@ -643,7 +646,7 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto validity = std::shared_ptr( new arrow::Buffer(reinterpret_cast(validity_addr), validity_size)); buffers.push_back(validity); - std::cout << "LR make_record_batch_with_buf_addrs adding validity_addr buffer=" << validity_addr << " idx=" << buf_idx - 1 << std::endl; + //std::cout << "LR make_record_batch_with_buf_addrs adding validity_addr buffer=" << validity_addr << " idx=" << buf_idx - 1 << std::endl; if (buf_idx >= in_bufs_len) { return Status::Invalid("insufficient number of in_buf_addrs"); @@ -653,7 +656,7 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto data = std::shared_ptr( new arrow::Buffer(reinterpret_cast(value_addr), value_size)); buffers.push_back(data); - std::cout << "LR make_record_batch_with_buf_addrs adding value_addr buffer=" << value_addr << " idx=" << buf_idx - 1 << std::endl; + // std::cout << "LR make_record_batch_with_buf_addrs adding value_addr buffer=" << value_addr << " idx=" << buf_idx - 1 << std::endl; if (arrow::is_binary_like(field->type()->id())) { if (buf_idx >= in_bufs_len) { @@ -666,7 +669,7 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto offsets = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); - std::cout << "LR make_record_batch_with_buf_addrs adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; + // std::cout << "LR make_record_batch_with_buf_addrs adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; } ////////// @@ -687,12 +690,12 @@ auto type_id = type->id(); auto offsets = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); - std::cout << "LR make_record_batch_with_buf_addrs 2a adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; - std::cout << "LR bits are "; - for (int i = 0; i < 15; i++) { - std::cout << arrow::bit_util::GetBit(reinterpret_cast(offsets_addr), i) << ","; - } - std::cout << std::endl; + // std::cout << "LR make_record_batch_with_buf_addrs 2a adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; + // std::cout << "LR bits are "; + // for (int i = 0; i < 15; i++) { + // std::cout << arrow::bit_util::GetBit(reinterpret_cast(offsets_addr), i) << ","; + // } + // std::cout << std::endl; if (arrow::is_binary_like(type->field(0)->type()->id())) { // child offsets length is internal data length + 1 @@ -704,7 +707,7 @@ auto type_id = type->id(); auto child_offsets_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(std::move(child_offsets_buffer)); - std::cout << "LR make_record_batch_with_buf_addrs 2b adding child_offsets_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; + // std::cout << "LR make_record_batch_with_buf_addrs 2b adding child_offsets_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; } } @@ -714,19 +717,19 @@ auto type_id = type->id(); jlong offsets_addr = in_buf_addrs[buf_idx++]; jlong offsets_size = in_buf_sizes[sz_idx++]; auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); - std::cout << "LR make_record_batch_with_buf_addrs 3 adding data_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; + // std::cout << "LR make_record_batch_with_buf_addrs 3 adding data_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; //std::cout << "LR New ArrayData List" << std::endl; auto internal_type = type->field(0)->type(); std::shared_ptr child_data; if (arrow::is_primitive(internal_type->id())) { - std::cout << "LR New ArrayData List creating child data" << std::endl; - for (int i = 0; i < buffers.size(); i++) { - std::cout << "buffer for child data " << i << "=" << *buffers[i]->data() << std::endl; - } + // std::cout << "LR New ArrayData List creating child data" << std::endl; + // for (int i = 0; i < buffers.size(); i++) { + // std::cout << "buffer for child data " << i << "=" << *buffers[i]->data() << std::endl; + // } child_data = arrow::ArrayData::Make(internal_type, 0, {std::move(buffers[2]), std::move(data_buffer)}); - std::cout << "child_data is =" << child_data->buffers[0] << " 1=" << child_data->buffers[1] << std::endl; + // std::cout << "child_data is =" << child_data->buffers[0] << " 1=" << child_data->buffers[1] << std::endl; } if (arrow::is_binary_like(internal_type->id())) { //LR TODO need this for strings I think. @@ -966,6 +969,7 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { jlong offset_ret_address = env_->GetLongField(ret, list_expander_offset_ret_address_); jlong offset_ret_capacity = env_->GetLongField(ret, list_expander_offset_ret_capacity_); jlong valid_address = env_->GetLongField(ret, list_expander_valid_address_); + jlong outer_valid_address = env_->GetLongField(ret, list_expander_outer_valid_address_); std::cout << "Buffer expand: New capacity is " << new_capacity << " vector id " << vector_idx_ << " expander method " << method_ << @@ -980,6 +984,7 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { offsetCapacity = offset_ret_capacity; std::cout << "LR Setting buffer validityBuffer to " << validityBuffer << std::endl; validityBuffer = reinterpret_cast(valid_address); + outerValidityBuffer = reinterpret_cast(outer_valid_address); } else { jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); @@ -1066,15 +1071,15 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( << in_batch->ToString() << " there are " << out_bufs_len << " buffers " << std::endl;*/ - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " - << " there are " << out_bufs_len << " buffers " - << std::endl; - for (int i = 0; i < out_bufs_len; i++) { - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " - << " buffer " << i - << "length " << out_sizes[i] - << std::endl; - } + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + //<< " there are " << out_bufs_len << " buffers " + //<< std::endl; + //for (int i = 0; i < out_bufs_len; i++) { + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + // << " buffer " << i + // << "length " << out_sizes[i] + // << std::endl; + // } std::shared_ptr selection_vector; auto selection_buffer = std::make_shared( @@ -1111,14 +1116,14 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( for (FieldPtr field : ret_types) { std::vector> buffers; - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer idx=" << buf_idx << std::endl; + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* validity_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong bitmap_sz = out_sizes[sz_idx++]; buffers.push_back(std::make_shared(validity_buf, bitmap_sz)); if (arrow::is_binary_like(field->type()->id())) { - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding bufferbuffer idx=" << buf_idx << std::endl; + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding bufferbuffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* offsets_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong offsets_sz = out_sizes[sz_idx++]; @@ -1137,13 +1142,13 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; buffers.push_back(std::make_shared( env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else if (field->type()->id() == arrow::Type::LIST) { - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding list offset buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; - std::cout << " size=" << out_sizes[sz_idx - 1] << " outsize index=" << sz_idx - 1 << " address " << out_bufs[buf_idx - 1] - << " output_vector_idx=" << output_vector_idx << std::endl; + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding list offset buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; + // std::cout << " size=" << out_sizes[sz_idx - 1] << " outsize index=" << sz_idx - 1 << " address " << out_bufs[buf_idx - 1] + // << " output_vector_idx=" << output_vector_idx << std::endl; buffers.push_back(std::make_shared( env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else { @@ -1164,19 +1169,20 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //LR TODO the two buffers... + //LR TODO maybe should all be mutable buffers except for the data buffer? data_sz = out_sizes[sz_idx++]; - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding child nbuffer " << buf_idx - << " size=" << data_sz << std::endl; + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding child nbuffer " << buf_idx + // << " size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( env, jListExpander, listvector_expander_method_, output_vector_idx, child_offset_buf, data_sz)); - std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding child buffer " << buf_idx - << " size=" << out_sizes[sz_idx] << " outsize index=" << sz_idx << " address " << out_bufs[buf_idx] - << " output_vector_idx=" << output_vector_idx << std::endl; + // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding child buffer " << buf_idx + // << " size=" << out_sizes[sz_idx] << " outsize index=" << sz_idx << " address " << out_bufs[buf_idx] + // << " output_vector_idx=" << output_vector_idx << std::endl; data_sz = out_sizes[sz_idx++]; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); @@ -1186,6 +1192,8 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); outBufJava->offsetCapacity = out_sizes[1]; outBufJava->validityBuffer = reinterpret_cast(out_bufs[2]); + //LR TODO ? + outBufJava->outerValidityBuffer = reinterpret_cast(out_bufs[0]); child_buffers.push_back(outBufJava); std::shared_ptr dt2 = std::make_shared(); @@ -1279,6 +1287,9 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( out_bufs[2] = (jlong) outBufJava->validityBuffer; out_sizes[2] = (jlong) outBufJava->offsetCapacity; + out_bufs[0] = (jlong) outBufJava->outerValidityBuffer; + out_sizes[0] = (jlong) outBufJava->offsetCapacity; + env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index fc96846842a3e..a4e30127242ba 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -17,7 +17,6 @@ package org.apache.arrow.gandiva.evaluator; -import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.complex.ListVector; /** @@ -39,7 +38,8 @@ public static class ExpandResult { public long capacity; public long offsetaddress; public long offsetcapacity; - public long validityddress; + public long validityaddress; + public long outervalidityaddress; /** * fdsfsdfds. @@ -49,12 +49,13 @@ public static class ExpandResult { * @param offsetcap dfsfs * */ - public ExpandResult(long address, long capacity, long offsetad, long offsetcap, long validAdd) { + public ExpandResult(long address, long capacity, long offsetad, long offsetcap, long outValidAdd, long validAdd) { this.address = address; this.capacity = capacity; this.offsetaddress = offsetad; this.offsetcapacity = offsetcap; - this.validityddress = validAdd; + this.outervalidityaddress = outValidAdd; + this.validityaddress = validAdd; } } @@ -72,26 +73,45 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { throw new IllegalArgumentException("invalid index " + index); } + + //ArrowBuf ab = vectors[index].getValidityBuffer(); + //String s = "Before validity = ["; + //for (int i = 0; i < 20; i++) { + // s += ab.getInt(i) + ","; + //} + //System.out.println(s); + + int valueBufferIndex = 1; int validBufferIndex = 0; ListVector vector = vectors[index]; while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { - vector.reAlloc(); + //vector.reAlloc(); + vector.getDataVector().reAlloc(); } System.out.println("LR Expanding ListVector. New capacity=" + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity()); - System.out.println("LR Expanding ListVector. Offset data is "); - ArrowBuf ab = vector.getOffsetBuffer(); + System.out.println("LR Expanding ListVector. new data is "); + + /*ArrowBuf ab2 = vector.getValidityBuffer(); + s = "After validity = ["; + for (int i = 0; i < 20; i++) { + s += ab2.getInt(i) + ","; + } + System.out.println(s);*/ + /*ArrowBuf ab = vector.getOffsetBuffer(); String s = "offsetBuffer = ["; for (int i = 0; i < 20; i++) { s += ab.getInt(i) + ","; } System.out.println(s); + */ return new ExpandResult( vector.getDataVector().getFieldBuffers().get(valueBufferIndex).memoryAddress(), vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity(), vector.getOffsetBuffer().memoryAddress(), vector.getOffsetBuffer().capacity(), + vector.getValidityBuffer().memoryAddress(), vector.getDataVector().getFieldBuffers().get(validBufferIndex).memoryAddress()); } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index e57e7e90f9e0b..590ef697b220b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -27,7 +27,6 @@ import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.SelectionVectorType; import org.apache.arrow.memory.ArrowBuf; -import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; @@ -547,7 +546,7 @@ public void endList() { - + /* String s = ""; List fv = ((ListVector) valueVector).getDataVector().getFieldBuffers(); for (ArrowBuf ab : fv) { @@ -574,20 +573,25 @@ public void endList() { } logger.error("LR Projector.java before updating listvector. getOffsetBuffer=" + fvvv.capacity() + " buffer=" + s); - + */ + + + + + - ((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); + //((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); ((ListVector) valueVector).setLastSet(selectionVectorRecordCount - 1); - + /* ArrowBuf mabb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); s = "validity? buffer mabb2, outAddrs[2]="; for (int i = 0; i < 20; i++) { s += mabb2.getInt(i) + ","; } System.out.println(s); - + */ /* //Validity then data. ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java index f22ebbd37878f..d3c75413957a1 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java @@ -57,6 +57,7 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { throw new IllegalArgumentException("invalid index " + index); } + System.out.println("LR Expanding VectorExpander."); BaseVariableWidthVector vector = vectors[index]; while (vector.getDataBuffer().capacity() < toCapacity) { vector.reallocDataBuffer(); From a4ee4ae0344f69e3f5bbe4c1ec10d5c884e29a73 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 25 Oct 2023 11:30:19 -0700 Subject: [PATCH 24/46] Cleanup, tested --- cpp/src/gandiva/gdv_function_stubs.cc | 87 ++--------- java/gandiva/src/main/cpp/jni_common.cc | 136 ------------------ .../gandiva/evaluator/ListVectorExpander.java | 2 +- .../arrow/gandiva/evaluator/Projector.java | 15 -- 4 files changed, 13 insertions(+), 227 deletions(-) diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 76465020bb93a..38d61590613c1 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -166,7 +166,7 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, TYPE* entry_buf, int32_t entry_len, int32_t** valid_ptr) { \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ - std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; \ + std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; \ auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ @@ -174,81 +174,18 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, context->set_error_msg(status.message().c_str()); \ return -1; \ } \ - std::cout << "LR gdv_fn_populate_list_ 2 valid_ptr" << valid_ptr << std::endl; \ - std::cout << "LR gdv_fn_populate_list_ " << buffer << " " << offset; \ - std::cout << " " << entry_len << " " << SCALE << "]]" << std::endl; \ memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ - std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; \ - std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; \ - std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; \ - std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << std::endl; \ - std::bitset<8> bs((unsigned long)(*valid_ptr)); \ - std::cout << "LR bitset of valid ptr is " << bs << std::endl; \ - offsets = reinterpret_cast(buffer->offsetBuffer); \ - offsets[slot] = offset / SCALE; \ - offsets[slot + 1] = offset / SCALE + entry_len; \ - std::cout << "LR gdv_fn_populate_list_ Done" << std::endl; \ - return 0; \ - } - -// int32_t vv = 5; - //memcpy(buffer->validityBuffer + slot, *vv, 1); -//memcpy(buffer->validityBuffer + slot, bs.to_ulong(), 1); - //buffer->offsetBuffer[slot] = offset / SCALE; - //buffer->offsetBuffer[slot + 1] = offset / SCALE + entry_len; - -int32_t gdv_fn_populate_list_int32_t_vector(int64_t context_ptr, int8_t* data_ptr, - int32_t* offsets, int64_t slot, - int32_t* entry_buf, int32_t entry_len, int32_t** valid_ptr) { - int SCALE = 4; - auto buffer = reinterpret_cast(data_ptr); - int32_t offset = static_cast(buffer->size()); - //std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; - auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); - if (!status.ok()) { - gandiva::ExecutionContext* context = - reinterpret_cast(context_ptr); - context->set_error_msg(status.message().c_str()); - return -1; - } - //std::cout << "LR gdv_fn_populate_list_ 2 valid_ptr" << valid_ptr << std::endl; - //std::cout << "LR gdv_fn_populate_list_ " << buffer << " " << offset; - //std::cout << " " << entry_len << " " << SCALE << "]]" << std::endl; - memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); - //std::cout << "LR gdv_fn_populate_list_ 3 entry_buf=" << entry_buf << "]" << std::endl; - //std::cout << "LR gdv_fn_populate_list_ 3a entry_len=" << entry_len << " &entry_len=" << &entry_len << "]" << std::endl; - //std::cout << "LR gdv_fn_populate_list_ 4 buffer->validityBuffer=" << reinterpret_cast(buffer->validityBuffer) << "]" << std::endl; - //int v[6] = {255, 255, 255, 255, 255, 255}; - - int validbitIndex = offset / SCALE; - //std::cout << "LR gdv_fn_populate_list_ 5 valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " validbitIndex=" << validbitIndex << std::endl; - //int newValidSize = validbitIndex + entry_len; - - //LR TODO just copy? - for (int i = 0; i < entry_len; i++) { - - arrow::bit_util::SetBitTo(buffer->validityBuffer, validbitIndex + i, arrow::bit_util::GetBit(reinterpret_cast(valid_ptr), i)); - } - - - /*std::bitset existingBits(buffer->validityBuffer); - std::cout << "LR bitset of existingBits " << existingBits << std::endl; - std::bitset<8> bs((unsigned long)(*valid_ptr)); - for (int i = 0; i < entry_len; i++) { - existingBits.set(validbitIndex + i, bs[i]); - } - std::bitset existingBits2(buffer->validityBuffer); - std::cout << "LR bitset of existingBits2 " << existingBits2 << std::endl; - */ - //std::cout << "LR bitset of valid ptr is " << bs << std::endl; - offsets = reinterpret_cast(buffer->offsetBuffer); - offsets[slot] = offset / SCALE; - offsets[slot + 1] = offset / SCALE + entry_len; - //std::cout << "LR gdv_fn_populate_list_ Done" << std::endl; - return 0; - } - -//POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) + int validbitIndex = offset / SCALE; \ + for (int i = 0; i < entry_len; i++) { \ + arrow::bit_util::SetBitTo(buffer->validityBuffer, validbitIndex + i, arrow::bit_util::GetBit(reinterpret_cast(valid_ptr), i)); \ + } \ + offsets = reinterpret_cast(buffer->offsetBuffer); \ + offsets[slot] = offset / SCALE; \ + offsets[slot + 1] = offset / SCALE + entry_len; \ + return 0; \ + }\ + +POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) POPULATE_NUMERIC_LIST_TYPE_VECTOR(int64_t, 8) POPULATE_NUMERIC_LIST_TYPE_VECTOR(float, 4) POPULATE_NUMERIC_LIST_TYPE_VECTOR(double, 8) diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index bb8feb0abb18a..5f5f3fb02d920 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -690,13 +690,6 @@ auto type_id = type->id(); auto offsets = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); - // std::cout << "LR make_record_batch_with_buf_addrs 2a adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; - // std::cout << "LR bits are "; - // for (int i = 0; i < 15; i++) { - // std::cout << arrow::bit_util::GetBit(reinterpret_cast(offsets_addr), i) << ","; - // } - // std::cout << std::endl; - if (arrow::is_binary_like(type->field(0)->type()->id())) { // child offsets length is internal data length + 1 // offsets element is int32 @@ -707,29 +700,18 @@ auto type_id = type->id(); auto child_offsets_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(std::move(child_offsets_buffer)); - // std::cout << "LR make_record_batch_with_buf_addrs 2b adding child_offsets_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; } } - - //std::cout << "LR New ArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { jlong offsets_addr = in_buf_addrs[buf_idx++]; jlong offsets_size = in_buf_sizes[sz_idx++]; auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); - // std::cout << "LR make_record_batch_with_buf_addrs 3 adding data_buffer buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; - - //std::cout << "LR New ArrayData List" << std::endl; auto internal_type = type->field(0)->type(); std::shared_ptr child_data; if (arrow::is_primitive(internal_type->id())) { - // std::cout << "LR New ArrayData List creating child data" << std::endl; - // for (int i = 0; i < buffers.size(); i++) { - // std::cout << "buffer for child data " << i << "=" << *buffers[i]->data() << std::endl; - // } child_data = arrow::ArrayData::Make(internal_type, 0, {std::move(buffers[2]), std::move(data_buffer)}); - // std::cout << "child_data is =" << child_data->buffers[0] << " 1=" << child_data->buffers[1] << std::endl; } if (arrow::is_binary_like(internal_type->id())) { //LR TODO need this for strings I think. @@ -746,13 +728,6 @@ auto type_id = type->id(); auto array_data = arrow::ArrayData::Make(type, num_rows, std::move(buffers)); columns.push_back(array_data); } - -///////// -//TODO use unique_ptr -//Was -//auto array_data = arrow::ArrayData::Make(field->type(), num_rows, std::move(buffers)); -//columns.push_back(array_data); - } *batch = arrow::RecordBatch::Make(schema, num_rows, columns); return Status::OK(); @@ -949,8 +924,6 @@ class JavaResizableBuffer : public arrow::ResizableBuffer { Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { // callback into java to expand the buffer - - //LR TODO listvector_expander_method_ vector_expander_method_ jobject ret = env_->CallObjectMethod(jexpander_, method_, vector_idx_, new_capacity); if (env_->ExceptionCheck()) { @@ -1167,10 +1140,6 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - - //LR TODO the two buffers... - //LR TODO maybe should all be mutable buffers except for the data buffer? - data_sz = out_sizes[sz_idx++]; // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding child nbuffer " << buf_idx // << " size=" << data_sz << std::endl; @@ -1192,25 +1161,17 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); outBufJava->offsetCapacity = out_sizes[1]; outBufJava->validityBuffer = reinterpret_cast(out_bufs[2]); - //LR TODO ? outBufJava->outerValidityBuffer = reinterpret_cast(out_bufs[0]); child_buffers.push_back(outBufJava); std::shared_ptr dt2 = std::make_shared(); auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); - //array_data_child-> - - std::vector> kids; kids.push_back(array_data_child); - //auto array_data = std::make_shared(field->type(), output_row_count); auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers, kids); array_data->child_data = std::move(kids); output.push_back(array_data); ++output_vector_idx; - - //std::cout << "LR jni_common there are " << buffers.size() << " buffers" << std::endl; - } else { auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers); output.push_back(array_data); @@ -1222,104 +1183,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - //std::cout << "LR jni_common calling evaluate" << std::endl; status = holder->projector()->Evaluate(*in_batch, selection_vector.get(), output); - //LRtest1 - //std::cout << "LR jni_common after evaluating the output size is " << output.size() << std::endl; - arrow::ArraySpan sp(*(output[0])); - //std::cout << "LR jni_common after evaluating the output 0 is " << sp.ToArray()->ToString() << std::endl; - auto array_data = output[0]; - if (array_data->type->id() == arrow::Type::LIST) { - auto child_data = array_data->child_data[0]; - //std::cout << "LR jni_common child array[3] " << - //int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; - //std::cout << "LR jni_common child array[0] " << - //int32_t( (*(array_data->child_data[0])->buffers[1])[0*4]) << std::endl; - //std::cout << "LR jni_common child via data ptr array[0] " << - //int32_t( *(*(array_data->child_data[0])->buffers[1]).data()) << std::endl; - //std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" - // << (array_data->child_data[0])->length << std::endl; - - //LRTest1 Start - int numRecords = (array_data->child_data[0])->length; - //int numRecords = (array_data->child_data[0])->length * array_data->length; - - //std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" - // << (array_data->child_data[0])->length << " using numRecords=" << numRecords << std::endl; - //std::cout << "LR jni_common out_bufs[3]=" << out_bufs[3] << " after eval=" - // << (jlong)(array_data->child_data[0])->buffers[1]->data() << std::endl; - //LR test1 - out_bufs[3] = (jlong)(array_data->child_data[0])->buffers[1]->data(); - out_sizes[3] = (jlong)(array_data->child_data[0])->buffers[1]->capacity(); - - //Copy the new buffer ptr back to Java. The above two lines don't copy it to java, just to the local array. - //env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); - //env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); - - //array_data.child_data.at(0)->offset) - - //env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT); - //memcpy((void*)out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); - //out_sizes[3] = recordSize; - //int test[] = {42,21,42,21,42}; - //memcpy((void *)out_bufs[3], test, 20); - - /*out_sizes[2] = numRecords * 20; - int test[numRecords * 20]; - for (int i = 0; i < numRecords; i++) { - test[i] = 0; - } - memcpy((void *)out_bufs[2], test, numRecords*4); - */ - - //LR test1 Havent tried yet. - //out_bufs[2] = (jlong)(array_data->child_data[0])->buffers[0]->data(); - //out_sizes[2] = (jlong)(array_data->child_data[0])->buffers[0]->capacity(); - - //out_bufs[1] = (jlong)(array_data->child_data[0])->buffers[0]->data(); - //out_sizes[1] = (jlong)(array_data->child_data[0])->buffers[0]->capacity(); - - //out_bufs[1] = (jlong)(array_data)->buffers[0]->data(); - //out_sizes[1] = (jlong)(array_data)->buffers[0]->capacity(); - out_bufs[1] = (jlong) outBufJava->offsetBuffer; - out_sizes[1] = (jlong) outBufJava->offsetCapacity; - - out_bufs[2] = (jlong) outBufJava->validityBuffer; - out_sizes[2] = (jlong) outBufJava->offsetCapacity; - - out_bufs[0] = (jlong) outBufJava->outerValidityBuffer; - out_sizes[0] = (jlong) outBufJava->offsetCapacity; - - env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); - env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); - - - - - //validity buffer? - //bool valid[] = {true, true, true, true, true}; - //memcpy(&out_bufs[2], valid, 5); - //out_sizes[2] = 5; - - - - - - - - //offset buffer is not needed. - //int32_t offsetsBuffer[] = {0}; - //memcpy(&out_bufs[1], offsetsBuffer, 1 * 4); - //out_sizes[1] = 1; - - //std::cout << "LR jni_common after copy parent buff child array[0] " << - //"," << int32_t( (out_bufs[3])) << - //"," << int32_t( (out_bufs[3]+4)) << - //"," << int32_t( (out_bufs[3])+8) << - //"," << int32_t( (out_bufs[3])+12) << std::endl; - //LRTest1 End - } - } while (0); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index a4e30127242ba..3b2778c7f21a4 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -86,7 +86,7 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { int validBufferIndex = 0; ListVector vector = vectors[index]; while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { - //vector.reAlloc(); + //Just realloc the data vector. vector.getDataVector().reAlloc(); } System.out.println("LR Expanding ListVector. New capacity=" + diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 590ef697b220b..61e6c8ffacc39 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -391,13 +391,6 @@ private void evaluate(int numRows, List buffers, List buf outSizes[idx++] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().capacity(); } if (valueVector instanceof ListVector) { - - /*((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc(); //100 rows - ((ListVector) valueVector).reAlloc(); - ((ListVector) valueVector).reAlloc();*/ - hasVariableWidthColumns = true; resizableListVectors[outColumnIdx] = (ListVector) valueVector; //LR TODO figure out what to use here resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; @@ -421,14 +414,6 @@ private void evaluate(int numRows, List buffers, List buf //vector offset logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); - - - - //This doesnt actually allocate any memory. - //((ListVector) valueVector).setInitialCapacity(1000000); - //while (((ListVector) valueVector).getValueCapacity() < 1000000) { - // ((ListVector) valueVector).reAlloc(); - //} logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); //The realloc avoids dynamic resizing, will have to be fixed later. From 316b822c3776d02acdf09b3da07a11ceeab7c355 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 25 Oct 2023 13:37:02 -0700 Subject: [PATCH 25/46] Cleanup and test --- cpp/src/gandiva/annotator.cc | 58 +--- cpp/src/gandiva/array_ops.cc | 89 +---- cpp/src/gandiva/bitmap_accumulator.h | 2 - cpp/src/gandiva/expr_decomposer.cc | 10 - cpp/src/gandiva/field_descriptor.h | 1 - cpp/src/gandiva/function_registry.cc | 3 - cpp/src/gandiva/gdv_function_stubs.cc | 1 - cpp/src/gandiva/llvm_generator.cc | 279 +-------------- cpp/src/gandiva/llvm_types.h | 2 +- cpp/src/gandiva/lvalue.h | 4 - cpp/src/gandiva/projector.cc | 73 +--- cpp/src/gandiva/tree_expr_builder.cc | 5 - .../main/cpp/expression_registry_helper.cc | 8 +- java/gandiva/src/main/cpp/jni_common.cc | 64 +--- .../gandiva/evaluator/ListVectorExpander.java | 63 +--- .../arrow/gandiva/evaluator/Projector.java | 323 +----------------- .../gandiva/evaluator/VectorExpander.java | 1 - 17 files changed, 35 insertions(+), 951 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index dbc1cc50babaf..7fc8ab94d3c05 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -53,7 +53,6 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { } if (field->type()->id() == arrow::Type::LIST) { - //std::cout << "LR Annotator::MakeDesc 1" << std::endl; offsets_idx = buffer_count_++; if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { child_offsets_idx = buffer_count_++; @@ -64,10 +63,7 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { data_buffer_ptr_idx = buffer_count_++; } int child_valid_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; - //if (is_output) { - child_valid_buffer_ptr_idx = buffer_count_++; - //std::cout << "LR Annotator::MakeDesc 2 child_valid_buffer_ptr_idx=" << child_valid_buffer_ptr_idx << std::endl; - //} + child_valid_buffer_ptr_idx = buffer_count_++; return std::make_shared(field, data_idx, validity_idx, offsets_idx, data_buffer_ptr_idx, child_offsets_idx, child_valid_buffer_ptr_idx); } @@ -86,56 +82,45 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // The validity buffer is optional. Use nullptr if it does not have one. if (array_data.buffers[buffer_idx]) { uint8_t* validity_buf = const_cast(array_data.buffers[buffer_idx]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -6 " << &validity_buf << std::endl; eval_batch->SetBuffer(desc.validity_idx(), validity_buf, array_data.offset); } else { - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -5 null " << std::endl; eval_batch->SetBuffer(desc.validity_idx(), nullptr, array_data.offset); } ++buffer_idx; if (desc.HasOffsetsIdx()) { uint8_t* offsets_buf = const_cast(array_data.buffers[buffer_idx]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << " using idx=" << buffer_idx << std::endl; eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); if (desc.HasChildOffsetsIdx()) { - //std::cout << "LR Annotator::PrepareBuffersForField 1 for field " << desc.Name() << " type is " << array_data.type->id() << std::endl; if (is_output) { // if list field is output field, we should put buffer pointer into eval batch // for resizing uint8_t* child_offsets_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3a " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); uint8_t* child_valid_buf = reinterpret_cast( array_data.child_data.at(0)->buffers[0].get()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3b " << &child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, array_data.child_data.at(0)->offset); } else { - //std::cout << "LR Annotator::PrepareBuffersForField 2" << std::endl; // if list field is input field, just put buffer data into eval batch uint8_t* child_offsets_buf = const_cast( array_data.child_data.at(0)->buffers[buffer_idx]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2a " << &child_offsets_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, array_data.child_data.at(0)->offset); uint8_t* child_valid_buf = const_cast( array_data.child_data.at(0)->buffers[0]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2b " << &child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_valid_buf, array_data.child_data.at(0)->offset); } } if (array_data.type->id() != arrow::Type::LIST || arrow::is_binary_like(array_data.type->field(0)->type()->id())) { - //std::cout << "LR Annotator::PrepareBuffersForField 3" << std::endl; - // primitive type list data buffer index is 1 // binary like type list data buffer index is 2 ++buffer_idx; @@ -143,39 +128,15 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, } if (array_data.type->id() != arrow::Type::LIST) { - //std::cout << "LR Annotator::PrepareBuffersForField 4" << std::endl; - - //std::cout << "LR Annotator::PrepareBuffersForField 4 buffer_idx " << buffer_idx << std::endl; uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField 4a" << std::endl; - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -1 " << &data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); - //std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; } else { - //std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; - //std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; - - //std::cout << "LR array_data.child_data.at(0)->buffers[0]=" << array_data.child_data.at(0)->buffers[0] << std::endl; - //uint8_t* data_valid_buf = - // const_cast(array_data.child_data.at(0)->buffers[0]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField setting offset eval data_valid_buf idx=" << 0 << " data_valid_buf=" << &data_valid_buf << std::endl; - //eval_batch->SetBuffer(desc.child_data_validity_idx(), data_valid_buf, array_data.child_data.at(0)->offset); - - uint8_t* data_buf = const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); - //std::cout << "LR Annotator::PrepareBuffersForField setting data buffer desc.data_idx()=" << desc.data_idx() << " idx=" << buffer_idx << " data=" << data_buf << std::endl; eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); - //std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; - - - //std::cout << "LR array_data.child_data.at(0)->buffers[0]->data() is " << array_data.child_data.at(0)->buffers[0] << std::endl; if (array_data.child_data.at(0)->buffers[0] ) { uint8_t* child_valid_buf = const_cast( array_data.child_data.at(0)->buffers[0]->data()); - //desc.set_child_data_validity_idx(4); - // std::cout << "LR Annotator::PrepareBuffersForField setting child valid buffer -5b " << - //" name=" << desc.Name() << " idx=" << desc.child_data_validity_idx() << " child_data_buf=" << *child_valid_buf << std::endl; eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, 0); } @@ -187,16 +148,11 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, if (array_data.type->id() != arrow::Type::LIST) { uint8_t* data_buf_ptr = reinterpret_cast(array_data.buffers[buffer_idx].get()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 1 " << &data_buf_ptr << std::endl; eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); } else { - //std::cout << "LR Annotator::PrepareBuffersForField is_output index " << desc.data_buffer_ptr_idx() << std::endl; - // list data buffer is in child data buffer uint8_t* data_buf_ptr = reinterpret_cast( array_data.child_data.at(0)->buffers[buffer_idx].get()); - //std::cout << "LR Annotator::PrepareBuffersForField setting eval data buffer " << buffer_idx << " data=" << &data_buf_ptr << std::endl; - eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.child_data.at(0)->offset); } @@ -209,7 +165,6 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, EvalBatchPtr eval_batch = std::make_shared( record_batch.num_rows(), buffer_count_, local_bitmap_count_); - //std::cout << "LR PrepareEvalBatch 1" << std::endl; // Fill in the entries for the input fields. for (int i = 0; i < record_batch.num_columns(); ++i) { const std::string& name = record_batch.column_name(i); @@ -218,28 +173,17 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, // skip columns not involved in the expression. continue; } - - /*std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch schema " << record_batch.schema()->ToString() - << " num rows " << record_batch.num_rows() - << " num columns " << record_batch.num_columns() - << " data size " << record_batch.column_data().size() - << " col 1 " << record_batch.column(0)->ToString() - << std::endl;*/ - - //std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch data " << record_batch.ToString() << std::endl; PrepareBuffersForField(*(found->second), *(record_batch.column_data(i)), eval_batch.get(), false /*is_output*/); } // Fill in the entries for the output fields. - //std::cout << "LR PrepareEvalBatch preparing output fields" << std::endl; int idx = 0; for (auto& arraydata : out_vector) { const FieldDescriptorPtr& desc = out_descs_.at(idx); PrepareBuffersForField(*desc, *arraydata, eval_batch.get(), true /*is_output*/); ++idx; } - //std::cout << "LR PrepareEvalBatch 2" << std::endl; return eval_batch; } diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index d83cd0a8986e6..b9ff34b4a8a4f 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -74,18 +74,15 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, return false; } +//LR TODO int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { - //std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; int integers[] = { contains_data, 21, 3, contains_data, 5 }; *out_len = 5;// * 4; //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); memcpy(ret, integers, *out_len * 4); - //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; - - //return reinterpret_cast(ret); return reinterpret_cast(ret); } @@ -94,7 +91,6 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int64_t contains_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { - //std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; if (!combined_row_validity) { *valid_row = false; return false; @@ -108,8 +104,7 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { continue; } - int64_t entry_len = *(entry_buf + (i*2)); //LR TODO sizeof int64? - //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + int64_t entry_len = *(entry_buf + (i*2)); if (entry_len == contains_data) { return true; } @@ -120,85 +115,29 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int32_t remove_data, bool entry_validWhat, - /*const int32_t* array_valid_bits,*/ int64_t loop_var, int64_t validity_index_var, + int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { - //std::cout << "LR array_int32_remove data=" << remove_data - // << " entry_offsets_len " << entry_offsets_len << std::endl; - //std::cout << "LR array_int32_remove " << loop_var << std::endl; std::vector newInts; - - - /*std::bitset<8> validBits(*entry_valid); //LR TODO handle size. - std::bitset<8> outputValidBits; - std::cout << "LR Entry bitset is " << validBits << std::endl; - for (int i = 0; i < entry_offsets_len; i++) { - //std::cout << "LR going to check " << entry_buf + i << std::endl; - int32_t entry_item = *(entry_buf + (i * 1)); - //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; - if (entry_item == remove_data) { - continue; - } else if (!validBits[i]) { - outputValidBits[i] = 0; - newInts.push_back(0); //This will be marked invalid, so data doesn't matter. - } else { - outputValidBits[i] = 1; - //Note the vector can have n elements, while validbits might have n+1. - newInts.push_back(entry_item); - } - }*/ - - //std::cout << "LR entry_buf=" << entry_buf << " *entry_buf=" << entry_buf << std::endl; - //std::cout << "LR notSureWhatThisIs=" << notSureWhatThisIs << " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; - std::cout << "LR combined_row_validity=" << combined_row_validity << " entry_validWhat=" << entry_validWhat << " validity_index_var=" << validity_index_var << - " entry_validity=" << entry_validity << std::endl; - //<< " *notSureWhatThisIs=" << *notSureWhatThisIs << std::endl; //LR TODO not sure what entry_validWhat is. //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - //std::bitset<15> maybeInputBits (*notSureWhatThisIsAdjusted); - //std::cout << "LR maybeInputBits=" << maybeInputBits << std::endl; - - int64_t validityBitIndex = 0; - //for (int i = 0; i < loop_var; i++) { - // validityBitIndex += *(offsets + i); - // std::cout << "LR i=" << i << " adding offset " << *(offsets + i) << " offset is " << offsets << std::endl; - //} - //The validity index already has the current row length added to it, so decrement. -validityBitIndex = validity_index_var - entry_len; - //TODO temp until the buffer is worked out. - //validityBitIndex -= (loop_var); - - - //std::cout << "Using validityBitIndex=" << validityBitIndex << std::endl; - - - + validityBitIndex = validity_index_var - entry_len; entry_validWhat = true; - //std::bitset<10> outputValidBits; - std::vector outValid; for (int i = 0; i < entry_len; i++) { - //std::cout << "LR going to check " << entry_buf + i << std::endl; int32_t entry_item = *(entry_buf + (i * 1)); - //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; if (entry_item == remove_data) { - //outValid.push_back(false); - //newInts.push_back(42); - //entry_validWhat = false; - //TODO temp until buffer is worked out } else if (!arrow::bit_util::GetBit(reinterpret_cast(array_valid_bits), validityBitIndex + i)) { + //Do not add the item to remove. } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { outValid.push_back(false); newInts.push_back(0); - //outputValidBits[i] = 0; } else { outValid.push_back(true); - //Note the vector can have n elements, while validbits might have n+1. newInts.push_back(entry_item); - //outputValidBits[i] = 1; } } @@ -215,31 +154,13 @@ validityBitIndex = validity_index_var - entry_len; //length is number of items, but buffers must account for byte size. uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); memcpy(ret, newInts.data(), outBufferLength); - //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; - - *valid_row = true; - - - //unsigned long ll = outputValidBits.to_ulong(); if (!combined_row_validity) { - //ll = 0; *out_len = 0; *valid_row = false; //this one is what works for the top level validity. entry_validWhat = false; } - //LR no need, set along the way. memcpy(validRet, &ll, 1); - //*valid_len = 1; - //std::cout << "LR valid_buf is " << valid_buf << std::endl; - //std::cout << "LR outputValidBits is " << outputValidBits << std::endl; - //valid_buf = reinterpret_cast(validRet); - *valid_ptr = reinterpret_cast(validRet); - //std::cout << "LR setting valid_ptr=" << valid_ptr << " *valid_ptr=" << *valid_ptr << " **valid_ptr=" << **valid_ptr << " valid_ptr bitset data is " << std::bitset<8>(**valid_ptr) - // << " return value is " << reinterpret_cast(ret) << std::endl; - - - //return reinterpret_cast(ret); return reinterpret_cast(ret); } diff --git a/cpp/src/gandiva/bitmap_accumulator.h b/cpp/src/gandiva/bitmap_accumulator.h index f67b58847ce70..52d73696c788c 100644 --- a/cpp/src/gandiva/bitmap_accumulator.h +++ b/cpp/src/gandiva/bitmap_accumulator.h @@ -37,11 +37,9 @@ class GANDIVA_EXPORT BitMapAccumulator : public DexDefaultVisitor { void Visit(const VectorReadValidityDex& dex) { int idx = dex.ValidityIdx(); - //std::cout << "LR BitMapAccumulator visiting " << idx << std::endl; auto bitmap = eval_batch_.GetBuffer(idx); // The bitmap could be null. Ignore it in this case. if (bitmap != NULLPTR) { - //std::cout << "LR BitMapAccumulator is not null " << bitmap << std::endl; src_maps_.push_back(bitmap); src_map_offsets_.push_back(eval_batch_.GetBufferOffset(idx)); } diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index f35b3bc5cc5e8..e14fcbc8952cb 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -38,28 +38,20 @@ namespace gandiva { Status ExprDecomposer::Visit(const FieldNode& node) { auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field()); - //std::cout << "LR ExprDecomposer" << std::endl; DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; if (desc->HasChildOffsetsIdx()) { - //std::cout << "LR ExprDecomposer 1" << std::endl; // handle list type value_dex = std::make_shared(desc); } else if (desc->HasOffsetsIdx()) { - //std::cout << "LR ExprDecomposer 2" << std::endl; if (desc->field()->type()->id() == arrow::Type::LIST) { // handle list type - //std::cout << "LR ExprDecomposer 3" << std::endl; auto p = std::make_shared(desc); value_dex = p; - //int v = p->DataIdx(); - //std::cout << "LR primitive list type " v << " " << } else { - //std::cout << "LR ExprDecomposer 4" << std::endl; value_dex = std::make_shared(desc); } } else { - //std::cout << "LR ExprDecomposer 5" << std::endl; value_dex = std::make_shared(desc); } result_ = std::make_shared(validity_dex, value_dex); @@ -126,9 +118,7 @@ Status ExprDecomposer::Visit(const FunctionNode& in_node) { } else { DCHECK(native_function->result_nullable_type() == kResultNullInternal); - //LR TODO Need validity? // Add a local bitmap to track the output validity. - std::cout << "LR Making a nullable function holder with validity." << std::endl; int local_bitmap_idx = annotator_.AddLocalBitMap(); auto validity_dex = std::make_shared(local_bitmap_idx); diff --git a/cpp/src/gandiva/field_descriptor.h b/cpp/src/gandiva/field_descriptor.h index 0df7d4f2f2aaa..dfcf6872d501d 100644 --- a/cpp/src/gandiva/field_descriptor.h +++ b/cpp/src/gandiva/field_descriptor.h @@ -39,7 +39,6 @@ class FieldDescriptor { data_buffer_ptr_idx_(data_buffer_ptr_idx), child_offsets_idx_(child_offsets_idx), child_validity_idx_(child_validity_idx) { - //std::cout << "LR FieldDescriptor=" << Name() << " " << data_idx_ << "," << data_buffer_ptr_idx_ << "," << child_validity_idx_ << std::endl; } /// Index of validity array in the array-of-buffers diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 021100678a08e..616ef8530c02b 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -72,10 +72,7 @@ SignatureMap FunctionRegistry::InitPCMap() { pc_registry_.insert(std::end(pc_registry_), v7.begin(), v7.end()); for (auto& elem : pc_registry_) { - //std::cout << "LR pc_registry_ item " << elem.pc_name() << " first signature name " << elem.signatures()[0].base_name() << std::endl; for (auto& func_signature : elem.signatures()) { - //std::cout << "LR Adding function to map " << func_signature.base_name() << std::endl; - //std::cout << " LR args " << func_signature.param_types map.insert(std::make_pair(&(func_signature), &elem)); } } diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 38d61590613c1..2ca9529fa846b 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -166,7 +166,6 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, TYPE* entry_buf, int32_t entry_len, int32_t** valid_ptr) { \ auto buffer = reinterpret_cast(data_ptr); \ int32_t offset = static_cast(buffer->size()); \ - std::cout << "LR gdv_fn_populate_list_" << slot << std::endl; \ auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ if (!status.ok()) { \ gandiva::ExecutionContext* context = \ diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 7fa7073a24948..9d8786e28c5bb 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -92,7 +92,6 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out std::unique_ptr compiled_expr(new CompiledExpr(value_validity, output)); std::string fn_name = "expr_" + std::to_string(idx) + "_" + std::to_string(static_cast(selection_vector_mode_)); - //std::cout << "LR LLVMGenerator::Add " << fn_name << std::endl; if (!cached_) { ARROW_RETURN_NOT_OK(engine_->LoadFunctionIRs()); ARROW_RETURN_NOT_OK(CodeGenExprValue(value_validity->value_expr(), @@ -101,7 +100,6 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out } compiled_expr->SetFunctionName(selection_vector_mode_, fn_name); compiled_exprs_.push_back(std::move(compiled_expr)); - //std::cout << "LR LLVMGenerator::Add Done" << std::endl; return Status::OK(); } @@ -110,18 +108,13 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode mode) { selection_vector_mode_ = mode; - //std::cout << "LR LLVMGenerator::Build " << std::endl; for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); ARROW_RETURN_NOT_OK(Add(expr, output)); } - //std::cout << "LR LLVMGenerator::Build 2" << std::endl; - //Too much logging. needle in haystack? - std::cout << "LR LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); - //std::cout << "LR LLVMGenerator::Build FinalizeModule" << std::endl; // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { @@ -130,7 +123,6 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn); } - //std::cout << "LR LLVMGenerator::Build Done" << std::endl; return Status::OK(); } @@ -152,12 +144,10 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); - //std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); - //std::cout << "LR LLVMGenerator::Execute 2" << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -167,7 +157,6 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } - // std::cout << "LR LLVMGenerator::Execute 3" << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -177,7 +166,6 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } - //std::cout << "LR LLVMGenerator::Execute A1" << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -189,7 +177,6 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); - // std::cout << "LR LLVMGenerator::Execute A2" << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -209,7 +196,6 @@ llvm::Value* LLVMGenerator::GetValidityReference(llvm::Value* arg_addrs, int idx FieldPtr field) { const std::string& name = field->name(); llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); - // std::cout << "LR LLVMGenerator::GetValidityReference name=" << name << " idx=" << idx << std::endl; return ir_builder()->CreateIntToPtr(load, types()->i64_ptr_type(), name + "_varray"); } @@ -218,7 +204,6 @@ llvm::Value* LLVMGenerator::GetDataBufferPtrReference(llvm::Value* arg_addrs, in FieldPtr field) { const std::string& name = field->name(); llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); - // std::cout << "LR LLVMGenerator::GetDataBufferPtrReference name=" << name << " idx=" << idx << std::endl; return ir_builder()->CreateIntToPtr(load, types()->i8_ptr_type(), name + "_buf_ptr"); } @@ -314,8 +299,6 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, FieldDescriptorPtr output, int suffix_idx, std::string& fn_name, SelectionVector::Mode selection_vector_mode) { - //std::cout << "LR CodeGenExprValue for output field " << output->Name() - // << " type " << output->Type()->ToString() << " output type id " << output->Type()->id() << std::endl; try { llvm::IRBuilder<>* builder = ir_builder(); // Create fn prototype : @@ -423,15 +406,11 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } // The visitor can add code to both the entry/loop blocks. - //std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, slice_offsets, arg_context_ptr, position_var, validity_index_var); value_expr->Accept(visitor); LValuePtr output_value = visitor.result(); - //std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; - //std::cout << "LR output_value from visitor is " << output_value->to_string() << std::endl; - // The "current" block may have changed due to code generation in the visitor. llvm::BasicBlock* loop_body_tail = builder->GetInsertBlock(); @@ -458,12 +437,10 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); } else if (output_type_id == arrow::Type::STRUCT) { - //std::cout << "LR creating struct type to store the result." << std::endl; auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); builder->CreateStore(output_value->data(), slot_offset); } else if (output_type_id == arrow::Type::LIST) { auto output_list_internal_type = output->Type()->field(0)->type()->id(); - //std::cout << "LR creating list type to store the result with internal type " << output_list_internal_type << std::endl; if (arrow::is_binary_like(output_list_internal_type)) { auto output_list_value = std::dynamic_pointer_cast(output_value); @@ -475,22 +452,6 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, child_output_offset_ref, loop_var, output_list_value->data(), output_list_value->child_offsets(), output_list_value->offsets_length()}); } else if (output_list_internal_type == arrow::Type::INT32) { - - - std::string str1; - llvm::raw_string_ostream output1(str1); - output_value->data()->print(output1); - - std::string str2; - llvm::raw_string_ostream output2(str2); - output_value->length()->print(output2); - - - // std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," - // << output_offset_ref << "," << loop_var << - // " output_value->data() " << output_value->data() << " output_value->validity() " << output_value->validity() << - // " output_value->length() " << output_value->length() << std::endl; - AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length(), output_value->validity()}); @@ -515,13 +476,6 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, return Status::NotImplemented("output type ", output->Type()->ToString(), " not supported"); } - //LR HACK somehow this caused a crash???? - //std::cout << "LR saving result " << output->Name() << " value " << - // printType(output_value->data()) << std::endl; - - //ADD_TRACE("saving result 2 " + output->Name() + " value %T", output_value->data()); - //int jello = 0; - //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; if (visitor.has_arena_allocs()) { // Reset allocations to avoid excessive memory usage. Once the result is copied to // the output vector (store instruction above), any memory allocations in this @@ -531,23 +485,19 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_context_arena_reset", types()->void_type(), reset_args); } - //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // check loop_var loop_var->addIncoming(types()->i64_constant(0), loop_entry); llvm::Value* loop_update = builder->CreateAdd(loop_var, types()->i64_constant(1), "loop_var+1"); loop_var->addIncoming(loop_update, loop_body_tail); - //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; llvm::Value* loop_var_check = builder->CreateICmpSLT(loop_update, arg_nrecords, "loop_var < nrec"); builder->CreateCondBr(loop_var_check, loop_body, loop_exit); - //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // Loop exit builder->SetInsertPoint(loop_exit); builder->CreateRet(types()->i32_constant(0)); - //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; return Status::OK(); } catch (std::exception& e) { std::cout << e.what() << std::endl; @@ -615,12 +565,6 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, uint8_t* dst_bitmap = eval_batch->GetBuffer(out_idx); // Compute the destination bitmap. if (selection_vector == nullptr) { - // std::cout << "LR blarg" << std::endl; - //std::cout << "LR bitmap array buffer index is " << out_idx << " bitset is " << std::bitset<8>(*dst_bitmap) << std::endl; - //std::cout << "LR bitmap array buffer index is " << 0 << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(0)) << std::endl; - //std::cout << "LR bitmap thing getting the validity buffer " << compiled_expr.output()->validity_idx() << std::endl; - //std::cout << "LR Eval buffer has " << eval_batch->GetNumBuffers() << std::endl; - // << " bitset is " << std::bitset<8>(* eval_batch->GetBuffer(compiled_expr.output()->child_data_validity_idx() )) << std::endl; accumulator.ComputeResult(dst_bitmap); } else { /// The output bitmap is an intersection of some input/local bitmaps. However, with a @@ -633,19 +577,6 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, LocalBitMapsHolder bit_map_holder(eval_batch->num_records(), 1); uint8_t* temp_bitmap = bit_map_holder.GetLocalBitMap(0); accumulator.ComputeResult(temp_bitmap); - - - //std::cout << "LR computing bitmap. Size is " << bit_map_holder.GetLocalBitMapSize() << std::endl; - // for (int i = 0; i < bit_map_holder.GetLocalBitMapSize(); i++) { - // uint8_t* arr = bit_map_holder.GetLocalBitMap(i); - // std::cout << "LR bitmap array [" << i << "] size is " << bit_map_holder.GetNumRecords() << " bitset is " << std::bitset<8>(*arr) << std::endl; - - //} - - - - - auto num_out_records = selection_vector->GetNumSlots(); // the memset isn't required, doing it just for valgrind. memset(dst_bitmap, 0, arrow::bit_util::BytesForBits(num_out_records)); @@ -683,8 +614,6 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::raw_string_ostream output2(str2); ret_type->print(output); value->getType()->print(output2); - //std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; - DCHECK(value->getType() == ret_type); } @@ -770,23 +699,14 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - //std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; - //std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - //std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; arrow::Type::type at = arrow::Type::INT32; type = types->IRType(at); - //type = types->DataVecType(dex.FieldType()); - //std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; // compute list len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); - //std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << - // printType(offsets_slot_index) << std::endl; - - // => offset_start = offsets[loop_var] slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); @@ -808,15 +728,6 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.DataIdx())); llvm::Value* data_list = builder->CreateGEP(type, slot_ref, slot_index); -//LR-VAR - // auto valid_var = builder->CreateIntCast(list_len, types->i64_type(), true); - //builder->CreateStore(valid_var, validity_index_var_); - - - - - - auto list_len_var = builder->CreateIntCast(list_len, types->i64_type(), true); llvm::Value* vv_end = builder->CreateLoad(generator_->types()->i64_type(),validity_index_var_, "vv_end"); @@ -824,17 +735,6 @@ llvm::Value* updated_validity_index_var = builder->CreateAdd( vv_end, list_len_var, "validity_index_var+offset"); builder->CreateStore(updated_validity_index_var, validity_index_var_); - //builder->CreateStore(updated_validity_index_var, validity_index_var_); - - - - // TODO: handle bool type bitmap - //Validity bitmap. - //llvm::Value* b_slot_ref = GetBufferReference(dex.ValidityIdx(), kBufferTypeValidity, dex.Field()); - //llvm::Value* b_slot_index = - // builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); - //llvm::Value* validity = generator_->GetPackedValidityBitValue(b_slot_ref, b_slot_index); - llvm::Value* b_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); @@ -845,13 +745,6 @@ llvm::Value* updated_validity_index_var = builder->CreateAdd( llvm::raw_string_ostream output3(str3); validity->print(output3); } - //std::cout << "LR VectorReadFixedLenValueListDex using validity " << str3 << std::endl; - - // TODO: handle decimal precision and scale - - //std::cout << "LR VectorReadFixedLenValueListDex slot_ref " << printType(slot_ref) << std::endl; - //std::cout << "LR VectorReadFixedLenValueListDex visit fixed-len data list vector " << dex.FieldName() << - // " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", list_len); ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " updated_validity_index_var %T", @@ -914,10 +807,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { llvm::Value* slot; auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - //std::cout << "LR dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - //std::cout << "LR IRType is " << printType(type) << std::endl; - //type = types->DataVecType(dex.FieldType()); - //LR HACK. Original was type = types->DataVecType(dex.FieldType()); + arrow::Type::type at = arrow::Type::INT32; type = types->IRType(at); @@ -930,29 +820,24 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { int i = 0; std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_start = offsets[loop_var] - //std::cout << "LR Type is " << printType(type) << std::endl; slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); - //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_end = offsets[loop_var + 1] llvm::Value* offsets_slot_index_next = builder->CreateAdd( offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); llvm::Value* offset_end = builder->CreateLoad(type, slot, "offset_end"); - //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => list_data_length = offset_end - offset_start llvm::Value* list_data_length = builder->CreateSub(offset_end, offset_start, "offsets_len"); - //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // get the child offsets array from the child offsets array, // start from offset 'offset_start' llvm::Value* child_offset_slot_ref = GetBufferReference(dex.ChildOffsetsIdx(), kBufferTypeChildOffsets, dex.Field()); - //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // do not forget slice offset llvm::Value* offset_start_int64 = builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); @@ -963,13 +848,11 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { llvm::Value* child_offset_start = builder->CreateLoad(type, child_offsets, "child_offset_start"); - //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // get the data array llvm::Value* data_slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); llvm::Value* data_value = builder->CreateGEP(type, data_slot_ref, child_offset_start); - //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; result_.reset(new ListLValue(data_value, child_offsets, list_data_length)); } @@ -1013,7 +896,6 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { llvm::Value* value = nullptr; llvm::Value* len = nullptr; - //std::cout << "LR LiteralDex type " << dex.type()->id() << std::endl; switch (dex.type()->id()) { case arrow::Type::BOOL: value = types->i1_constant(std::get(dex.holder())); @@ -1054,7 +936,6 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { case arrow::Type::STRING: case arrow::Type::BINARY: { const std::string& str = std::get(dex.holder()); - //std::cout << "LR Literal string " << str << std::endl; value = ir_builder()->CreateGlobalStringPtr(str.c_str()); len = types->i32_constant(static_cast(str.length())); break; @@ -1108,8 +989,6 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { native_function->NeedsContext()); auto arrow_return_type = dex.func_descriptor()->return_type(); - //std::cout << "LR NonNullableFunc 1 result_type " << printType(generator_->types()->DataVecType(arrow_return_type)) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << printType(generator_->types()->IRType(arrow_return_type->id())) << std::endl; - if (native_function->CanReturnErrors()) { // slow path : if a function can return errors, skip invoking the function // unless all of the input args are valid. Otherwise, it can cause spurious errors. @@ -1118,10 +997,6 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { LLVMTypes* types = generator_->types(); auto arrow_type_id = arrow_return_type->id(); auto result_type = types->DataVecType(arrow_return_type); - //Result type array/list is special. - //auto result_type = types->IRType(arrow_type_id); - //std::cout << "LR NonNullableFunc 2 result_type " << printType(result_type) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << types->IRType(arrow_type_id) << std::endl; - // Build combined validity of the args. llvm::Value* is_valid = types->true_constant(); for (auto& pair : dex.args()) { @@ -1193,52 +1068,18 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { } if (passLoopVars) { - //Pointer to validity bitmap and bit starting index for accessing validity bits in the called function. - //llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); - //llvm::Value* validity = b_slot_ref; - - //Compute the bit offset. - //int64_t validIndex = 0; - //for (int i = 0; i < loop_var_; i++) { - // validIndex += *(arg_offsets_ + i); - //} - - /*std::string str3 = "validity:"; - if (validity) { - llvm::raw_string_ostream output3(str3); - validity->print(output3); - }*/ std::string str32 = "loopvar:"; if (loop_var_) { llvm::raw_string_ostream output3(str32); loop_var_->print(output3); } - //std::cout << "LR VectorReadFixedLenValueListDex loopvar=" << str32 << " result()->length()=" << result()->length() << std::endl; - //TODO params.push_back(validity); - params.push_back(loop_var_); -//LR-VAR - //llvm::Value* updated_validity_index_var = builder->CreateAdd( - // validity_index_var_, result()->length(), "validity_index_var+offset"); - // check loop_var - //loop_var->addIncoming(types()->i64_constant(0), loop_entry); - - //builder->CreateStore(updated_validity_index_var, validity_index_var_); - auto valid_var = builder->CreateLoad(types->i64_type(), validity_index_var_, "loaded_var"); + params.push_back(loop_var_); + auto valid_var = builder->CreateLoad(types->i64_type(), validity_index_var_, "loaded_var"); params.push_back(valid_var); } - - - - - - - - - - // add an extra arg for validity (allocated on stack). llvm::AllocaInst* result_valid_ptr = new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); @@ -1250,96 +1091,10 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { // load the result validity and truncate to i1. auto result_valid_i8 = builder->CreateLoad(types->i8_type(), result_valid_ptr); llvm::Value* result_valid = builder->CreateTrunc(result_valid_i8, types->i1_type()); - - //std::bitset<8> bs(dex.local_bitmap_idx()); - //std::cout <<"LR NullableInternal validity from dex.local_bitmap_idx()=" << bs << std::endl; - - - // auto result_valid_i8ptr = builder->CreateLoad(types->i8_ptr_type(), result_valid_ptr); - - // std::cout << "LR NullableInternal function param validity=" << result_valid_i8ptr << std::endl; // set validity bit in the local bitmap. ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid); } -/* -void LLVMGenerator::Visitor::Visit(const NullableInternalListFuncDex& dex) { - ADD_VISITOR_TRACE("visit NullableInternalListFuncDex base function " + - dex.func_descriptor()->name()); - llvm::IRBuilder<>* builder = ir_builder(); - LLVMTypes* types = generator_->types(); - - const NativeFunction* native_function = dex.native_function(); - - // build function params along with validity. - auto params = BuildParams(dex.get_holder_idx(), dex.args(), true, - native_function->NeedsContext()); - - auto arrow_return_type = dex.func_descriptor()->return_type(); - - - - auto arrow_type_id = arrow_return_type->arrow_return_type->id(); - - if (arrow_return_type_id == arrow::Type::LIST) - { - //Pointer to validity bitmap and bit starting index for accessing validity bits in the called function. - llvm::Value* b_slot_index = - builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); - llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); - //llvm::Value* validity = builder->CreateGEP(type, b_slot_ref, 0); - llvm::Value* validity = b_slot_ref; - - //Compute the bit offset. - //int64_t validIndex = 0; - //for (int i = 0; i < loop_var_; i++) { - // validIndex += *(arg_offsets_ + i); - //} - - std::string str3 = "validity:"; - if (validity) { - llvm::raw_string_ostream output3(str3); - validity->print(output3); - } - std::string str32 = "loopvar:"; - if (loop_var_) { - llvm::raw_string_ostream output3(str32); - loop_var_->print(output3); - } - std::cout << "LR VectorReadFixedLenValueListDex loopvar=" + str32 + " using validity " << str3 << std::endl; - params.push_back(validity); - params.push_back(loop_var_); - params.push_back(arg_offsets_); - } - - - - - - - - // add an extra arg for validity (allocated on stack). - llvm::AllocaInst* result_valid_ptr = - new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); - params.push_back(result_valid_ptr); - - result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); - - // load the result validity and truncate to i1. - auto result_valid_i8 = builder->CreateLoad(types->i8_type(), result_valid_ptr); - llvm::Value* result_valid = builder->CreateTrunc(result_valid_i8, types->i1_type()); - - std::bitset<8> bs(dex.local_bitmap_idx()); - std::cout <<"LR NullableInternalListFuncDex validity from dex.local_bitmap_idx()=" << bs << std::endl; - - - auto result_valid_i8ptr = builder->CreateLoad(types->i8_ptr_type(), result_valid_ptr); - - std::cout << "LR NullableInternalListFuncDex function param validity=" << result_valid_i8ptr << std::endl; - // set validity bit in the local bitmap. - ClearLocalBitMapIfNotValid(dex.local_bitmap_idx(), result_valid); -}*/ - void LLVMGenerator::Visitor::Visit(const IfDex& dex) { ADD_VISITOR_TRACE("visit IfExpression"); llvm::IRBuilder<>* builder = ir_builder(); @@ -1705,7 +1460,6 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& pair) { // generate code for value - // std::cout << "LR LLVMGenerator::Visitor::BuildValueAndValidity" << std::endl; auto value_expr = pair.value_expr(); value_expr->Accept(*this); auto value = result()->data(); @@ -1725,10 +1479,6 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); - //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; - //for (unsigned int i = 0; i < params->size(); i++) { - // std::cout << "LR param " << i << printType(params->at(i)) << std::endl; - //} if (arrow_return_type_id == arrow::Type::DECIMAL) { // For decimal fns, the output precision/scale are passed along as parameters. // @@ -1757,7 +1507,6 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, llvm::AllocaInst* result_len_ptr = nullptr; llvm::AllocaInst* valid_ptr = nullptr; if (arrow::is_binary_like(arrow_return_type_id)) { - //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is binary like" << std::endl; result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, "result_len", entry_block_); params->push_back(result_len_ptr); @@ -1765,26 +1514,16 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, } if (arrow_return_type_id == arrow::Type::LIST) { - //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is list" << std::endl; + result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, "result_len", entry_block_); params->push_back(result_len_ptr); has_arena_allocs_ = true; - valid_ptr = new llvm::AllocaInst(generator_->types()->i32_ptr_type(), 0, "valid_ptr", entry_block_); - // std::cout << "LR allocinst for valid_ptr=" << printType(valid_ptr) << std::endl; params->push_back(valid_ptr); } - //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; - /*for (auto p : *params) { - std::string str1; - llvm::raw_string_ostream output1(str1); - p->print(output1); - std::cout << str1 << std::endl; - }*/ - // Make the function call llvm::IRBuilder<>* builder = ir_builder(); auto value = @@ -1799,8 +1538,6 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, (valid_ptr == nullptr) ? nullptr : builder->CreateLoad(generator_->types()->i32_ptr_type(), valid_ptr); - // std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using validity=" << validity << " ptr=" << valid_ptr << std::endl; - // std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE. using value_len=" << value_len << " ptr=" << result_len_ptr << std::endl; return std::make_shared(value, value_len, validity); } } @@ -1816,7 +1553,6 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(arg_context_ptr_); } - //std::cout << "LR BuildParams1" << std::endl; // if the function has holder, add the holder pointer. if (holder_idx != -1) { auto builder = ir_builder(); @@ -1825,7 +1561,6 @@ std::vector LLVMGenerator::Visitor::BuildParams( llvm::BasicBlock* saved_block = builder->GetInsertBlock(); builder->SetInsertPoint(entry_block_); - // std::cout << "LR BuildParams1a" << std::endl; auto holder = generator_->LoadVectorAtIndex( arg_holder_ptrs_, generator_->types()->i64_type(), holder_idx, "holder"); @@ -1833,25 +1568,21 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(holder); } - // std::cout << "LR BuildParams2" << std::endl; // build the function params, along with the validities. for (auto& pair : args) { // build value. DexPtr value_expr = pair->value_expr(); - // std::cout << "LR BuildParams2a" << std::endl; value_expr->Accept(*this); - // std::cout << "LR BuildParams2b" << std::endl; LValue& result_ref = *result(); // append all the parameters corresponding to this LValue. result_ref.AppendFunctionParams(¶ms); - // std::cout << "LR BuildParams2c" << std::endl; + // build validity. if (with_validity) { llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs()); params.push_back(validity_expr); - // std::cout << "LR BuildParams2d adding combined validity" << std::endl; } } diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index fc875c14d380a..5e43eb74abcdf 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -126,7 +126,7 @@ class GANDIVA_EXPORT LLVMTypes { // offsets buffer is to separate data into list // not support nested list if (data_type->id() == arrow::Type::LIST) { - //LR HACK + //LR TODO //std::cout << "LR Returning list type as type " << data_type->field(0)->type()->id()<< " for IR " << std::endl; //return IRType(data_type->field(0)->type()->id()); //return IRType(data_type->id()); diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 3b2bbd3b0ec96..2f33a97788c6c 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -33,7 +33,6 @@ class GANDIVA_EXPORT LValue { explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR, llvm::Value* validity = NULLPTR) : data_(data), length_(length), validity_(validity) { - //std::cout << "LR created LValue " << to_string() << std::endl; } virtual ~LValue() = default; @@ -45,7 +44,6 @@ class GANDIVA_EXPORT LValue { // Append the params required when passing this as a function parameter. virtual void AppendFunctionParams(std::vector* params) { - // std::cout << "LR LValue::AppendFunctionParams" << std::endl; params->push_back(data_); if (length_ != NULLPTR) { params->push_back(length_); @@ -112,7 +110,6 @@ class GANDIVA_EXPORT ListLValue : public LValue { : LValue(data, NULLPTR, validity), child_offsets_(child_offsets), offsets_length_(offsets_length) { - //std::cout << "LR Creating ListLValue " << std::endl; } llvm::Value* child_offsets() { return child_offsets_; } @@ -120,7 +117,6 @@ class GANDIVA_EXPORT ListLValue : public LValue { llvm::Value* offsets_length() { return offsets_length_; } void AppendFunctionParams(std::vector* params) override { - // std::cout << "LR ListLValue::AppendFunctionParams" << std::endl; LValue::AppendFunctionParams(params); params->push_back(child_offsets_); params->push_back(offsets_length_); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index b0d5331a3ee48..a3cccca11191d 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -169,7 +169,6 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, ARROW_RETURN_IF(configuration == nullptr, Status::Invalid("Configuration cannot be null")); - //std::cout << "LR Projector::Make 1" << std::endl; // see if equivalent projector was already built std::shared_ptr>> cache = LLVMGenerator::GetCache(); @@ -192,7 +191,6 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::unique_ptr llvm_gen; ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); - //std::cout << "LR Projector::Make 2" << std::endl; if (!is_cached && sec_cache != nullptr) { std::shared_ptr arrow_buffer = sec_cache->Get(GetSecondaryCacheKey(cache_key.ToString())); @@ -210,7 +208,6 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. - //std::cout << "LR Projector::Make 3" << std::endl; if (!is_cached) { ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { @@ -230,13 +227,11 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, output_fields.push_back(expr->result()); } - //std::cout << "LR Projector::Make 4" << std::endl; // Instantiate the projector with the completely built llvm generator *projector = std::shared_ptr( new Projector(std::move(llvm_gen), schema, output_fields, configuration)); projector->get()->SetBuiltFromCache(is_cached); - //std::cout << "LR Projector::Make 5" << std::endl; if (sec_cache != nullptr && is_cached == false) { std::shared_ptr sec_cached_obj = cache->GetObjectCode(cache_key); llvm::StringRef string_buffer = sec_cached_obj->getBuffer(); @@ -245,7 +240,6 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, sec_cache->Set(GetSecondaryCacheKey(cache_key.ToString()), arrow_buffer); } - //std::cout << "LR Projector::Make DONE" << std::endl; return Status::OK(); } @@ -259,7 +253,6 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) const { ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); - //std::cout << "LR the other Projector::Evaluate" << std::endl; if (output_data_vecs.size() != output_fields_.size()) { std::stringstream ss; ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() @@ -267,10 +260,8 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, return Status::Invalid(ss.str()); } -//std::cout << "LR the other Projector::Evaluate 1a" << std::endl; int idx = 0; for (auto& array_data : output_data_vecs) { - //std::cout << "LR the other Projector::Evaluate checking array_data" << std::endl; if (array_data == nullptr) { std::stringstream ss; ss << "array for output field " << output_fields_[idx]->name() << "is null."; @@ -280,58 +271,13 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); - //std::cout << "LR the other Projector::Evaluate about to validate capacity" << std::endl; ARROW_RETURN_NOT_OK( ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), num_rows)); ++idx; } - //std::cout << "LR the other Projector::Evaluate 2" << std::endl; ARROW_RETURN_NOT_OK( llvm_generator_->Execute(batch, selection_vector, output_data_vecs)); - // Create and return array arrays. - - /* for (auto& array_data : output_data_vecs) { - - if (array_data->type->id() == arrow::Type::LIST) { - auto child_data = array_data->child_data[0]; - //std::cout << "LR the other Projector::Evaluate modifying child array " << - //child_data->buffers[1]->ToString() << std::endl; - //std::cout << "LR the other Projector::Evaluate child array[3] " << - //int32_t( (*child_data->buffers[1])[3*4]) << std::endl; - //std::cout << "LR the other Projector::Evaluate modifying child0 array " << - //child_data->buffers[0]->ToString() << std::endl; - - int64_t child_data_size = 1; - if (arrow::is_binary_like(child_data->type->id())) { - - child_data_size = child_data->buffers[1]->size() / 4 - 1; - } else if (child_data->type->id() == arrow::Type::INT32) { - child_data_size = child_data->buffers[1]->size() / 4; - } else if (child_data->type->id() == arrow::Type::INT64) { - child_data_size = child_data->buffers[1]->size() / 8; - } else if (child_data->type->id() == arrow::Type::FLOAT) { - child_data_size = child_data->buffers[1]->size() / 4; - } else if (child_data->type->id() == arrow::Type::DOUBLE) { - child_data_size = child_data->buffers[1]->size() / 8; - } - auto new_child_data = arrow::ArrayData::Make( - child_data->type, child_data_size, child_data->buffers, child_data->offset); - array_data->child_data.clear(); - array_data->child_data.push_back(new_child_data); - - //std::cout << "LR the other Projector::Evaluate child data size " << child_data_size << std::endl; - //std::cout << "LR the other Projector::Evaluate after modifying child array[3] " << - //int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; - - //array_data = arrow::ArrayData::Make(array_data->type, array_data->length, - // array_data->buffers, {new_child_data}, - // array_data->null_count, array_data->offset); - } - - }*/ - - return Status::OK(); } @@ -343,14 +289,12 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* p Status Projector::Evaluate(const arrow::RecordBatch& batch, const SelectionVector* selection_vector, arrow::MemoryPool* pool, arrow::ArrayVector* output) const { - //std::cout << "LR Projector::Evaluate" << std::endl; ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); - //std::cout << "LR Projector::Evaluate num_rows" << num_rows << std::endl; // Allocate the output data vecs. ArrayDataVector output_data_vecs; for (auto& field : output_fields_) { @@ -394,7 +338,6 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, array_data = arrow::ArrayData::Make(array_data->type, array_data->length, array_data->buffers, {new_child_data}, array_data->null_count, array_data->offset); - // std::cout << "LR Making array data length " << array_data->length << std::endl; } output->push_back(arrow::MakeArray(array_data)); @@ -402,14 +345,12 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, return Status::OK(); } -// TODO : handle complex vectors (list/map/..) Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::MemoryPool* pool, ArrayDataPtr* array_data) const { arrow::Status astatus; std::vector> buffers; - //std::cout << "LR Projector::AllocArrayData Enter" << std::endl; // The output vector always has a null bitmap. int64_t size = arrow::bit_util::BytesForBits(num_records); ARROW_ASSIGN_OR_RAISE(auto bitmap_buffer, arrow::AllocateBuffer(size, pool)); @@ -463,22 +404,16 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); - - //LR TODO not sure this is needed. ARROW_ASSIGN_OR_RAISE(auto data_valid_buffer, arrow::AllocateResizableBuffer(data_len, pool)); - //std::cout << "LR Projector::AllocArrayData 1" << std::endl; if (type->id() == arrow::Type::LIST) { - // std::cout << "LR Projector::AllocArrayData List. There are number of buffers=" << buffers.size() << std::endl; auto internal_type = type->field(0)->type(); ArrayDataPtr child_data; if (arrow::is_primitive(internal_type->id())) { - //std::cout << "LR Projector::AllocArrayData List 1" << std::endl; child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, {std::move(data_valid_buffer), std::move(buffers[2])}, 0); } if (arrow::is_binary_like(internal_type->id())) { - //std::cout << "LR Projector::AllocArrayData List 2" << std::endl; child_data = arrow::ArrayData::Make( internal_type, 0 /*initialize length*/, {nullptr, std::move(buffers[2]), std::move(buffers[3])}, 0); @@ -490,7 +425,6 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); } - // std::cout << "LR Projector::AllocArrayData Done" << std::endl; return Status::OK(); } @@ -509,20 +443,15 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, ARROW_RETURN_IF(array_data.buffers.size() < 2, Status::Invalid("ArrayData must have at least 2 buffers")); -//std::cout << "LR ValidateArrayDataCapacity" << std::endl; int64_t min_bitmap_len = arrow::bit_util::BytesForBits(num_records); - //std::cout << "LR ValidateArrayDataCapacity arra_data 0 is " << array_data.buffers[0] << std::endl; int64_t bitmap_len = array_data.buffers[0]->capacity(); - //std::cout << "LR ValidateArrayDataCapacity" << std::endl; ARROW_RETURN_IF( bitmap_len < min_bitmap_len, Status::Invalid("Bitmap buffer too small for ", field.name(), " expected minimum ", min_bitmap_len, " actual size ", bitmap_len)); auto type_id = field.type()->id(); - //std::cout << "LR ValidateArrayDataCapacity" << std::endl; - //LR TODO - if (arrow::is_binary_like(type_id)) { //|| type_id == arrow::Type::LIST) { + if (arrow::is_binary_like(type_id)) { // validate size of offsets buffer. int64_t min_offsets_len = arrow::bit_util::BytesForBits((num_records + 1) * 32); int64_t offsets_len = array_data.buffers[1]->capacity(); diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index c43285843a1ee..1946aadfef16f 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -144,10 +144,7 @@ NodePtr TreeExprBuilder::MakeOr(const NodeVector& children) { static bool print_expr = false; ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result_field) { - //std::cout << "LR Expression: " << root_node->ToString() << "\n"; - if (result_field == nullptr) { - //std::cout << "LR MakeExpression result_field is null" << std::endl; return nullptr; } return ExpressionPtr(new Expression(root_node, result_field)); @@ -164,9 +161,7 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } - //std::cout << "LR MakeExpression making function for " << function << std::endl; auto func_node = MakeFunction(function, field_nodes, out_field->type()); - //std::cout << "LR MakeExpression function is " << func_node->ToString() << std::endl; return MakeExpression(func_node, out_field); } diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index 0efb2e412e873..c74a1b7271788 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -136,6 +136,7 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) gandiva_data_type->set_type(types::GandivaType::INTERVAL); gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME); break; + //LR TODO case arrow::Type::STRUCT: gandiva_data_type->set_type(types::GandivaType::STRUCT); break; @@ -146,6 +147,7 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) // un-supported types. test ensures that // when one of these are added build breaks. //DCHECK(false); + //LR TODO printf("LR Found unsupported type %d\n", type->id()); fflush(stdout); } @@ -176,16 +178,10 @@ Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSu JNIEXPORT jbyteArray JNICALL Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedFunctions( // NOLINT JNIEnv* env, jobject types_helper) { - printf("LR Entering JNI call getGandivaSupportedFunctions\n"); - fflush(stdout); - ExpressionRegistry expr_registry; types::GandivaFunctions gandiva_functions; for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { - printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).base_name().c_str()); - printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).ToString().c_str()); - fflush(stdout); types::FunctionSignature* function_signature = gandiva_functions.add_function(); function_signature->set_name((*function).base_name()); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 5f5f3fb02d920..41b2593d501cd 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -90,11 +90,8 @@ static jmethodID listvector_expander_method_; static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; static jfieldID list_expander_ret_address_; -static jfieldID list_expander_valid_address_; static jfieldID list_expander_outer_valid_address_; static jfieldID list_expander_ret_capacity_; -static jfieldID list_expander_offset_ret_address_; -static jfieldID list_expander_offset_ret_capacity_; static jclass secondary_cache_class_; static jmethodID cache_get_method_; @@ -162,14 +159,8 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { env->GetFieldID(list_expander_ret_class_, "address", "J"); list_expander_ret_capacity_ = env->GetFieldID(list_expander_ret_class_, "capacity", "J"); - list_expander_offset_ret_address_ = - env->GetFieldID(list_expander_ret_class_, "offsetaddress", "J"); - list_expander_offset_ret_capacity_ = - env->GetFieldID(list_expander_ret_class_, "offsetcapacity", "J"); - list_expander_valid_address_ = - env->GetFieldID(list_expander_ret_class_, "validityaddress", "J"); list_expander_outer_valid_address_ = - env->GetFieldID(list_expander_ret_class_, "outervalidityaddress", "J"); + env->GetFieldID(list_expander_ret_class_, "validityaddress", "J"); jclass local_cache_class = env->FindClass("org/apache/arrow/gandiva/evaluator/JavaSecondaryCacheInterface"); @@ -337,7 +328,6 @@ FieldPtr ProtoTypeToField(const types::Field& f) { NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) { FieldPtr field_ptr = ProtoTypeToField(node.field()); - //std::cout << "LR created field " << field_ptr->ToString(true) << std::endl; if (field_ptr == nullptr) { std::cerr << "Unable to create field node from protobuf\n"; return nullptr; @@ -509,7 +499,6 @@ NodePtr ProtoTypeToNullNode(const types::NullNode& node) { NodePtr ProtoTypeToNode(const types::TreeNode& node) { if (node.has_fieldnode()) { - //std::cout << "LR Found ProtoTypeToNode fieldnode " << std::endl; return ProtoTypeToFieldNode(node.fieldnode()); } @@ -558,7 +547,6 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { } if (node.has_stringnode()) { - //std::cout << "LR Found StringNode" << std::endl; return TreeExprBuilder::MakeStringLiteral(node.stringnode().value()); } @@ -646,8 +634,6 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto validity = std::shared_ptr( new arrow::Buffer(reinterpret_cast(validity_addr), validity_size)); buffers.push_back(validity); - //std::cout << "LR make_record_batch_with_buf_addrs adding validity_addr buffer=" << validity_addr << " idx=" << buf_idx - 1 << std::endl; - if (buf_idx >= in_bufs_len) { return Status::Invalid("insufficient number of in_buf_addrs"); } @@ -656,7 +642,6 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto data = std::shared_ptr( new arrow::Buffer(reinterpret_cast(value_addr), value_size)); buffers.push_back(data); - // std::cout << "LR make_record_batch_with_buf_addrs adding value_addr buffer=" << value_addr << " idx=" << buf_idx - 1 << std::endl; if (arrow::is_binary_like(field->type()->id())) { if (buf_idx >= in_bufs_len) { @@ -669,9 +654,8 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto offsets = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); - // std::cout << "LR make_record_batch_with_buf_addrs adding offsets_addr buffer=" << offsets_addr << " idx=" << buf_idx - 1 << std::endl; } -////////// + @@ -939,25 +923,15 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { if (isList) { jlong ret_address = env_->GetLongField(ret, list_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, list_expander_ret_capacity_); - jlong offset_ret_address = env_->GetLongField(ret, list_expander_offset_ret_address_); - jlong offset_ret_capacity = env_->GetLongField(ret, list_expander_offset_ret_capacity_); - jlong valid_address = env_->GetLongField(ret, list_expander_valid_address_); jlong outer_valid_address = env_->GetLongField(ret, list_expander_outer_valid_address_); std::cout << "Buffer expand: New capacity is " << new_capacity << " vector id " << vector_idx_ << " expander method " << method_ << " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << - " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << - " and the original offset ptr=" << reinterpret_cast(offsetBuffer) << " and the new ptr=" << offset_ret_address << std::endl; + " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << std::endl; data_ = reinterpret_cast(ret_address); capacity_ = ret_capacity; - - offsetBuffer = reinterpret_cast(offset_ret_address); - offsetCapacity = offset_ret_capacity; - std::cout << "LR Setting buffer validityBuffer to " << validityBuffer << std::endl; - validityBuffer = reinterpret_cast(valid_address); - outerValidityBuffer = reinterpret_cast(outer_valid_address); } else { jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); @@ -1003,7 +977,6 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( jlongArray buf_addrs, jlongArray buf_sizes, jint sel_vec_type, jint sel_vec_rows, jlong sel_vec_addr, jlong sel_vec_size, jlongArray out_buf_addrs, jlongArray out_buf_sizes) { - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << std::endl; Status status; std::shared_ptr holder = projector_modules_.Lookup(module_id); if (holder == nullptr) { @@ -1039,21 +1012,6 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( if (!status.ok()) { break; } - /*std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " - << " Made a recordbatch num_rows " << num_rows - << in_batch->ToString() - << " there are " << out_bufs_len << " buffers " - << std::endl;*/ - //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " - //<< " there are " << out_bufs_len << " buffers " - //<< std::endl; - //for (int i = 0; i < out_bufs_len; i++) { - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " - // << " buffer " << i - // << "length " << out_sizes[i] - // << std::endl; - // } - std::shared_ptr selection_vector; auto selection_buffer = std::make_shared( reinterpret_cast(sel_vec_addr), sel_vec_size); @@ -1089,14 +1047,12 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( for (FieldPtr field : ret_types) { std::vector> buffers; - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* validity_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong bitmap_sz = out_sizes[sz_idx++]; buffers.push_back(std::make_shared(validity_buf, bitmap_sz)); if (arrow::is_binary_like(field->type()->id())) { - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding bufferbuffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* offsets_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong offsets_sz = out_sizes[sz_idx++]; @@ -1115,13 +1071,9 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; - buffers.push_back(std::make_shared( + buffers.push_back(std::make_shared( env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else if (field->type()->id() == arrow::Type::LIST) { - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding list offset buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; - // std::cout << " size=" << out_sizes[sz_idx - 1] << " outsize index=" << sz_idx - 1 << " address " << out_bufs[buf_idx - 1] - // << " output_vector_idx=" << output_vector_idx << std::endl; buffers.push_back(std::make_shared( env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else { @@ -1141,17 +1093,11 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( } data_sz = out_sizes[sz_idx++]; - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding child nbuffer " << buf_idx - // << " size=" << data_sz << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); child_buffers.push_back(std::make_shared( env, jListExpander, listvector_expander_method_, output_vector_idx, child_offset_buf, data_sz)); - - // std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding child buffer " << buf_idx - // << " size=" << out_sizes[sz_idx] << " outsize index=" << sz_idx << " address " << out_bufs[buf_idx] - // << " output_vector_idx=" << output_vector_idx << std::endl; data_sz = out_sizes[sz_idx++]; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); @@ -1161,7 +1107,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); outBufJava->offsetCapacity = out_sizes[1]; outBufJava->validityBuffer = reinterpret_cast(out_bufs[2]); - outBufJava->outerValidityBuffer = reinterpret_cast(out_bufs[0]); + //outBufJava->outerValidityBuffer = reinterpret_cast(out_bufs[0]); child_buffers.push_back(outBufJava); std::shared_ptr dt2 = std::make_shared(); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index 3b2778c7f21a4..4430674d19a72 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -20,41 +20,34 @@ import org.apache.arrow.vector.complex.ListVector; /** - * This class provides the functionality to expand output vectors using a callback mechanism from + * This class provides the functionality to expand output ListVectors using a callback mechanism from * gandiva. */ public class ListVectorExpander { - private final ListVector[] vectors; + private final ListVector[] bufferVectors; - public ListVectorExpander(ListVector[] vectors) { - this.vectors = vectors; + public ListVectorExpander(ListVector[] bufferVectors) { + this.bufferVectors = bufferVectors; } /** - * Result of vector expansion. + * Result of ListVector expansion. */ public static class ExpandResult { public long address; public long capacity; - public long offsetaddress; - public long offsetcapacity; public long validityaddress; - public long outervalidityaddress; /** - * fdsfsdfds. - * @param address dsfds - * @param capacity dfsdf - * @param offsetad dsfdsfsd - * @param offsetcap dfsfs + * Result of expanding the buffer. + * @param address Data buffer address + * @param capacity Capacity + * @param validAdd Validity buffer address * */ - public ExpandResult(long address, long capacity, long offsetad, long offsetcap, long outValidAdd, long validAdd) { + public ExpandResult(long address, long capacity, long validAdd) { this.address = address; this.capacity = capacity; - this.offsetaddress = offsetad; - this.offsetcapacity = offsetcap; - this.outervalidityaddress = outValidAdd; this.validityaddress = validAdd; } } @@ -69,50 +62,22 @@ public ExpandResult(long address, long capacity, long offsetad, long offsetcap, * @return address and size of the buffer after expansion. */ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { - if (index >= vectors.length || vectors[index] == null) { + if (index >= bufferVectors.length || bufferVectors[index] == null) { throw new IllegalArgumentException("invalid index " + index); } - - //ArrowBuf ab = vectors[index].getValidityBuffer(); - //String s = "Before validity = ["; - //for (int i = 0; i < 20; i++) { - // s += ab.getInt(i) + ","; - //} - //System.out.println(s); - - int valueBufferIndex = 1; - int validBufferIndex = 0; - ListVector vector = vectors[index]; + int validityBufferIndex = 0; + ListVector vector = bufferVectors[index]; while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { //Just realloc the data vector. vector.getDataVector().reAlloc(); } - System.out.println("LR Expanding ListVector. New capacity=" + - vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity()); - System.out.println("LR Expanding ListVector. new data is "); - /*ArrowBuf ab2 = vector.getValidityBuffer(); - s = "After validity = ["; - for (int i = 0; i < 20; i++) { - s += ab2.getInt(i) + ","; - } - System.out.println(s);*/ - /*ArrowBuf ab = vector.getOffsetBuffer(); - String s = "offsetBuffer = ["; - for (int i = 0; i < 20; i++) { - s += ab.getInt(i) + ","; - } - System.out.println(s); - */ return new ExpandResult( vector.getDataVector().getFieldBuffers().get(valueBufferIndex).memoryAddress(), vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity(), - vector.getOffsetBuffer().memoryAddress(), - vector.getOffsetBuffer().capacity(), - vector.getValidityBuffer().memoryAddress(), - vector.getDataVector().getFieldBuffers().get(validBufferIndex).memoryAddress()); + vector.getDataVector().getFieldBuffers().get(validityBufferIndex).memoryAddress()); } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 61e6c8ffacc39..7d677927f0ced 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -332,7 +332,6 @@ private void evaluate(int numRows, List buffers, List buf throw new EvaluatorClosedException(); } - logger.error("LR Projector.java evaluate"); if (numExprs != outColumns.size()) { logger.info("Expected " + numExprs + " columns, got " + outColumns.size()); throw new GandivaException("Incorrect number of columns for the output vector"); @@ -362,19 +361,11 @@ private void evaluate(int numRows, List buffers, List buf int outColumnIdx = 0; for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { - //LR HACK there is only one column. - logger.error("LR Projector.java evaluate out columns=" + outColumns.size()); outAddrs = new long[5 * outColumns.size()]; outSizes = new long[5 * outColumns.size()]; } - /*boolean isFixedWith = valueVector instanceof FixedWidthVector;*/ boolean isVarWidth = valueVector instanceof VariableWidthVector; - /*if (!isFixedWith && !isVarWidth) { - throw new UnsupportedTypeException( - "Unsupported value vector type " + valueVector.getField().getFieldType()); - }*/ - outAddrs[idx] = valueVector.getValidityBuffer().memoryAddress(); outSizes[idx++] = valueVector.getValidityBuffer().capacity(); if (isVarWidth) { @@ -393,51 +384,17 @@ private void evaluate(int numRows, List buffers, List buf if (valueVector instanceof ListVector) { hasVariableWidthColumns = true; resizableListVectors[outColumnIdx] = (ListVector) valueVector; - //LR TODO figure out what to use here resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; - //resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; - //resizeableVectors[outColumnIdx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); - List fieldBufs = ((ListVector) valueVector).getDataVector().getFieldBuffers(); - logger.error("LR Projector.java evaluate ListVector has buffers=" + fieldBufs.size()); - - - logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); //vector valid - logger.error("LR Projector.java evaluate isVarlistvector Width setting vector validity buffer=" + idx); - //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); - //outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); //vector offset - logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); - - logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); - //The realloc avoids dynamic resizing, will have to be fixed later. outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).memoryAddress(); outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).capacity(); - //logger.error("LR Projector.java evaluate ListVector set buffer " + idx + - // " as ptr=" + outAddrs[idx - 1] + " size " + outSizes[idx - 1]); - - //vector data - //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); - //outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).capacity(); - - //LR HACK TODO ((ListVector) valueVector).getDataVector().capacity(); - - - - - - - - - - - } else { outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); outSizes[idx++] = valueVector.getDataBuffer().capacity(); @@ -446,10 +403,6 @@ private void evaluate(int numRows, List buffers, List buf valueVector.setValueCount(selectionVectorRecordCount); outColumnIdx++; } - - //logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); - //logger.error("LR Projector.java before evaluateProjector buffer[3]=" + outAddrs[3]); - //logger.error("LR Projector.java before evaluateProjector buffer[1]=" + outAddrs[1]); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, hasVariableWidthColumns ? new ListVectorExpander(resizableListVectors) : null, @@ -458,286 +411,12 @@ private void evaluate(int numRows, List buffers, List buf selectionVectorAddr, selectionVectorSize, outAddrs, outSizes); - //outColumns.clear(); - //FieldType ft = new FieldType(true, int32, null); - //ListVector lv = new ListVector("res", allocator, ft, null); - //System.out.println(intVector.getDataVector()); - - - //logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3]); - //logger.error("LR Projector.java after evaluateProjector buffer[1]=" + outAddrs[1]); for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { - //LR HACK - - //int numRecordsFound = 5 * 100; - //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; - //logger.error("LR Projector.java using numRecords=" + numRecordsFound + " outSizes[3]=" + outSizes[3]); - - //LR HACK 9-13 10:34 - /*public void startList() { - vector.startNewValue(idx()); - writer.setPosition(vector.getOffsetBuffer().getInt((idx() + 1L) * OFFSET_WIDTH)); - listStarted = true; - } - - @Override - public void endList() { - vector.getOffsetBuffer().setInt((idx() + 1L) * OFFSET_WIDTH, writer.idx()); - setPosition(idx() + 1); - listStarted = false; - */ - - //ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - - - //ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); - - // logger.error("LR Projector.java using numRecords=" + - // selectionVectorRecordCount + " outSizes[3]=" + outSizes[3]); - - //import org.apache.arrow.vector.complex.impl.UnionListWriter; - /*UnionListWriter writer = ((ListVector) valueVector).getWriter(); - for (int i = 0; i < selectionVectorRecordCount; i++) { - writer.startList(); - writer.setPosition(i); - for (int j = 0; j < 5; j++) { - int index = ((j + (5 * i)) * 4); - //Not sure whats going on. Buffer too small? - try { - writer.writeInt(ab2.getInt(index)); - //writer.writeInt(42); - } catch (IndexOutOfBoundsException e) { - continue; - } - } - writer.setValueCount(5); - writer.endList(); - } - ((ListVector) valueVector).setValueCount(selectionVectorRecordCount);*/ - - - //offsetBuffer = [0,83886080,327680,1280,5,167772160,655360,2560,10,251658240,983040,3840,15, - //335544320,1310720,5120,20, - //419430400,1638400,6400,25,503316480,1966080,7680,30,587202560,2293760,8960,35,671088640,2621440,10240,40, - //754974720,2949120,11520, - - - - - - - - - - - /* - String s = ""; - List fv = ((ListVector) valueVector).getDataVector().getFieldBuffers(); - for (ArrowBuf ab : fv) { - s = ""; - for (int i = 0; i < 20; i++) { - s += ab.getInt(i) + ","; - } - logger.error("LR Projector.java before updating listvector. size=" + - ab.capacity() + " buffer=" + s); - } - - ArrowBuf fvv = ((ListVector) valueVector).getValidityBuffer(); - s = ""; - for (int i = 0; i < 20; i++) { - s += fvv.getInt(i) + ","; - } - logger.error("LR Projector.java before updating listvector. getValidityBuffer=" + - fvv.capacity() + " buffer=" + s); - - ArrowBuf fvvv = ((ListVector) valueVector).getOffsetBuffer(); - s = ""; - for (int i = 0; i < 20; i++) { - s += fvvv.getInt(i) + ","; - } - logger.error("LR Projector.java before updating listvector. getOffsetBuffer=" + - fvvv.capacity() + " buffer=" + s); - */ - - - - - - - //((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); - + //LR TODO check if this is necessary. ((ListVector) valueVector).setLastSet(selectionVectorRecordCount - 1); - - /* - ArrowBuf mabb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - s = "validity? buffer mabb2, outAddrs[2]="; - for (int i = 0; i < 20; i++) { - s += mabb2.getInt(i) + ","; - } - System.out.println(s); - */ - /* - //Validity then data. - ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - ArrowBuf abb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); - List outBufsNew = new ArrayList(); - - //outBufsNew.add(ab0); - outBufsNew.add(abb); - outBufsNew.add(abb2); - ArrowFieldNode afn = new ArrowFieldNode(selectionVectorRecordCount * 5, 0); - ((ListVector) valueVector).getDataVector().clear(); - ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); - - //TODO Need to get validity [0] and offset [1] buffer for the listvector. - //((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); - - List outBufsNew2 = new ArrayList(); - - - - ArrowBuf mabb22 = new ArrowBuf(ReferenceManager.NO_OP, null, selectionVectorRecordCount, outAddrs[0]); - for (int i = 0; i < selectionVectorRecordCount; i++) { - BitVectorHelper.setBit(mabb22, i); - } - - ArrowBuf mabb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[1], outAddrs[1]); - //for (int i = 0; i < selectionVectorRecordCount; i++) { - // mabb2.setInt(i * 4, 5 * i); - //} - s = "offset? buffer mabb2, outAddrs[0]="; - for (int i = 0; i < 20; i++) { - s += mabb2.getInt(i) + ","; - } - System.out.println(s); - - outBufsNew2.add(mabb22); - outBufsNew2.add(mabb2); - ArrowFieldNode afn2 = new ArrowFieldNode(selectionVectorRecordCount, 0); - ((ListVector) valueVector).loadFieldBuffers(afn2, outBufsNew2); - - - */ - - //((ListVector) valueVector).setValueCount(selectionVectorRecordCount); - //((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount); - - /*TODO NEeD THIS int simple = 0; - try { - for (int i = 0; i < selectionVectorRecordCount * 5; i++) { - BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); - simple++; - } - } catch (IndexOutOfBoundsException e) { - simple = 0; - } - */ - /* int simple = 0; - import org.apache.arrow.vector.BitVectorHelper; - try { - for (int i = 0; i < selectionVectorRecordCount; i++) { - BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); - simple++; - } - } catch (IndexOutOfBoundsException e) { - simple = 0; - } -*/ - - - - - - /* - - - - try { - for (int i = 0; i < selectionVectorRecordCount; i++) { - BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); - simple++; - } - } catch (IndexOutOfBoundsException e) { - simple = 0; - } - - - for (int i = 0; i < selectionVectorRecordCount; i++) { - ((ListVector) valueVector).getOffsetBuffer().setInt(i * 4, 5 * i); - } - */ - - - - - - - - //LR HACK 9-13 10:34 All the multiline comment - /* - import org.apache.arrow.memory.ReferenceManager; - import org.apache.arrow.vector.BitVectorHelper; - import org.apache.arrow.vector.ipc.message.ArrowFieldNode; - */ - //ArrowBuf ab0 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - /*ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); - ArrowBuf abb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); - List outBufsNew = new ArrayList(); - - StringBuilder sbb = new StringBuilder(); - abb.print(sbb, 1); - System.out.println("LR abb=" + sbb); - - //outBufsNew.add(ab0); - outBufsNew.add(abb); - outBufsNew.add(abb2); - ArrowFieldNode afn = new ArrowFieldNode(numRecordsFound, 0); - ((ListVector) valueVector).getDataVector().clear(); - ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); - - //LR HACK 9-12 10:09 - //ArrowBuf offBuff = ((ListVector) valueVector).getOffsetBuffer(); - //for (int i = 0; i < 101; i++) { - // offBuff.setInt(i, 5 * i * 4); - //} - - - - - - //byte[] valid = new byte[outsizes[2]]; - //LR HACK - //for (int i = 0; i < outSizes[2]; i++) { - int simple = 0; - try { - for (int i = 0; i < numRecordsFound * 4; i++) { - BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); - simple++; - //BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); - } - } catch (IndexOutOfBoundsException e) { - simple = 0; - } - ArrowBuf ab3 = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); - for (int i = 0; i < 50; i++) { - System.out.println("LR arrowbuf after=" + Integer.reverseBytes(ab3.getInt(i))); - System.out.println("LR arrowbuf after=" + ab3.getInt(i)); - System.out.println("LR arrowbuf after=" + ab3.getShort(i)); - } - ArrowBuf ab3a = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1); - for (int i = 0; i < 50; i++) { - System.out.println("LR arrowbuf aftera=" + Integer.reverseBytes(ab3a.getInt(i))); - System.out.println("LR arrowbuf aftera=" + ab3a.getInt(i)); - System.out.println("LR arrowbuf aftera=" + ab3a.getShort(i)); - } - IntVector iv = (IntVector) ((ListVector) valueVector).getDataVector(); - for (int i = 0; i < 50; i++) { - System.out.println("LR IntVector=" + iv.get(i)); - }*/ } } - } /** diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java index d3c75413957a1..f22ebbd37878f 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/VectorExpander.java @@ -57,7 +57,6 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { throw new IllegalArgumentException("invalid index " + index); } - System.out.println("LR Expanding VectorExpander."); BaseVariableWidthVector vector = vectors[index]; while (vector.getDataBuffer().capacity() < toCapacity) { vector.reallocDataBuffer(); From 7d746085eaee88b9ed7c38325f1e3456679040b5 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 25 Oct 2023 15:07:23 -0700 Subject: [PATCH 26/46] Cleanup, tested --- cpp/src/arrow/c/bridge.cc | 2 +- cpp/src/arrow/type.cc | 5 -- cpp/src/arrow/type.h | 5 +- cpp/src/arrow/type_fwd.h | 6 -- cpp/src/gandiva/array_ops.cc | 21 ------- cpp/src/gandiva/array_ops.h | 5 -- cpp/src/gandiva/expression_registry.cc | 8 +-- cpp/src/gandiva/function_registry_array.cc | 9 --- cpp/src/gandiva/function_registry_string.cc | 9 --- cpp/src/gandiva/llvm_types.cc | 1 - cpp/src/gandiva/precompiled/string_ops.cc | 59 ------------------- cpp/src/gandiva/precompiled/types.h | 12 ---- .../main/cpp/expression_registry_helper.cc | 4 -- java/gandiva/src/main/cpp/jni_common.cc | 3 +- .../arrow/gandiva/evaluator/Projector.java | 5 -- 15 files changed, 7 insertions(+), 147 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 32dbc088a7118..85a5156d11db2 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -195,7 +195,7 @@ struct SchemaExporter { } Status ExportSchema(const Schema& schema) { - static const StructType dummy_struct_type = StructType(); + static const StructType dummy_struct_type({}); flags_ = 0; RETURN_NOT_OK(ExportFormat(dummy_struct_type)); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 60b71cbb71df7..4804570bdf52f 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -856,10 +856,6 @@ StructType::StructType(const std::vector>& fields) children_ = fields; } -StructType::StructType() - : NestedType(Type::STRUCT) { -} - StructType::~StructType() {} std::string StructType::ToString() const { @@ -2531,7 +2527,6 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) -TYPE_FACTORY(structType, StructType) TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) TYPE_FACTORY(large_binary, LargeBinaryType) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ddeb45b721f89..29ac79037d508 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1079,10 +1079,7 @@ class ARROW_EXPORT StructType : public NestedType { static constexpr const char* type_name() { return "struct"; } explicit StructType(const std::vector>& fields); - explicit StructType(); - StructType(const StructType& rhs) = delete; - StructType& operator=(const StructType& rhs) = delete; - + ~StructType() override; DataTypeLayout layout() const override { diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 450ed9a136d26..66fd6c75f0ddb 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -153,11 +153,6 @@ class FixedSizeListArray; class FixedSizeListBuilder; struct FixedSizeListScalar; -class StructType; -class StructArray; -class StructBuilder; -struct StructScalar; - class Decimal128; class Decimal256; class DecimalType; @@ -459,7 +454,6 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); -ARROW_EXPORT const std::shared_ptr& structType(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index b9ff34b4a8a4f..b5d1a57e5fe5f 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -74,18 +74,6 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, return false; } -//LR TODO -int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { - - int integers[] = { contains_data, 21, 3, contains_data, 5 }; - *out_len = 5;// * 4; - //length is number of items, but buffers must account for byte size. - uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); - memcpy(ret, integers, *out_len * 4); - - return reinterpret_cast(ret); -} - bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int64_t contains_data, bool entry_validWhat, @@ -227,15 +215,6 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(array_int64_contains_int64)); - - args = {types->i64_type(), // int64_t execution_context - types->i32_type(), // array item input - types->i32_ptr_type()}; // out array length - - engine->AddGlobalMappingForFunc("array_int32_make_array", - types->i32_ptr_type(), args, - reinterpret_cast(array_int32_make_array)); - args = {types->i64_type(), // int64_t execution_context types->i32_ptr_type(), // int8_t* input data ptr types->i32_type(), // int32_t input length diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 8fdf957f3d22c..8d51b6e09f7f7 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -47,11 +47,6 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int64_t loop_var, int64_t validity_index_var, bool* valid_buf); -GANDIVA_EXPORT -int32_t* array_int32_make_array(int64_t context_ptr, - int32_t contains_data, - int32_t* out_len); - GANDIVA_EXPORT int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index 20be12548e0f9..12ac0d0b154e8 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -166,12 +166,12 @@ static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector case arrow::Type::type::INTERVAL_DAY_TIME: vector.push_back(arrow::day_time_interval()); break; - case arrow::Type::type::STRUCT: - vector.push_back(arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)})); - break; case arrow::Type::type::LIST: - //vector.push_back(arrow::list(arrow::utf8())); vector.push_back(arrow::list(arrow::int32())); + vector.push_back(arrow::list(arrow::int64())); + vector.push_back(arrow::list(arrow::float32())); + vector.push_back(arrow::list(arrow::float64())); + vector.push_back(arrow::list(arrow::utf8())); break; default: // Unsupported types. test ensures that diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index f7c587a64b74d..439b275d0ace2 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -31,18 +31,9 @@ std::vector GetArrayFunctionRegistry() { NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, boolean(), kResultNullInternal, "array_int32_contains_int32", NativeFunction::kNeedsContext), - NativeFunction("array_contains", {}, DataTypeVector{list(int32()), int32()}, - boolean(), kResultNullIfNull, "array_int32_contains_int32", - NativeFunction::kNeedsContext), - NativeFunction("array_makeGandiva", {}, DataTypeVector{int32()}, - list(int32()), kResultNullIfNull, "array_int32_make_array", - NativeFunction::kNeedsContext), NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int32()), int32()}, list(int32()), kResultNullInternal, "array_int32_remove", NativeFunction::kNeedsContext), - /*NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, - boolean(), kResultNullIfNull, "array_int64_contains_int64", - NativeFunction::kNeedsContext),*/ }; return array_fn_registry_; } diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index edb900e976c59..d93757b40cfd0 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -263,15 +263,6 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), - NativeFunction("st_geohash", {}, DataTypeVector{float64(), float64()}, - utf8(), kResultNullIfNull, "gdv_fn_geo_hash_encode_float64_float64", - NativeFunction::kNeedsContext), - - NativeFunction("st_fromgeohash", {}, DataTypeVector{utf8()}, - arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", - //arrow::structType(), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", - NativeFunction::kNeedsContext), - NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index 68be62816f60e..3eb49f39037f6 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -42,7 +42,6 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::BINARY, i8_ptr_type()}, {arrow::Type::type::DECIMAL, i128_type()}, {arrow::Type::type::INTERVAL_MONTHS, i32_type()}, - {arrow::Type::type::STRUCT, struct_type()}, {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}, {arrow::Type::type::LIST, list_type()}}; } diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 9c4458ea1b705..c255b9a11c084 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -827,65 +827,6 @@ const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len); } -FORCE_INLINE -const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, - gdv_int32* out_len) { - //if (repeat_number == 0 || in_len <= 0) { - // *out_len = 0; - // return ""; - //} - - - //Gandiva-blarg - *out_len = 14; - char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); - if (ret == nullptr) { - gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); - *out_len = 0; - return ""; - } - - std::string out_string = "Gandiva-blarg"; - memcpy(ret, out_string.c_str(), *out_len); - return ret; -} - -FORCE_INLINE -const gdv_struct gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len) { - //gdv_struct* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, sizeof(gdv_struct))); - gdv_struct ret; - ret.lattitude = 42; - ret.longitude = 142; - return ret; - - //if (repeat_number == 0 || in_len <= 0) { - // *out_len = 0; - // return ""; - //} - - /*auto s = arrow::struct_({field("a", arrow::int32(), false), field("b", arrow::int32(), false)}); - - MemoryPool* pool_ = default_memory_pool(); - std::unique_ptr tmp; - MakeBuilder(pool_, s, &tmp); - - - -//std::vector list_lengths = {42, 43}; -//std::vector list_offsets = {142, 143}; -//410 ListBuilder* list_vb = checked_cast(builder_->field_builder(0)); - Int32Builder* int_vb = checked_cast(builder_->field_builder(0)); - Int32Builder* int_vb2 = checked_cast(builder_->field_builder(1)); -//420 ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(), -//421 list_is_valid.data())); - - int_vb->UnsafeAppend(42); - int_vb->UnsafeAppend(43); - int_vb2->UnsafeAppend(142); - int_vb2->UnsafeAppend(143); -*/ -} - FORCE_INLINE const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_number, gdv_int32* out_len) { diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 15fba4867e650..117b27b2808dd 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -43,13 +43,6 @@ using gdv_utf8 = char*; using gdv_binary = char*; using gdv_day_time_interval = int64_t; -struct GeoStruct { - double lattitude; - double longitude; -}; - -using gdv_struct = GeoStruct; - #ifdef GANDIVA_UNIT_TEST // unit tests may be compiled without O2, so inlining may not happen. #define FORCE_INLINE @@ -473,11 +466,6 @@ gdv_int64 truncate_int64_int32(gdv_int64 in, gdv_int32 out_scale); const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_times, gdv_int32* out_len); -const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, - gdv_int32* out_len); - -const gdv_struct gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len); - const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, gdv_int64 length, gdv_int32* out_len); diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index c74a1b7271788..9c135ea8065d4 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -136,10 +136,6 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) gandiva_data_type->set_type(types::GandivaType::INTERVAL); gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME); break; - //LR TODO - case arrow::Type::STRUCT: - gandiva_data_type->set_type(types::GandivaType::STRUCT); - break; case arrow::Type::LIST: gandiva_data_type->set_type(types::GandivaType::LIST); break; diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 41b2593d501cd..e6852e2198ec7 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -297,9 +297,8 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { return ProtoTypeToTimestamp(ext_type); case types::INTERVAL: return ProtoTypeToInterval(ext_type); - case types::STRUCT: - return arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}); case types::LIST: + //LR TODO return arrow::list(arrow::int32()); //return arrow::list(arrow::utf8()); case types::FIXED_SIZE_BINARY: diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 7d677927f0ced..fe82c25736aac 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -31,7 +31,6 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.message.ArrowBuffer; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; @@ -377,10 +376,6 @@ private void evaluate(int numRows, List buffers, List buf // save vector to allow for resizing. resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; } - if (valueVector instanceof StructVector) { - outAddrs[idx] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().memoryAddress(); - outSizes[idx++] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().capacity(); - } if (valueVector instanceof ListVector) { hasVariableWidthColumns = true; resizableListVectors[outColumnIdx] = (ListVector) valueVector; From 980972f52aa9ae210defa3ab14e233238cbd6eef Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 31 Oct 2023 10:28:11 -0700 Subject: [PATCH 27/46] Working functions end to end. --- cpp/src/gandiva/array_ops.cc | 355 +++++++++++++++++- cpp/src/gandiva/array_ops.h | 44 ++- cpp/src/gandiva/engine.cc | 1 + cpp/src/gandiva/function_registry_array.cc | 26 +- cpp/src/gandiva/function_registry_string.cc | 2 - cpp/src/gandiva/function_signature.cc | 2 + cpp/src/gandiva/llvm_generator.cc | 34 +- cpp/src/gandiva/llvm_types.h | 6 +- cpp/src/gandiva/tree_expr_builder.cc | 1 + java/gandiva/pom.xml | 5 + java/gandiva/proto/Types.proto | 1 + .../main/cpp/expression_registry_helper.cc | 31 +- java/gandiva/src/main/cpp/jni_common.cc | 55 ++- .../gandiva/evaluator/ExpressionRegistry.java | 34 +- .../gandiva/evaluator/FunctionSignature.java | 13 +- .../gandiva/expression/ArrowTypeHelper.java | 61 ++- .../gandiva/expression/FunctionNode.java | 33 +- .../arrow/gandiva/expression/IfNode.java | 2 +- .../arrow/gandiva/expression/NullNode.java | 2 +- .../arrow/gandiva/expression/TreeBuilder.java | 52 ++- .../evaluator/ExpressionRegistryTest.java | 6 +- 21 files changed, 686 insertions(+), 80 deletions(-) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index b5d1a57e5fe5f..96052f58c92de 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -27,13 +27,25 @@ #include "gandiva/engine.h" #include "gandiva/exported_funcs.h" +//LR TODO +namespace { + bool floatsEqual(float l, float r) { + return (l - r < 0.001 && r - l < 0.001); + } + + bool doublesEqual(double l, double r) { + return (l - r < 0.001 && r - l < 0.001); + } +} /// Stub functions that can be accessed from LLVM or the pre-compiled library. extern "C" { bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len, - const char* contains_data, int32_t contains_data_length) { + const char* contains_data, int32_t contains_data_length, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row) { for (int i = 0; i < entry_offsets_len; i++) { int32_t entry_len = *(entry_child_offsets + i + 1) - *(entry_child_offsets + i); if (entry_len != contains_data_length) { @@ -66,8 +78,8 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { continue; } - int32_t entry_val = *(entry_buf + i); - if (entry_val == contains_data) { + int32_t entry_item = *(entry_buf + i); + if (entry_item == contains_data) { return true; } } @@ -92,8 +104,60 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { continue; } - int64_t entry_len = *(entry_buf + (i*2)); - if (entry_len == contains_data) { + int64_t entry_item = *(entry_buf + (i)); + if (entry_item == contains_data) { + return true; + } + } + return false; +} + +bool array_float32_contains_float32(int64_t context_ptr, const float* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + float contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row) { + if (!combined_row_validity) { + *valid_row = false; + return false; + } + *valid_row = true; + + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = validity_index_var - entry_len; + + for (int i = 0; i < entry_len; i++) { + if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + continue; + } + float entry_item = *(entry_buf + (i)); + if (floatsEqual(entry_item, contains_data)) { + return true; + } + } + return false; +} + +bool array_float64_contains_float64(int64_t context_ptr, const double* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + double contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row) { + if (!combined_row_validity) { + *valid_row = false; + return false; + } + *valid_row = true; + + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = validity_index_var - entry_len; + + for (int i = 0; i < entry_len; i++) { + if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + continue; + } + double entry_item = *(entry_buf + (i)); + if (doublesEqual(entry_item, contains_data)) { return true; } } @@ -152,11 +216,177 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, return reinterpret_cast(ret); } -int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, - int32_t* entry_child_offsets, int32_t entry_offsets_len) { - int64_t res = entry_offsets_len; - return res; + + +int64_t* array_int64_remove(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int64_t remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ + + std::vector newInts; + + //LR TODO not sure what entry_validWhat is. + //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = 0; + //The validity index already has the current row length added to it, so decrement. + validityBitIndex = validity_index_var - entry_len; + entry_validWhat = true; + std::vector outValid; + std::cout << "LR TODO entry length is " << entry_len << std::endl; + for (int32_t i = 0; i < entry_len; i++) { + int64_t entry_item = *(entry_buf + (i)); + std::cout << "LR TODO checking entry item " << entry_item << std::endl; + if (entry_item == remove_data) { + //Do not add the item to remove. + } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + outValid.push_back(false); + newInts.push_back(0); + std::cout << "LR TODO entry item is null" << std::endl; + } else { + outValid.push_back(true); + newInts.push_back(entry_item); + } + } + + *out_len = (int)newInts.size(); + + //Since this function can remove values we don't know the length ahead of time. + //LR TODO divide by 8 and ensure at least 1? + uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); + for (int i = 0; i < outValid.size(); i++) { + arrow::bit_util::SetBitTo(validRet, i, outValid[i]); + std::cout << "LR TODO Setting validty " << i << " to " << outValid[i] << std::endl; + } + + int32_t outBufferLength = (int)*out_len * sizeof(int64_t); + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); + memcpy(ret, newInts.data(), outBufferLength); + + //LR TODO + for (int k = 0; k < *out_len; k++) { + std::cout << "LR TODO the 64 data is " << ((int64_t*)ret)[k] << std::endl; + } + + *valid_row = true; + if (!combined_row_validity) { + *out_len = 0; + *valid_row = false; //this one is what works for the top level validity. + entry_validWhat = false; + } + *valid_ptr = reinterpret_cast(validRet); + return reinterpret_cast(ret); +} + +float* array_float32_remove(int64_t context_ptr, const float* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + float remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ + + std::vector newArray; + + //LR TODO not sure what entry_validWhat is. + //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = 0; + //The validity index already has the current row length added to it, so decrement. + validityBitIndex = validity_index_var - entry_len; + entry_validWhat = true; + std::vector outValid; + for (int i = 0; i < entry_len; i++) { + float entry_item = *(entry_buf + (i * 1)); + //LR TODO comparison tolerance? + if (floatsEqual(entry_item, remove_data)) { + //Do not add the item to remove. + } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + outValid.push_back(false); + newArray.push_back(0); + } else { + outValid.push_back(true); + newArray.push_back(entry_item); + } + } + + *out_len = (int)newArray.size(); + + //Since this function can remove values we don't know the length ahead of time. + //LR TODO divide by 8 and ensure at least 1? + uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); + for (int i = 0; i < outValid.size(); i++) { + arrow::bit_util::SetBitTo(validRet, i, outValid[i]); + } + + int32_t outBufferLength = (int)*out_len * sizeof(float); + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); + memcpy(ret, newArray.data(), outBufferLength); + *valid_row = true; + if (!combined_row_validity) { + *out_len = 0; + *valid_row = false; //this one is what works for the top level validity. + entry_validWhat = false; + } + *valid_ptr = reinterpret_cast(validRet); + return reinterpret_cast(ret); +} + + +double* array_float64_remove(int64_t context_ptr, const double* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + double remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ + + std::vector newArray; + + //LR TODO not sure what entry_validWhat is. + //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = 0; + //The validity index already has the current row length added to it, so decrement. + validityBitIndex = validity_index_var - entry_len; + entry_validWhat = true; + std::vector outValid; + for (int32_t i = 0; i < entry_len; i++) { + double entry_item = *(entry_buf + (i * 1)); + //LR TODO comparison tolerance? + if (doublesEqual(entry_item, remove_data)) { + //Do not add the item to remove. + } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + outValid.push_back(false); + newArray.push_back(0.0); + } else { + outValid.push_back(true); + newArray.push_back(entry_item); + } + } + + *out_len = (int)newArray.size(); + + //Since this function can remove values we don't know the length ahead of time. + //LR TODO divide by 8 and ensure at least 1? + uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); + for (int i = 0; i < outValid.size(); i++) { + arrow::bit_util::SetBitTo(validRet, i, outValid[i]); + } + + int32_t outBufferLength = (int)*out_len * sizeof(double); + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); + memcpy(ret, newArray.data(), outBufferLength); + *valid_row = true; + if (!combined_row_validity) { + *out_len = 0; + *valid_row = false; //this one is what works for the top level validity. + entry_validWhat = false; + } + *valid_ptr = reinterpret_cast(validRet); + return reinterpret_cast(ret); } + } namespace gandiva { @@ -164,20 +394,21 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { std::vector args; auto types = engine->types(); - args = {types->i64_type(), // int64_t execution_context - types->i8_ptr_type(), // int8_t* data ptr - types->i32_ptr_type(), // int32_t* child offsets ptr - types->i32_type()}; // int32_t child offsets length - - engine->AddGlobalMappingForFunc("array_utf8_length", types->i64_type() /*return_type*/, - args, reinterpret_cast(array_utf8_length)); + //Array contains. args = {types->i64_type(), // int64_t execution_context types->i8_ptr_type(), // int8_t* data ptr types->i32_ptr_type(), // int32_t* child offsets ptr types->i32_type(), // int32_t child offsets length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity types->i8_ptr_type(), // const char* contains data buf - types->i32_type()}; // int32_t contains data length + types->i32_type(), // int32_t contains data length + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type() //output validity for the row + }; engine->AddGlobalMappingForFunc("array_utf8_contains_utf8", types->i1_type() /*return_type*/, args, @@ -215,6 +446,38 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(array_int64_contains_int64)); + args = {types->i64_type(), // int64_t execution_context + types->float_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t data length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->float_type(), // int32_t value to check for + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type() //output validity for the row + }; + + engine->AddGlobalMappingForFunc("array_float32_contains_float32", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_float32_contains_float32)); + + args = {types->i64_type(), // int64_t execution_context + types->double_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t data length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->double_type(), // int32_t value to check for + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type() //output validity for the row + }; + + engine->AddGlobalMappingForFunc("array_float64_contains_float64", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_float64_contains_float64)); + //Array remove. args = {types->i64_type(), // int64_t execution_context types->i32_ptr_type(), // int8_t* input data ptr types->i32_type(), // int32_t input length @@ -229,9 +492,65 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { types->i32_ptr_type() //output pointer to new validity buffer }; - engine->AddGlobalMappingForFunc("array_int32_remove", types->i32_ptr_type(), args, reinterpret_cast(array_int32_remove)); + + args = {types->i64_type(), // int64_t execution_context + types->i64_ptr_type(), // int8_t* input data ptr + types->i32_type(), // int32_t input length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->i64_type(), //value to remove from input + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type(), //output validity for the row + types->i32_ptr_type(), // output array length + types->i32_ptr_type() //output pointer to new validity buffer + + }; + + engine->AddGlobalMappingForFunc("array_int64_remove", + types->i64_ptr_type(), args, + reinterpret_cast(array_int64_remove)); + + args = {types->i64_type(), // int64_t execution_context + types->float_ptr_type(), // float* input data ptr + types->i32_type(), // int32_t input length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->float_type(), //value to remove from input + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type(), //output validity for the row + types->i32_ptr_type(), // output array length + types->i32_ptr_type() //output pointer to new validity buffer + + }; + + engine->AddGlobalMappingForFunc("array_float32_remove", + types->float_ptr_type(), args, + reinterpret_cast(array_float32_remove)); + + args = {types->i64_type(), // int64_t execution_context + types->double_ptr_type(), // int8_t* input data ptr + types->i32_type(), // int32_t input length + types->i32_ptr_type(), // input validity buffer + types->i1_type(), // bool input row validity + types->double_type(), //value to remove from input + types->i1_type(), // bool validity --Needed? + types->i64_type(), //in loop var --Needed? + types->i64_type(), //in validity_index_var index into the valdity vector for the current row. + types->i1_ptr_type(), //output validity for the row + types->i32_ptr_type(), // output array length + types->i32_ptr_type() //output pointer to new validity buffer + + }; + + engine->AddGlobalMappingForFunc("array_float64_remove", + types->double_ptr_type(), args, + reinterpret_cast(array_float64_remove)); } } // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 8d51b6e09f7f7..2a7d1448a9af4 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -30,10 +30,9 @@ extern "C" { GANDIVA_EXPORT bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, int32_t* entry_child_offsets, int32_t entry_offsets_len, - const char* contains_data, int32_t contains_data_length); -GANDIVA_EXPORT -int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, - int32_t* entry_child_offsets, int32_t entry_offsets_len); + const char* contains_data, int32_t contains_data_length, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row); GANDIVA_EXPORT bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, @@ -47,11 +46,46 @@ bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int64_t loop_var, int64_t validity_index_var, bool* valid_buf); +GANDIVA_EXPORT +bool array_float32_contains_float32(int64_t context_ptr, const float* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + float contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_buf); + +GANDIVA_EXPORT +bool array_float64_contains_float64(int64_t context_ptr, const double* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + double contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_buf); + GANDIVA_EXPORT int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int32_t remove_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, - bool* valid_buf, int32_t* out_len, int32_t** valid_ptr); + bool* valid_row, int32_t* out_len, int32_t** valid_ptr); + +GANDIVA_EXPORT +int64_t* array_int64_remove(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int64_t remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr); + +GANDIVA_EXPORT +float* array_float32_remove(int64_t context_ptr, const float* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + float remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr); + +GANDIVA_EXPORT +double* array_float64_remove(int64_t context_ptr, const double* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + double remove_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row, int32_t* out_len, int32_t** valid_ptr); } diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 2033919cde2b3..f8cfa8b54a60d 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -300,6 +300,7 @@ Status Engine::FinalizeModule() { if (!cached_) { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); + //LR TODO //LR Turning this off seems to provide better error messages with compilation/generation failures. //if (optimize_) { if (false) { diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index 439b275d0ace2..015c8e97bfb53 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -23,17 +23,35 @@ namespace gandiva { std::vector GetArrayFunctionRegistry() { static std::vector array_fn_registry_ = { NativeFunction("array_containsGandiva", {}, DataTypeVector{list(utf8()), utf8()}, - boolean(), kResultNullIfNull, "array_utf8_contains_utf8", - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), - NativeFunction("array_lengthGandiva", {}, DataTypeVector{list(utf8())}, int64(), - kResultNullIfNull, "array_utf8_length", + boolean(), kResultNullInternal, "array_utf8_contains_utf8", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, boolean(), kResultNullInternal, "array_int32_contains_int32", NativeFunction::kNeedsContext), + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, + boolean(), kResultNullInternal, "array_int64_contains_int64", + NativeFunction::kNeedsContext), + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(float32()), float32()}, + boolean(), kResultNullInternal, "array_float32_contains_float32", + NativeFunction::kNeedsContext), + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(float64()), float64()}, + boolean(), kResultNullInternal, "array_float64_contains_float64", + NativeFunction::kNeedsContext), + NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int32()), int32()}, list(int32()), kResultNullInternal, "array_int32_remove", NativeFunction::kNeedsContext), + NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int64()), int64()}, + list(int64()), kResultNullInternal, "array_int64_remove", + NativeFunction::kNeedsContext), + NativeFunction("array_removeGandiva", {}, DataTypeVector{list(float32()), float32()}, + list(float32()), kResultNullInternal, "array_float32_remove", + NativeFunction::kNeedsContext), + NativeFunction("array_removeGandiva", {}, DataTypeVector{list(float64()), float64()}, + list(float64()), kResultNullInternal, "array_float64_remove", + NativeFunction::kNeedsContext), }; return array_fn_registry_; } diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index d93757b40cfd0..2bc6936d77b3c 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -307,8 +307,6 @@ std::vector GetStringFunctionRegistry() { // concat treats null inputs as empty strings whereas concatOperator returns null if // one of the inputs is null - NativeFunction("concatGandiva", {}, DataTypeVector{utf8(), utf8()}, utf8(), - kResultNullNever, "concat_utf8_utf8", NativeFunction::kNeedsContext), NativeFunction("concat", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullNever, "concat_utf8_utf8", NativeFunction::kNeedsContext), NativeFunction("concat", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(), diff --git a/cpp/src/gandiva/function_signature.cc b/cpp/src/gandiva/function_signature.cc index 6dc6416178e15..8c086f5ee33a4 100644 --- a/cpp/src/gandiva/function_signature.cc +++ b/cpp/src/gandiva/function_signature.cc @@ -18,6 +18,7 @@ #include "gandiva/function_signature.h" #include +#include #include #include #include @@ -58,6 +59,7 @@ FunctionSignature::FunctionSignature(std::string base_name, DataTypeVector param : base_name_(std::move(base_name)), param_types_(std::move(param_types)), ret_type_(std::move(ret_type)) { + std::cout << "LR TODO creating FunctionSignature " << ret_type_->ToString() << std::endl; DCHECK_GT(base_name_.length(), 0); for (auto it = param_types_.begin(); it != param_types_.end(); it++) { DCHECK(*it); diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 9d8786e28c5bb..f9d993403bf42 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -113,6 +113,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode ARROW_RETURN_NOT_OK(Add(expr, output)); } +std::cout << "LR TODO LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); @@ -590,6 +591,7 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::Type* ret_type, const std::vector& args) { + std::cout << "LR TODO AddFunctionCall " << full_name << " ret type is " << printType(ret_type) << std::endl; // find the llvm function. llvm::Function* fn = module()->getFunction(full_name); DCHECK_NE(fn, nullptr) << "missing function " << full_name; @@ -600,6 +602,10 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, ADD_TRACE("invoke native fn " + full_name); } + std::cout << "LR TODO AddFunctionCall 2" << std::endl; + for (llvm::Value* lv : args) { + std::cout << "LR TODO arg is " << printType(lv) << std::endl; + } // build a call to the llvm function. llvm::Value* value; if (ret_type->isVoidTy()) { @@ -607,7 +613,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args); } else { value = ir_builder()->CreateCall(fn, args, full_name); - +std::cout << "LR TODO AddFunctionCall 3" << std::endl; std::string str; llvm::raw_string_ostream output(str); std::string str2; @@ -699,22 +705,34 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - arrow::Type::type at = arrow::Type::INT32; - type = types->IRType(at); + std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + + auto dt = dex.FieldType(); + if (dt->id() == arrow::Type::LIST) { + if (dt->num_fields() > 0) { + std::cout << "LR TODO creating listtype" << std::endl; + std::cout << "LR TODO listtype id=" << dt->fields()[0]->type()->id() << std::endl; + type = types->IRType(dt->fields()[0]->type()->id() ); + } + } + std::cout << "LR TODO using type " << printType(type) << std::endl; + + arrow::Type::type at32 = arrow::Type::INT32; + auto type32 = types->IRType(at32); // compute list len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); - slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); - llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); + slot = builder->CreateGEP(type32, offsets_slot_ref, offsets_slot_index); + llvm::Value* offset_start = builder->CreateLoad(type32, slot, "offset_start"); // => offset_end = offsets[loop_var + 1] llvm::Value* offsets_slot_index_next = builder->CreateAdd( offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); - slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); - llvm::Value* offset_end = builder->CreateLoad(type,slot, "offset_end"); + slot = builder->CreateGEP(type32, offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(type32, slot, "offset_end"); // => offsets_len_value = offset_end - offset_start llvm::Value* list_len = builder->CreateSub(offset_end, offset_start, "offsets_len"); @@ -738,7 +756,7 @@ llvm::Value* updated_validity_index_var = builder->CreateAdd( llvm::Value* b_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.ValidityIdx())); llvm::Value* b_slot_ref = GetBufferReference(dex.ChildValidityIdx(), kBufferTypeValidity, dex.Field()); - llvm::Value* validity = builder->CreateGEP(type, b_slot_ref, b_slot_index); + llvm::Value* validity = builder->CreateGEP(type32, b_slot_ref, b_slot_index); std::string str3 = "validity:"; if (validity) { diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index 5e43eb74abcdf..be31954f0c7b1 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -127,10 +127,10 @@ class GANDIVA_EXPORT LLVMTypes { // not support nested list if (data_type->id() == arrow::Type::LIST) { //LR TODO - //std::cout << "LR Returning list type as type " << data_type->field(0)->type()->id()<< " for IR " << std::endl; - //return IRType(data_type->field(0)->type()->id()); + std::cout << "LR Returning list type as type " << data_type->field(0)->type()->id()<< " for IR " << std::endl; + return IRType(data_type->field(0)->type()->id()); //return IRType(data_type->id()); - return i32_ptr_type(); + //return i32_ptr_type(); } return IRType(data_type->id()); } diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 1946aadfef16f..08ffb8c192cb9 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -161,6 +161,7 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } + std::cout << "LR TODO creating TreeExpression " << out_field->type()->ToString() << std::endl; auto func_node = MakeFunction(function, field_nodes, out_field->type()); return MakeExpression(func_node, out_field); } diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index bed66b427e625..d2df653d5e1fe 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -30,6 +30,11 @@ ../../../cpp/release-build + + org.apache.arrow + arrow-format + ${project.version} + org.apache.arrow arrow-memory-core diff --git a/java/gandiva/proto/Types.proto b/java/gandiva/proto/Types.proto index eb0d996b92e63..a5c4df474db37 100644 --- a/java/gandiva/proto/Types.proto +++ b/java/gandiva/proto/Types.proto @@ -85,6 +85,7 @@ message ExtGandivaType { optional TimeUnit timeUnit = 6; // used by TIME32/TIME64 optional string timeZone = 7; // used by TIMESTAMP optional IntervalType intervalType = 8; // used by INTERVAL + optional GandivaType listType = 9; //used by LIST } message Field { diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index 9c135ea8065d4..aba90a93fc87f 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -136,9 +136,33 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) gandiva_data_type->set_type(types::GandivaType::INTERVAL); gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME); break; - case arrow::Type::LIST: + case arrow::Type::LIST: { gandiva_data_type->set_type(types::GandivaType::LIST); + //LR TODO make a helper function + std::cout << "LR TODO creating listtype" << std::endl; + if (type->num_fields() <= 0) { + break; + } + std::cout << "LR TODO listtype id=" << type->fields()[0]->type()->id() << std::endl; + switch (type->fields()[0]->type()->id()) { + case arrow::Type::INT32: + gandiva_data_type->set_listtype(types::GandivaType::INT32); + break; + case arrow::Type::INT64: + gandiva_data_type->set_listtype(types::GandivaType::INT64); + break; + case arrow::Type::FLOAT: + gandiva_data_type->set_listtype(types::GandivaType::FLOAT); + break; + case arrow::Type::DOUBLE: + gandiva_data_type->set_listtype(types::GandivaType::DOUBLE); + break; + case arrow::Type::STRING: + gandiva_data_type->set_listtype(types::GandivaType::UTF8); + break; + } break; + } default: // un-supported types. test ensures that // when one of these are added build breaks. @@ -179,6 +203,11 @@ Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSu for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { + //LR TODO + printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).base_name().c_str()); + printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).ToString().c_str()); + fflush(stdout); + types::FunctionSignature* function_signature = gandiva_functions.add_function(); function_signature->set_name((*function).base_name()); types::ExtGandivaType* return_type = function_signature->mutable_returntype(); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index e6852e2198ec7..4a4ccda035375 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -198,6 +198,8 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { env->DeleteGlobalRef(cache_buf_ret_class_); } +DataTypePtr SimpleProtoTypeToDataType(const types::GandivaType& gandiva_type); + DataTypePtr ProtoTypeToTime32(const types::ExtGandivaType& ext_type) { switch (ext_type.timeunit()) { case types::SEC: @@ -250,8 +252,14 @@ DataTypePtr ProtoTypeToInterval(const types::ExtGandivaType& ext_type) { } } -DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { - switch (ext_type.type()) { +DataTypePtr ProtoTypeToList(const types::ExtGandivaType& ext_type) { + std::cout << "LR TODO 2 checking a field type " << ext_type.type() << " and it has listType:" << ext_type.listtype() << std::endl; + DataTypePtr childType = SimpleProtoTypeToDataType(ext_type.listtype()); + return arrow::list(childType); +} + +DataTypePtr SimpleProtoTypeToDataType(const types::GandivaType& gandiva_type) { + switch (gandiva_type) { case types::NONE: return arrow::null(); case types::BOOL: @@ -286,6 +294,16 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { return arrow::date32(); case types::DATE64: return arrow::date64(); + default: + std::cerr << "Unknown data type: " << gandiva_type << "\n"; + return nullptr; + } +} + + + +DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { + switch (ext_type.type()) { case types::DECIMAL: // TODO: error handling return arrow::decimal(ext_type.precision(), ext_type.scale()); @@ -298,25 +316,36 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { case types::INTERVAL: return ProtoTypeToInterval(ext_type); case types::LIST: - //LR TODO - return arrow::list(arrow::int32()); - //return arrow::list(arrow::utf8()); + return ProtoTypeToList(ext_type); case types::FIXED_SIZE_BINARY: case types::UNION: case types::DICTIONARY: case types::MAP: std::cerr << "Unhandled data type: " << ext_type.type() << "\n"; return nullptr; - default: - std::cerr << "Unknown data type: " << ext_type.type() << "\n"; + return SimpleProtoTypeToDataType(ext_type.type()); + } +} + +DataTypePtr ProtoTypeToDataType(const types::Field& f) { + const types::ExtGandivaType& ext_type = f.type(); + std::cout << "LR TODO checking a field type " << ext_type.type() << " and it has listType:" << ext_type.listtype() << std::endl; + if (ext_type.type() == types::LIST) { + if (f.children().size() > 0 && f.children()[0].type().type() != types::LIST) { + DataTypePtr childType = ProtoTypeToDataType(f.children()[0].type()); + return arrow::list(childType); + } + std::cerr << "Unhandled list data type: " << ext_type.type() << "\n"; return nullptr; + } else { + return ProtoTypeToDataType(ext_type); } } FieldPtr ProtoTypeToField(const types::Field& f) { const std::string& name = f.name(); - DataTypePtr type = ProtoTypeToDataType(f.type()); + DataTypePtr type = ProtoTypeToDataType(f); bool nullable = true; if (f.has_nullable()) { nullable = f.nullable(); @@ -350,7 +379,7 @@ NodePtr ProtoTypeToFnNode(const types::FunctionNode& node) { children.push_back(n); } - + DataTypePtr return_type = ProtoTypeToDataType(node.returntype()); if (return_type == nullptr) { std::cerr << "Unknown return type for function: " << name << "\n"; @@ -1109,7 +1138,15 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( //outBufJava->outerValidityBuffer = reinterpret_cast(out_bufs[0]); child_buffers.push_back(outBufJava); + //LR TODO + + std::cout << "LR Creating array for type: " << field->type()->ToString() << std::endl; std::shared_ptr dt2 = std::make_shared(); + if (field->type()->id() == arrow::Type::LIST && field->type()->num_fields() > 0) { + dt2 = field->type()->fields()[0]->type(); + } + std::cout << "LR using sub type: " << dt2->ToString() << std::endl; + auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); std::vector> kids; kids.push_back(array_data_child); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 39358a084ba98..80b61332e62e9 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -116,12 +116,13 @@ private static Set getSupportedFunctionsFromGandiva() throws String functionName = protoFunctionSignature.getName(); ArrowType returnType = getArrowType(protoFunctionSignature.getReturnType()); + ArrowType returnListType = getArrowTypeSimple(protoFunctionSignature.getReturnType().getListType()); List paramTypes = Lists.newArrayList(); for (ExtGandivaType type : protoFunctionSignature.getParamTypesList()) { paramTypes.add(getArrowType(type)); } FunctionSignature functionSignature = new FunctionSignature(functionName, - returnType, paramTypes); + returnType, returnListType, paramTypes); supportedTypes.add(functionSignature); } } catch (InvalidProtocolBufferException invalidProtException) { @@ -130,8 +131,8 @@ private static Set getSupportedFunctionsFromGandiva() throws return supportedTypes; } - private static ArrowType getArrowType(ExtGandivaType type) { - switch (type.getType().getNumber()) { + private static ArrowType getArrowTypeSimple(GandivaType type) { + switch (type.getNumber()) { case GandivaType.BOOL_VALUE: return ArrowType.Bool.INSTANCE; case GandivaType.UINT8_VALUE: @@ -164,20 +165,10 @@ private static ArrowType getArrowType(ExtGandivaType type) { return new ArrowType.Date(DateUnit.DAY); case GandivaType.DATE64_VALUE: return new ArrowType.Date(DateUnit.MILLISECOND); - case GandivaType.TIMESTAMP_VALUE: - return new ArrowType.Timestamp(mapArrowTimeUnit(type.getTimeUnit()), null); - case GandivaType.TIME32_VALUE: - return new ArrowType.Time(mapArrowTimeUnit(type.getTimeUnit()), - BIT_WIDTH_32); - case GandivaType.TIME64_VALUE: - return new ArrowType.Time(mapArrowTimeUnit(type.getTimeUnit()), - BIT_WIDTH_64); case GandivaType.NONE_VALUE: return new ArrowType.Null(); case GandivaType.DECIMAL_VALUE: return new ArrowType.Decimal(0, 0, 128); - case GandivaType.INTERVAL_VALUE: - return new ArrowType.Interval(mapArrowIntervalUnit(type.getIntervalType())); case GandivaType.STRUCT_VALUE: return new ArrowType.Struct(); case GandivaType.LIST_VALUE: @@ -192,6 +183,23 @@ private static ArrowType getArrowType(ExtGandivaType type) { return null; } + private static ArrowType getArrowType(ExtGandivaType type) { + switch (type.getType().getNumber()) { + case GandivaType.TIMESTAMP_VALUE: + return new ArrowType.Timestamp(mapArrowTimeUnit(type.getTimeUnit()), null); + case GandivaType.TIME32_VALUE: + return new ArrowType.Time(mapArrowTimeUnit(type.getTimeUnit()), + BIT_WIDTH_32); + case GandivaType.TIME64_VALUE: + return new ArrowType.Time(mapArrowTimeUnit(type.getTimeUnit()), + BIT_WIDTH_64); + case GandivaType.INTERVAL_VALUE: + return new ArrowType.Interval(mapArrowIntervalUnit(type.getIntervalType())); + default: + return getArrowTypeSimple(type.getType()); + } + } + private static TimeUnit mapArrowTimeUnit(GandivaTypes.TimeUnit timeUnit) { switch (timeUnit.getNumber()) { case GandivaTypes.TimeUnit.MICROSEC_VALUE: diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java index d01881843de47..e626efedd8d9f 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java @@ -30,12 +30,17 @@ public class FunctionSignature { private final String name; private final ArrowType returnType; + private final ArrowType returnListType; private final List paramTypes; public ArrowType getReturnType() { return returnType; } + public ArrowType getReturnListType() { + return returnListType; + } + public List getParamTypes() { return paramTypes; } @@ -48,11 +53,13 @@ public String getName() { * Ctor. * @param name - name of the function. * @param returnType - data type of return + * @param returnListType optional list type * @param paramTypes - data type of input args. */ - public FunctionSignature(String name, ArrowType returnType, List paramTypes) { + public FunctionSignature(String name, ArrowType returnType, ArrowType returnListType, List paramTypes) { this.name = name; this.returnType = returnType; + this.returnListType = returnListType; this.paramTypes = paramTypes; } @@ -71,12 +78,13 @@ public boolean equals(Object signature) { final FunctionSignature other = (FunctionSignature) signature; return this.name.equalsIgnoreCase(other.name) && Objects.equal(this.returnType, other.returnType) && + Objects.equal(this.returnListType, other.returnListType) && Objects.equal(this.paramTypes, other.paramTypes); } @Override public int hashCode() { - return Objects.hashCode(this.name.toLowerCase(), this.returnType, this.paramTypes); + return Objects.hashCode(this.name.toLowerCase(), this.returnType, this.returnListType, this.paramTypes); } @Override @@ -84,6 +92,7 @@ public String toString() { return MoreObjects.toStringHelper(this) .add("name ", name) .add("return type ", returnType) + .add("return list type", returnListType) .add("param types ", paramTypes) .toString(); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 47d97c6b0dca8..9e84bc6d05561 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -149,7 +149,16 @@ private static void initArrowTypeStruct(ArrowType.Struct structType, } private static void initArrowTypeList(ArrowType.List listType, - GandivaTypes.ExtGandivaType.Builder builder) { + ArrowType subType, + GandivaTypes.ExtGandivaType.Builder builder) throws GandivaException { + /*if (f != null && f.getChildren().size() > 0 && f.getChildren().get(0) + .getType().getTypeID().getFlatbufID() != Type.List) { + //builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null)); + builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null, builder).getType()); + }*/ + if (subType != null) { + builder.setListType(arrowTypeToProtobuf(subType).getType()); + } builder.setType(GandivaTypes.GandivaType.LIST); } @@ -237,11 +246,13 @@ private static void initArrowTypeInterval(ArrowType.Interval interval, * Converts an arrow type into a protobuf. * * @param arrowType Arrow type to be converted + * @param subType optional arrow type for list/complex types + * @param builder the builder to use * @return Protobuf representing the arrow type */ - public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowType) + public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowType, ArrowType subType, + GandivaTypes.ExtGandivaType.Builder builder) throws GandivaException { - GandivaTypes.ExtGandivaType.Builder builder = GandivaTypes.ExtGandivaType.newBuilder(); byte typeId = arrowType.getTypeID().getFlatbufID(); switch (typeId) { @@ -294,7 +305,7 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp break; } case Type.List: { // 12 - ArrowTypeHelper.initArrowTypeList((ArrowType.List) arrowType, builder); + ArrowTypeHelper.initArrowTypeList((ArrowType.List) arrowType, subType, builder); break; } case Type.Struct_: { // 13 @@ -327,6 +338,31 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp return builder.build(); } + + /** + * Converts an arrow type into a protobuf. + * + * @param arrowType Arrow type to be converted + * @param f field optional for list/complex types + * @return Protobuf representing the arrow type + */ + public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowType, ArrowType f) + throws GandivaException { + GandivaTypes.ExtGandivaType.Builder builder = GandivaTypes.ExtGandivaType.newBuilder(); + return arrowTypeToProtobuf(arrowType, f, builder); + } + + /** + * Converts an arrow type into a protobuf. + * + * @param arrowType Arrow type to be converted + * @return Protobuf representing the arrow type + */ + public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowType) + throws GandivaException { + return arrowTypeToProtobuf(arrowType, null); + } + /** * Converts an arrow field object to a protobuf. * @param field Arrow field to be converted @@ -335,12 +371,25 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp public static GandivaTypes.Field arrowFieldToProtobuf(Field field) throws GandivaException { GandivaTypes.Field.Builder builder = GandivaTypes.Field.newBuilder(); builder.setName(field.getName()); - builder.setType(ArrowTypeHelper.arrowTypeToProtobuf(field.getType())); builder.setNullable(field.isNullable()); + //LR TODO + ArrowType subType = null; + if (field.getChildren().size() > 0 && field.getChildren().get(0) + .getType().getTypeID().getFlatbufID() != Type.List) { + //builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null)); + subType = field.getChildren().get(0).getType(); + } + + builder.setType(ArrowTypeHelper.arrowTypeToProtobuf(field.getType(), subType)); for (Field child : field.getChildren()) { - builder.addChildren(ArrowTypeHelper.arrowFieldToProtobuf(child)); + System.out.println("LR TODO arrowFieldToProtobuf child field id is " + child.getType().getTypeID() ); + if (child.getType() != ArrowType.Null.INSTANCE) { + System.out.println("LR TODO adding child=" + child.getName() + " type=" + child.getType()); + builder.addChildren(ArrowTypeHelper.arrowFieldToProtobuf(child)); + } } + return builder.build(); } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java index ead1e146d5d8c..14d6286a3282c 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java @@ -19,9 +19,12 @@ import java.util.List; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.gandiva.exceptions.GandivaException; import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; + /** * Node representing an arbitrary function in an expression. @@ -30,18 +33,42 @@ class FunctionNode implements TreeNode { private final String function; private final List children; private final ArrowType retType; + private final ArrowType retListType; - FunctionNode(String function, List children, ArrowType retType) { + FunctionNode(String function, List children, Field inField) { + this.function = function; + this.children = children; + this.retType = inField.getType(); + if (inField.getChildren().size() > 0 && inField.getChildren().get(0) + .getType().getTypeID().getFlatbufID() != Type.List) { + //builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null)); + this.retListType = inField.getChildren().get(0).getType(); + } else { + this.retListType = null; + } + + } + + FunctionNode(String function, List children, ArrowType inType) { + this.function = function; + this.children = children; + this.retType = inType; + this.retListType = null; + } + + FunctionNode(String function, List children, ArrowType inType, ArrowType listType) { this.function = function; this.children = children; - this.retType = retType; + this.retType = inType; + this.retListType = listType; } @Override public GandivaTypes.TreeNode toProtobuf() throws GandivaException { GandivaTypes.FunctionNode.Builder fnNode = GandivaTypes.FunctionNode.newBuilder(); fnNode.setFunctionName(function); - fnNode.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType)); + System.out.println("LR TODO retType, retListType)=" + retType + "==" + retListType); + fnNode.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType, retListType)); for (TreeNode arg : children) { fnNode.addInArgs(arg.toProtobuf()); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java index 19f9095fb7626..db97675d8a298 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java @@ -43,7 +43,7 @@ public GandivaTypes.TreeNode toProtobuf() throws GandivaException { ifNodeBuilder.setCond(condition.toProtobuf()); ifNodeBuilder.setThenNode(thenNode.toProtobuf()); ifNodeBuilder.setElseNode(elseNode.toProtobuf()); - ifNodeBuilder.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType)); + ifNodeBuilder.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType, null)); GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder(); builder.setIfNode(ifNodeBuilder.build()); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java index a8e7d6f82e522..caeefe66fd76b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java @@ -32,7 +32,7 @@ class NullNode implements TreeNode { @Override public GandivaTypes.TreeNode toProtobuf() throws GandivaException { GandivaTypes.NullNode.Builder nullNode = GandivaTypes.NullNode.newBuilder(); - nullNode.setType(ArrowTypeHelper.arrowTypeToProtobuf(type)); + nullNode.setType(ArrowTypeHelper.arrowTypeToProtobuf(type, null)); GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder(); builder.setNullNode(nullNode.build()); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index 8656e886aae24..f9f2a4cd775b3 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -94,6 +94,56 @@ public static TreeNode makeField(Field field) { public static TreeNode makeFunction(String function, List children, ArrowType retType) { + System.out.println("LR TODO TreeNode makeFunction Type"); + StackTraceElement[] elements = Thread.currentThread().getStackTrace(); + for (int i = 1; i < elements.length; i++) { + StackTraceElement s = elements[i]; + System.out.println("\tat " + s.getClassName() + "." + s.getMethodName() + + "(" + s.getFileName() + ":" + s.getLineNumber() + ")"); + } + return new FunctionNode(function, children, retType); + } + + /** + * Invoke this function to create a node representing a function. + * + * @param function Name of the function, e.g. add + * @param children The arguments to the function + * @param retType The type of the return value of the operator + * @param listType The type of the list return value of the operator + * @return Node representing a function + */ + public static TreeNode makeFunction(String function, + List children, + ArrowType retType, ArrowType listType) { + System.out.println("LR TODO TreeNode makeFunction Type2"); + StackTraceElement[] elements = Thread.currentThread().getStackTrace(); + for (int i = 1; i < elements.length; i++) { + StackTraceElement s = elements[i]; + System.out.println("\tat " + s.getClassName() + "." + s.getMethodName() + + "(" + s.getFileName() + ":" + s.getLineNumber() + ")"); + } + return new FunctionNode(function, children, retType, listType); + } + + /** + * Invoke this function to create a node representing a function. + * + * @param function Name of the function, e.g. add + * @param children The arguments to the function + * @param retType The field of the return value of the operator, could be a complex type. + * @return Node representing a function + */ + public static TreeNode makeFunction(String function, + List children, + Field retType) { + System.out.println("LR TODO TreeNode makeFunction Field"); + StackTraceElement[] elements = Thread.currentThread().getStackTrace(); + for (int i = 1; i < elements.length; i++) { + StackTraceElement s = elements[i]; + System.out.println("\tat " + s.getClassName() + "." + s.getMethodName() + + "(" + s.getFileName() + ":" + s.getLineNumber() + ")"); + } return new FunctionNode(function, children, retType); } @@ -161,7 +211,7 @@ public static ExpressionTree makeExpression(String function, children.add(makeField(field)); } - TreeNode root = makeFunction(function, children, resultField.getType()); + TreeNode root = makeFunction(function, children, resultField); return makeExpression(root, resultField); } diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java index a51ac09ba1a51..8853945c6d4d4 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java @@ -39,7 +39,7 @@ public void testTypes() throws GandivaException { public void testFunctions() throws GandivaException { ArrowType.Int uint8 = new ArrowType.Int(8, false); FunctionSignature signature = - new FunctionSignature("add", uint8, Lists.newArrayList(uint8, uint8)); + new FunctionSignature("add", uint8, null, Lists.newArrayList(uint8, uint8)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } @@ -48,7 +48,7 @@ public void testFunctions() throws GandivaException { public void testFunctionAliases() throws GandivaException { ArrowType.Int int64 = new ArrowType.Int(64, true); FunctionSignature signature = - new FunctionSignature("modulo", int64, Lists.newArrayList(int64, int64)); + new FunctionSignature("modulo", int64, null, Lists.newArrayList(int64, int64)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } @@ -58,7 +58,7 @@ public void testCaseInsensitiveFunctionName() throws GandivaException { ArrowType.Utf8 utf8 = new ArrowType.Utf8(); ArrowType.Int int64 = new ArrowType.Int(64, true); FunctionSignature signature = - new FunctionSignature("castvarchar", utf8, Lists.newArrayList(utf8, int64)); + new FunctionSignature("castvarchar", utf8, null, Lists.newArrayList(utf8, int64)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } From ff0f9ab55de45c03d5f226b5e8cf591cbc8d2530 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 1 Nov 2023 11:11:31 -0700 Subject: [PATCH 28/46] More cleanup. --- build_release.sh | 34 -- build_testing.sh | 38 -- cpp/src/arrow/type_fwd.h | 10 +- cpp/src/gandiva/array_ops.cc | 401 +++++------------- cpp/src/gandiva/array_ops.h | 7 +- cpp/src/gandiva/array_ops_test.cc | 26 -- cpp/src/gandiva/bitmap_accumulator.h | 1 - cpp/src/gandiva/compiled_expr.h | 3 +- cpp/src/gandiva/engine.cc | 5 +- cpp/src/gandiva/function_registry.cc | 1 - cpp/src/gandiva/function_registry_array.cc | 21 +- cpp/src/gandiva/function_signature.cc | 1 - cpp/src/gandiva/llvm_generator.cc | 73 +--- cpp/src/gandiva/llvm_generator_test.cc | 68 --- cpp/src/gandiva/llvm_types.h | 9 - cpp/src/gandiva/tests/list_test.cc | 354 +--------------- cpp/src/gandiva/tree_expr_builder.cc | 2 +- .../main/cpp/expression_registry_helper.cc | 34 +- java/gandiva/src/main/cpp/jni_common.cc | 20 +- .../gandiva/evaluator/ExpressionRegistry.java | 2 - .../arrow/gandiva/evaluator/Projector.java | 2 - .../gandiva/expression/ArrowTypeHelper.java | 5 - .../gandiva/expression/FunctionNode.java | 1 - .../arrow/gandiva/expression/TreeBuilder.java | 15 - .../gandiva/evaluator/ProjectorTest.java | 50 --- 25 files changed, 134 insertions(+), 1049 deletions(-) delete mode 100755 build_release.sh delete mode 100755 build_testing.sh diff --git a/build_release.sh b/build_release.sh deleted file mode 100755 index 5afaff588237c..0000000000000 --- a/build_release.sh +++ /dev/null @@ -1,34 +0,0 @@ -rm -rf cpp-jni java-dist java-jni cpp/debug -mkdir cpp/debug -cd cpp/debug - -arch -x86_64 cmake -DCMAKE_BUILD_TYPE=RELEASE -DARROW_GANDIVA=ON -DARROW_JEMALLOC=OFF -DARROW_GANDIVA_JAVA=ON -DARROW_BUILD_TESTS=OFF .. -arch -x86_64 make -j 8 -if [ $? -ne 0 ] -then - echo "failed" - exit 1 -fi - -cd ../../ -mkdir -p java-jni cpp-jni - -arch -x86_64 cmake -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF -DARROW_JEMALLOC=OFF -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=ON -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON -DARROW_ORC=ON -DARROW_PARQUET=ON -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_UNITY_BUILD=ON -arch -x86_64 cmake --build cpp-jni --target install --config Release -if [ $? -ne 0 ] -then - echo "failed" - exit 1 -fi - -arch -x86_64 cmake -S java -B java-jni -DARROW_JAVA_JNI_ENABLE_C=OFF -DARROW_JEMALLOC=OFF -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_PREFIX_PATH=$PWD/java-dist/lib/x86_64/cmake -arch -x86_64 cmake --build java-jni --target install --config Release -if [ $? -ne 0 ] -then - echo "failed" - exit 1 -fi - -cd java -/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow/java-dist/lib -Parrow-jni clean install -cp gandiva/target/arrow-gandiva-12.0.1.jar /Users/logan.riggs/github/dremio/enterprise/distribution/server/target/dremio-enterprise-24.3.0-SNAPSHOT/dremio-enterprise-24.3.0-SNAPSHOT/jars/3rdparty/ diff --git a/build_testing.sh b/build_testing.sh deleted file mode 100755 index 9604ba5678ff2..0000000000000 --- a/build_testing.sh +++ /dev/null @@ -1,38 +0,0 @@ -rm -rf cpp-jni java-dist java-jni cpp/debug -mkdir cpp/debug -cd cpp/debug - -echo "====CPP====" -arch -x86_64 cmake -DCMAKE_BUILD_TYPE=DEBUG -DARROW_GANDIVA=ON -DARROW_JEMALLOC=OFF -DARROW_GANDIVA_JAVA=ON -DARROW_BUILD_TESTS=ON .. -arch -x86_64 make -j 8 -if [ $? -ne 0 ] -then - echo "failed" - exit 1 -fi - -cd ../../ -mkdir -p java-jni cpp-jni - -echo "====CPP-JNI====" -arch -x86_64 cmake -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF -DARROW_JEMALLOC=OFF -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=ON -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON -DARROW_ORC=ON -DARROW_PARQUET=ON -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_UNITY_BUILD=ON -arch -x86_64 cmake --build cpp-jni --target install --config Debug -if [ $? -ne 0 ] -then - echo "failed" - exit 1 -fi - -echo "====JAVA-JNI====" -arch -x86_64 cmake -S java -B java-jni -DARROW_JAVA_JNI_ENABLE_C=OFF -DARROW_JEMALLOC=OFF -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_PREFIX_PATH=$PWD/java-dist/lib/x86_64/cmake -DArrowTesting_DIR=$PWD/cpp/debug/src/arrow -arch -x86_64 cmake --build java-jni --target install --config Debug -if [ $? -ne 0 ] -then - echo "failed" - exit 1 -fi - -echo "====JARS====" -cd java -/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Parrow-jni clean install -cp java/gandiva/target/arrow-gandiva-12.0.1.jar /Users/logan.riggs/github/dremio/enterprise/distribution/server/target/dremio-enterprise-24.3.0-SNAPSHOT/dremio-enterprise-24.3.0-SNAPSHOT/jars/3rdparty/ diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 66fd6c75f0ddb..657abbaecc42b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -123,11 +123,6 @@ class StringArray; class StringBuilder; struct StringScalar; -class StructType; -class StructArray; -class StructBuilder; -struct StructScalar; - class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -153,6 +148,11 @@ class FixedSizeListArray; class FixedSizeListBuilder; struct FixedSizeListScalar; +class StructType; +class StructArray; +class StructBuilder; +struct StructScalar; + class Decimal128; class Decimal256; class DecimalType; diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 96052f58c92de..802b4e20947bf 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -27,45 +27,73 @@ #include "gandiva/engine.h" #include "gandiva/exported_funcs.h" -//LR TODO -namespace { - bool floatsEqual(float l, float r) { - return (l - r < 0.001 && r - l < 0.001); - } - - bool doublesEqual(double l, double r) { - return (l - r < 0.001 && r - l < 0.001); - } -} /// Stub functions that can be accessed from LLVM or the pre-compiled library. -extern "C" { - -bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, - int32_t* entry_child_offsets, int32_t entry_offsets_len, - const char* contains_data, int32_t contains_data_length, +template +Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + Type remove_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, - bool* valid_row) { - for (int i = 0; i < entry_offsets_len; i++) { - int32_t entry_len = *(entry_child_offsets + i + 1) - *(entry_child_offsets + i); - if (entry_len != contains_data_length) { - entry_buf = entry_buf + entry_len; - continue; - } - if (strncmp(entry_buf, contains_data, contains_data_length) == 0) { - return true; + bool* valid_row, int32_t* out_len, int32_t** valid_ptr) +{ + std::vector newInts; + + const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); + int64_t validityBitIndex = 0; + //The validity index already has the current row length added to it, so decrement. + validityBitIndex = validity_index_var - entry_len; + entry_validWhat = true; + std::vector outValid; + for (int i = 0; i < entry_len; i++) { + Type entry_item = *(entry_buf + (i * 1)); + std::cout << "LR TODO checking " << entry_item << std::endl; + if (entry_item == remove_data) { + //Do not add the item to remove. + } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + outValid.push_back(false); + newInts.push_back(0); + std::cout << "LR TODO not valid! " << i << std::endl; + } else { + outValid.push_back(true); + newInts.push_back(entry_item); + std::cout << "LR TODO valid " << i << std::endl; } - entry_buf = entry_buf + entry_len; } - return false; + + *out_len = (int)newInts.size(); + + //Since this function can remove values we don't know the length ahead of time. + //A fast way to compute Math.ceil(input / 8.0). + int validByteSize = (unsigned int)((*out_len) + 7) >> 3; + std::cout << "LR TODO out_len=" << *out_len << " valid byte length is " << validByteSize << std::endl; + + uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, validByteSize); + for (int i = 0; i < outValid.size(); i++) { + std::cout << "LR TODO setting bit " << i << " to value " << outValid[i] << std::endl; + arrow::bit_util::SetBitTo(validRet, i, outValid[i]); + } + + int32_t outBufferLength = (int)*out_len * sizeof(Type); + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); + memcpy(ret, newInts.data(), outBufferLength); + *valid_row = true; + if (!combined_row_validity) { + *out_len = 0; + *valid_row = false; //this one is what works for the top level validity. + entry_validWhat = false; + } + *valid_ptr = reinterpret_cast(validRet); + return reinterpret_cast(ret); } -bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, +template +bool array_contains_template(const Type* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - int32_t contains_data, bool entry_validWhat, + int32_t contains_data, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { - if (!combined_row_validity) { + if (!combined_row_validity) { *valid_row = false; return false; } @@ -78,7 +106,7 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { continue; } - int32_t entry_item = *(entry_buf + i); + Type entry_item = *(entry_buf + i); if (entry_item == contains_data) { return true; } @@ -86,30 +114,26 @@ bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, return false; } +extern "C" { + +bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, + int32_t contains_data, bool entry_validWhat, + int64_t loop_var, int64_t validity_index_var, + bool* valid_row) { + return array_contains_template(entry_buf, entry_len, entry_validity, + combined_row_validity, contains_data, + loop_var, validity_index_var, valid_row); +} + bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int64_t contains_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { - if (!combined_row_validity) { - *valid_row = false; - return false; - } - *valid_row = true; - - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = validity_index_var - entry_len; - - for (int i = 0; i < entry_len; i++) { - if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - continue; - } - int64_t entry_item = *(entry_buf + (i)); - if (entry_item == contains_data) { - return true; - } - } - return false; + return array_contains_template(entry_buf, entry_len, entry_validity, + combined_row_validity, contains_data, + loop_var, validity_index_var, valid_row); } bool array_float32_contains_float32(int64_t context_ptr, const float* entry_buf, @@ -117,25 +141,9 @@ bool array_float32_contains_float32(int64_t context_ptr, const float* entry_buf, float contains_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { - if (!combined_row_validity) { - *valid_row = false; - return false; - } - *valid_row = true; - - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = validity_index_var - entry_len; - - for (int i = 0; i < entry_len; i++) { - if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - continue; - } - float entry_item = *(entry_buf + (i)); - if (floatsEqual(entry_item, contains_data)) { - return true; - } - } - return false; + return array_contains_template(entry_buf, entry_len, entry_validity, + combined_row_validity, contains_data, + loop_var, validity_index_var, valid_row); } bool array_float64_contains_float64(int64_t context_ptr, const double* entry_buf, @@ -143,141 +151,35 @@ bool array_float64_contains_float64(int64_t context_ptr, const double* entry_buf double contains_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { - if (!combined_row_validity) { - *valid_row = false; - return false; - } - *valid_row = true; + return array_contains_template(entry_buf, entry_len, entry_validity, + combined_row_validity, contains_data, + loop_var, validity_index_var, valid_row); +} - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = validity_index_var - entry_len; - for (int i = 0; i < entry_len; i++) { - if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - continue; - } - double entry_item = *(entry_buf + (i)); - if (doublesEqual(entry_item, contains_data)) { - return true; - } - } - return false; -} int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int32_t remove_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { - - std::vector newInts; - - //LR TODO not sure what entry_validWhat is. - //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = 0; - //The validity index already has the current row length added to it, so decrement. - validityBitIndex = validity_index_var - entry_len; - entry_validWhat = true; - std::vector outValid; - for (int i = 0; i < entry_len; i++) { - int32_t entry_item = *(entry_buf + (i * 1)); - if (entry_item == remove_data) { - //Do not add the item to remove. - } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - outValid.push_back(false); - newInts.push_back(0); - } else { - outValid.push_back(true); - newInts.push_back(entry_item); - } - } - - *out_len = (int)newInts.size(); - - //Since this function can remove values we don't know the length ahead of time. - //LR TODO divide by 8 and ensure at least 1? - uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); - for (int i = 0; i < outValid.size(); i++) { - arrow::bit_util::SetBitTo(validRet, i, outValid[i]); - } - - int32_t outBufferLength = (int)*out_len * sizeof(int); - //length is number of items, but buffers must account for byte size. - uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); - memcpy(ret, newInts.data(), outBufferLength); - *valid_row = true; - if (!combined_row_validity) { - *out_len = 0; - *valid_row = false; //this one is what works for the top level validity. - entry_validWhat = false; - } - *valid_ptr = reinterpret_cast(validRet); - return reinterpret_cast(ret); + return array_remove_template(context_ptr, entry_buf, + entry_len, entry_validity, combined_row_validity, + remove_data, entry_validWhat, + loop_var, validity_index_var, + valid_row, out_len, valid_ptr); } - - int64_t* array_int64_remove(int64_t context_ptr, const int64_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, int64_t remove_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ - - std::vector newInts; - - //LR TODO not sure what entry_validWhat is. - //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = 0; - //The validity index already has the current row length added to it, so decrement. - validityBitIndex = validity_index_var - entry_len; - entry_validWhat = true; - std::vector outValid; - std::cout << "LR TODO entry length is " << entry_len << std::endl; - for (int32_t i = 0; i < entry_len; i++) { - int64_t entry_item = *(entry_buf + (i)); - std::cout << "LR TODO checking entry item " << entry_item << std::endl; - if (entry_item == remove_data) { - //Do not add the item to remove. - } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - outValid.push_back(false); - newInts.push_back(0); - std::cout << "LR TODO entry item is null" << std::endl; - } else { - outValid.push_back(true); - newInts.push_back(entry_item); - } - } - - *out_len = (int)newInts.size(); - - //Since this function can remove values we don't know the length ahead of time. - //LR TODO divide by 8 and ensure at least 1? - uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); - for (int i = 0; i < outValid.size(); i++) { - arrow::bit_util::SetBitTo(validRet, i, outValid[i]); - std::cout << "LR TODO Setting validty " << i << " to " << outValid[i] << std::endl; - } - - int32_t outBufferLength = (int)*out_len * sizeof(int64_t); - //length is number of items, but buffers must account for byte size. - uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); - memcpy(ret, newInts.data(), outBufferLength); - - //LR TODO - for (int k = 0; k < *out_len; k++) { - std::cout << "LR TODO the 64 data is " << ((int64_t*)ret)[k] << std::endl; - } - - *valid_row = true; - if (!combined_row_validity) { - *out_len = 0; - *valid_row = false; //this one is what works for the top level validity. - entry_validWhat = false; - } - *valid_ptr = reinterpret_cast(validRet); - return reinterpret_cast(ret); + return array_remove_template(context_ptr, entry_buf, + entry_len, entry_validity, combined_row_validity, + remove_data, entry_validWhat, + loop_var, validity_index_var, + valid_row, out_len, valid_ptr); } float* array_float32_remove(int64_t context_ptr, const float* entry_buf, @@ -285,52 +187,11 @@ float* array_float32_remove(int64_t context_ptr, const float* entry_buf, float remove_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ - - std::vector newArray; - - //LR TODO not sure what entry_validWhat is. - //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = 0; - //The validity index already has the current row length added to it, so decrement. - validityBitIndex = validity_index_var - entry_len; - entry_validWhat = true; - std::vector outValid; - for (int i = 0; i < entry_len; i++) { - float entry_item = *(entry_buf + (i * 1)); - //LR TODO comparison tolerance? - if (floatsEqual(entry_item, remove_data)) { - //Do not add the item to remove. - } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - outValid.push_back(false); - newArray.push_back(0); - } else { - outValid.push_back(true); - newArray.push_back(entry_item); - } - } - - *out_len = (int)newArray.size(); - - //Since this function can remove values we don't know the length ahead of time. - //LR TODO divide by 8 and ensure at least 1? - uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); - for (int i = 0; i < outValid.size(); i++) { - arrow::bit_util::SetBitTo(validRet, i, outValid[i]); - } - - int32_t outBufferLength = (int)*out_len * sizeof(float); - //length is number of items, but buffers must account for byte size. - uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); - memcpy(ret, newArray.data(), outBufferLength); - *valid_row = true; - if (!combined_row_validity) { - *out_len = 0; - *valid_row = false; //this one is what works for the top level validity. - entry_validWhat = false; - } - *valid_ptr = reinterpret_cast(validRet); - return reinterpret_cast(ret); + return array_remove_template(context_ptr, entry_buf, + entry_len, entry_validity, combined_row_validity, + remove_data, entry_validWhat, + loop_var, validity_index_var, + valid_row, out_len, valid_ptr); } @@ -339,54 +200,12 @@ double* array_float64_remove(int64_t context_ptr, const double* entry_buf, double remove_data, bool entry_validWhat, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ - - std::vector newArray; - - //LR TODO not sure what entry_validWhat is. - //LR TODO I'm not sure why entry_validty increases for each loop. It starts as the pointer to the validity buffer, so adjust here. - const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); - int64_t validityBitIndex = 0; - //The validity index already has the current row length added to it, so decrement. - validityBitIndex = validity_index_var - entry_len; - entry_validWhat = true; - std::vector outValid; - for (int32_t i = 0; i < entry_len; i++) { - double entry_item = *(entry_buf + (i * 1)); - //LR TODO comparison tolerance? - if (doublesEqual(entry_item, remove_data)) { - //Do not add the item to remove. - } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { - outValid.push_back(false); - newArray.push_back(0.0); - } else { - outValid.push_back(true); - newArray.push_back(entry_item); - } - } - - *out_len = (int)newArray.size(); - - //Since this function can remove values we don't know the length ahead of time. - //LR TODO divide by 8 and ensure at least 1? - uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, *out_len); - for (int i = 0; i < outValid.size(); i++) { - arrow::bit_util::SetBitTo(validRet, i, outValid[i]); - } - - int32_t outBufferLength = (int)*out_len * sizeof(double); - //length is number of items, but buffers must account for byte size. - uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); - memcpy(ret, newArray.data(), outBufferLength); - *valid_row = true; - if (!combined_row_validity) { - *out_len = 0; - *valid_row = false; //this one is what works for the top level validity. - entry_validWhat = false; - } - *valid_ptr = reinterpret_cast(validRet); - return reinterpret_cast(ret); + return array_remove_template(context_ptr, entry_buf, + entry_len, entry_validity, combined_row_validity, + remove_data, entry_validWhat, + loop_var, validity_index_var, + valid_row, out_len, valid_ptr); } - } namespace gandiva { @@ -394,26 +213,6 @@ void ExportedArrayFunctions::AddMappings(Engine* engine) const { std::vector args; auto types = engine->types(); - - //Array contains. - args = {types->i64_type(), // int64_t execution_context - types->i8_ptr_type(), // int8_t* data ptr - types->i32_ptr_type(), // int32_t* child offsets ptr - types->i32_type(), // int32_t child offsets length - types->i32_ptr_type(), // input validity buffer - types->i1_type(), // bool input row validity - types->i8_ptr_type(), // const char* contains data buf - types->i32_type(), // int32_t contains data length - types->i1_type(), // bool validity --Needed? - types->i64_type(), //in loop var --Needed? - types->i64_type(), //in validity_index_var index into the valdity vector for the current row. - types->i1_ptr_type() //output validity for the row - }; - - engine->AddGlobalMappingForFunc("array_utf8_contains_utf8", - types->i1_type() /*return_type*/, args, - reinterpret_cast(array_utf8_contains_utf8)); - args = {types->i64_type(), // int64_t execution_context types->i64_ptr_type(), // int8_t* data ptr types->i32_type(), // int32_t data length diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h index 2a7d1448a9af4..c0de72a39472b 100644 --- a/cpp/src/gandiva/array_ops.h +++ b/cpp/src/gandiva/array_ops.h @@ -27,12 +27,7 @@ class VectorType; /// Array functions that can be accessed from LLVM. extern "C" { -GANDIVA_EXPORT -bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, - int32_t* entry_child_offsets, int32_t entry_offsets_len, - const char* contains_data, int32_t contains_data_length, - int64_t loop_var, int64_t validity_index_var, - bool* valid_row); + GANDIVA_EXPORT bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, diff --git a/cpp/src/gandiva/array_ops_test.cc b/cpp/src/gandiva/array_ops_test.cc index 12dd6f9c56d30..4d96b80dd4222 100644 --- a/cpp/src/gandiva/array_ops_test.cc +++ b/cpp/src/gandiva/array_ops_test.cc @@ -36,30 +36,4 @@ TEST(TestArrayOps, TestInt32ContainsInt32) { true); } -TEST(TestArrayOps, TestUtf8ContainsUtf8) { - gandiva::ExecutionContext ctx; - uint64_t ctx_ptr = reinterpret_cast(&ctx); - const char* entry_buf = "trianglecirclerectangle"; - int32_t entry_child_offsets[] = {0, 8, 14, 24}; - int32_t entry_offsets_len = 3; - const char* contains_data = "triangle"; - int32_t contains_data_length = 8; - - EXPECT_EQ( - array_utf8_contains_utf8(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len, - contains_data, contains_data_length), - true); -} - -TEST(TestArrayOps, TestUtf8Length) { - gandiva::ExecutionContext ctx; - uint64_t ctx_ptr = reinterpret_cast(&ctx); - const char* entry_buf = "trianglecirclerectangle"; - int32_t entry_child_offsets[] = {0, 8, 14, 24}; - int32_t entry_offsets_len = 3; - - EXPECT_EQ(array_utf8_length(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len), - 3); -} - } // namespace gandiva diff --git a/cpp/src/gandiva/bitmap_accumulator.h b/cpp/src/gandiva/bitmap_accumulator.h index 52d73696c788c..9eaec81763786 100644 --- a/cpp/src/gandiva/bitmap_accumulator.h +++ b/cpp/src/gandiva/bitmap_accumulator.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include "arrow/util/macros.h" diff --git a/cpp/src/gandiva/compiled_expr.h b/cpp/src/gandiva/compiled_expr.h index b4244aae63380..4933e7f4922f6 100644 --- a/cpp/src/gandiva/compiled_expr.h +++ b/cpp/src/gandiva/compiled_expr.h @@ -36,8 +36,7 @@ class CompiledExpr { ValueValidityPairPtr value_validity() const { return value_validity_; } - FieldDescriptorPtr output() const { - return output_; } + FieldDescriptorPtr output() const { return output_; } void SetFunctionName(SelectionVector::Mode mode, std::string& name) { ir_functions_[static_cast(mode)] = name; diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index f8cfa8b54a60d..f5f9460ddd1f2 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -300,10 +300,7 @@ Status Engine::FinalizeModule() { if (!cached_) { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); - //LR TODO - //LR Turning this off seems to provide better error messages with compilation/generation failures. - //if (optimize_) { - if (false) { + if (optimize_) { // misc passes to allow for inlining, vectorization, .. std::unique_ptr pass_manager( new llvm::legacy::PassManager()); diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 616ef8530c02b..9180e8c33ca33 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -17,7 +17,6 @@ #include "gandiva/function_registry.h" -#include #include #include #include diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc index 015c8e97bfb53..893ba6e3d2b04 100644 --- a/cpp/src/gandiva/function_registry_array.cc +++ b/cpp/src/gandiva/function_registry_array.cc @@ -22,34 +22,29 @@ namespace gandiva { std::vector GetArrayFunctionRegistry() { static std::vector array_fn_registry_ = { - NativeFunction("array_containsGandiva", {}, DataTypeVector{list(utf8()), utf8()}, - boolean(), kResultNullInternal, "array_utf8_contains_utf8", - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), - - - NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, + NativeFunction("array_contains", {}, DataTypeVector{list(int32()), int32()}, boolean(), kResultNullInternal, "array_int32_contains_int32", NativeFunction::kNeedsContext), - NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, + NativeFunction("array_contains", {}, DataTypeVector{list(int64()), int64()}, boolean(), kResultNullInternal, "array_int64_contains_int64", NativeFunction::kNeedsContext), - NativeFunction("array_containsGandiva", {}, DataTypeVector{list(float32()), float32()}, + NativeFunction("array_contains", {}, DataTypeVector{list(float32()), float32()}, boolean(), kResultNullInternal, "array_float32_contains_float32", NativeFunction::kNeedsContext), - NativeFunction("array_containsGandiva", {}, DataTypeVector{list(float64()), float64()}, + NativeFunction("array_contains", {}, DataTypeVector{list(float64()), float64()}, boolean(), kResultNullInternal, "array_float64_contains_float64", NativeFunction::kNeedsContext), - NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int32()), int32()}, + NativeFunction("array_remove", {}, DataTypeVector{list(int32()), int32()}, list(int32()), kResultNullInternal, "array_int32_remove", NativeFunction::kNeedsContext), - NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int64()), int64()}, + NativeFunction("array_remove", {}, DataTypeVector{list(int64()), int64()}, list(int64()), kResultNullInternal, "array_int64_remove", NativeFunction::kNeedsContext), - NativeFunction("array_removeGandiva", {}, DataTypeVector{list(float32()), float32()}, + NativeFunction("array_remove", {}, DataTypeVector{list(float32()), float32()}, list(float32()), kResultNullInternal, "array_float32_remove", NativeFunction::kNeedsContext), - NativeFunction("array_removeGandiva", {}, DataTypeVector{list(float64()), float64()}, + NativeFunction("array_remove", {}, DataTypeVector{list(float64()), float64()}, list(float64()), kResultNullInternal, "array_float64_remove", NativeFunction::kNeedsContext), }; diff --git a/cpp/src/gandiva/function_signature.cc b/cpp/src/gandiva/function_signature.cc index 8c086f5ee33a4..2498de39e1b3b 100644 --- a/cpp/src/gandiva/function_signature.cc +++ b/cpp/src/gandiva/function_signature.cc @@ -59,7 +59,6 @@ FunctionSignature::FunctionSignature(std::string base_name, DataTypeVector param : base_name_(std::move(base_name)), param_types_(std::move(param_types)), ret_type_(std::move(ret_type)) { - std::cout << "LR TODO creating FunctionSignature " << ret_type_->ToString() << std::endl; DCHECK_GT(base_name_.length(), 0); for (auto it = param_types_.begin(); it != param_types_.end(); it++) { DCHECK(*it); diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index f9d993403bf42..11fd2d3cb1947 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -17,7 +17,6 @@ #include "gandiva/llvm_generator.h" -#include #include #include #include @@ -36,27 +35,6 @@ namespace gandiva { AddTrace(__VA_ARGS__); \ } -namespace { - std::string printType(llvm::Type* t) { - if (t == nullptr) { - return std::string("null"); - } - std::string str; - llvm::raw_string_ostream output(str); - t->print(output); - return str; - } - std::string printType(llvm::Value* t) { - if (t == nullptr) { - return std::string("null"); - } - std::string str; - llvm::raw_string_ostream output(str); - t->print(output); - return str; - } -} - LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(false) {} Status LLVMGenerator::Make(std::shared_ptr config, bool cached, @@ -113,7 +91,6 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode ARROW_RETURN_NOT_OK(Add(expr, output)); } -std::cout << "LR TODO LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); @@ -300,7 +277,6 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, FieldDescriptorPtr output, int suffix_idx, std::string& fn_name, SelectionVector::Mode selection_vector_mode) { - try { llvm::IRBuilder<>* builder = ir_builder(); // Create fn prototype : // int expr_1 (long **addrs, long *offsets, long **bitmaps, @@ -390,11 +366,6 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, // define loop_var : start with 0, +1 after each iter llvm::PHINode* loop_var = builder->CreatePHI(types()->i64_type(), 2, "loop_var"); -//LR-VAR - //Define counter for index into list validity vector. - //llvm::PHINode* validity_index_var = builder->CreatePHI(types()->i64_type(), 2, "validity_index_var"); - - llvm::Value* position_var = loop_var; if (selection_vector_mode != SelectionVector::MODE_NONE) { @@ -500,10 +471,6 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, builder->SetInsertPoint(loop_exit); builder->CreateRet(types()->i32_constant(0)); return Status::OK(); - } catch (std::exception& e) { - std::cout << e.what() << std::endl; - throw e; - } } /// Return value of a bit in bitMap. @@ -591,21 +558,16 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::Type* ret_type, const std::vector& args) { - std::cout << "LR TODO AddFunctionCall " << full_name << " ret type is " << printType(ret_type) << std::endl; // find the llvm function. llvm::Function* fn = module()->getFunction(full_name); DCHECK_NE(fn, nullptr) << "missing function " << full_name; - if (!full_name.compare("printf") && + if (enable_ir_traces_ && !full_name.compare("printf") && !full_name.compare("printff")) { // Trace for debugging ADD_TRACE("invoke native fn " + full_name); } - std::cout << "LR TODO AddFunctionCall 2" << std::endl; - for (llvm::Value* lv : args) { - std::cout << "LR TODO arg is " << printType(lv) << std::endl; - } // build a call to the llvm function. llvm::Value* value; if (ret_type->isVoidTy()) { @@ -613,13 +575,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args); } else { value = ir_builder()->CreateCall(fn, args, full_name); -std::cout << "LR TODO AddFunctionCall 3" << std::endl; - std::string str; - llvm::raw_string_ostream output(str); - std::string str2; - llvm::raw_string_ostream output2(str2); - ret_type->print(output); - value->getType()->print(output2); + DCHECK(value->getType() == ret_type); } @@ -638,7 +594,9 @@ std::shared_ptr LLVMGenerator::BuildDecimalLValue(llvm::Value* va } #define ADD_VISITOR_TRACE(...) \ + if (generator_->enable_ir_traces_) { \ generator_->AddTrace(__VA_ARGS__); \ + } // Visitor for generating the code for a decomposed expression. LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* function, @@ -705,17 +663,10 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { auto types = generator_->types(); auto type = types->IRType(dex.FieldType()->id()); - std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; - auto dt = dex.FieldType(); if (dt->id() == arrow::Type::LIST) { - if (dt->num_fields() > 0) { - std::cout << "LR TODO creating listtype" << std::endl; - std::cout << "LR TODO listtype id=" << dt->fields()[0]->type()->id() << std::endl; type = types->IRType(dt->fields()[0]->type()->id() ); - } } - std::cout << "LR TODO using type " << printType(type) << std::endl; arrow::Type::type at32 = arrow::Type::INT32; auto type32 = types->IRType(at32); @@ -1075,7 +1026,6 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { auto arrow_return_type = dex.func_descriptor()->return_type(); - auto arrow_return_type_id = arrow_return_type->id(); bool passLoopVars = false; for (auto& p : dex.func_descriptor()->params()) { @@ -1086,16 +1036,9 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { } if (passLoopVars) { - std::string str32 = "loopvar:"; - if (loop_var_) { - llvm::raw_string_ostream output3(str32); - loop_var_->print(output3); - } - - params.push_back(loop_var_); - auto valid_var = builder->CreateLoad(types->i64_type(), validity_index_var_, "loaded_var"); - params.push_back(valid_var); - + params.push_back(loop_var_); + auto valid_var = builder->CreateLoad(types->i64_type(), validity_index_var_, "loaded_var"); + params.push_back(valid_var); } // add an extra arg for validity (allocated on stack). @@ -1103,7 +1046,6 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); params.push_back(result_valid_ptr); - //auto arrow_return_type = dex.func_descriptor()->return_type(); result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); // load the result validity and truncate to i1. @@ -1731,7 +1673,6 @@ void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { dmsg = ReplaceFormatInTrace(dmsg, value, &print_fn_name); } trace_strings_.push_back(dmsg); - std::cout << dmsg << std::endl; // cast this to an llvm pointer. const char* str = trace_strings_.back().c_str(); diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 2c0c742eb79c2..0651614c816f6 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -114,72 +114,4 @@ TEST_F(TestLLVMGenerator, TestAdd) { EXPECT_THAT(out, testing::ElementsAre(6, 8, 10, 12)); EXPECT_EQ(out_bitmap, 0ULL); } -/* -TEST_F(TestLLVMGenerator, TestArrayRemove) { - // Setup LLVM generator to do an array remove. - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), false, &generator)); - Annotator annotator; - - std::shared_ptr listDt = std::make_shared(); - std::shared_ptr dt = std::make_shared(listDt); - auto field0 = std::make_shared("f0", dt); - auto desc0 = annotator.CheckAndAddInputFieldDescriptor(field0); - auto validity_dex0 = std::make_shared(desc0); - auto value_dex0 = std::make_shared(desc0); - auto pair0 = std::make_shared(validity_dex0, value_dex0); - - auto field1 = std::make_shared("f1", arrow::int32()); - auto desc1 = annotator.CheckAndAddInputFieldDescriptor(field1); - auto validity_dex1 = std::make_shared(desc1); - auto value_dex1 = std::make_shared(desc1); - auto pair1 = std::make_shared(validity_dex1, value_dex1); - - DataTypeVector params{dt, arrow::int32()}; - auto func_desc = std::make_shared("array_removeGandiva", params, arrow::int32()); - FunctionSignature signature(func_desc->name(), func_desc->params(), - func_desc->return_type()); - const NativeFunction* native_func = - generator->function_registry_.LookupSignature(signature); - - std::vector pairs{pair0, pair1}; - auto func_dex = std::make_shared( - func_desc, native_func, FunctionHolderPtr(nullptr), -1, pairs); - - auto field_sum = std::make_shared("out", arrow::int32()); - auto desc_sum = annotator.CheckAndAddInputFieldDescriptor(field_sum); - - std::string fn_name = "codegen"; - - ASSERT_OK(generator->engine_->LoadFunctionIRs()); - ASSERT_OK(generator->CodeGenExprValue(func_dex, 4, desc_sum, 0, fn_name, - SelectionVector::MODE_NONE)); - - ASSERT_OK(generator->engine_->FinalizeModule()); - auto ir = generator->engine_->DumpIR(); - EXPECT_THAT(ir, testing::HasSubstr("vector.body")); - - EvalFunc eval_func = (EvalFunc)generator->engine_->CompiledFunction(fn_name); - - constexpr size_t kNumRecords = 4; - std::array a0{1, 2, 3, 4}; - std::array a1{5, 6, 7, 8}; - uint64_t in_bitmap = 0xffffffffffffffffull; - - std::array out{0, 0, 0, 0}; - uint64_t out_bitmap = 0; - - std::array addrs{ - reinterpret_cast(a0.data()), reinterpret_cast(&in_bitmap), - reinterpret_cast(a1.data()), reinterpret_cast(&in_bitmap), - reinterpret_cast(out.data()), reinterpret_cast(&out_bitmap), - }; - std::array addr_offsets{0, 0, 0, 0, 0, 0}; - eval_func(addrs.data(), addr_offsets.data(), nullptr, nullptr, nullptr, - 0 /* dummy context ptr */, kNumRecords); - - EXPECT_THAT(out, testing::ElementsAre(6, 8, 10, 12)); - EXPECT_EQ(out_bitmap, 0ULL); -}*/ - } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index be31954f0c7b1..7473f0c4d6ea7 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include @@ -47,10 +46,6 @@ class GANDIVA_EXPORT LLVMTypes { llvm::Type* i128_type() { return llvm::Type::getInt128Ty(context_); } - llvm::StructType* struct_type() { - return llvm::StructType::get(context_, {double_type(), double_type()}, false); - } - llvm::VectorType* list_type() { return llvm::ScalableVectorType::get(i8_type(), (unsigned int)0); } llvm::StructType* i128_split_type() { @@ -126,11 +121,7 @@ class GANDIVA_EXPORT LLVMTypes { // offsets buffer is to separate data into list // not support nested list if (data_type->id() == arrow::Type::LIST) { - //LR TODO - std::cout << "LR Returning list type as type " << data_type->field(0)->type()->id()<< " for IR " << std::endl; return IRType(data_type->field(0)->type()->id()); - //return IRType(data_type->id()); - //return i32_ptr_type(); } return IRType(data_type->id()); } diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index 249980abbab84..e065645bcff6a 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -199,28 +199,14 @@ TEST_F(TestList, TestConcatWS) { // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); - // build expressions. - // array_contains(a, b) - - //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); - - //std::vector field_nodes; - //auto node2 = TreeExprBuilder::MakeLiteral(42); - //field_nodes.push_back(node2); - - //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); - //auto expr = TreeExprBuilder::MakeExpression(func_node, res); - std::cout << "LR test is about to make expression " << std::endl; auto expr = TreeExprBuilder::MakeExpression("concat_ws", {field_a, field_b, field_c}, res); - //////// + // Build a projector for the expressions. std::shared_ptr projector; auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); - std::cout << "LR Test 2 " << std::endl; - //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; // Evaluate expression arrow::ArrayVector outputs; status = projector->Evaluate(*in_batch, pool_, &outputs); @@ -255,194 +241,11 @@ TEST_F(TestList, TestArrayRemove) { {10, 30, 70, 80}, {2, 2}, {true, true}, {true, true, true, true}, pool_, &exp1); - // auto exp = MakeArrowArrayArray({ 42, 42, 44, 45, 46}, - // {true, true, true, true, true}); // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); - // build expressions. - // array_contains(a, b) - - //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); - - //std::vector field_nodes; - //auto node2 = TreeExprBuilder::MakeLiteral(42); - //field_nodes.push_back(node2); - - //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); - //auto expr = TreeExprBuilder::MakeExpression(func_node, res); - std::cout << "LR test is about to make expression " << std::endl; - auto expr = TreeExprBuilder::MakeExpression("array_removeGandiva", {field_a, field_b}, res); - //////// - - // Build a projector for the expressions. - std::shared_ptr projector; - auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); - EXPECT_TRUE(status.ok()) << status.message(); - - std::cout << "LR Test 2 " << std::endl; - //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()) << status.message(); - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); - - std::cout << "LR ==============================SECOND=WAY==================================================== " << std::endl; - - - - //Try the second method. - arrow::ArrayDataVector outputs2; - std::shared_ptr listDt = std::make_shared(); - std::shared_ptr dt = std::make_shared(listDt); - - - int num_records2 = 5; - std::vector> buffers; - - - - //int64_t size = arrow::bit_util::BytesForBits(num_records2); - int64_t size = 20; - auto bitmap_buffer = arrow::AllocateBuffer(size, pool_); - buffers.push_back(*std::move(bitmap_buffer)); - auto offsets_len = arrow::bit_util::BytesForBits((num_records2 + 1) * 32); - - auto offsets_buffer = arrow::AllocateBuffer(offsets_len*10, pool_); - buffers.push_back(*std::move(offsets_buffer)); - - std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; - //auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, 0, offsets_len); - //outputs2.push_back(array_data); - - - -std::vector> buffers2; -auto bitmap_buffer2 = arrow::AllocateBuffer(size, pool_); - buffers2.push_back(*std::move(bitmap_buffer2)); - - auto offsets_buffer2 = arrow::AllocateBuffer(offsets_len, pool_); - buffers2.push_back(*std::move(offsets_buffer2)); -std::shared_ptr dt2 = std::make_shared(); - - auto array_data_child = arrow::ArrayData::Make(dt2, num_records2, buffers2, 0, 0); - array_data_child->buffers = std::move(buffers2); - - std::vector> kids; - kids.push_back(array_data_child); - - -auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, kids, 0, 0); -array_data->buffers = std::move(buffers); -outputs2.push_back(array_data); - -std::cout << "LR Test " << array_data << " arra_data 0 is " << array_data->buffers[0] << std::endl; - //std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; - std::cout << "LR about to evaluate 2nd " << std::endl; - - status = projector->Evaluate(*(in_batch.get()), outputs2); - EXPECT_TRUE(status.ok()) << status.message(); - arrow::ArrayData ad = *outputs2.at(0); - arrow::ArraySpan sp(*ad.child_data.at(0)); - EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); - - - - -for (auto& array_data : outputs2) { - auto child_data = array_data->child_data[0]; - int64_t child_data_size = 1; - if (arrow::is_binary_like(child_data->type->id())) { - /* when allocate array data, child data length is an initialized value, - * after calculating, child data offsets buffer has been resized for results, - * but array data length is unchanged. - * We should recalculate child data length and make ArrayData with new length - * - * Otherwise, child data offsets buffer length is data length + 1 - * and offset data is int32_t, need use buffer->size()/4 - 1 - */ - child_data_size = child_data->buffers[1]->size() / 4 - 1; - } else if (child_data->type->id() == arrow::Type::INT32) { - child_data_size = child_data->buffers[1]->size() / 4; - } else if (child_data->type->id() == arrow::Type::INT64) { - child_data_size = child_data->buffers[1]->size() / 8; - } else if (child_data->type->id() == arrow::Type::FLOAT) { - child_data_size = child_data->buffers[1]->size() / 4; - } else if (child_data->type->id() == arrow::Type::DOUBLE) { - child_data_size = child_data->buffers[1]->size() / 8; - } - auto new_child_data = arrow::ArrayData::Make( - child_data->type, child_data_size, child_data->buffers, child_data->offset); - array_data = arrow::ArrayData::Make(array_data->type, array_data->length, - array_data->buffers, {new_child_data}, - array_data->null_count, array_data->offset); - - - auto newArray = arrow::MakeArray(array_data); - //arrow::ArraySpan sp(newArray); - EXPECT_ARROW_ARRAY_EQUALS(exp1, newArray); -} - - - - std::cout << "LR ====================THIRD=WAY================================== " << std::endl; - { - std::shared_ptr listDt = std::make_shared(); - std::shared_ptr dt = std::make_shared(listDt); - -ArrayDataPtr output_data; - auto s = projector->AllocArrayData(dt, num_records2, pool_, &output_data); - ArrayDataVector output_data_vecs; - output_data_vecs.push_back(output_data); - - status = projector->Evaluate(*(in_batch.get()), output_data_vecs); - EXPECT_TRUE(status.ok()) << status.message(); - arrow::ArraySpan sp(*output_data_vecs.at(0)); - EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); - } -} - - -TEST_F(TestList, TestMakeArray) { - // schema for input fields - auto field_b = field("b", int32()); - auto schema = arrow::schema({field_b}); - - // output fields - auto res = field("res", list(int32())); - - // Create a row-batch with some sample data - int num_records = 5; - auto array_b = - MakeArrowArrayInt32({42, 43, 44, 45, 46}, {true, true, true, true, true}); - - // expected output - auto exp1 = MakeArrowArrayInt32({ 1, 2, 3, 42, 5}, - {true, true, true, true, true}); - - // auto exp = MakeArrowArrayArray({ 42, 42, 44, 45, 46}, - // {true, true, true, true, true}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_b}); - - // build expressions. - // array_contains(a, b) - - //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); - - //std::vector field_nodes; - //auto node2 = TreeExprBuilder::MakeLiteral(42); - //field_nodes.push_back(node2); - - //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); - //auto expr = TreeExprBuilder::MakeExpression(func_node, res); - std::cout << "LR test is about to make expression " << std::endl; - auto expr = TreeExprBuilder::MakeExpression("array_makeGandiva", {field_b}, res); - //////// + auto expr = TreeExprBuilder::MakeExpression("array_remove", {field_a, field_b}, res); // Build a projector for the expressions. std::shared_ptr projector; @@ -450,7 +253,6 @@ TEST_F(TestList, TestMakeArray) { EXPECT_TRUE(status.ok()) << status.message(); std::cout << "LR Test 2 " << std::endl; - //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; // Evaluate expression arrow::ArrayVector outputs; status = projector->Evaluate(*in_batch, pool_, &outputs); @@ -458,10 +260,6 @@ TEST_F(TestList, TestMakeArray) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); - std::cout << "LR ==============================SECOND=WAY==================================================== " << std::endl; - - - //Try the second method. arrow::ArrayDataVector outputs2; std::shared_ptr listDt = std::make_shared(); @@ -471,9 +269,6 @@ TEST_F(TestList, TestMakeArray) { int num_records2 = 5; std::vector> buffers; - - - //int64_t size = arrow::bit_util::BytesForBits(num_records2); int64_t size = 20; auto bitmap_buffer = arrow::AllocateBuffer(size, pool_); buffers.push_back(*std::move(bitmap_buffer)); @@ -482,12 +277,6 @@ TEST_F(TestList, TestMakeArray) { auto offsets_buffer = arrow::AllocateBuffer(offsets_len*10, pool_); buffers.push_back(*std::move(offsets_buffer)); - std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; - //auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, 0, offsets_len); - //outputs2.push_back(array_data); - - - std::vector> buffers2; auto bitmap_buffer2 = arrow::AllocateBuffer(size, pool_); buffers2.push_back(*std::move(bitmap_buffer2)); @@ -507,9 +296,6 @@ auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, kids, 0, 0); array_data->buffers = std::move(buffers); outputs2.push_back(array_data); -std::cout << "LR Test " << array_data << " arra_data 0 is " << array_data->buffers[0] << std::endl; - //std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; - std::cout << "LR about to evaluate 2nd " << std::endl; status = projector->Evaluate(*(in_batch.get()), outputs2); EXPECT_TRUE(status.ok()) << status.message(); @@ -573,8 +359,6 @@ ArrayDataPtr output_data; } } - -/* TEST_F(TestList, TestListArrayInt32) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); @@ -615,11 +399,6 @@ TEST_F(TestList, TestListInt32LiteralContains) { // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); - // build expressions. - // array_contains(a, b) - - //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); - std::vector field_nodes; auto node = TreeExprBuilder::MakeField(field_a); field_nodes.push_back(node); @@ -627,7 +406,7 @@ TEST_F(TestList, TestListInt32LiteralContains) { auto node2 = TreeExprBuilder::MakeLiteral(42); field_nodes.push_back(node2); - auto func_node = TreeExprBuilder::MakeFunction("array_containsGandiva", field_nodes, res->type()); + auto func_node = TreeExprBuilder::MakeFunction("array_contains", field_nodes, res->type()); auto expr = TreeExprBuilder::MakeExpression(func_node, res); //////// @@ -673,7 +452,7 @@ TEST_F(TestList, TestListInt32Contains) { // build expressions. // array_contains(a, b) - auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + auto expr = TreeExprBuilder::MakeExpression("array_contains", {field_a, field_b}, res); // Build a projector for the expressions. std::shared_ptr projector; @@ -707,129 +486,4 @@ TEST_F(TestList, TestListFloat64) { _test_list_type_field_alias(list(float64()), array, pool_); } - -TEST_F(TestList, TestListUtf8Length) { - // schema for input fields - auto field_a = field("a", list(utf8())); - auto schema = arrow::schema({field_a}); - - // output fields - auto res = field("res", int64()); - - // Create a row-batch with some sample data - int num_records = 5; - ArrayPtr array_a; - _build_list_array( - {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", - "eeee", "eeeee"}, - {1, 2, 3, 4, 5}, {true, true, true, true, true}, pool_, &array_a); - - // expected output - auto exp = MakeArrowArrayInt64({1, 2, 3, 4, 5}, {true, true, true, true, true}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); - - // build expressions. - // array_length(a) - auto expr = TreeExprBuilder::MakeExpression("array_lengthGandiva", {field_a}, res); - - // Build a projector for the expressions. - std::shared_ptr projector; - auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); - EXPECT_TRUE(status.ok()) << status.message(); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()) << status.message(); - - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); -} - -TEST_F(TestList, TestListUtf8LengthWithInvalidData) { - // schema for input fields - auto field_a = field("a", list(utf8())); - auto schema = arrow::schema({field_a}); - - // output fields - auto res = field("res", int64()); - - // Create a row-batch with some sample data - int num_records = 5; - ArrayPtr array_a; - _build_list_array( - {"a", "b", "bb", "cc", "cc", "ccc", "d", "dd", "ddd"}, {1, 2, 2, 3, 1}, - {true, false, true, false, true}, pool_, &array_a); - - // expected output - auto exp = MakeArrowArrayInt64({1, 2, 2, 3, 1}, {true, false, true, false, true}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); - - // build expressions. - // array_length(a) - auto expr = TreeExprBuilder::MakeExpression("array_lengthGandiva", {field_a}, res); - - // Build a projector for the expressions. - std::shared_ptr projector; - auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); - EXPECT_TRUE(status.ok()) << status.message(); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()) << status.message(); - - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); -} - - -TEST_F(TestList, TestListUtf8Contains) { - // schema for input fields - auto field_a = field("a", list(utf8())); - auto field_b = field("b", utf8()); - auto schema = arrow::schema({field_a, field_b}); - - // output fields - auto res = field("res", boolean()); - - // Create a row-batch with some sample data - int num_records = 5; - ArrayPtr array_a; - _build_list_array( - {"rectangle", "circle", "rectangle", "circle", "triangle", "triangle", "circle", - "rectangle"}, - {2, 3, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); - auto array_b = - MakeArrowArrayUtf8({"rectangle", "circle", "circle", "circle", "rectangll"}); - - // expected output - auto exp = MakeArrowArrayBool({true, true, false, true, false}, - {true, true, true, true, true}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); - - // build expressions. - // array_contains(a, b) - auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); - - // Build a projector for the expressions. - std::shared_ptr projector; - auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); - EXPECT_TRUE(status.ok()) << status.message(); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()) << status.message(); - - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); -} -*/ } // namespace gandiva diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 08ffb8c192cb9..461ed2a04d8ea 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -161,7 +161,7 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } - std::cout << "LR TODO creating TreeExpression " << out_field->type()->ToString() << std::endl; + auto func_node = MakeFunction(function, field_nodes, out_field->type()); return MakeExpression(func_node, out_field); } diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index aba90a93fc87f..cc1ed04194861 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -138,38 +138,20 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) break; case arrow::Type::LIST: { gandiva_data_type->set_type(types::GandivaType::LIST); - //LR TODO make a helper function - std::cout << "LR TODO creating listtype" << std::endl; if (type->num_fields() <= 0) { break; } - std::cout << "LR TODO listtype id=" << type->fields()[0]->type()->id() << std::endl; - switch (type->fields()[0]->type()->id()) { - case arrow::Type::INT32: - gandiva_data_type->set_listtype(types::GandivaType::INT32); - break; - case arrow::Type::INT64: - gandiva_data_type->set_listtype(types::GandivaType::INT64); - break; - case arrow::Type::FLOAT: - gandiva_data_type->set_listtype(types::GandivaType::FLOAT); - break; - case arrow::Type::DOUBLE: - gandiva_data_type->set_listtype(types::GandivaType::DOUBLE); - break; - case arrow::Type::STRING: - gandiva_data_type->set_listtype(types::GandivaType::UTF8); - break; + if (type->fields()[0]->type()->id() != arrow::Type::LIST) { + types::ExtGandivaType gt; + ArrowToProtobuf(type->fields()[0]->type(), >); + gandiva_data_type->set_listtype(gt.type()); } break; } default: // un-supported types. test ensures that // when one of these are added build breaks. - //DCHECK(false); - //LR TODO - printf("LR Found unsupported type %d\n", type->id()); - fflush(stdout); + DCHECK(false); } } @@ -202,12 +184,6 @@ Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSu types::GandivaFunctions gandiva_functions; for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { - - //LR TODO - printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).base_name().c_str()); - printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).ToString().c_str()); - fflush(stdout); - types::FunctionSignature* function_signature = gandiva_functions.add_function(); function_signature->set_name((*function).base_name()); types::ExtGandivaType* return_type = function_signature->mutable_returntype(); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index 4a4ccda035375..d85644d7831f3 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -253,7 +253,6 @@ DataTypePtr ProtoTypeToInterval(const types::ExtGandivaType& ext_type) { } DataTypePtr ProtoTypeToList(const types::ExtGandivaType& ext_type) { - std::cout << "LR TODO 2 checking a field type " << ext_type.type() << " and it has listType:" << ext_type.listtype() << std::endl; DataTypePtr childType = SimpleProtoTypeToDataType(ext_type.listtype()); return arrow::list(childType); } @@ -330,7 +329,6 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { DataTypePtr ProtoTypeToDataType(const types::Field& f) { const types::ExtGandivaType& ext_type = f.type(); - std::cout << "LR TODO checking a field type " << ext_type.type() << " and it has listType:" << ext_type.listtype() << std::endl; if (ext_type.type() == types::LIST) { if (f.children().size() > 0 && f.children()[0].type().type() != types::LIST) { DataTypePtr childType = ProtoTypeToDataType(f.children()[0].type()); @@ -689,7 +687,6 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, auto type = field->type(); auto type_id = type->id(); -//num_rows = num_records or ?? if (type_id == arrow::Type::LIST) { if (buf_idx >= in_bufs_len) { @@ -888,7 +885,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build status = Projector::Make(schema_ptr, expr_vector, mode, config, sec_cache, &projector); if (!status.ok()) { - ss << "Failed to make LLVM module [1]cdue to " << status.message() << "\n"; + ss << "Failed to make LLVM module due to " << status.message() << "\n"; releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env); goto err_out; } @@ -953,22 +950,12 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { jlong ret_capacity = env_->GetLongField(ret, list_expander_ret_capacity_); jlong outer_valid_address = env_->GetLongField(ret, list_expander_outer_valid_address_); - std::cout << "Buffer expand: New capacity is " << new_capacity << - " vector id " << vector_idx_ << " expander method " << method_ << - " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << - " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << std::endl; - data_ = reinterpret_cast(ret_address); capacity_ = ret_capacity; } else { jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); - std::cout << "Buffer expand: New capacity is " << new_capacity << - " vector id " << vector_idx_ << " expander method " << method_ << - " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << - " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << std::endl; - data_ = reinterpret_cast(ret_address); capacity_ = ret_capacity; } @@ -1135,17 +1122,12 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); outBufJava->offsetCapacity = out_sizes[1]; outBufJava->validityBuffer = reinterpret_cast(out_bufs[2]); - //outBufJava->outerValidityBuffer = reinterpret_cast(out_bufs[0]); child_buffers.push_back(outBufJava); - //LR TODO - - std::cout << "LR Creating array for type: " << field->type()->ToString() << std::endl; std::shared_ptr dt2 = std::make_shared(); if (field->type()->id() == arrow::Type::LIST && field->type()->num_fields() > 0) { dt2 = field->type()->fields()[0]->type(); } - std::cout << "LR using sub type: " << dt2->ToString() << std::endl; auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); std::vector> kids; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 80b61332e62e9..c870cf8f9ab8e 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -169,8 +169,6 @@ private static ArrowType getArrowTypeSimple(GandivaType type) { return new ArrowType.Null(); case GandivaType.DECIMAL_VALUE: return new ArrowType.Decimal(0, 0, 128); - case GandivaType.STRUCT_VALUE: - return new ArrowType.Struct(); case GandivaType.LIST_VALUE: return new ArrowType.List(); case GandivaType.FIXED_SIZE_BINARY_VALUE: diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index fe82c25736aac..5485d46882336 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -368,7 +368,6 @@ private void evaluate(int numRows, List buffers, List buf outAddrs[idx] = valueVector.getValidityBuffer().memoryAddress(); outSizes[idx++] = valueVector.getValidityBuffer().capacity(); if (isVarWidth) { - logger.error("LR Projector.java evaluate isVarWidth setting buffer=" + idx); outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); hasVariableWidthColumns = true; @@ -408,7 +407,6 @@ private void evaluate(int numRows, List buffers, List buf for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { - //LR TODO check if this is necessary. ((ListVector) valueVector).setLastSet(selectionVectorRecordCount - 1); } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 9e84bc6d05561..91bf5b633c590 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -309,7 +309,6 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp break; } case Type.Struct_: { // 13 - ArrowTypeHelper.initArrowTypeStruct((ArrowType.Struct) arrowType, builder); break; } case Type.Union: { // 14 @@ -373,19 +372,15 @@ public static GandivaTypes.Field arrowFieldToProtobuf(Field field) throws Gandiv builder.setName(field.getName()); builder.setNullable(field.isNullable()); - //LR TODO ArrowType subType = null; if (field.getChildren().size() > 0 && field.getChildren().get(0) .getType().getTypeID().getFlatbufID() != Type.List) { - //builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null)); subType = field.getChildren().get(0).getType(); } builder.setType(ArrowTypeHelper.arrowTypeToProtobuf(field.getType(), subType)); for (Field child : field.getChildren()) { - System.out.println("LR TODO arrowFieldToProtobuf child field id is " + child.getType().getTypeID() ); if (child.getType() != ArrowType.Null.INSTANCE) { - System.out.println("LR TODO adding child=" + child.getName() + " type=" + child.getType()); builder.addChildren(ArrowTypeHelper.arrowFieldToProtobuf(child)); } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java index 14d6286a3282c..0097e2236fa07 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java @@ -67,7 +67,6 @@ class FunctionNode implements TreeNode { public GandivaTypes.TreeNode toProtobuf() throws GandivaException { GandivaTypes.FunctionNode.Builder fnNode = GandivaTypes.FunctionNode.newBuilder(); fnNode.setFunctionName(function); - System.out.println("LR TODO retType, retListType)=" + retType + "==" + retListType); fnNode.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType, retListType)); for (TreeNode arg : children) { diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index f9f2a4cd775b3..a020dcda38091 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -94,7 +94,6 @@ public static TreeNode makeField(Field field) { public static TreeNode makeFunction(String function, List children, ArrowType retType) { - System.out.println("LR TODO TreeNode makeFunction Type"); StackTraceElement[] elements = Thread.currentThread().getStackTrace(); for (int i = 1; i < elements.length; i++) { StackTraceElement s = elements[i]; @@ -116,13 +115,6 @@ public static TreeNode makeFunction(String function, public static TreeNode makeFunction(String function, List children, ArrowType retType, ArrowType listType) { - System.out.println("LR TODO TreeNode makeFunction Type2"); - StackTraceElement[] elements = Thread.currentThread().getStackTrace(); - for (int i = 1; i < elements.length; i++) { - StackTraceElement s = elements[i]; - System.out.println("\tat " + s.getClassName() + "." + s.getMethodName() + - "(" + s.getFileName() + ":" + s.getLineNumber() + ")"); - } return new FunctionNode(function, children, retType, listType); } @@ -137,13 +129,6 @@ public static TreeNode makeFunction(String function, public static TreeNode makeFunction(String function, List children, Field retType) { - System.out.println("LR TODO TreeNode makeFunction Field"); - StackTraceElement[] elements = Thread.currentThread().getStackTrace(); - for (int i = 1; i < elements.length; i++) { - StackTraceElement s = elements[i]; - System.out.println("\tat " + s.getClassName() + "." + s.getMethodName() + - "(" + s.getFileName() + ":" + s.getLineNumber() + ")"); - } return new FunctionNode(function, children, retType); } diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java index df0fd8639b231..8dd759ee885d2 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java @@ -48,7 +48,6 @@ import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarCharVector; -import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.holders.NullableIntervalDayHolder; import org.apache.arrow.vector.holders.NullableIntervalYearHolder; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -58,7 +57,6 @@ import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; -import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; import org.junit.Ignore; @@ -290,54 +288,6 @@ public void testEvaluate() throws GandivaException, Exception { eval.close(); } - @Test - public void testEvaluateArray() throws GandivaException, Exception { - ArrowType int32 = new ArrowType.Int(32, true); - ArrowType listInt32 = new ArrowType.List(); - - Field a = Field.nullable("a", int32); - List args = Lists.newArrayList(a); - - Field retType = Field.nullable("c", listInt32); - ExpressionTree root = TreeBuilder.makeExpression("array_makeGandiva", args, retType); - - List exprs = Lists.newArrayList(root); - - Schema schema = new Schema(args); - Projector eval = Projector.make(schema, exprs); - - int numRows = 16; - byte[] validity = new byte[]{(byte) 255, 0}; - // second half is "undefined" - int[] aValues = new int[]{1, 2, 3, 42, 5}; - - - ArrowBuf validitya = buf(validity); - ArrowBuf valuesa = intBuf(aValues); - ArrowRecordBatch batch = - new ArrowRecordBatch( - numRows, - Lists.newArrayList(new ArrowFieldNode(numRows, 5)), - Lists.newArrayList(validitya, valuesa)); - - FieldType ft = new FieldType(true, int32, null); - ListVector intVector = new ListVector("result", allocator, ft, null); - //ListVector.allocateNew(numRows); - - List output = new ArrayList(); - output.add(intVector); - eval.evaluate(batch, output); - - System.out.println(intVector.getDataVector()); - - - - // free buffers - releaseRecordBatch(batch); - releaseValueVectors(output); - eval.close(); - } - @Test public void testEvaluateDivZero() throws GandivaException, Exception { Field a = Field.nullable("a", int32); From b727684757af0eaa543b7b1792cf04928f0451b4 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 1 Nov 2023 11:12:33 -0700 Subject: [PATCH 29/46] Restore pom.xml --- java/pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/java/pom.xml b/java/pom.xml index 8237c8f06f271..747320d2f8a40 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -713,6 +713,16 @@ + format + memory + vector + tools + adapter/jdbc + flight + performance + algorithm + adapter/avro + compression From cfb56a0c660ef4a1814c4624a74bd473c8662f0b Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 1 Nov 2023 11:36:06 -0700 Subject: [PATCH 30/46] Cleanup --- cpp/src/gandiva/array_ops.cc | 5 ----- cpp/src/gandiva/function_registry_test.cc | 9 --------- 2 files changed, 14 deletions(-) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 802b4e20947bf..1af83a26fe31c 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -46,17 +46,14 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, std::vector outValid; for (int i = 0; i < entry_len; i++) { Type entry_item = *(entry_buf + (i * 1)); - std::cout << "LR TODO checking " << entry_item << std::endl; if (entry_item == remove_data) { //Do not add the item to remove. } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { outValid.push_back(false); newInts.push_back(0); - std::cout << "LR TODO not valid! " << i << std::endl; } else { outValid.push_back(true); newInts.push_back(entry_item); - std::cout << "LR TODO valid " << i << std::endl; } } @@ -65,11 +62,9 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, //Since this function can remove values we don't know the length ahead of time. //A fast way to compute Math.ceil(input / 8.0). int validByteSize = (unsigned int)((*out_len) + 7) >> 3; - std::cout << "LR TODO out_len=" << *out_len << " valid byte length is " << validByteSize << std::endl; uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, validByteSize); for (int i = 0; i < outValid.size(); i++) { - std::cout << "LR TODO setting bit " << i << " to value " << outValid[i] << std::endl; arrow::bit_util::SetBitTo(validRet, i, outValid[i]); } diff --git a/cpp/src/gandiva/function_registry_test.cc b/cpp/src/gandiva/function_registry_test.cc index 63ede751b44e3..dcbfeea3e80be 100644 --- a/cpp/src/gandiva/function_registry_test.cc +++ b/cpp/src/gandiva/function_registry_test.cc @@ -94,13 +94,4 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { << stream.str(); } -TEST_F(TestFunctionRegistry, TestFound2) { - FunctionSignature array_length("array_lengthGandiva", {list(utf8())}, arrow::int64()); - - const NativeFunction* function = registry_.LookupSignature(array_length); - EXPECT_NE(function, nullptr); - EXPECT_THAT(function->signatures(), testing::Contains(array_length)); - EXPECT_EQ(function->pc_name(), "array_utf8_length"); -} - } // namespace gandiva From 235e64886ecdf5fcfae629d2caab318eb08316e4 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 1 Nov 2023 13:09:55 -0700 Subject: [PATCH 31/46] Update jni to assign validity vector after resize --- java/gandiva/src/main/cpp/jni_common.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index d85644d7831f3..7a631ad856c47 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -90,7 +90,7 @@ static jmethodID listvector_expander_method_; static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; static jfieldID list_expander_ret_address_; -static jfieldID list_expander_outer_valid_address_; +static jfieldID list_expander_valid_address_; static jfieldID list_expander_ret_capacity_; static jclass secondary_cache_class_; @@ -159,7 +159,7 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { env->GetFieldID(list_expander_ret_class_, "address", "J"); list_expander_ret_capacity_ = env->GetFieldID(list_expander_ret_class_, "capacity", "J"); - list_expander_outer_valid_address_ = + list_expander_valid_address_ = env->GetFieldID(list_expander_ret_class_, "validityaddress", "J"); jclass local_cache_class = @@ -948,10 +948,11 @@ Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { if (isList) { jlong ret_address = env_->GetLongField(ret, list_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, list_expander_ret_capacity_); - jlong outer_valid_address = env_->GetLongField(ret, list_expander_outer_valid_address_); + jlong valid_address = env_->GetLongField(ret, list_expander_valid_address_); data_ = reinterpret_cast(ret_address); capacity_ = ret_capacity; + validityBuffer = reinterpret_cast(valid_address); } else { jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); From 112d5409687c121fa4112d15bea6a2a5aefb708f Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 2 Nov 2023 17:52:23 -0700 Subject: [PATCH 32/46] cleanup --- cpp/src/gandiva/array_ops.cc | 16 +++--- cpp/src/gandiva/tests/list_test.cc | 92 +----------------------------- 2 files changed, 8 insertions(+), 100 deletions(-) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 1af83a26fe31c..257198ac62a7b 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -32,7 +32,7 @@ template Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - Type remove_data, bool entry_validWhat, + Type remove_data, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { @@ -42,7 +42,6 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, int64_t validityBitIndex = 0; //The validity index already has the current row length added to it, so decrement. validityBitIndex = validity_index_var - entry_len; - entry_validWhat = true; std::vector outValid; for (int i = 0; i < entry_len; i++) { Type entry_item = *(entry_buf + (i * 1)); @@ -64,7 +63,7 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, int validByteSize = (unsigned int)((*out_len) + 7) >> 3; uint8_t* validRet = gdv_fn_context_arena_malloc(context_ptr, validByteSize); - for (int i = 0; i < outValid.size(); i++) { + for (size_t i = 0; i < outValid.size(); i++) { arrow::bit_util::SetBitTo(validRet, i, outValid[i]); } @@ -76,7 +75,6 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, if (!combined_row_validity) { *out_len = 0; *valid_row = false; //this one is what works for the top level validity. - entry_validWhat = false; } *valid_ptr = reinterpret_cast(validRet); return reinterpret_cast(ret); @@ -85,7 +83,7 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, template bool array_contains_template(const Type* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - int32_t contains_data, + Type contains_data, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { if (!combined_row_validity) { @@ -160,7 +158,7 @@ int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, entry_validWhat, + remove_data, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } @@ -172,7 +170,7 @@ int64_t* array_int64_remove(int64_t context_ptr, const int64_t* entry_buf, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, entry_validWhat, + remove_data, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } @@ -184,7 +182,7 @@ float* array_float32_remove(int64_t context_ptr, const float* entry_buf, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, entry_validWhat, + remove_data, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } @@ -197,7 +195,7 @@ double* array_float64_remove(int64_t context_ptr, const double* entry_buf, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, entry_validWhat, + remove_data, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index e065645bcff6a..c422052baceb2 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -133,88 +133,6 @@ void _test_list_type_field_alias(DataTypePtr type, ArrayPtr array, } /* -TEST_F(TestList, TestListUtf8) { - ArrayPtr array; - _build_list_array( - {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", - "eeee", "eeeee"}, - {1, 4, 3, 2, 5}, {true, true, false, true, true}, pool_, &array); - _test_list_type_field_alias(list(utf8()), array, pool_); -} - -TEST_F(TestList, TestListUtf8WithInvalidData) { - ArrayPtr array; - _build_list_array( - {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", - "eeee", "eeeee"}, - {1, 2, 3, 4, 5}, {true, false, true, true, false}, pool_, &array); - _test_list_type_field_alias(list(utf8()), array, pool_); -} - -TEST_F(TestList, TestListInt64) { - ArrayPtr array; - _build_list_array( - {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, - 50000}, - {1, 2, 5, 4, 3}, {true, true, true, true, false}, pool_, &array); - _test_list_type_field_alias(list(int64()), array, pool_); -} -*/ - - -/*TEST_F(TestList, TestListInt32) { - ArrayPtr array; - _build_list_array2( - {10, 20, 30, 60, 70, 80}, - {3, 3}, {true, true}, {true, true, false, true, false, true}, pool_, &array); - _test_list_type_field_alias(list(int32()), array, pool_, 2); -}*/ - -TEST_F(TestList, TestConcatWS) { - // schema for input fields - - auto field_a = field("a", utf8()); - auto field_b = field("b", utf8()); - auto field_c = field("c", utf8()); - auto schema = arrow::schema({field_a, field_b, field_c}); - - // output fields - auto res = field("res", utf8()); - - // Create a row-batch with some sample data - int num_records = 2; - auto array_a = - MakeArrowArrayUtf8({"this", "this"}, {true, true}); - auto array_b = - MakeArrowArrayUtf8({"is", "is not"}, {true, true}); - auto array_c = - MakeArrowArrayUtf8({"a test", "a test"}, {true, true}); - - - // expected output - ArrayPtr exp1; - _build_list_array2( - {10, 30, 70, 80}, - {2, 2}, {true, true}, {true, true, true, true}, pool_, &exp1); - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); - - auto expr = TreeExprBuilder::MakeExpression("concat_ws", {field_a, field_b, field_c}, res); - - - // Build a projector for the expressions. - std::shared_ptr projector; - auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); - EXPECT_TRUE(status.ok()) << status.message(); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()) << status.message(); - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); -} - TEST_F(TestList, TestArrayRemove) { // schema for input fields auto field_b = field("b", int32()); @@ -310,14 +228,6 @@ for (auto& array_data : outputs2) { auto child_data = array_data->child_data[0]; int64_t child_data_size = 1; if (arrow::is_binary_like(child_data->type->id())) { - /* when allocate array data, child data length is an initialized value, - * after calculating, child data offsets buffer has been resized for results, - * but array data length is unchanged. - * We should recalculate child data length and make ArrayData with new length - * - * Otherwise, child data offsets buffer length is data length + 1 - * and offset data is int32_t, need use buffer->size()/4 - 1 - */ child_data_size = child_data->buffers[1]->size() / 4 - 1; } else if (child_data->type->id() == arrow::Type::INT32) { child_data_size = child_data->buffers[1]->size() / 4; @@ -466,7 +376,7 @@ TEST_F(TestList, TestListInt32Contains) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); -} +}*/ TEST_F(TestList, TestListFloat32) { ArrayPtr array; From 0e3bd9e1892626d6d8da1478ed2ea1b39ec6c702 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 3 Nov 2023 08:40:49 -0700 Subject: [PATCH 33/46] Disable tests for now. --- cpp/src/gandiva/array_ops_test.cc | 6 ++++-- cpp/src/gandiva/llvm_types.h | 4 ++++ cpp/src/gandiva/tests/list_test.cc | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/gandiva/array_ops_test.cc b/cpp/src/gandiva/array_ops_test.cc index 4d96b80dd4222..fe83ad6260095 100644 --- a/cpp/src/gandiva/array_ops_test.cc +++ b/cpp/src/gandiva/array_ops_test.cc @@ -29,10 +29,12 @@ TEST(TestArrayOps, TestInt32ContainsInt32) { int32_t data[] = {1, 2, 3, 4}; int32_t entry_offsets_len = 3; int32_t contains_data = 2; + int32_t entry_validity = 15; + bool valid = false; EXPECT_EQ( - array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, - contains_data), + array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, &entry_validity, + true, contains_data, true, 0, 0, &valid), true); } diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index 7473f0c4d6ea7..58b7c3008695f 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -121,6 +121,10 @@ class GANDIVA_EXPORT LLVMTypes { // offsets buffer is to separate data into list // not support nested list if (data_type->id() == arrow::Type::LIST) { + //Nested lists aren't supported yet. + if (data_type->field(0)->type()->id() == arrow::Type::LIST) { + return NULL; + } return IRType(data_type->field(0)->type()->id()); } return IRType(data_type->id()); diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index c422052baceb2..9a6eed35c95e2 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -376,7 +376,7 @@ TEST_F(TestList, TestListInt32Contains) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); -}*/ +} TEST_F(TestList, TestListFloat32) { ArrayPtr array; @@ -394,6 +394,6 @@ TEST_F(TestList, TestListFloat64) { 2.22222, 3.33333, 4.44444, 5.55555}, {1, 2, 4, 3, 5}, {true, false, true, true, true}, pool_, &array); _test_list_type_field_alias(list(float64()), array, pool_); -} +}*/ } // namespace gandiva From 662e9f1abf4a98851d21838af7bf87ca660cb6ae Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 3 Nov 2023 09:50:43 -0700 Subject: [PATCH 34/46] Remove uneeded changes. --- .../apache/arrow/gandiva/expression/ArrowTypeHelper.java | 5 ----- .../org/apache/arrow/gandiva/expression/FunctionNode.java | 1 - .../java/org/apache/arrow/gandiva/expression/IfNode.java | 2 +- .../java/org/apache/arrow/gandiva/expression/NullNode.java | 2 +- .../org/apache/arrow/gandiva/expression/TreeBuilder.java | 6 ------ .../main/java/io/netty/buffer/PooledByteBufAllocatorL.java | 2 +- 6 files changed, 3 insertions(+), 15 deletions(-) diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 91bf5b633c590..86cf44febe826 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -151,11 +151,6 @@ private static void initArrowTypeStruct(ArrowType.Struct structType, private static void initArrowTypeList(ArrowType.List listType, ArrowType subType, GandivaTypes.ExtGandivaType.Builder builder) throws GandivaException { - /*if (f != null && f.getChildren().size() > 0 && f.getChildren().get(0) - .getType().getTypeID().getFlatbufID() != Type.List) { - //builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null)); - builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null, builder).getType()); - }*/ if (subType != null) { builder.setListType(arrowTypeToProtobuf(subType).getType()); } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java index 0097e2236fa07..e092facfd69ba 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/FunctionNode.java @@ -41,7 +41,6 @@ class FunctionNode implements TreeNode { this.retType = inField.getType(); if (inField.getChildren().size() > 0 && inField.getChildren().get(0) .getType().getTypeID().getFlatbufID() != Type.List) { - //builder.setListType(arrowTypeToProtobuf(f.getChildren().get(0).getType(), null)); this.retListType = inField.getChildren().get(0).getType(); } else { this.retListType = null; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java index db97675d8a298..19f9095fb7626 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/IfNode.java @@ -43,7 +43,7 @@ public GandivaTypes.TreeNode toProtobuf() throws GandivaException { ifNodeBuilder.setCond(condition.toProtobuf()); ifNodeBuilder.setThenNode(thenNode.toProtobuf()); ifNodeBuilder.setElseNode(elseNode.toProtobuf()); - ifNodeBuilder.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType, null)); + ifNodeBuilder.setReturnType(ArrowTypeHelper.arrowTypeToProtobuf(retType)); GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder(); builder.setIfNode(ifNodeBuilder.build()); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java index caeefe66fd76b..a8e7d6f82e522 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/NullNode.java @@ -32,7 +32,7 @@ class NullNode implements TreeNode { @Override public GandivaTypes.TreeNode toProtobuf() throws GandivaException { GandivaTypes.NullNode.Builder nullNode = GandivaTypes.NullNode.newBuilder(); - nullNode.setType(ArrowTypeHelper.arrowTypeToProtobuf(type, null)); + nullNode.setType(ArrowTypeHelper.arrowTypeToProtobuf(type)); GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder(); builder.setNullNode(nullNode.build()); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index a020dcda38091..f8337a25f8377 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -94,12 +94,6 @@ public static TreeNode makeField(Field field) { public static TreeNode makeFunction(String function, List children, ArrowType retType) { - StackTraceElement[] elements = Thread.currentThread().getStackTrace(); - for (int i = 1; i < elements.length; i++) { - StackTraceElement s = elements[i]; - System.out.println("\tat " + s.getClassName() + "." + s.getMethodName() + - "(" + s.getFileName() + ":" + s.getLineNumber() + ")"); - } return new FunctionNode(function, children, retType); } diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 8364b4a258889..d0a5a9945ce20 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -161,7 +161,7 @@ public InnerAllocator() { } private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCapacity) { - PoolArenasCache cache = threadCache(); + PoolThreadCache cache = threadCache(); PoolArena directArena = cache.directArena; if (directArena != null) { From 4bd53e22c6c0acc82108a3714ab8e43d4f3aab23 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 3 Nov 2023 15:17:38 -0700 Subject: [PATCH 35/46] Remove uneeded includes --- cpp/src/arrow/type.h | 2 +- cpp/src/gandiva/annotator.cc | 1 - cpp/src/gandiva/expr_decomposer.cc | 1 - cpp/src/gandiva/function_registry_test.cc | 1 - cpp/src/gandiva/function_signature.cc | 1 - cpp/src/gandiva/llvm_generator.h | 1 - cpp/src/gandiva/llvm_generator_test.cc | 1 + cpp/src/gandiva/local_bitmaps_holder.h | 2 - cpp/src/gandiva/lvalue.h | 6 +- cpp/src/gandiva/projector.cc | 91 ----------------------- cpp/src/gandiva/tree_expr_builder.cc | 4 +- 11 files changed, 6 insertions(+), 105 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 29ac79037d508..560805535dc4f 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1079,7 +1079,7 @@ class ARROW_EXPORT StructType : public NestedType { static constexpr const char* type_name() { return "struct"; } explicit StructType(const std::vector>& fields); - + ~StructType() override; DataTypeLayout layout() const override { diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 7fc8ab94d3c05..04821c945e5f7 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -17,7 +17,6 @@ #include "gandiva/annotator.h" -#include #include #include diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index e14fcbc8952cb..719d4006e65ae 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -17,7 +17,6 @@ #include "gandiva/expr_decomposer.h" -#include #include #include #include diff --git a/cpp/src/gandiva/function_registry_test.cc b/cpp/src/gandiva/function_registry_test.cc index dcbfeea3e80be..e3c1e85f79cba 100644 --- a/cpp/src/gandiva/function_registry_test.cc +++ b/cpp/src/gandiva/function_registry_test.cc @@ -93,5 +93,4 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { "different precompiled functions:\n" << stream.str(); } - } // namespace gandiva diff --git a/cpp/src/gandiva/function_signature.cc b/cpp/src/gandiva/function_signature.cc index 2498de39e1b3b..6dc6416178e15 100644 --- a/cpp/src/gandiva/function_signature.cc +++ b/cpp/src/gandiva/function_signature.cc @@ -18,7 +18,6 @@ #include "gandiva/function_signature.h" #include -#include #include #include #include diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 594b7253b9e93..e8c15bdf00744 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -112,7 +112,6 @@ class GANDIVA_EXPORT LLVMGenerator { void Visit(const NonNullableFuncDex& dex) override; void Visit(const NullableNeverFuncDex& dex) override; void Visit(const NullableInternalFuncDex& dex) override; - //void Visit(const NullableInternalListFuncDex& dex) override; void Visit(const IfDex& dex) override; void Visit(const BooleanAndDex& dex) override; void Visit(const BooleanOrDex& dex) override; diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 0651614c816f6..028893b0b4594 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -114,4 +114,5 @@ TEST_F(TestLLVMGenerator, TestAdd) { EXPECT_THAT(out, testing::ElementsAre(6, 8, 10, 12)); EXPECT_EQ(out_bitmap, 0ULL); } + } // namespace gandiva diff --git a/cpp/src/gandiva/local_bitmaps_holder.h b/cpp/src/gandiva/local_bitmaps_holder.h index 4c3d55c47c585..a172fb973c4a5 100644 --- a/cpp/src/gandiva/local_bitmaps_holder.h +++ b/cpp/src/gandiva/local_bitmaps_holder.h @@ -44,8 +44,6 @@ class LocalBitMapsHolder { return local_bitmaps_array_.get()[idx]; } - int64_t GetNumRecords() { return num_records_; } - private: /// number of records in the current batch. int64_t num_records_; diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 2f33a97788c6c..04862dc9d18c8 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include "arrow/util/macros.h" @@ -32,8 +31,7 @@ class GANDIVA_EXPORT LValue { public: explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR, llvm::Value* validity = NULLPTR) - : data_(data), length_(length), validity_(validity) { - } + : data_(data), length_(length), validity_(validity) {} virtual ~LValue() = default; llvm::Value* data() { return data_; } @@ -125,10 +123,8 @@ class GANDIVA_EXPORT ListLValue : public LValue { virtual std::string to_string() override { std::string s = "List LValue"; - s += " " + LValue::to_string(); - std::string str1 = "child_offsets_:"; if (child_offsets_) { llvm::raw_string_ostream output1(str1); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index a3cccca11191d..372f097ed5cfa 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -17,7 +17,6 @@ #include "gandiva/projector.h" -#include #include #include #include @@ -30,96 +29,6 @@ namespace gandiva { -class ProjectorCacheKey { - public: - ProjectorCacheKey(SchemaPtr schema, std::shared_ptr configuration, - ExpressionVector expression_vector, SelectionVector::Mode mode) - : schema_(schema), configuration_(configuration), mode_(mode), uniqifier_(0) { - static const int kSeedValue = 4; - size_t result = kSeedValue; - for (auto& expr : expression_vector) { - std::string expr_as_string = expr->ToString(); - expressions_as_strings_.push_back(expr_as_string); - arrow::internal::hash_combine(result, expr_as_string); - UpdateUniqifier(expr_as_string); - } - arrow::internal::hash_combine(result, static_cast(mode)); - arrow::internal::hash_combine(result, configuration->Hash()); - arrow::internal::hash_combine(result, schema_->ToString()); - arrow::internal::hash_combine(result, uniqifier_); - hash_code_ = result; - } - - std::size_t Hash() const { return hash_code_; } - - bool operator==(const ProjectorCacheKey& other) const { - // arrow schema does not overload equality operators. - if (!(schema_->Equals(*other.schema().get(), true))) { - return false; - } - - if (*configuration_ != *other.configuration_) { - return false; - } - - if (expressions_as_strings_ != other.expressions_as_strings_) { - return false; - } - - if (mode_ != other.mode_) { - return false; - } - - if (uniqifier_ != other.uniqifier_) { - return false; - } - return true; - } - - bool operator!=(const ProjectorCacheKey& other) const { return !(*this == other); } - - SchemaPtr schema() const { return schema_; } - - std::string ToString() const { - std::stringstream ss; - // indent, window, indent_size, null_rep and skip new lines. - arrow::PrettyPrintOptions options{0, 10, 2, "null", true}; - DCHECK_OK(PrettyPrint(*schema_.get(), options, &ss)); - - ss << "Expressions: ["; - bool first = true; - for (auto& expr : expressions_as_strings_) { - if (first) { - first = false; - } else { - ss << ", "; - } - - ss << expr; - } - ss << "]"; - return ss.str(); - } - - private: - void UpdateUniqifier(const std::string& expr) { - if (uniqifier_ == 0) { - // caching of expressions with re2 patterns causes lock contention. So, use - // multiple instances to reduce contention. - if (expr.find(" like(") != std::string::npos) { - uniqifier_ = std::hash()(std::this_thread::get_id()) % 16; - } - } - } - - const SchemaPtr schema_; - const std::shared_ptr configuration_; - SelectionVector::Mode mode_; - std::vector expressions_as_strings_; - size_t hash_code_; - uint32_t uniqifier_; -}; - Projector::Projector(std::unique_ptr llvm_generator, SchemaPtr schema, const FieldVector& output_fields, std::shared_ptr configuration) diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 461ed2a04d8ea..82bb661ecda80 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -147,6 +147,9 @@ ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result if (result_field == nullptr) { return nullptr; } + if (print_expr) { + std::cout << "Expression: " << root_node->ToString() << "\n"; + } return ExpressionPtr(new Expression(root_node, result_field)); } @@ -161,7 +164,6 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } - auto func_node = MakeFunction(function, field_nodes, out_field->type()); return MakeExpression(func_node, out_field); } From b4af98a1dbe25c97ee50d16037860414b5455156 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Mon, 6 Nov 2023 13:37:35 -0800 Subject: [PATCH 36/46] Fix unit tests --- cpp/src/gandiva/annotator.cc | 4 +++- cpp/src/gandiva/array_ops_test.cc | 2 +- cpp/src/gandiva/llvm_generator.cc | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 04821c945e5f7..45500a785c7c7 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -62,7 +62,9 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { data_buffer_ptr_idx = buffer_count_++; } int child_valid_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; - child_valid_buffer_ptr_idx = buffer_count_++; + if (field->type()->id() == arrow::Type::LIST) { + child_valid_buffer_ptr_idx = buffer_count_++; + } return std::make_shared(field, data_idx, validity_idx, offsets_idx, data_buffer_ptr_idx, child_offsets_idx, child_valid_buffer_ptr_idx); } diff --git a/cpp/src/gandiva/array_ops_test.cc b/cpp/src/gandiva/array_ops_test.cc index fe83ad6260095..bf01c1fe0a091 100644 --- a/cpp/src/gandiva/array_ops_test.cc +++ b/cpp/src/gandiva/array_ops_test.cc @@ -34,7 +34,7 @@ TEST(TestArrayOps, TestInt32ContainsInt32) { EXPECT_EQ( array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, &entry_validity, - true, contains_data, true, 0, 0, &valid), + true, contains_data, true, 0, 3, &valid), true); } diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 11fd2d3cb1947..0c1478894a838 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -787,10 +787,8 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); int i = 0; - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; // => offset_start = offsets[loop_var] slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); - std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); // => offset_end = offsets[loop_var + 1] From a057f5815b24e12f937061dc9e2cf76d9af0b0ce Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Mon, 6 Nov 2023 14:56:46 -0800 Subject: [PATCH 37/46] Cleanup. --- cpp/src/gandiva/llvm_generator.cc | 1 - cpp/src/gandiva/tests/list_test.cc | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 0c1478894a838..5e676d70251fa 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -786,7 +786,6 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { llvm::Value* offsets_slot_index = builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); - int i = 0; // => offset_start = offsets[loop_var] slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc index 9a6eed35c95e2..abc7b5d7091b8 100644 --- a/cpp/src/gandiva/tests/list_test.cc +++ b/cpp/src/gandiva/tests/list_test.cc @@ -251,9 +251,7 @@ for (auto& array_data : outputs2) { } - - std::cout << "LR ====================THIRD=WAY================================== " << std::endl; - { +{ std::shared_ptr listDt = std::make_shared(); std::shared_ptr dt = std::make_shared(listDt); From be455abe3075d06f9a27b05a3024ef4828f053b1 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Mon, 6 Nov 2023 19:33:09 -0800 Subject: [PATCH 38/46] Cleanup --- cpp/src/gandiva/annotator.cc | 9 ++++++--- cpp/src/gandiva/array_ops.cc | 2 +- cpp/src/gandiva/projector.cc | 13 ++++++++----- .../gandiva/evaluator/ListVectorExpander.java | 12 ++++++------ .../arrow/gandiva/evaluator/Projector.java | 17 +++++++++++------ 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index 45500a785c7c7..abd5ba6b1a4bf 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -128,16 +128,19 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, } } + int const childDataIndex = 0; if (array_data.type->id() != arrow::Type::LIST) { uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); } else { uint8_t* data_buf = - const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); + const_cast(array_data.child_data.at(childDataIndex)->buffers[buffer_idx]->data()); eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); - if (array_data.child_data.at(0)->buffers[0] ) { + + int const childDataBufferIndex = 0; + if (array_data.child_data.at(childDataIndex)->buffers[childDataBufferIndex] ) { uint8_t* child_valid_buf = const_cast( - array_data.child_data.at(0)->buffers[0]->data()); + array_data.child_data.at(childDataIndex)->buffers[childDataBufferIndex]->data()); eval_batch->SetBuffer(desc.child_data_validity_idx(), child_valid_buf, 0); } diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 257198ac62a7b..99622854bdff6 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -44,7 +44,7 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, validityBitIndex = validity_index_var - entry_len; std::vector outValid; for (int i = 0; i < entry_len; i++) { - Type entry_item = *(entry_buf + (i * 1)); + Type entry_item = *(entry_buf + i); if (entry_item == remove_data) { //Do not add the item to remove. } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 372f097ed5cfa..0181ece3b2607 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -218,6 +218,9 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, llvm_generator_->Execute(batch, selection_vector, output_data_vecs)); // Create and return array arrays. + int const child_data_buffer_index = 1; + int const int_data_size = 4; + int const double_data_size = 8; output->clear(); for (auto& array_data : output_data_vecs) { if (array_data->type->id() == arrow::Type::LIST) { @@ -232,15 +235,15 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, * Otherwise, child data offsets buffer length is data length + 1 * and offset data is int32_t, need use buffer->size()/4 - 1 */ - child_data_size = child_data->buffers[1]->size() / 4 - 1; + child_data_size = child_data->buffers[child_data_buffer_index]->size() / int_data_size - 1; } else if (child_data->type->id() == arrow::Type::INT32) { - child_data_size = child_data->buffers[1]->size() / 4; + child_data_size = child_data->buffers[child_data_buffer_index]->size() / int_data_size; } else if (child_data->type->id() == arrow::Type::INT64) { - child_data_size = child_data->buffers[1]->size() / 8; + child_data_size = child_data->buffers[child_data_buffer_index]->size() / double_data_size; } else if (child_data->type->id() == arrow::Type::FLOAT) { - child_data_size = child_data->buffers[1]->size() / 4; + child_data_size = child_data->buffers[child_data_buffer_index]->size() / int_data_size; } else if (child_data->type->id() == arrow::Type::DOUBLE) { - child_data_size = child_data->buffers[1]->size() / 8; + child_data_size = child_data->buffers[child_data_buffer_index]->size() / double_data_size; } auto new_child_data = arrow::ArrayData::Make( child_data->type, child_data_size, child_data->buffers, child_data->offset); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java index 4430674d19a72..1d02f38a4d591 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -25,6 +25,8 @@ */ public class ListVectorExpander { private final ListVector[] bufferVectors; + public static final int valueBufferIndex = 1; + public static final int validityBufferIndex = 0; public ListVectorExpander(ListVector[] bufferVectors) { this.bufferVectors = bufferVectors; @@ -66,18 +68,16 @@ public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { throw new IllegalArgumentException("invalid index " + index); } - int valueBufferIndex = 1; - int validityBufferIndex = 0; ListVector vector = bufferVectors[index]; - while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { + while (vector.getDataVector().getFieldBuffers().get(ListVectorExpander.valueBufferIndex).capacity() < toCapacity) { //Just realloc the data vector. vector.getDataVector().reAlloc(); } return new ExpandResult( - vector.getDataVector().getFieldBuffers().get(valueBufferIndex).memoryAddress(), - vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity(), - vector.getDataVector().getFieldBuffers().get(validityBufferIndex).memoryAddress()); + vector.getDataVector().getFieldBuffers().get(ListVectorExpander.valueBufferIndex).memoryAddress(), + vector.getDataVector().getFieldBuffers().get(ListVectorExpander.valueBufferIndex).capacity(), + vector.getDataVector().getFieldBuffers().get(ListVectorExpander.validityBufferIndex).memoryAddress()); } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index 5485d46882336..686539a169f57 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -358,10 +358,11 @@ private void evaluate(int numRows, List buffers, List buf idx = 0; int outColumnIdx = 0; + final int listVectorBufferCount = 5; for (ValueVector valueVector : outColumns) { if (valueVector instanceof ListVector) { - outAddrs = new long[5 * outColumns.size()]; - outSizes = new long[5 * outColumns.size()]; + outAddrs = new long[listVectorBufferCount * outColumns.size()]; + outSizes = new long[listVectorBufferCount * outColumns.size()]; } boolean isVarWidth = valueVector instanceof VariableWidthVector; @@ -383,12 +384,16 @@ private void evaluate(int numRows, List buffers, List buf outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); //vector valid - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).memoryAddress(); - outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers() + .get(ListVectorExpander.validityBufferIndex).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers() + .get(ListVectorExpander.validityBufferIndex).capacity(); //vector offset - outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).memoryAddress(); - outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).capacity(); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers() + .get(ListVectorExpander.valueBufferIndex).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers() + .get(ListVectorExpander.valueBufferIndex).capacity(); } else { outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); outSizes[idx++] = valueVector.getDataBuffer().capacity(); From 7c7a93968af9846559987698ea3349f8cadcc1e4 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 8 Nov 2023 12:25:16 -0800 Subject: [PATCH 39/46] Tidy up function results with nulls present. --- cpp/src/gandiva/array_ops.cc | 53 +++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc index 99622854bdff6..7170534342085 100644 --- a/cpp/src/gandiva/array_ops.cc +++ b/cpp/src/gandiva/array_ops.cc @@ -32,7 +32,7 @@ template Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - Type remove_data, + Type remove_data, bool remove_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { @@ -45,7 +45,7 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, std::vector outValid; for (int i = 0; i < entry_len; i++) { Type entry_item = *(entry_buf + i); - if (entry_item == remove_data) { + if (remove_data_valid && entry_item == remove_data) { //Do not add the item to remove. } else if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { outValid.push_back(false); @@ -72,10 +72,13 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); memcpy(ret, newInts.data(), outBufferLength); *valid_row = true; - if (!combined_row_validity) { + + //Return null if the input array is null or the data to remove is null. + if (!combined_row_validity || !remove_data_valid) { *out_len = 0; *valid_row = false; //this one is what works for the top level validity. } + *valid_ptr = reinterpret_cast(validRet); return reinterpret_cast(ret); } @@ -83,10 +86,10 @@ Type* array_remove_template(int64_t context_ptr, const Type* entry_buf, template bool array_contains_template(const Type* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - Type contains_data, + Type contains_data, bool contains_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { - if (!combined_row_validity) { + if (!combined_row_validity || !contains_data_valid) { *valid_row = false; return false; } @@ -95,15 +98,21 @@ bool array_contains_template(const Type* entry_buf, const int32_t* entry_validityAdjusted = entry_validity - (loop_var ); int64_t validityBitIndex = validity_index_var - entry_len; + bool found_null_in_data = false; for (int i = 0; i < entry_len; i++) { if (!arrow::bit_util::GetBit(reinterpret_cast(entry_validityAdjusted), validityBitIndex + i)) { + found_null_in_data = true; continue; } Type entry_item = *(entry_buf + i); - if (entry_item == contains_data) { + if (contains_data_valid && entry_item == contains_data) { return true; } } + //If there is null in the input and the item is not found the result is null. + if (found_null_in_data) { + *valid_row = false; + } return false; } @@ -111,41 +120,41 @@ extern "C" { bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - int32_t contains_data, bool entry_validWhat, + int32_t contains_data, bool contains_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { return array_contains_template(entry_buf, entry_len, entry_validity, - combined_row_validity, contains_data, + combined_row_validity, contains_data, contains_data_valid, loop_var, validity_index_var, valid_row); } bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - int64_t contains_data, bool entry_validWhat, + int64_t contains_data, bool contains_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { return array_contains_template(entry_buf, entry_len, entry_validity, - combined_row_validity, contains_data, + combined_row_validity, contains_data, contains_data_valid, loop_var, validity_index_var, valid_row); } bool array_float32_contains_float32(int64_t context_ptr, const float* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - float contains_data, bool entry_validWhat, + float contains_data, bool contains_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { return array_contains_template(entry_buf, entry_len, entry_validity, - combined_row_validity, contains_data, + combined_row_validity, contains_data, contains_data_valid, loop_var, validity_index_var, valid_row); } bool array_float64_contains_float64(int64_t context_ptr, const double* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - double contains_data, bool entry_validWhat, + double contains_data, bool contains_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row) { return array_contains_template(entry_buf, entry_len, entry_validity, - combined_row_validity, contains_data, + combined_row_validity, contains_data, contains_data_valid, loop_var, validity_index_var, valid_row); } @@ -153,36 +162,36 @@ bool array_float64_contains_float64(int64_t context_ptr, const double* entry_buf int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - int32_t remove_data, bool entry_validWhat, + int32_t remove_data, bool remove_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr) { return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, + remove_data, remove_data_valid, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } int64_t* array_int64_remove(int64_t context_ptr, const int64_t* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - int64_t remove_data, bool entry_validWhat, + int64_t remove_data, bool remove_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, + remove_data, remove_data_valid, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } float* array_float32_remove(int64_t context_ptr, const float* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - float remove_data, bool entry_validWhat, + float remove_data, bool remove_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, + remove_data, remove_data_valid, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } @@ -190,12 +199,12 @@ float* array_float32_remove(int64_t context_ptr, const float* entry_buf, double* array_float64_remove(int64_t context_ptr, const double* entry_buf, int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity, - double remove_data, bool entry_validWhat, + double remove_data, bool remove_data_valid, int64_t loop_var, int64_t validity_index_var, bool* valid_row, int32_t* out_len, int32_t** valid_ptr){ return array_remove_template(context_ptr, entry_buf, entry_len, entry_validity, combined_row_validity, - remove_data, + remove_data, remove_data_valid, loop_var, validity_index_var, valid_row, out_len, valid_ptr); } From fb5551561b20034d9c1350598338c5c8b0faef65 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 8 Nov 2023 18:57:40 -0800 Subject: [PATCH 40/46] Fix unit test --- .../arrow/gandiva/evaluator/ExpressionRegistryTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java index 8853945c6d4d4..e8552d81180b6 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java @@ -39,7 +39,7 @@ public void testTypes() throws GandivaException { public void testFunctions() throws GandivaException { ArrowType.Int uint8 = new ArrowType.Int(8, false); FunctionSignature signature = - new FunctionSignature("add", uint8, null, Lists.newArrayList(uint8, uint8)); + new FunctionSignature("add", uint8, ArrowType.Null(), Lists.newArrayList(uint8, uint8)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } @@ -48,7 +48,7 @@ public void testFunctions() throws GandivaException { public void testFunctionAliases() throws GandivaException { ArrowType.Int int64 = new ArrowType.Int(64, true); FunctionSignature signature = - new FunctionSignature("modulo", int64, null, Lists.newArrayList(int64, int64)); + new FunctionSignature("modulo", int64, ArrowType.Null(), Lists.newArrayList(int64, int64)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } @@ -58,7 +58,7 @@ public void testCaseInsensitiveFunctionName() throws GandivaException { ArrowType.Utf8 utf8 = new ArrowType.Utf8(); ArrowType.Int int64 = new ArrowType.Int(64, true); FunctionSignature signature = - new FunctionSignature("castvarchar", utf8, null, Lists.newArrayList(utf8, int64)); + new FunctionSignature("castvarchar", utf8, ArrowType.Null(), Lists.newArrayList(utf8, int64)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } From 66396392e32231b635df6b5febc4948abe613497 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 9 Nov 2023 12:07:08 -0800 Subject: [PATCH 41/46] Make listreturn type optional. update unit test. --- .../arrow/gandiva/evaluator/FunctionSignature.java | 14 ++++++++++++++ .../gandiva/evaluator/ExpressionRegistryTest.java | 6 +++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java index e626efedd8d9f..e46aa2f933455 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java @@ -63,6 +63,20 @@ public FunctionSignature(String name, ArrowType returnType, ArrowType returnList this.paramTypes = paramTypes; } + /** + * Ctor. + * @param name - name of the function. + * @param returnType - data type of return + * @param returnListType optional list type + * @param paramTypes - data type of input args. + */ + public FunctionSignature(String name, ArrowType returnType, List paramTypes) { + this.name = name; + this.returnType = returnType; + this.returnListType = ArrowType.Null.INSTANCE; + this.paramTypes = paramTypes; + } + /** * Override equals. * @param signature - signature to compare diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java index e8552d81180b6..a51ac09ba1a51 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistryTest.java @@ -39,7 +39,7 @@ public void testTypes() throws GandivaException { public void testFunctions() throws GandivaException { ArrowType.Int uint8 = new ArrowType.Int(8, false); FunctionSignature signature = - new FunctionSignature("add", uint8, ArrowType.Null(), Lists.newArrayList(uint8, uint8)); + new FunctionSignature("add", uint8, Lists.newArrayList(uint8, uint8)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } @@ -48,7 +48,7 @@ public void testFunctions() throws GandivaException { public void testFunctionAliases() throws GandivaException { ArrowType.Int int64 = new ArrowType.Int(64, true); FunctionSignature signature = - new FunctionSignature("modulo", int64, ArrowType.Null(), Lists.newArrayList(int64, int64)); + new FunctionSignature("modulo", int64, Lists.newArrayList(int64, int64)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } @@ -58,7 +58,7 @@ public void testCaseInsensitiveFunctionName() throws GandivaException { ArrowType.Utf8 utf8 = new ArrowType.Utf8(); ArrowType.Int int64 = new ArrowType.Int(64, true); FunctionSignature signature = - new FunctionSignature("castvarchar", utf8, ArrowType.Null(), Lists.newArrayList(utf8, int64)); + new FunctionSignature("castvarchar", utf8, Lists.newArrayList(utf8, int64)); Set functions = ExpressionRegistry.getInstance().getSupportedFunctions(); Assert.assertTrue(functions.contains(signature)); } From 054ef1168ef4057e2ca8103819b6dfef5d3b9f68 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 9 Nov 2023 16:58:18 -0800 Subject: [PATCH 42/46] Fix javadoc --- .../org/apache/arrow/gandiva/evaluator/FunctionSignature.java | 1 - 1 file changed, 1 deletion(-) diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java index e46aa2f933455..57fa1df1ab80c 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java @@ -67,7 +67,6 @@ public FunctionSignature(String name, ArrowType returnType, ArrowType returnList * Ctor. * @param name - name of the function. * @param returnType - data type of return - * @param returnListType optional list type * @param paramTypes - data type of input args. */ public FunctionSignature(String name, ArrowType returnType, List paramTypes) { From 2d9cdbf3a43bfd85019e8d22342ff8badb8e275b Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Mon, 30 Oct 2023 15:37:37 +0200 Subject: [PATCH 43/46] GH-38511: [JAVA] added impl of getTransferPair(Field, BufferAllocator, CallBack) for StructVector (#59) --- .../org/apache/arrow/vector/complex/StructVector.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java index d947249fd3cdd..d0304a6fd2504 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -246,6 +246,15 @@ public TransferPair getTransferPair(Field field, BufferAllocator allocator) { allowConflictPolicyChanges), false); } + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + return new NullableStructTransferPair(this, new StructVector(field, + allocator, + callBack, + getConflictPolicy(), + allowConflictPolicyChanges), false); + } + /** * {@link TransferPair} for this (nullable) {@link StructVector}. */ From f477cc835de41b53376c537e456ab8c025d8da03 Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Tue, 31 Oct 2023 08:08:53 +0200 Subject: [PATCH 44/46] GH-38511: [JAVA] added impl of getTransferPair(Field, BufferAllocator, CallBack) for MapVector (#60) * GH-38511: [JAVA] added impl of getTransferPair(Field, BufferAllocator, CallBack) for MapVector --- .../main/java/org/apache/arrow/vector/complex/MapVector.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index c1913574bab19..e082b2f43be64 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -146,6 +146,11 @@ public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallB return new TransferImpl(ref, allocator, callBack); } + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + return new TransferImpl(field, allocator, callBack); + } + @Override public TransferPair makeTransferPair(ValueVector target) { return new MapVector.TransferImpl((MapVector) target); From b7c52884adbf2623a830f13f94112bb6190511eb Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 14 Nov 2023 14:28:31 -0800 Subject: [PATCH 45/46] Remove unneeded code. --- .../org/apache/arrow/gandiva/expression/ArrowTypeHelper.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 86cf44febe826..fd1be362b8404 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -143,11 +143,6 @@ private static void initArrowTypeDate(ArrowType.Date dateType, } } - private static void initArrowTypeStruct(ArrowType.Struct structType, - GandivaTypes.ExtGandivaType.Builder builder) { - builder.setType(GandivaTypes.GandivaType.STRUCT); - } - private static void initArrowTypeList(ArrowType.List listType, ArrowType subType, GandivaTypes.ExtGandivaType.Builder builder) throws GandivaException { From b6593b36259c93208e7f3161a7634bdfb5a788dc Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Thu, 16 Nov 2023 23:59:02 -0800 Subject: [PATCH 46/46] Return list parameter type information through the function registry. --- .../gandiva/evaluator/ExpressionRegistry.java | 14 +++++++++++--- .../gandiva/evaluator/FunctionSignature.java | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index c870cf8f9ab8e..6abc6719d63e6 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -17,9 +17,11 @@ package org.apache.arrow.gandiva.evaluator; +import java.util.ArrayList; import java.util.List; import java.util.Set; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.gandiva.exceptions.GandivaException; import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.ExtGandivaType; @@ -32,7 +34,6 @@ import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; -import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.protobuf.InvalidProtocolBufferException; @@ -117,9 +118,16 @@ private static Set getSupportedFunctionsFromGandiva() throws String functionName = protoFunctionSignature.getName(); ArrowType returnType = getArrowType(protoFunctionSignature.getReturnType()); ArrowType returnListType = getArrowTypeSimple(protoFunctionSignature.getReturnType().getListType()); - List paramTypes = Lists.newArrayList(); + List> paramTypes = new ArrayList>(); for (ExtGandivaType type : protoFunctionSignature.getParamTypesList()) { - paramTypes.add(getArrowType(type)); + ArrowType paramType = getArrowType(type); + ArrowType paramListType = getArrowTypeSimple(type.getListType()); + List paramArrowList = new ArrayList(); + paramArrowList.add(paramType); + if (paramType.getTypeID().getFlatbufID() == Type.List) { + paramArrowList.add(paramListType); + } + paramTypes.add(paramArrowList); } FunctionSignature functionSignature = new FunctionSignature(functionName, returnType, returnListType, paramTypes); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java index 57fa1df1ab80c..c5c6aeb5372b8 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/FunctionSignature.java @@ -17,6 +17,7 @@ package org.apache.arrow.gandiva.evaluator; +import java.util.ArrayList; import java.util.List; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -31,7 +32,7 @@ public class FunctionSignature { private final String name; private final ArrowType returnType; private final ArrowType returnListType; - private final List paramTypes; + private final List> paramTypes; public ArrowType getReturnType() { return returnType; @@ -41,7 +42,7 @@ public ArrowType getReturnListType() { return returnListType; } - public List getParamTypes() { + public List> getParamTypes() { return paramTypes; } @@ -56,7 +57,8 @@ public String getName() { * @param returnListType optional list type * @param paramTypes - data type of input args. */ - public FunctionSignature(String name, ArrowType returnType, ArrowType returnListType, List paramTypes) { + public FunctionSignature(String name, ArrowType returnType, ArrowType returnListType, + List> paramTypes) { this.name = name; this.returnType = returnType; this.returnListType = returnListType; @@ -73,7 +75,13 @@ public FunctionSignature(String name, ArrowType returnType, List para this.name = name; this.returnType = returnType; this.returnListType = ArrowType.Null.INSTANCE; - this.paramTypes = paramTypes; + this.paramTypes = new ArrayList>(); + for (ArrowType paramType : paramTypes) { + List paramArrowList = new ArrayList(); + paramArrowList.add(paramType); + this.paramTypes.add(paramArrowList); + } + } /**