diff --git a/cpp/src/gandiva/function_registry_common.h b/cpp/src/gandiva/function_registry_common.h index 66f94515089..2c46a2dcc04 100644 --- a/cpp/src/gandiva/function_registry_common.h +++ b/cpp/src/gandiva/function_registry_common.h @@ -229,6 +229,16 @@ typedef std::unordered_map GetHashFunctionRegistry() { static std::vector hash_fn_registry_ = { HASH32_SAFE_NULL_NEVER_FN(hash, {}), @@ -55,7 +58,9 @@ std::vector GetHashFunctionRegistry() { HASH_SHA1_NULL_NEVER_FN(hashSHA1, {}), - HASH_SHA256_NULL_NEVER_FN(hashSHA256, {})}; + HASH_SHA256_NULL_NEVER_FN(hashSHA256, {}), + + HASH_MD5_NULL_NEVER_FN(hashMD5, {})}; return hash_fn_registry_; } diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 2cac036abd5..7f78c11e679 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -184,54 +184,78 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, return 0; } -#define SHA1_HASH_FUNCTION(TYPE) \ +#define MD5_HASH_FUNCTION(TYPE) \ GANDIVA_EXPORT \ - const char* gdv_fn_sha1_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \ - int32_t* out_length) { \ + const char* gdv_fn_md5_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \ + int32_t* out_length) { \ if (!validity) { \ - return gandiva::gdv_hash_using_sha1(context, NULLPTR, 0, out_length); \ + return gandiva::gdv_md5_hash(context, NULLPTR, 0, out_length); \ } \ auto value_as_long = gandiva::gdv_double_to_long((double)value); \ - const char* result = gandiva::gdv_hash_using_sha1( \ - context, &value_as_long, sizeof(value_as_long), out_length); \ + const char* result = gandiva::gdv_md5_hash(context, &value_as_long, \ + sizeof(value_as_long), out_length); \ \ return result; \ } -#define SHA1_HASH_FUNCTION_BUF(TYPE) \ - GANDIVA_EXPORT \ - const char* gdv_fn_sha1_##TYPE(int64_t context, gdv_##TYPE value, \ - int32_t value_length, bool value_validity, \ - int32_t* out_length) { \ - if (!value_validity) { \ - return gandiva::gdv_hash_using_sha1(context, NULLPTR, 0, out_length); \ - } \ - return gandiva::gdv_hash_using_sha1(context, value, value_length, out_length); \ +#define MD5_HASH_FUNCTION_BUF(TYPE) \ + GANDIVA_EXPORT \ + const char* gdv_fn_md5_##TYPE(int64_t context, gdv_##TYPE value, int32_t value_length, \ + bool value_validity, int32_t* out_length) { \ + if (!value_validity) { \ + return gandiva::gdv_md5_hash(context, NULLPTR, 0, out_length); \ + } \ + return gandiva::gdv_md5_hash(context, value, value_length, out_length); \ } -#define SHA256_HASH_FUNCTION(TYPE) \ - GANDIVA_EXPORT \ - const char* gdv_fn_sha256_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \ - int32_t* out_length) { \ - if (!validity) { \ - return gandiva::gdv_hash_using_sha256(context, NULLPTR, 0, out_length); \ - } \ - auto value_as_long = gandiva::gdv_double_to_long((double)value); \ - const char* result = gandiva::gdv_hash_using_sha256( \ - context, &value_as_long, sizeof(value_as_long), out_length); \ - return result; \ +#define SHA1_HASH_FUNCTION(TYPE) \ + GANDIVA_EXPORT \ + const char* gdv_fn_sha1_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \ + int32_t* out_length) { \ + if (!validity) { \ + return gandiva::gdv_sha1_hash(context, NULLPTR, 0, out_length); \ + } \ + auto value_as_long = gandiva::gdv_double_to_long((double)value); \ + const char* result = gandiva::gdv_sha1_hash(context, &value_as_long, \ + sizeof(value_as_long), out_length); \ + \ + return result; \ } -#define SHA256_HASH_FUNCTION_BUF(TYPE) \ - GANDIVA_EXPORT \ - const char* gdv_fn_sha256_##TYPE(int64_t context, gdv_##TYPE value, \ - int32_t value_length, bool value_validity, \ - int32_t* out_length) { \ - if (!value_validity) { \ - return gandiva::gdv_hash_using_sha256(context, NULLPTR, 0, out_length); \ - } \ - \ - return gandiva::gdv_hash_using_sha256(context, value, value_length, out_length); \ +#define SHA1_HASH_FUNCTION_BUF(TYPE) \ + GANDIVA_EXPORT \ + const char* gdv_fn_sha1_##TYPE(int64_t context, gdv_##TYPE value, \ + int32_t value_length, bool value_validity, \ + int32_t* out_length) { \ + if (!value_validity) { \ + return gandiva::gdv_sha1_hash(context, NULLPTR, 0, out_length); \ + } \ + return gandiva::gdv_sha1_hash(context, value, value_length, out_length); \ + } + +#define SHA256_HASH_FUNCTION(TYPE) \ + GANDIVA_EXPORT \ + const char* gdv_fn_sha256_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \ + int32_t* out_length) { \ + if (!validity) { \ + return gandiva::gdv_sha256_hash(context, NULLPTR, 0, out_length); \ + } \ + auto value_as_long = gandiva::gdv_double_to_long((double)value); \ + const char* result = gandiva::gdv_sha256_hash(context, &value_as_long, \ + sizeof(value_as_long), out_length); \ + return result; \ + } + +#define SHA256_HASH_FUNCTION_BUF(TYPE) \ + GANDIVA_EXPORT \ + const char* gdv_fn_sha256_##TYPE(int64_t context, gdv_##TYPE value, \ + int32_t value_length, bool value_validity, \ + int32_t* out_length) { \ + if (!value_validity) { \ + return gandiva::gdv_sha256_hash(context, NULLPTR, 0, out_length); \ + } \ + \ + return gandiva::gdv_sha256_hash(context, value, value_length, out_length); \ } // Expand inner macro for all numeric types. @@ -257,6 +281,9 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, INNER(utf8) \ INNER(binary) +SHA_NUMERIC_BOOL_DATE_PARAMS(MD5_HASH_FUNCTION) +SHA_VAR_LEN_PARAMS(MD5_HASH_FUNCTION_BUF) + SHA_NUMERIC_BOOL_DATE_PARAMS(SHA256_HASH_FUNCTION) SHA_VAR_LEN_PARAMS(SHA256_HASH_FUNCTION_BUF) @@ -267,17 +294,28 @@ SHA_VAR_LEN_PARAMS(SHA1_HASH_FUNCTION_BUF) #undef SHA_VAR_LEN_PARAMS // Add functions for decimal128 +GANDIVA_EXPORT +const char* gdv_fn_md5_decimal128(int64_t context, int64_t x_high, uint64_t x_low, + int32_t /*x_precision*/, int32_t /*x_scale*/, + gdv_boolean x_isvalid, int32_t* out_length) { + if (!x_isvalid) { + return gandiva::gdv_md5_hash(context, NULLPTR, 0, out_length); + } + + const gandiva::BasicDecimal128 decimal_128(x_high, x_low); + return gandiva::gdv_md5_hash(context, decimal_128.ToBytes().data(), 16, out_length); +} + GANDIVA_EXPORT const char* gdv_fn_sha256_decimal128(int64_t context, int64_t x_high, uint64_t x_low, int32_t /*x_precision*/, int32_t /*x_scale*/, gdv_boolean x_isvalid, int32_t* out_length) { if (!x_isvalid) { - return gandiva::gdv_hash_using_sha256(context, NULLPTR, 0, out_length); + return gandiva::gdv_sha256_hash(context, NULLPTR, 0, out_length); } const gandiva::BasicDecimal128 decimal_128(x_high, x_low); - return gandiva::gdv_hash_using_sha256(context, decimal_128.ToBytes().data(), 16, - out_length); + return gandiva::gdv_sha256_hash(context, decimal_128.ToBytes().data(), 16, out_length); } GANDIVA_EXPORT @@ -285,12 +323,11 @@ const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_l int32_t /*x_precision*/, int32_t /*x_scale*/, gdv_boolean x_isvalid, int32_t* out_length) { if (!x_isvalid) { - return gandiva::gdv_hash_using_sha1(context, NULLPTR, 0, out_length); + return gandiva::gdv_sha1_hash(context, NULLPTR, 0, out_length); } const gandiva::BasicDecimal128 decimal_128(x_high, x_low); - return gandiva::gdv_hash_using_sha1(context, decimal_128.ToBytes().data(), 16, - out_length); + return gandiva::gdv_sha1_hash(context, decimal_128.ToBytes().data(), 16, out_length); } int32_t gdv_fn_dec_from_string(int64_t context, const char* in, int32_t in_length, @@ -1128,6 +1165,195 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { args, reinterpret_cast(gdv_fn_castFLOAT8_varbinary)); + // gdv_fn_md5_int8 + args = { + types->i64_type(), // context + types->i8_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_int8", types->i8_ptr_type() /*return_type*/, + args, reinterpret_cast(gdv_fn_md5_int8)); + + // gdv_fn_md5_int16 + args = { + types->i64_type(), // context + types->i16_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_int16", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_int16)); + + // gdv_fn_md5_int32 + args = { + types->i64_type(), // context + types->i32_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_int32", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_int32)); + + // gdv_fn_md5_int32 + args = { + types->i64_type(), // context + types->i64_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_int64", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_int64)); + + // gdv_fn_md5_uint8 + args = { + types->i64_type(), // context + types->i8_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_uint8", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_uint8)); + + // gdv_fn_md5_uint16 + args = { + types->i64_type(), // context + types->i16_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_uint16", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_uint16)); + + // gdv_fn_md5_uint32 + args = { + types->i64_type(), // context + types->i32_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_uint32", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_uint32)); + + // gdv_fn_md5_uint64 + args = { + types->i64_type(), // context + types->i64_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_uint64", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_uint64)); + + // gdv_fn_md5_float32 + args = { + types->i64_type(), // context + types->float_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_float32", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_float32)); + + // gdv_fn_md5_float64 + args = { + types->i64_type(), // context + types->double_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_float64", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_float64)); + + // gdv_fn_md5_boolean + args = { + types->i64_type(), // context + types->i1_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_boolean", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_boolean)); + + // gdv_fn_md5_date64 + args = { + types->i64_type(), // context + types->i64_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_date64", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_date64)); + + // gdv_fn_md5_date32 + args = { + types->i64_type(), // context + types->i32_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_date32", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_date32)); + + // gdv_fn_md5_time32 + args = { + types->i64_type(), // context + types->i32_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_time32", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_time32)); + + // gdv_fn_md5_timestamp + args = { + types->i64_type(), // context + types->i64_type(), // value + types->i1_type(), // validity + types->i32_ptr_type() // out_length + }; + engine->AddGlobalMappingForFunc("gdv_fn_md5_timestamp", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_timestamp)); + + // gdv_fn_md5_utf8 + args = { + types->i64_type(), // context + types->i8_ptr_type(), // const char* + types->i32_type(), // value_length + types->i1_type(), // validity + types->i32_ptr_type() // out + }; + + engine->AddGlobalMappingForFunc("gdv_fn_md5_utf8", types->i8_ptr_type() /*return_type*/, + args, reinterpret_cast(gdv_fn_md5_utf8)); + + // gdv_fn_md5_from_binary + args = { + types->i64_type(), // context + types->i8_ptr_type(), // const char* + types->i32_type(), // value_length + types->i1_type(), // validity + types->i32_ptr_type() // out + }; + + engine->AddGlobalMappingForFunc("gdv_fn_md5_binary", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_binary)); + // gdv_fn_sha1_int8 args = { types->i64_type(), // context @@ -1563,6 +1789,21 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(gdv_fn_base64_decode_utf8)); + // gdv_fn_MD5_decimal128 + args = { + types->i64_type(), // context + types->i64_type(), // high_bits + types->i64_type(), // low_bits + types->i32_type(), // precision + types->i32_type(), // scale + types->i1_type(), // validity + types->i32_ptr_type() // out length + }; + + engine->AddGlobalMappingForFunc("gdv_fn_md5_decimal128", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_md5_decimal128)); + // gdv_fn_upper_utf8 args = { types->i64_type(), // context diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 670ac94df1b..8af3f42de4c 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -107,6 +107,11 @@ const char* gdv_fn_sha256_decimal128(int64_t context, int64_t x_high, uint64_t x int32_t x_precision, int32_t x_scale, gdv_boolean x_isvalid, int32_t* out_length); +GANDIVA_EXPORT +const char* gdv_fn_md5_decimal128(int64_t context, int64_t x_high, uint64_t x_low, + int32_t x_precision, int32_t x_scale, + gdv_boolean x_isvalid, int32_t* out_length); + GANDIVA_EXPORT const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_low, int32_t x_precision, int32_t x_scale, diff --git a/cpp/src/gandiva/hash_utils.cc b/cpp/src/gandiva/hash_utils.cc index 8ebf60a9b88..493eec48c29 100644 --- a/cpp/src/gandiva/hash_utils.cc +++ b/cpp/src/gandiva/hash_utils.cc @@ -24,20 +24,28 @@ namespace gandiva { /// Hashes a generic message using the SHA256 algorithm GANDIVA_EXPORT -const char* gdv_hash_using_sha256(int64_t context, const void* message, - size_t message_length, int32_t* out_length) { +const char* gdv_sha256_hash(int64_t context, const void* message, size_t message_length, + int32_t* out_length) { constexpr int sha256_result_length = 64; - return gdv_hash_using_sha(context, message, message_length, EVP_sha256(), - sha256_result_length, out_length); + return gdv_hash_using_openssl(context, message, message_length, EVP_sha256(), + sha256_result_length, out_length); } /// Hashes a generic message using the SHA1 algorithm GANDIVA_EXPORT -const char* gdv_hash_using_sha1(int64_t context, const void* message, - size_t message_length, int32_t* out_length) { +const char* gdv_sha1_hash(int64_t context, const void* message, size_t message_length, + int32_t* out_length) { constexpr int sha1_result_length = 40; - return gdv_hash_using_sha(context, message, message_length, EVP_sha1(), - sha1_result_length, out_length); + return gdv_hash_using_openssl(context, message, message_length, EVP_sha1(), + sha1_result_length, out_length); +} + +GANDIVA_EXPORT +const char* gdv_md5_hash(int64_t context, const void* message, size_t message_length, + int32_t* out_length) { + constexpr int md5_result_length = 32; + return gdv_hash_using_openssl(context, message, message_length, EVP_md5(), + md5_result_length, out_length); } /// \brief Hashes a generic message using SHA algorithm. @@ -46,9 +54,9 @@ const char* gdv_hash_using_sha1(int64_t context, const void* message, /// the hash. The type of the hash is defined by the /// \b hash_type \b parameter. GANDIVA_EXPORT -const char* gdv_hash_using_sha(int64_t context, const void* message, - size_t message_length, const EVP_MD* hash_type, - uint32_t result_buf_size, int32_t* out_length) { +const char* gdv_hash_using_openssl(int64_t context, const void* message, + size_t message_length, const EVP_MD* hash_type, + uint32_t result_buf_size, int32_t* out_length) { EVP_MD_CTX* md_ctx = EVP_MD_CTX_new(); if (md_ctx == nullptr) { diff --git a/cpp/src/gandiva/hash_utils.h b/cpp/src/gandiva/hash_utils.h index 483993f3009..a7d3b48c308 100644 --- a/cpp/src/gandiva/hash_utils.h +++ b/cpp/src/gandiva/hash_utils.h @@ -25,17 +25,21 @@ namespace gandiva { GANDIVA_EXPORT -const char* gdv_hash_using_sha256(int64_t context, const void* message, - size_t message_length, int32_t* out_length); +const char* gdv_sha256_hash(int64_t context, const void* message, size_t message_length, + int32_t* out_length); GANDIVA_EXPORT -const char* gdv_hash_using_sha1(int64_t context, const void* message, - size_t message_length, int32_t* out_length); +const char* gdv_sha1_hash(int64_t context, const void* message, size_t message_length, + int32_t* out_length); GANDIVA_EXPORT -const char* gdv_hash_using_sha(int64_t context, const void* message, - size_t message_length, const EVP_MD* hash_type, - uint32_t result_buf_size, int32_t* out_length); +const char* gdv_hash_using_openssl(int64_t context, const void* message, + size_t message_length, const EVP_MD* hash_type, + uint32_t result_buf_size, int32_t* out_length); + +GANDIVA_EXPORT +const char* gdv_md5_hash(int64_t context, const void* message, size_t message_length, + int32_t* out_length); GANDIVA_EXPORT uint64_t gdv_double_to_long(double value); diff --git a/cpp/src/gandiva/hash_utils_test.cc b/cpp/src/gandiva/hash_utils_test.cc index a8f55e1ede7..b4d66f1aa6f 100644 --- a/cpp/src/gandiva/hash_utils_test.cc +++ b/cpp/src/gandiva/hash_utils_test.cc @@ -46,7 +46,7 @@ TEST(TestShaHashUtils, TestSha1Numeric) { for (auto value : values_to_be_hashed) { int out_length; const char* sha_1 = - gandiva::gdv_hash_using_sha1(ctx_ptr, &value, sizeof(value), &out_length); + gandiva::gdv_sha1_hash(ctx_ptr, &value, sizeof(value), &out_length); std::string sha1_as_str(sha_1, out_length); EXPECT_EQ(sha1_as_str.size(), sha1_size); @@ -81,7 +81,7 @@ TEST(TestShaHashUtils, TestSha256Numeric) { for (auto value : values_to_be_hashed) { int out_length; const char* sha_256 = - gandiva::gdv_hash_using_sha256(ctx_ptr, &value, sizeof(value), &out_length); + gandiva::gdv_sha256_hash(ctx_ptr, &value, sizeof(value), &out_length); std::string sha256_as_str(sha_256, out_length); EXPECT_EQ(sha256_as_str.size(), sha256_size); @@ -91,6 +91,40 @@ TEST(TestShaHashUtils, TestSha256Numeric) { } } +TEST(TestShaHashUtils, TestMD5Numeric) { + gandiva::ExecutionContext ctx; + + auto ctx_ptr = reinterpret_cast(&ctx); + + std::vector values_to_be_hashed; + + // Generate a list of values to obtains the MD5 hash + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.0)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.1)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.2)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.10000001)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000001)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(1.000000)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000002)); + values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.999999)); + + // Checks if the hash value is different for each one of the values + std::unordered_set md5_values; + + int md5_size = 32; + + for (auto value : values_to_be_hashed) { + int out_length; + const char* md5 = gandiva::gdv_md5_hash(ctx_ptr, &value, sizeof(value), &out_length); + std::string md5_as_str(md5, out_length); + EXPECT_EQ(md5_as_str.size(), md5_size); + + // The value can not exists inside the set with the hash results + EXPECT_EQ(md5_values.find(md5_as_str), md5_values.end()); + md5_values.insert(md5_as_str); + } +} + TEST(TestShaHashUtils, TestSha1Varlen) { gandiva::ExecutionContext ctx; @@ -113,14 +147,14 @@ TEST(TestShaHashUtils, TestSha1Varlen) { const int sha1_size = 40; int out_length; - const char* sha_1 = gandiva::gdv_hash_using_sha1(ctx_ptr, first_string.c_str(), - first_string.size(), &out_length); + const char* sha_1 = gandiva::gdv_sha1_hash(ctx_ptr, first_string.c_str(), + first_string.size(), &out_length); std::string sha1_as_str(sha_1, out_length); EXPECT_EQ(sha1_as_str.size(), sha1_size); EXPECT_EQ(sha1_as_str, expected_first_result); - const char* sha_2 = gandiva::gdv_hash_using_sha1(ctx_ptr, second_string.c_str(), - second_string.size(), &out_length); + const char* sha_2 = gandiva::gdv_sha1_hash(ctx_ptr, second_string.c_str(), + second_string.size(), &out_length); std::string sha2_as_str(sha_2, out_length); EXPECT_EQ(sha2_as_str.size(), sha1_size); EXPECT_EQ(sha2_as_str, expected_second_result); @@ -150,15 +184,49 @@ TEST(TestShaHashUtils, TestSha256Varlen) { const int sha256_size = 64; int out_length; - const char* sha_1 = gandiva::gdv_hash_using_sha256(ctx_ptr, first_string.c_str(), - first_string.size(), &out_length); + const char* sha_1 = gandiva::gdv_sha256_hash(ctx_ptr, first_string.c_str(), + first_string.size(), &out_length); std::string sha1_as_str(sha_1, out_length); EXPECT_EQ(sha1_as_str.size(), sha256_size); EXPECT_EQ(sha1_as_str, expected_first_result); - const char* sha_2 = gandiva::gdv_hash_using_sha256(ctx_ptr, second_string.c_str(), - second_string.size(), &out_length); + const char* sha_2 = gandiva::gdv_sha256_hash(ctx_ptr, second_string.c_str(), + second_string.size(), &out_length); std::string sha2_as_str(sha_2, out_length); EXPECT_EQ(sha2_as_str.size(), sha256_size); EXPECT_EQ(sha2_as_str, expected_second_result); } + +TEST(TestShaHashUtils, TestMD5Varlen) { + gandiva::ExecutionContext ctx; + + auto ctx_ptr = reinterpret_cast(&ctx); + + std::string first_string = + "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃnY [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]"; + + std::string second_string = + "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeınY [ˈʏpsilɔn], " + "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ"; + + // The strings expected hashes are obtained from shell executing the following command: + // echo -n | openssl dgst md5 + std::string expected_first_result = "a633460644425b44e0e023d6980849cc"; + std::string expected_second_result = "407983529dba21e95d95951ccffd30c3"; + + // Generate the hashes and compare with expected outputs + const int md5_size = 32; + int out_length; + + const char* md5_1 = gandiva::gdv_md5_hash(ctx_ptr, first_string.c_str(), + first_string.size(), &out_length); + std::string md5_as_str(md5_1, out_length); + EXPECT_EQ(md5_as_str.size(), md5_size); + EXPECT_EQ(md5_as_str, expected_first_result); + + const char* md5_2 = gandiva::gdv_md5_hash(ctx_ptr, second_string.c_str(), + second_string.size(), &out_length); + std::string md5_2_as_str(md5_2, out_length); + EXPECT_EQ(md5_2_as_str.size(), md5_size); + EXPECT_EQ(md5_2_as_str, expected_second_result); +} diff --git a/cpp/src/gandiva/tests/hash_test.cc b/cpp/src/gandiva/tests/hash_test.cc index 40ebc50a271..0a574f3267f 100644 --- a/cpp/src/gandiva/tests/hash_test.cc +++ b/cpp/src/gandiva/tests/hash_test.cc @@ -612,4 +612,144 @@ TEST_F(TestHash, TestSha256FunctionsAlias) { EXPECT_ARROW_ARRAY_EQUALS(outputs.at(4), outputs.at(5)); // hashSha2 and sha256 responses } + +TEST_F(TestHash, TestMD5Simple) { + // schema for input fields + auto field_a = field("a", int32()); + auto field_b = field("b", int64()); + auto field_c = field("c", float32()); + auto field_d = field("d", float64()); + auto schema = arrow::schema({field_a, field_b, field_c, field_d}); + + // output fields + auto res_0 = field("res0", utf8()); + auto res_1 = field("res1", utf8()); + auto res_2 = field("res2", utf8()); + auto res_3 = field("res3", utf8()); + + // build expressions. + // hashMD5(a) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto hashMD5_1 = TreeExprBuilder::MakeFunction("hashMD5", {node_a}, utf8()); + auto expr_0 = TreeExprBuilder::MakeExpression(hashMD5_1, res_0); + + auto node_b = TreeExprBuilder::MakeField(field_b); + auto hashMD5_2 = TreeExprBuilder::MakeFunction("hashMD5", {node_b}, utf8()); + auto expr_1 = TreeExprBuilder::MakeExpression(hashMD5_2, res_1); + + auto node_c = TreeExprBuilder::MakeField(field_c); + auto hashMD5_3 = TreeExprBuilder::MakeFunction("hashMD5", {node_c}, utf8()); + auto expr_2 = TreeExprBuilder::MakeExpression(hashMD5_3, res_2); + + auto node_d = TreeExprBuilder::MakeField(field_d); + auto hashMD5_4 = TreeExprBuilder::MakeFunction("hashMD5", {node_d}, utf8()); + auto expr_3 = TreeExprBuilder::MakeExpression(hashMD5_4, res_3); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr_0, expr_1, expr_2, expr_3}, + TestConfiguration(), &projector); + ASSERT_OK(status) << status.message(); + + // Create a row-batch with some sample data + int num_records = 2; + auto validity_array = {false, true}; + + auto array_int32 = MakeArrowArrayInt32({1, 0}, validity_array); + + auto array_int64 = MakeArrowArrayInt64({1, 0}, validity_array); + + auto array_float32 = MakeArrowArrayFloat32({1.0, 0.0}, validity_array); + + auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0}, validity_array); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make( + schema, num_records, {array_int32, array_int64, array_float32, array_float64}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + ASSERT_OK(status); + + auto response_int32 = outputs.at(0); + auto response_int64 = outputs.at(1); + auto response_float32 = outputs.at(2); + auto response_float64 = outputs.at(3); + + // Checks if the null and zero representation for numeric values + // are consistent between the types + EXPECT_ARROW_ARRAY_EQUALS(response_int32, response_int64); + EXPECT_ARROW_ARRAY_EQUALS(response_int64, response_float32); + EXPECT_ARROW_ARRAY_EQUALS(response_float32, response_float64); + + const int MD5_hash_size = 32; + + // Checks if the hash size in response is correct + for (int i = 1; i < num_records; ++i) { + const auto& value_at_position = response_int32->GetScalar(i).ValueOrDie()->ToString(); + + EXPECT_EQ(value_at_position.size(), MD5_hash_size); + EXPECT_NE(value_at_position, + response_int32->GetScalar(i - 1).ValueOrDie()->ToString()); + } +} + +TEST_F(TestHash, TestMD5Varlen) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res_0 = field("res0", utf8()); + + // build expressions. + // hashMD5(a) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto hashMD5 = TreeExprBuilder::MakeFunction("hashMD5", {node_a}, utf8()); + auto expr_0 = TreeExprBuilder::MakeExpression(hashMD5, res_0); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr_0}, TestConfiguration(), &projector); + ASSERT_OK(status) << status.message(); + + // Create a row-batch with some sample data + int num_records = 3; + + std::string first_string = + "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], " + "Yen [jɛn], Yoga [ˈjoːgɑ]"; + std::string second_string = + "ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], " + "Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ"; + + auto array_a = + MakeArrowArrayUtf8({"", first_string, second_string}, {false, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + ASSERT_OK(status); + + auto response = outputs.at(0); + const int MD5_hash_size = 32; + + EXPECT_EQ(response->null_count(), 0); + + // Checks that the null value was hashed + EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), ""); + EXPECT_EQ(response->GetScalar(0).ValueOrDie()->ToString().size(), MD5_hash_size); + + // Check that all generated hashes were different + for (int i = 1; i < num_records; ++i) { + const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString(); + + EXPECT_EQ(value_at_position.size(), MD5_hash_size); + EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString()); + } +} } // namespace gandiva