Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/src/gandiva/function_registry_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,16 @@ typedef std::unordered_map<const FunctionSignature*, const NativeFunction*, KeyH
ARROW_STRINGIFY(gdv_fn_sha256_##TYPE), \
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)

// HashMD5 functions that :
// - NULL handling is of type NULL_NEVER
// - can return errors
//
// The function name includes the base name & input type name. gdv_fn_md5_float64
#define HASH_MD5_NULL_NEVER(NAME, ALIASES, TYPE) \
NativeFunction(#NAME, {"md5"}, DataTypeVector{TYPE()}, utf8(), kResultNullNever, \
ARROW_STRINGIFY(gdv_fn_md5_##TYPE), \
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)

// Iterate the inner macro over all numeric types
#define BASE_NUMERIC_TYPES(INNER, NAME, ALIASES) \
INNER(NAME, ALIASES, int8), INNER(NAME, ALIASES, int16), INNER(NAME, ALIASES, int32), \
Expand Down
7 changes: 6 additions & 1 deletion cpp/src/gandiva/function_registry_hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ namespace gandiva {
#define HASH_SHA256_NULL_NEVER_FN(name, ALIASES) \
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA256_NULL_NEVER, name, ALIASES)

#define HASH_MD5_NULL_NEVER_FN(name, ALIASES) \
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_MD5_NULL_NEVER, name, ALIASES)

std::vector<NativeFunction> GetHashFunctionRegistry() {
static std::vector<NativeFunction> hash_fn_registry_ = {
HASH32_SAFE_NULL_NEVER_FN(hash, {}),
Expand All @@ -55,7 +58,9 @@ std::vector<NativeFunction> GetHashFunctionRegistry() {

HASH_SHA1_NULL_NEVER_FN(hashSHA1, {}),

HASH_SHA256_NULL_NEVER_FN(hashSHA256, {})};
HASH_SHA256_NULL_NEVER_FN(hashSHA256, {}),

HASH_MD5_NULL_NEVER_FN(hashMD5, {})};

return hash_fn_registry_;
}
Expand Down
325 changes: 283 additions & 42 deletions cpp/src/gandiva/gdv_function_stubs.cc

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ const char* gdv_fn_sha256_decimal128(int64_t context, int64_t x_high, uint64_t x
int32_t x_precision, int32_t x_scale,
gdv_boolean x_isvalid, int32_t* out_length);

GANDIVA_EXPORT
const char* gdv_fn_md5_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
int32_t x_precision, int32_t x_scale,
gdv_boolean x_isvalid, int32_t* out_length);

GANDIVA_EXPORT
const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
int32_t x_precision, int32_t x_scale,
Expand Down
30 changes: 19 additions & 11 deletions cpp/src/gandiva/hash_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,28 @@
namespace gandiva {
/// Hashes a generic message using the SHA256 algorithm
GANDIVA_EXPORT
const char* gdv_hash_using_sha256(int64_t context, const void* message,
size_t message_length, int32_t* out_length) {
const char* gdv_sha256_hash(int64_t context, const void* message, size_t message_length,
int32_t* out_length) {
constexpr int sha256_result_length = 64;
return gdv_hash_using_sha(context, message, message_length, EVP_sha256(),
sha256_result_length, out_length);
return gdv_hash_using_openssl(context, message, message_length, EVP_sha256(),
sha256_result_length, out_length);
}

/// Hashes a generic message using the SHA1 algorithm
GANDIVA_EXPORT
const char* gdv_hash_using_sha1(int64_t context, const void* message,
size_t message_length, int32_t* out_length) {
const char* gdv_sha1_hash(int64_t context, const void* message, size_t message_length,
int32_t* out_length) {
constexpr int sha1_result_length = 40;
return gdv_hash_using_sha(context, message, message_length, EVP_sha1(),
sha1_result_length, out_length);
return gdv_hash_using_openssl(context, message, message_length, EVP_sha1(),
sha1_result_length, out_length);
}

GANDIVA_EXPORT
const char* gdv_md5_hash(int64_t context, const void* message, size_t message_length,
int32_t* out_length) {
constexpr int md5_result_length = 32;
return gdv_hash_using_openssl(context, message, message_length, EVP_md5(),
md5_result_length, out_length);
}

/// \brief Hashes a generic message using SHA algorithm.
Expand All @@ -46,9 +54,9 @@ const char* gdv_hash_using_sha1(int64_t context, const void* message,
/// the hash. The type of the hash is defined by the
/// \b hash_type \b parameter.
GANDIVA_EXPORT
const char* gdv_hash_using_sha(int64_t context, const void* message,
size_t message_length, const EVP_MD* hash_type,
uint32_t result_buf_size, int32_t* out_length) {
const char* gdv_hash_using_openssl(int64_t context, const void* message,
size_t message_length, const EVP_MD* hash_type,
uint32_t result_buf_size, int32_t* out_length) {
EVP_MD_CTX* md_ctx = EVP_MD_CTX_new();

if (md_ctx == nullptr) {
Expand Down
18 changes: 11 additions & 7 deletions cpp/src/gandiva/hash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,21 @@

namespace gandiva {
GANDIVA_EXPORT
const char* gdv_hash_using_sha256(int64_t context, const void* message,
size_t message_length, int32_t* out_length);
const char* gdv_sha256_hash(int64_t context, const void* message, size_t message_length,
int32_t* out_length);

GANDIVA_EXPORT
const char* gdv_hash_using_sha1(int64_t context, const void* message,
size_t message_length, int32_t* out_length);
const char* gdv_sha1_hash(int64_t context, const void* message, size_t message_length,
int32_t* out_length);

GANDIVA_EXPORT
const char* gdv_hash_using_sha(int64_t context, const void* message,
size_t message_length, const EVP_MD* hash_type,
uint32_t result_buf_size, int32_t* out_length);
const char* gdv_hash_using_openssl(int64_t context, const void* message,
size_t message_length, const EVP_MD* hash_type,
uint32_t result_buf_size, int32_t* out_length);

GANDIVA_EXPORT
const char* gdv_md5_hash(int64_t context, const void* message, size_t message_length,
int32_t* out_length);

GANDIVA_EXPORT
uint64_t gdv_double_to_long(double value);
Expand Down
88 changes: 78 additions & 10 deletions cpp/src/gandiva/hash_utils_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ TEST(TestShaHashUtils, TestSha1Numeric) {
for (auto value : values_to_be_hashed) {
int out_length;
const char* sha_1 =
gandiva::gdv_hash_using_sha1(ctx_ptr, &value, sizeof(value), &out_length);
gandiva::gdv_sha1_hash(ctx_ptr, &value, sizeof(value), &out_length);
std::string sha1_as_str(sha_1, out_length);
EXPECT_EQ(sha1_as_str.size(), sha1_size);

Expand Down Expand Up @@ -81,7 +81,7 @@ TEST(TestShaHashUtils, TestSha256Numeric) {
for (auto value : values_to_be_hashed) {
int out_length;
const char* sha_256 =
gandiva::gdv_hash_using_sha256(ctx_ptr, &value, sizeof(value), &out_length);
gandiva::gdv_sha256_hash(ctx_ptr, &value, sizeof(value), &out_length);
std::string sha256_as_str(sha_256, out_length);
EXPECT_EQ(sha256_as_str.size(), sha256_size);

Expand All @@ -91,6 +91,40 @@ TEST(TestShaHashUtils, TestSha256Numeric) {
}
}

TEST(TestShaHashUtils, TestMD5Numeric) {
gandiva::ExecutionContext ctx;

auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);

std::vector<uint64_t> values_to_be_hashed;

// Generate a list of values to obtains the MD5 hash
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.0));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.1));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.2));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.10000001));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000001));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(1.000000));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000002));
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.999999));

// Checks if the hash value is different for each one of the values
std::unordered_set<std::string> md5_values;

int md5_size = 32;

for (auto value : values_to_be_hashed) {
int out_length;
const char* md5 = gandiva::gdv_md5_hash(ctx_ptr, &value, sizeof(value), &out_length);
std::string md5_as_str(md5, out_length);
EXPECT_EQ(md5_as_str.size(), md5_size);

// The value can not exists inside the set with the hash results
EXPECT_EQ(md5_values.find(md5_as_str), md5_values.end());
md5_values.insert(md5_as_str);
}
}

TEST(TestShaHashUtils, TestSha1Varlen) {
gandiva::ExecutionContext ctx;

Expand All @@ -113,14 +147,14 @@ TEST(TestShaHashUtils, TestSha1Varlen) {
const int sha1_size = 40;
int out_length;

const char* sha_1 = gandiva::gdv_hash_using_sha1(ctx_ptr, first_string.c_str(),
first_string.size(), &out_length);
const char* sha_1 = gandiva::gdv_sha1_hash(ctx_ptr, first_string.c_str(),
first_string.size(), &out_length);
std::string sha1_as_str(sha_1, out_length);
EXPECT_EQ(sha1_as_str.size(), sha1_size);
EXPECT_EQ(sha1_as_str, expected_first_result);

const char* sha_2 = gandiva::gdv_hash_using_sha1(ctx_ptr, second_string.c_str(),
second_string.size(), &out_length);
const char* sha_2 = gandiva::gdv_sha1_hash(ctx_ptr, second_string.c_str(),
second_string.size(), &out_length);
std::string sha2_as_str(sha_2, out_length);
EXPECT_EQ(sha2_as_str.size(), sha1_size);
EXPECT_EQ(sha2_as_str, expected_second_result);
Expand Down Expand Up @@ -150,15 +184,49 @@ TEST(TestShaHashUtils, TestSha256Varlen) {
const int sha256_size = 64;
int out_length;

const char* sha_1 = gandiva::gdv_hash_using_sha256(ctx_ptr, first_string.c_str(),
first_string.size(), &out_length);
const char* sha_1 = gandiva::gdv_sha256_hash(ctx_ptr, first_string.c_str(),
first_string.size(), &out_length);
std::string sha1_as_str(sha_1, out_length);
EXPECT_EQ(sha1_as_str.size(), sha256_size);
EXPECT_EQ(sha1_as_str, expected_first_result);

const char* sha_2 = gandiva::gdv_hash_using_sha256(ctx_ptr, second_string.c_str(),
second_string.size(), &out_length);
const char* sha_2 = gandiva::gdv_sha256_hash(ctx_ptr, second_string.c_str(),
second_string.size(), &out_length);
std::string sha2_as_str(sha_2, out_length);
EXPECT_EQ(sha2_as_str.size(), sha256_size);
EXPECT_EQ(sha2_as_str, expected_second_result);
}

TEST(TestShaHashUtils, TestMD5Varlen) {
gandiva::ExecutionContext ctx;

auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);

std::string first_string =
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃnY [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]";

std::string second_string =
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeınY [ˈʏpsilɔn], "
"Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";

// The strings expected hashes are obtained from shell executing the following command:
// echo -n <output-string> | openssl dgst md5
std::string expected_first_result = "a633460644425b44e0e023d6980849cc";
std::string expected_second_result = "407983529dba21e95d95951ccffd30c3";

// Generate the hashes and compare with expected outputs
const int md5_size = 32;
int out_length;

const char* md5_1 = gandiva::gdv_md5_hash(ctx_ptr, first_string.c_str(),
first_string.size(), &out_length);
std::string md5_as_str(md5_1, out_length);
EXPECT_EQ(md5_as_str.size(), md5_size);
EXPECT_EQ(md5_as_str, expected_first_result);

const char* md5_2 = gandiva::gdv_md5_hash(ctx_ptr, second_string.c_str(),
second_string.size(), &out_length);
std::string md5_2_as_str(md5_2, out_length);
EXPECT_EQ(md5_2_as_str.size(), md5_size);
EXPECT_EQ(md5_2_as_str, expected_second_result);
}
Loading