diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index fcd4db5bead775..0cb85d1b41b716 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -140,6 +140,17 @@ class HashUtil { return out; } + template + static uint64_t murmur_hash3_64(const void* key, int64_t len, uint64_t seed) { + uint64_t out = 0; + if constexpr (is_mmh64_v2) { + murmur_hash3_x64_64_shared(key, len, seed, &out); + } else { + murmur_hash3_x64_64(key, len, seed, &out); + } + return out; + } + static const int MURMUR_R = 47; // Murmur2 hash implementation returning 64-bit hashes. diff --git a/be/src/util/murmur_hash3.cpp b/be/src/util/murmur_hash3.cpp index 96568d6978e225..13f9d8e15cedc4 100644 --- a/be/src/util/murmur_hash3.cpp +++ b/be/src/util/murmur_hash3.cpp @@ -315,13 +315,11 @@ void murmur_hash3_x86_128(const void* key, const int len, uint32_t seed, void* o //----------------------------------------------------------------------------- -void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, void* out) { +// Helper function that implements the core MurmurHash3 128-bit hashing algorithm +void murmur_hash3_x64_process(const void* key, const int len, uint64_t& h1, uint64_t& h2) { const uint8_t* data = (const uint8_t*)key; const int nblocks = len / 16; - uint64_t h1 = seed; - uint64_t h2 = seed; - const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); @@ -430,12 +428,40 @@ void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, v h1 += h2; h2 += h1; +} +//----------------------------------------------------------------------------- + +// The origin function `murmur_hash3_x64_128` is copied from: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp +// And Doris modified it into function `murmur_hash3_x64_process` +// For this reason, this function is still retained even though it has no calls. +void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, void* out) { + uint64_t h1 = seed; + uint64_t h2 = seed; + murmur_hash3_x64_process(key, len, h1, h2); ((uint64_t*)out)[0] = h1; ((uint64_t*)out)[1] = h2; } -void murmur_hash3_x64_64(const void* key, const int len, const uint64_t seed, void* out) { +//----------------------------------------------------------------------------- + +// MurmurHash3 x64 64-bit variant using shared 128-bit processing function +// This implementation reuses the murmur_hash3_x64_process function and only outputs the first hash value +// Used for function mmh3_64_v2 +void murmur_hash3_x64_64_shared(const void* key, const int64_t len, const uint64_t seed, + void* out) { + uint64_t h1 = seed; + uint64_t h2 = seed; + murmur_hash3_x64_process(key, static_cast(len), h1, h2); + ((uint64_t*)out)[0] = h1; +} + +//----------------------------------------------------------------------------- + +// MurmurHash3 x64 64-bit variant with optimized standalone implementation +// This implementation is specifically optimized for 64-bit output +// Used for function mmh3_64 +void murmur_hash3_x64_64(const void* key, const int64_t len, const uint64_t seed, void* out) { const uint8_t* data = (const uint8_t*)key; const int nblocks = len / 8; uint64_t h1 = seed; diff --git a/be/src/util/murmur_hash3.h b/be/src/util/murmur_hash3.h index c8e8964bf6a20e..992e71eb4da99a 100644 --- a/be/src/util/murmur_hash3.h +++ b/be/src/util/murmur_hash3.h @@ -29,8 +29,12 @@ void murmur_hash3_x86_32(const void* key, int len, uint32_t seed, void* out); void murmur_hash3_x86_128(const void* key, int len, uint32_t seed, void* out); +void murmur_hash3_x64_process(const void* key, const int len, uint64_t& h1, uint64_t& h2); + void murmur_hash3_x64_128(const void* key, int len, uint32_t seed, void* out); -void murmur_hash3_x64_64(const void* key, int len, uint64_t seed, void* out); +void murmur_hash3_x64_64_shared(const void* key, const int64_t len, const uint64_t seed, void* out); + +void murmur_hash3_x64_64(const void* key, int64_t len, uint64_t seed, void* out); //----------------------------------------------------------------------------- diff --git a/be/src/vec/functions/function_hash.cpp b/be/src/vec/functions/function_hash.cpp index 972d2eb0b9d8a1..992de998fd0294 100644 --- a/be/src/vec/functions/function_hash.cpp +++ b/be/src/vec/functions/function_hash.cpp @@ -40,10 +40,18 @@ namespace doris::vectorized { constexpr uint64_t emtpy_value = 0xe28dbde7fe22e41c; -template +template struct MurmurHash3Impl { - static constexpr auto name = - std::is_same_v ? "murmur_hash3_32" : "murmur_hash3_64"; + static constexpr auto get_name() { + if constexpr (ReturnType == TYPE_INT) { + return "murmur_hash3_32"; + } else if constexpr (is_mmh64_v2) { + return "murmur_hash3_64_v2"; + } else { + return "murmur_hash3_64"; + } + } + static constexpr auto name = get_name(); static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { ColumnVector& vec_to = assert_cast&>(icolumn); @@ -85,9 +93,9 @@ struct MurmurHash3Impl { reinterpret_cast(&data[current_offset]), offsets[i] - current_offset, col_to_data[i]); } else { - murmur_hash3_x64_64(reinterpret_cast(&data[current_offset]), - offsets[i] - current_offset, col_to_data[i], - col_to_data.data() + i); + col_to_data[i] = HashUtil::murmur_hash3_64( + reinterpret_cast(&data[current_offset]), + offsets[i] - current_offset, col_to_data[i]); } current_offset = offsets[i]; } @@ -99,8 +107,8 @@ struct MurmurHash3Impl { col_to_data[i] = HashUtil::murmur_hash3_32(value.data(), value.size(), col_to_data[i]); } else { - murmur_hash3_x64_64(value.data(), value.size(), col_to_data[i], - col_to_data.data() + i); + col_to_data[i] = HashUtil::murmur_hash3_64( + value.data(), value.size(), col_to_data[i]); } } } else { @@ -112,8 +120,26 @@ struct MurmurHash3Impl { } }; -using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase>; -using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase>; +using FunctionMurmurHash3_32 = + FunctionVariadicArgumentsBase>; +using FunctionMurmurHash3_64 = + FunctionVariadicArgumentsBase>; +using FunctionMurmurHash3_64_V2 = + FunctionVariadicArgumentsBase>; + +#ifdef BE_TEST +const char* murmur_hash3_get_name_type_int_for_test() { + return MurmurHash3Impl::get_name(); +} + +const char* murmur_hash3_get_name_type_bigint_for_test() { + return MurmurHash3Impl::get_name(); +} + +const char* murmur_hash3_get_name_type_bigint_v2_for_test() { + return MurmurHash3Impl::get_name(); +} +#endif template struct XxHashImpl { @@ -187,7 +213,9 @@ using FunctionXxHash_64 = FunctionVariadicArgumentsBase(); factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_alias("xxhash_64", "xxhash3_64"); } } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/functions/function_hash.h b/be/src/vec/functions/function_hash.h index 0516dd85c4c49e..7da4a75dc3a54e 100644 --- a/be/src/vec/functions/function_hash.h +++ b/be/src/vec/functions/function_hash.h @@ -22,4 +22,12 @@ #include "vec/core/types.h" -namespace doris::vectorized {} // namespace doris::vectorized +namespace doris::vectorized { + +#ifdef BE_TEST +const char* murmur_hash3_get_name_type_int_for_test(); +const char* murmur_hash3_get_name_type_bigint_for_test(); +const char* murmur_hash3_get_name_type_bigint_v2_for_test(); +#endif + +} // namespace doris::vectorized diff --git a/be/test/vec/function/function_hash_test.cpp b/be/test/vec/function/function_hash_test.cpp index 4d2cf6be4b4069..aef82682ae6e42 100644 --- a/be/test/vec/function/function_hash_test.cpp +++ b/be/test/vec/function/function_hash_test.cpp @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "vec/functions/function_hash.h" + #include #include @@ -24,6 +26,7 @@ #include "function_test_util.h" #include "gtest/gtest_pred_impl.h" #include "testutil/any_type.h" +#include "util/murmur_hash3.h" #include "vec/core/types.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" @@ -94,6 +97,26 @@ TEST(HashFunctionTest, murmur_hash_3_64_test) { }; } +TEST(HashFunctionTest, murmur_hash_3_64_v2_test) { + std::string func_name = "murmur_hash3_64_v2"; + + { + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; + + DataSet data_set = {{{std::string("1000209601_1756808272")}, (int64_t)4038800892574899471}, + {{std::string("hello world")}, (int64_t)5998619086395760910}, + {{std::string("apache doris")}, (int64_t)3669213779466221743}}; + + static_cast(check_function(func_name, input_types, data_set)); + }; +} + +TEST(HashFunctionTest, murmur_hash_get_name_test) { + EXPECT_STREQ(murmur_hash3_get_name_type_int_for_test(), "murmur_hash3_32"); + EXPECT_STREQ(murmur_hash3_get_name_type_bigint_for_test(), "murmur_hash3_64"); + EXPECT_STREQ(murmur_hash3_get_name_type_bigint_v2_for_test(), "murmur_hash3_64_v2"); +} + TEST(HashFunctionTest, xxhash_32_test) { std::string func_name = "xxhash_32"; @@ -158,4 +181,44 @@ TEST(HashFunctionTest, xxhash_64_test) { }; } +TEST(HashFunctionTest, murmur_hash3_helper_functions_test) { + { + std::string input = "hello world"; + uint64_t h1 = 0; + uint64_t h2 = 0; + murmur_hash3_x64_process(input.data(), input.size(), h1, h2); + EXPECT_EQ(h1, 5998619086395760910ULL); + EXPECT_EQ(h2, 12364428806279881649ULL); + } + + { + std::string input = "hello world"; + uint64_t out[2] = {0, 0}; + murmur_hash3_x64_128(input.data(), input.size(), 0, out); + EXPECT_TRUE(out[0] == 5998619086395760910ULL && out[1] == 12364428806279881649ULL); + } + + { + std::string input = "hello world"; + uint64_t out = 0; + murmur_hash3_x64_64_shared(input.data(), input.size(), 0, &out); + EXPECT_EQ(out, 5998619086395760910ULL); + } + + { + std::string input = "hello"; + uint64_t out = 0; + murmur_hash3_x64_64(input.data(), input.size(), 0, &out); + EXPECT_EQ(out, static_cast(-3215607508166160593LL)); + } + + { + std::string input = ""; + uint64_t h1 = 0, h2 = 0; + murmur_hash3_x64_process(input.data(), input.size(), h1, h2); + EXPECT_EQ(h1, 0ULL); + EXPECT_EQ(h2, 0ULL); + } +} + } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 3d9e12398f20b4..91ffd73668e1aa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -319,6 +319,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions; import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332; import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364; +import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364V2; import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative; import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch; import org.apache.doris.nereids.trees.expressions.functions.scalar.NonNullable; @@ -799,6 +800,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(MultiSearchAllPositions.class, "multi_search_all_positions"), scalar(MurmurHash332.class, "murmur_hash3_32"), scalar(MurmurHash364.class, "murmur_hash3_64"), + scalar(MurmurHash364V2.class, "murmur_hash3_64_v2"), scalar(Negative.class, "negative"), scalar(NonNullable.class, "non_nullable"), scalar(NormalCdf.class, "normal_cdf"), @@ -946,7 +948,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(WeeksSub.class, "weeks_sub"), scalar(WidthBucket.class, "width_bucket"), scalar(XxHash32.class, "xxhash_32"), - scalar(XxHash64.class, "xxhash_64"), + scalar(XxHash64.class, "xxhash_64", "xxhash3_64"), scalar(Xor.class, "xor"), scalar(Year.class, "year"), scalar(YearCeil.class, "year_ceil"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/MurmurHash364V2.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/MurmurHash364V2.java new file mode 100644 index 00000000000000..61b30bf8f30376 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/MurmurHash364V2.java @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; +import org.apache.doris.nereids.util.ExpressionUtils; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'murmur_hash3_64_v2'. This class is generated by GenerateFunction. + */ +public class MurmurHash364V2 extends ScalarFunction + implements ExplicitlyCastableSignature, PropagateNullable { + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(BigIntType.INSTANCE).varArgs(VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(BigIntType.INSTANCE).varArgs(StringType.INSTANCE) + ); + + /** + * constructor with 1 or more arguments. + */ + public MurmurHash364V2(Expression arg, Expression... varArgs) { + super("murmur_hash3_64_v2", ExpressionUtils.mergeArguments(arg, varArgs)); + } + + /** constructor for withChildren and reuse signature */ + private MurmurHash364V2(ScalarFunctionParams functionParams) { + super(functionParams); + } + + /** + * withChildren. + */ + @Override + public MurmurHash364V2 withChildren(List children) { + Preconditions.checkArgument(!children.isEmpty()); + return new MurmurHash364V2(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitMurmurHash364V2(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 75c4a1669199f1..14e77163185c55 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -320,6 +320,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions; import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332; import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364; +import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364V2; import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative; import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch; import org.apache.doris.nereids.trees.expressions.functions.scalar.NormalCdf; @@ -1641,6 +1642,10 @@ default R visitMurmurHash364(MurmurHash364 murmurHash364, C context) { return visitScalarFunction(murmurHash364, context); } + default R visitMurmurHash364V2(MurmurHash364V2 murmurHash364V2, C context) { + return visitScalarFunction(murmurHash364V2, context); + } + default R visitXxHash32(XxHash32 xxHash32, C context) { return visitScalarFunction(xxHash32, context); } diff --git a/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out index 984075ddeffbc4..e4d755e029e7d0 100644 --- a/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out +++ b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out @@ -17,6 +17,18 @@ -- !sql -- 3583109472027628045 +-- !mmh3_64_v2_1 -- +\N + +-- !mmh3_64_v2_2 -- +4038800892574899471 + +-- !mmh3_64_v2_3 -- +5998619086395760910 + +-- !mmh3_64_v2_4 -- +3669213779466221743 + -- !sql -- \N diff --git a/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy index d547e9fb287d71..74acd20a9982ce 100644 --- a/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy +++ b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy @@ -26,6 +26,13 @@ suite("test_hash_function", "arrow_flight_sql") { qt_sql "SELECT murmur_hash3_64(\"hello\");" qt_sql "SELECT murmur_hash3_64(\"hello\", \"world\");" + // Keep the results same with `mmh3.hash64` in python or `murmur3.Sum64` in go + // Please dont auto genOut for this test + qt_mmh3_64_v2_1 "SELECT MURMUR_HASH3_64_V2(NULL);" + qt_mmh3_64_v2_2 "SELECT MURMUR_HASH3_64_V2('1000209601_1756808272');" + qt_mmh3_64_v2_3 "SELECT MURMUR_HASH3_64_V2('hello world');" + qt_mmh3_64_v2_4 "SELECT MURMUR_HASH3_64_V2('apache doris');" + qt_sql "SELECT xxhash_32(null);" qt_sql "SELECT xxhash_32(\"hello\");" qt_sql "SELECT xxhash_32(\"hello\", \"world\");" @@ -33,4 +40,16 @@ suite("test_hash_function", "arrow_flight_sql") { qt_sql "SELECT xxhash_64(null);" qt_sql "SELECT xxhash_64(\"hello\");" qt_sql "SELECT xxhash_64(\"hello\", \"world\");" + + def xxhash_res = sql "SELECT xxhash_64(null);" + def xxhash3_res = sql "SELECT xxhash3_64(null);" + assertEquals(xxhash_res, xxhash3_res); + + xxhash_res = sql "SELECT xxhash_64(\"hello\");" + xxhash3_res = sql "SELECT xxhash3_64(\"hello\");" + assertEquals(xxhash_res, xxhash3_res); + + xxhash_res = sql "SELECT xxhash_64(\"hello\", \"world\");" + xxhash3_res = sql "SELECT xxhash3_64(\"hello\", \"world\");" + assertEquals(xxhash_res, xxhash3_res); }