Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions be/src/util/hash_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ class HashUtil {
return out;
}

template <bool is_mmh64_v2>
static uint64_t murmur_hash3_64(const void* key, int64_t len, uint64_t seed) {
uint64_t out = 0;
if constexpr (is_mmh64_v2) {
murmur_hash3_x64_64_shared(key, len, seed, &out);
} else {
murmur_hash3_x64_64(key, len, seed, &out);
}
return out;
}

static const int MURMUR_R = 47;

// Murmur2 hash implementation returning 64-bit hashes.
Expand Down
34 changes: 30 additions & 4 deletions be/src/util/murmur_hash3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,13 +315,11 @@ void murmur_hash3_x86_128(const void* key, const int len, uint32_t seed, void* o

//-----------------------------------------------------------------------------

void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, void* out) {
// Helper function that implements the core MurmurHash3 128-bit hashing algorithm
void murmur_hash3_x64_process(const void* key, const int len, uint64_t& h1, uint64_t& h2) {
const uint8_t* data = (const uint8_t*)key;
const int nblocks = len / 16;

uint64_t h1 = seed;
uint64_t h2 = seed;

const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);

Expand Down Expand Up @@ -430,11 +428,39 @@ void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, v

h1 += h2;
h2 += h1;
}

//-----------------------------------------------------------------------------

// The origin function `murmur_hash3_x64_128` is copied from: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
// And Doris modified it into function `murmur_hash3_x64_process`
// For this reason, this function is still retained even though it has no calls.
void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, void* out) {
uint64_t h1 = seed;
uint64_t h2 = seed;
murmur_hash3_x64_process(key, len, h1, h2);
((uint64_t*)out)[0] = h1;
((uint64_t*)out)[1] = h2;
}

//-----------------------------------------------------------------------------

// MurmurHash3 x64 64-bit variant using shared 128-bit processing function
// This implementation reuses the murmur_hash3_x64_process function and only outputs the first hash value
// Used for function mmh3_64_v2
void murmur_hash3_x64_64_shared(const void* key, const int64_t len, const uint64_t seed,
void* out) {
uint64_t h1 = seed;
uint64_t h2 = seed;
murmur_hash3_x64_process(key, static_cast<int>(len), h1, h2);
((uint64_t*)out)[0] = h1;
}

//-----------------------------------------------------------------------------

// MurmurHash3 x64 64-bit variant with optimized standalone implementation
// This implementation is specifically optimized for 64-bit output
// Used for function mmh3_64
void murmur_hash3_x64_64(const void* key, const int64_t len, const uint64_t seed, void* out) {
const uint8_t* data = (const uint8_t*)key;
const int nblocks = (int)len / 8;
Expand Down
4 changes: 4 additions & 0 deletions be/src/util/murmur_hash3.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@ void murmur_hash3_x86_32(const void* key, int64_t len, uint32_t seed, void* out)

void murmur_hash3_x86_128(const void* key, int len, uint32_t seed, void* out);

void murmur_hash3_x64_process(const void* key, const int len, uint64_t& h1, uint64_t& h2);

void murmur_hash3_x64_128(const void* key, int len, uint32_t seed, void* out);

void murmur_hash3_x64_64_shared(const void* key, const int64_t len, const uint64_t seed, void* out);

void murmur_hash3_x64_64(const void* key, int64_t len, uint64_t seed, void* out);

} // namespace doris
41 changes: 34 additions & 7 deletions be/src/vec/functions/function_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,18 @@ namespace doris::vectorized {
#include "common/compile_check_begin.h"
constexpr uint64_t emtpy_value = 0xe28dbde7fe22e41c;

template <PrimitiveType ReturnType>
template <PrimitiveType ReturnType, bool is_mmh64_v2 = false>
struct MurmurHash3Impl {
static constexpr auto name = ReturnType == TYPE_INT ? "murmur_hash3_32" : "murmur_hash3_64";
static constexpr auto get_name() {
if constexpr (ReturnType == TYPE_INT) {
return "murmur_hash3_32";
} else if constexpr (is_mmh64_v2) {
return "murmur_hash3_64_v2";
} else {
return "murmur_hash3_64";
}
}
static constexpr auto name = get_name();

static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn);
Expand Down Expand Up @@ -87,9 +96,9 @@ struct MurmurHash3Impl {
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, col_to_data[i]);
} else {
murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, col_to_data[i],
col_to_data.data() + i);
col_to_data[i] = HashUtil::murmur_hash3_64<is_mmh64_v2>(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, col_to_data[i]);
}
current_offset = offsets[i];
}
Expand All @@ -101,8 +110,8 @@ struct MurmurHash3Impl {
col_to_data[i] =
HashUtil::murmur_hash3_32(value.data(), value.size(), col_to_data[i]);
} else {
murmur_hash3_x64_64(value.data(), value.size(), col_to_data[i],
col_to_data.data() + i);
col_to_data[i] = HashUtil::murmur_hash3_64<is_mmh64_v2>(
value.data(), value.size(), col_to_data[i]);
}
}
} else {
Expand All @@ -118,6 +127,22 @@ using FunctionMurmurHash3_32 =
FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<TYPE_INT>>;
using FunctionMurmurHash3_64 =
FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<TYPE_BIGINT>>;
using FunctionMurmurHash3_64_V2 =
FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<TYPE_BIGINT, true>>;

#ifdef BE_TEST
const char* murmur_hash3_get_name_type_int_for_test() {
return MurmurHash3Impl<TYPE_INT>::get_name();
}

const char* murmur_hash3_get_name_type_bigint_for_test() {
return MurmurHash3Impl<TYPE_BIGINT>::get_name();
}

const char* murmur_hash3_get_name_type_bigint_v2_for_test() {
return MurmurHash3Impl<TYPE_BIGINT, true>::get_name();
}
#endif

template <PrimitiveType ReturnType>
struct XxHashImpl {
Expand Down Expand Up @@ -204,7 +229,9 @@ using FunctionXxHash_64 = FunctionVariadicArgumentsBase<DataTypeInt64, XxHashImp
void register_function_hash(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMurmurHash3_32>();
factory.register_function<FunctionMurmurHash3_64>();
factory.register_function<FunctionMurmurHash3_64_V2>();
factory.register_function<FunctionXxHash_32>();
factory.register_function<FunctionXxHash_64>();
factory.register_alias("xxhash_64", "xxhash3_64");
}
} // namespace doris::vectorized
10 changes: 9 additions & 1 deletion be/src/vec/functions/function_hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,12 @@

#include "vec/core/types.h"

namespace doris::vectorized {} // namespace doris::vectorized
namespace doris::vectorized {

#ifdef BE_TEST
const char* murmur_hash3_get_name_type_int_for_test();
const char* murmur_hash3_get_name_type_bigint_for_test();
const char* murmur_hash3_get_name_type_bigint_v2_for_test();
#endif

} // namespace doris::vectorized
63 changes: 63 additions & 0 deletions be/test/vec/function/function_hash_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

#include "vec/functions/function_hash.h"

#include <stdint.h>

#include <string>
Expand All @@ -24,6 +26,7 @@
#include "function_test_util.h"
#include "gtest/gtest_pred_impl.h"
#include "testutil/any_type.h"
#include "util/murmur_hash3.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h"
Expand Down Expand Up @@ -96,6 +99,26 @@ TEST(HashFunctionTest, murmur_hash_3_64_test) {
};
}

TEST(HashFunctionTest, murmur_hash_3_64_v2_test) {
std::string func_name = "murmur_hash3_64_v2";

{
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};

DataSet data_set = {{{std::string("1000209601_1756808272")}, (int64_t)4038800892574899471},
{{std::string("hello world")}, (int64_t)5998619086395760910},
{{std::string("apache doris")}, (int64_t)3669213779466221743}};

static_cast<void>(check_function<DataTypeInt64, true>(func_name, input_types, data_set));
};
}

TEST(HashFunctionTest, murmur_hash_get_name_test) {
EXPECT_STREQ(murmur_hash3_get_name_type_int_for_test(), "murmur_hash3_32");
EXPECT_STREQ(murmur_hash3_get_name_type_bigint_for_test(), "murmur_hash3_64");
EXPECT_STREQ(murmur_hash3_get_name_type_bigint_v2_for_test(), "murmur_hash3_64_v2");
}

TEST(HashFunctionTest, xxhash_32_test) {
std::string func_name = "xxhash_32";

Expand Down Expand Up @@ -220,4 +243,44 @@ TEST(HashFunctionTest, xxhash_64_test) {
};
}

TEST(HashFunctionTest, murmur_hash3_helper_functions_test) {
{
std::string input = "hello world";
uint64_t h1 = 0;
uint64_t h2 = 0;
murmur_hash3_x64_process(input.data(), input.size(), h1, h2);
EXPECT_EQ(h1, 5998619086395760910ULL);
EXPECT_EQ(h2, 12364428806279881649ULL);
}

{
std::string input = "hello world";
uint64_t out[2] = {0, 0};
murmur_hash3_x64_128(input.data(), input.size(), 0, out);
EXPECT_TRUE(out[0] == 5998619086395760910ULL && out[1] == 12364428806279881649ULL);
}

{
std::string input = "hello world";
uint64_t out = 0;
murmur_hash3_x64_64_shared(input.data(), input.size(), 0, &out);
EXPECT_EQ(out, 5998619086395760910ULL);
}

{
std::string input = "hello";
uint64_t out = 0;
murmur_hash3_x64_64(input.data(), input.size(), 0, &out);
EXPECT_EQ(out, static_cast<uint64_t>(-3215607508166160593LL));
}

{
std::string input = "";
uint64_t h1 = 0, h2 = 0;
murmur_hash3_x64_process(input.data(), input.size(), h1, h2);
EXPECT_EQ(h1, 0ULL);
EXPECT_EQ(h2, 0ULL);
}
}

} // namespace doris::vectorized
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364V2;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NextDay;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch;
Expand Down Expand Up @@ -857,6 +858,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(MultiSearchAllPositions.class, "multi_search_all_positions"),
scalar(MurmurHash332.class, "murmur_hash3_32"),
scalar(MurmurHash364.class, "murmur_hash3_64"),
scalar(MurmurHash364V2.class, "murmur_hash3_64_v2"),
scalar(Negative.class, "negative"),
scalar(NextDay.class, "next_day"),
scalar(NonNullable.class, "non_nullable"),
Expand Down Expand Up @@ -1032,7 +1034,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(WeeksSub.class, "weeks_sub"),
scalar(WidthBucket.class, "width_bucket"),
scalar(XxHash32.class, "xxhash_32"),
scalar(XxHash64.class, "xxhash_64"),
scalar(XxHash64.class, "xxhash_64", "xxhash3_64"),
scalar(Xor.class, "xor"),
scalar(XpathString.class, "xpath_string"),
scalar(Year.class, "year"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.BigIntType;
import org.apache.doris.nereids.types.StringType;
import org.apache.doris.nereids.types.VarcharType;
import org.apache.doris.nereids.util.ExpressionUtils;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'murmur_hash3_64_v2'. This class is generated by GenerateFunction.
*/
public class MurmurHash364V2 extends ScalarFunction
implements ExplicitlyCastableSignature, PropagateNullable {
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(BigIntType.INSTANCE).varArgs(VarcharType.SYSTEM_DEFAULT),
FunctionSignature.ret(BigIntType.INSTANCE).varArgs(StringType.INSTANCE)
);

/**
* constructor with 1 or more arguments.
*/
public MurmurHash364V2(Expression arg, Expression... varArgs) {
super("murmur_hash3_64_v2", ExpressionUtils.mergeArguments(arg, varArgs));
}

/** constructor for withChildren and reuse signature */
private MurmurHash364V2(ScalarFunctionParams functionParams) {
super(functionParams);
}

/**
* withChildren.
*/
@Override
public MurmurHash364V2 withChildren(List<Expression> children) {
Preconditions.checkArgument(!children.isEmpty());
return new MurmurHash364V2(getFunctionParams(children));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitMurmurHash364V2(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364V2;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NextDay;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch;
Expand Down Expand Up @@ -1768,6 +1769,10 @@ default R visitMurmurHash364(MurmurHash364 murmurHash364, C context) {
return visitScalarFunction(murmurHash364, context);
}

default R visitMurmurHash364V2(MurmurHash364V2 murmurHash364V2, C context) {
return visitScalarFunction(murmurHash364V2, context);
}

default R visitXxHash32(XxHash32 xxHash32, C context) {
return visitScalarFunction(xxHash32, context);
}
Expand Down
Loading
Loading