apache · chenkovsky · Aug 8, 2025 · Aug 8, 2025 · Aug 11, 2025 · Aug 11, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
@@ -37,6 +37,7 @@ name = "datafusion_spark"
 
 [dependencies]
 arrow = { workspace = true }
+bigdecimal = { workspace = true }
 chrono = { workspace = true }
 crc32fast = "1.4"
 datafusion-catalog = { workspace = true }
@@ -47,6 +48,10 @@ datafusion-functions = { workspace = true, features = ["crypto_expressions"] }
 datafusion-macros = { workspace = true }
 log = { workspace = true }
 sha1 = "0.10"
+# XXH3 and XXHash64 are two different algorithms. 
+# The xxhash-rust crate only implements XXH3, whose results differ from Spark. 
+# Therefore, twox-hash is introduced.
+twox-hash = "2.1.1"
 url = { workspace = true }
 xxhash-rust = { version = "0.8", features = ["xxh3"] }
 

diff --git a/datafusion/spark/src/function/hash/mod.rs b/datafusion/spark/src/function/hash/mod.rs
@@ -16,8 +16,11 @@
 // under the License.
 
 pub mod crc32;
+pub mod murmur3;
 pub mod sha1;
 pub mod sha2;
+pub mod utils;
+pub mod xxhash64;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
@@ -26,16 +29,20 @@ use std::sync::Arc;
 make_udf_function!(crc32::SparkCrc32, crc32);
 make_udf_function!(sha1::SparkSha1, sha1);
 make_udf_function!(sha2::SparkSha2, sha2);
+make_udf_function!(xxhash64::SparkXxHash64, xxhash64);
+make_udf_function!(murmur3::SparkMurmur3Hash, hash);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
     export_functions!(
         (crc32, "crc32(expr) - Returns a cyclic redundancy check value of the expr as a bigint.", arg1),
         (sha1, "sha1(expr) - Returns a SHA-1 hash value of the expr as a hex string.", arg1),
-        (sha2, "sha2(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of expr. SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.", arg1 arg2)
+        (sha2, "sha2(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of expr. SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.", arg1 arg2),
+        (xxhash64, "xxhash64(*expr) - Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm, and returns the result as a long column. The hash computation uses an initial seed of 42.", args),
+        (hash, "hash(*expr) - Calculates the hash code of given columns using the 32-bit variant of the MurmurHash3 algorithm, and returns the result as an integer column. The hash computation uses an initial seed of 42.", args)
     );
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![crc32(), sha1(), sha2()]
+    vec![crc32(), sha1(), sha2(), xxhash64(), hash()]
 }
diff --git a/datafusion/spark/src/function/hash/murmur3.rs b/datafusion/spark/src/function/hash/murmur3.rs
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, ArrowNativeTypeOp, PrimitiveArray};
+use arrow::datatypes::{DataType, Int32Type};
+use datafusion_common::Result;
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::function::hash::utils::SparkHasher;
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#hash>
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub struct SparkMurmur3Hash {
+    signature: Signature,
+    seed: i64,
+}
+
+impl Default for SparkMurmur3Hash {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMurmur3Hash {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+            seed: 42,
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMurmur3Hash {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "hash"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let func = |arr: &[ArrayRef]| {
+            let mut result = vec![self.seed as i32; arr[0].len()];
+            Murmur3Hasher::hash_arrays(arr, &mut result)?;
+            Ok(Arc::new(PrimitiveArray::<Int32Type>::from(result)) as ArrayRef)
+        };
+        make_scalar_function(func, vec![])(&args.args)
+    }
+}
+
+// The following code is copied from
+// <https://github.com/apache/datafusion-comet/blob/main/native/spark-expr/src/hash_funcs/murmur3.rs>
+#[inline]
+pub fn murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32) -> u32 {
+    #[inline]
+    fn mix_k1(mut k1: i32) -> i32 {
+        k1 = k1.mul_wrapping(0xcc9e2d51u32 as i32);
+        k1 = k1.rotate_left(15);
+        k1 = k1.mul_wrapping(0x1b873593u32 as i32);
+        k1
+    }
+
+    #[inline]
+    fn mix_h1(mut h1: i32, k1: i32) -> i32 {
+        h1 ^= k1;
+        h1 = h1.rotate_left(13);
+        h1 = h1.mul_wrapping(5).add_wrapping(0xe6546b64u32 as i32);
+        h1
+    }
+
+    #[inline]
+    fn fmix(mut h1: i32, len: i32) -> i32 {
+        h1 ^= len;
+        h1 ^= (h1 as u32 >> 16) as i32;
+        h1 = h1.mul_wrapping(0x85ebca6bu32 as i32);
+        h1 ^= (h1 as u32 >> 13) as i32;
+        h1 = h1.mul_wrapping(0xc2b2ae35u32 as i32);
+        h1 ^= (h1 as u32 >> 16) as i32;
+        h1
+    }
+
+    #[inline]
+    unsafe fn hash_bytes_by_int(data: &[u8], seed: u32) -> i32 {
+        // safety: data length must be aligned to 4 bytes
+        let mut h1 = seed as i32;
+        for i in (0..data.len()).step_by(4) {
+            let ints = data.as_ptr().add(i) as *const i32;
+            let half_word = ints.read_unaligned();
+            h1 = mix_h1(h1, mix_k1(half_word));
+        }
+        h1
+    }
+    let data = data.as_ref();
+    let len = data.len();
+    let len_aligned = len - len % 4;
+
+    // safety:
+    // avoid boundary checking in performance critical codes.
+    // all operations are guaranteed to be safe
+    // data is &[u8] so we do not need to check for proper alignment
+    unsafe {
+        let mut h1 = if len_aligned > 0 {
+            hash_bytes_by_int(&data[0..len_aligned], seed)
+        } else {
+            seed as i32
+        };
+
+        for i in len_aligned..len {
+            let half_word = *data.get_unchecked(i) as i8 as i32;
+            h1 = mix_h1(h1, mix_k1(half_word));
+        }
+        fmix(h1, len as i32) as u32
+    }
+}
+
+struct Murmur3Hasher;
+
+impl SparkHasher<i32> for Murmur3Hasher {
+    fn oneshot(seed: i32, data: &[u8]) -> i32 {
+        murmur3_hash(data, seed as u32) as i32
+    }
+}