apache · comphead · Aug 6, 2025 · Aug 4, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
@@ -38,13 +38,16 @@ name = "datafusion_spark"
 [dependencies]
 arrow = { workspace = true }
 chrono = { workspace = true }
+crc32fast = "1.4"
 datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-functions = { workspace = true, features = ["crypto_expressions"] }
 datafusion-macros = { workspace = true }
 log = { workspace = true }
+sha1 = "0.10"
+xxhash-rust = { version = "0.8", features = ["xxh3"] }
 
 [dev-dependencies]
 criterion = { workspace = true }

diff --git a/datafusion/spark/src/function/hash/crc32.rs b/datafusion/spark/src/function/hash/crc32.rs
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array};
+use arrow::datatypes::DataType;
+use crc32fast::Hasher;
+use datafusion_common::cast::{
+    as_binary_array, as_binary_view_array, as_large_binary_array,
+};
+use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#crc32>
+#[derive(Debug)]
+pub struct SparkCrc32 {
+    signature: Signature,
+}
+
+impl Default for SparkCrc32 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCrc32 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkCrc32 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "crc32"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_crc32, vec![])(&args.args)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() != 1 {
+            return exec_err!(
+                "`crc32` function requires 1 argument, got {}",
+                arg_types.len()
+            );
+        }
+        match arg_types[0] {
+            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
+                Ok(vec![arg_types[0].clone()])
+            }
+            DataType::Utf8 | DataType::Utf8View => Ok(vec![DataType::Binary]),
+            DataType::LargeUtf8 => Ok(vec![DataType::LargeBinary]),
+            DataType::Null => Ok(vec![DataType::Binary]),
+            _ => exec_err!("`crc32` function does not support type {}", arg_types[0]),
+        }
+    }
+}
+
+fn spark_crc32_digest(value: &[u8]) -> i64 {
+    let mut hasher = Hasher::new();
+    hasher.update(value);
+    hasher.finalize() as i64
+}
+
+fn spark_crc32_impl<'a>(input: impl Iterator<Item = Option<&'a [u8]>>) -> ArrayRef {
+    let result = input
+        .map(|value| value.map(spark_crc32_digest))
+        .collect::<Int64Array>();
+    Arc::new(result)
+}
+
+fn spark_crc32(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input] = args else {
+        return internal_err!(
+            "Spark `crc32` function requires 1 argument, got {}",
+            args.len()
+        );
+    };
+
+    match input.data_type() {
+        DataType::Binary => {
+            let input = as_binary_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        DataType::LargeBinary => {
+            let input = as_large_binary_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        DataType::BinaryView => {
+            let input = as_binary_view_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        _ => {
+            exec_err!(
+                "Spark `crc32` function: argument must be binary or large binary, got {:?}",
+                input.data_type()
+            )
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/hash/mod.rs b/datafusion/spark/src/function/hash/mod.rs
@@ -15,19 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod crc32;
+pub mod sha1;
 pub mod sha2;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
+make_udf_function!(crc32::SparkCrc32, crc32);
+make_udf_function!(sha1::SparkSha1, sha1);
 make_udf_function!(sha2::SparkSha2, sha2);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
-    export_functions!((sha2, "sha2(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of expr. SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.", arg1 arg2));
+    export_functions!(
+        (crc32, "crc32(expr) - Returns a cyclic redundancy check value of the expr as a bigint.", arg1),
+        (sha1, "sha1(expr) - Returns a SHA-1 hash value of the expr as a hex string.", arg1),
+        (sha2, "sha2(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of expr. SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.", arg1 arg2)
+    );
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![sha2()]
+    vec![crc32(), sha1(), sha2()]
 }
diff --git a/datafusion/spark/src/function/hash/sha1.rs b/datafusion/spark/src/function/hash/sha1.rs
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StringArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_binary_array, as_binary_view_array, as_large_binary_array,
+};
+use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use sha1::{Digest, Sha1};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#sha1>
+#[derive(Debug)]
+pub struct SparkSha1 {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkSha1 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSha1 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec!["sha".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSha1 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "sha1"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_sha1, vec![])(&args.args)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() != 1 {
+            return exec_err!(
+                "`sha1` function requires 1 argument, got {}",
+                arg_types.len()
+            );
+        }
+        match arg_types[0] {
+            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
+                Ok(vec![arg_types[0].clone()])
+            }
+            DataType::Utf8 | DataType::Utf8View => Ok(vec![DataType::Binary]),
+            DataType::LargeUtf8 => Ok(vec![DataType::LargeBinary]),
+            DataType::Null => Ok(vec![DataType::Binary]),
+            _ => exec_err!("`sha1` function does not support type {}", arg_types[0]),
+        }
+    }
+}
+
+fn spark_sha1_digest(value: &[u8]) -> String {
+    let result = Sha1::digest(value);
+    let mut s = String::with_capacity(result.len() * 2);
+    for b in result.as_slice() {
+        #[allow(clippy::unwrap_used)]
+        write!(&mut s, "{b:02x}").unwrap();
+    }
+    s
+}
+
+fn spark_sha1_impl<'a>(input: impl Iterator<Item = Option<&'a [u8]>>) -> ArrayRef {
+    let result = input
+        .map(|value| value.map(spark_sha1_digest))
+        .collect::<StringArray>();
+    Arc::new(result)
+}
+
+fn spark_sha1(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input] = args else {
+        return internal_err!(
+            "Spark `sha1` function requires 1 argument, got {}",
+            args.len()
+        );
+    };
+
+    match input.data_type() {
+        DataType::Binary => {
+            let input = as_binary_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        DataType::LargeBinary => {
+            let input = as_large_binary_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        DataType::BinaryView => {
+            let input = as_binary_view_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        _ => {
+            exec_err!(
+                "Spark `sha1` function: argument must be binary or large binary, got {:?}",
+                input.data_type()
+            )
+        }
+    }
+}