From 0eade48675f8fb9a4861d510b167f39afd100bdd Mon Sep 17 00:00:00 2001 From: tangruilin Date: Thu, 25 Jan 2024 21:34:34 +0800 Subject: [PATCH] [task #8987]add_to_date_function Signed-off-by: tangruilin --- datafusion/common/src/scalar.rs | 7 ++ datafusion/expr/src/built_in_function.rs | 6 ++ datafusion/expr/src/expr_fn.rs | 5 ++ .../physical-expr/src/datetime_expressions.rs | 72 ++++++++++++++++++- datafusion/physical-expr/src/functions.rs | 1 + datafusion/proto/proto/datafusion.proto | 1 + datafusion/proto/src/generated/pbjson.rs | 3 + datafusion/proto/src/generated/prost.rs | 3 + .../proto/src/logical_plan/from_proto.rs | 11 +++ datafusion/proto/src/logical_plan/to_proto.rs | 1 + datafusion/sqllogictest/test_files/dates.slt | 24 +++++++ 11 files changed, 132 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 2f9e374bd7f44..c0d04a7f3a88b 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -52,6 +52,7 @@ use arrow::{ }, }; use arrow_array::cast::as_list_array; +use arrow_array::types::Date32Type; /// A dynamically typed, nullable single value, (the single-valued counter-part /// to arrow's [`Array`]) @@ -3239,6 +3240,12 @@ impl ScalarType for TimestampNanosecondType { } } +impl ScalarType for Date32Type { + fn scalar(r: Option) -> ScalarValue { + ScalarValue::Date32(r) + } +} + #[cfg(test)] mod tests { use std::cmp::Ordering; diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index b7bb17c86be72..7867ab834ce2a 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -287,6 +287,8 @@ pub enum BuiltinScalarFunction { ToTimestampSeconds, /// from_unixtime FromUnixtime, + /// to_date + ToDate, ///now Now, ///current_date @@ -490,6 +492,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::RegexpMatch => Volatility::Immutable, BuiltinScalarFunction::Struct => Volatility::Immutable, BuiltinScalarFunction::FromUnixtime => Volatility::Immutable, + BuiltinScalarFunction::ToDate => Volatility::Immutable, BuiltinScalarFunction::ArrowTypeof => Volatility::Immutable, BuiltinScalarFunction::OverLay => Volatility::Immutable, BuiltinScalarFunction::Levenshtein => Volatility::Immutable, @@ -806,6 +809,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ToTimestampMicros => Ok(Timestamp(Microsecond, None)), BuiltinScalarFunction::ToTimestampSeconds => Ok(Timestamp(Second, None)), BuiltinScalarFunction::FromUnixtime => Ok(Timestamp(Second, None)), + BuiltinScalarFunction::ToDate => Ok(Date32), BuiltinScalarFunction::Now => { Ok(Timestamp(Nanosecond, Some("+00:00".into()))) } @@ -1055,6 +1059,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::FromUnixtime => { Signature::uniform(1, vec![Int64], self.volatility()) } + BuiltinScalarFunction::ToDate => Signature::variadic_any(self.volatility()), BuiltinScalarFunction::Digest => Signature::one_of( vec![ Exact(vec![Utf8, Utf8]), @@ -1499,6 +1504,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ToTimestampSeconds => &["to_timestamp_seconds"], BuiltinScalarFunction::ToTimestampNanos => &["to_timestamp_nanos"], BuiltinScalarFunction::FromUnixtime => &["from_unixtime"], + BuiltinScalarFunction::ToDate => &["to_date"], // hashing functions BuiltinScalarFunction::Digest => &["digest"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 877066aabfedf..07bd6ad86f162 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -891,6 +891,11 @@ nary_scalar_expr!( scalar_expr!(DatePart, date_part, part date, "extracts a subfield from the date"); scalar_expr!(DateTrunc, date_trunc, part date, "truncates the date to a specified level of precision"); scalar_expr!(DateBin, date_bin, stride source origin, "coerces an arbitrary timestamp to the start of the nearest specified interval"); +nary_scalar_expr!( + ToDate, + to_date, + "converts string to date according to the given format" +); nary_scalar_expr!( ToTimestamp, to_timestamp, diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index c40a89b0ba918..bf78be6c3ffa1 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -53,8 +53,8 @@ use datafusion_common::cast::{ as_timestamp_nanosecond_array, as_timestamp_second_array, }; use datafusion_common::{ - exec_err, internal_err, not_impl_err, DataFusionError, Result, ScalarType, - ScalarValue, + exec_err, internal_datafusion_err, internal_err, not_impl_err, DataFusionError, + Result, ScalarType, ScalarValue, }; use datafusion_expr::ColumnarValue; @@ -397,6 +397,39 @@ fn string_to_timestamp_nanos_shim(s: &str) -> Result { string_to_timestamp_nanos(s).map_err(|e| e.into()) } +fn to_date_impl(args: &[ColumnarValue], name: &str) -> Result { + match args.len() { + 1 => handle::( + args, + |s| { + string_to_timestamp_nanos_shim(s) + .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) + .and_then(|v| { + v.try_into().map_err(|_| { + internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") + }) + }) + }, + name, + ), + n if n >= 2 => handle_multiple::( + args, + |s, format| { + string_to_timestamp_nanos_formatted(s, format) + .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) + .and_then(|v| { + v.try_into().map_err(|_| { + internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") + }) + }) + }, + |n| n, + name, + ), + _ => internal_err!("Unsupported 0 argument count for function {name}"), + } +} + fn to_timestamp_impl>( args: &[ColumnarValue], name: &str, @@ -424,6 +457,11 @@ fn to_timestamp_impl>( } } +/// to_date SQL function +pub fn to_date(args: &[ColumnarValue]) -> Result { + to_date_impl(args, "to_date") +} + /// to_timestamp SQL function /// /// Note: `to_timestamp` returns `Timestamp(Nanosecond)` though its arguments are interpreted as **seconds**. @@ -1343,6 +1381,36 @@ fn validate_to_timestamp_data_types( None } +/// to_date SQL function implementation +pub fn to_date_invoke(args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!( + "to_date function requires 1 or more arguments, got {}", + args.len() + ); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_to_timestamp_data_types(args, "to_date") { + return value; + } + } + + match args[0].data_type() { + DataType::Int32 + | DataType::Int64 + | DataType::Null + | DataType::Float64 + | DataType::Date32 + | DataType::Date64 => cast_column(&args[0], &DataType::Date32, None), + DataType::Utf8 => to_date(args), + other => { + internal_err!("Unsupported data type {:?} for function to_date", other) + } + } +} + /// to_timestamp() SQL function implementation pub fn to_timestamp_invoke(args: &[ColumnarValue]) -> Result { if args.is_empty() { diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 21eaeab7213a1..0e95712a3822b 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -577,6 +577,7 @@ pub fn create_physical_fun( BuiltinScalarFunction::FromUnixtime => { Arc::new(datetime_expressions::from_unixtime_invoke) } + BuiltinScalarFunction::ToDate => Arc::new(datetime_expressions::to_date_invoke), BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::initcap::)(args) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index f2b5c5dd42395..fa9e3c238325a 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -671,6 +671,7 @@ enum ScalarFunction { InStr = 132; MakeDate = 133; ArrayReverse = 134; + ToDate = 135; } message ScalarFunctionNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index b9a8c5fc07826..6e463095af6fb 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22423,6 +22423,7 @@ impl serde::Serialize for ScalarFunction { Self::InStr => "InStr", Self::MakeDate => "MakeDate", Self::ArrayReverse => "ArrayReverse", + Self::ToDate => "ToDate", }; serializer.serialize_str(variant) } @@ -22567,6 +22568,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "InStr", "MakeDate", "ArrayReverse", + "ToDate", ]; struct GeneratedVisitor; @@ -22740,6 +22742,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "InStr" => Ok(ScalarFunction::InStr), "MakeDate" => Ok(ScalarFunction::MakeDate), "ArrayReverse" => Ok(ScalarFunction::ArrayReverse), + "ToDate" => Ok(ScalarFunction::ToDate), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 758ef2dcb5f38..a2e191ee51c48 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2766,6 +2766,7 @@ pub enum ScalarFunction { InStr = 132, MakeDate = 133, ArrayReverse = 134, + ToDate = 135, } impl ScalarFunction { /// String value of the enum field names used in the ProtoBuf definition. @@ -2907,6 +2908,7 @@ impl ScalarFunction { ScalarFunction::InStr => "InStr", ScalarFunction::MakeDate => "MakeDate", ScalarFunction::ArrayReverse => "ArrayReverse", + ScalarFunction::ToDate => "ToDate", } } /// Creates an enum from field names used in the ProtoBuf definition. @@ -3045,6 +3047,7 @@ impl ScalarFunction { "InStr" => Some(Self::InStr), "MakeDate" => Some(Self::MakeDate), "ArrayReverse" => Some(Self::ArrayReverse), + "ToDate" => Some(Self::ToDate), _ => None, } } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index decf3b18745fd..a4c188170be5c 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -575,6 +575,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Levenshtein => Self::Levenshtein, ScalarFunction::SubstrIndex => Self::SubstrIndex, ScalarFunction::FindInSet => Self::FindInSet, + ScalarFunction::ToDate => Self::ToDate, } } } @@ -1826,6 +1827,16 @@ pub fn parse_expr( ScalarFunction::StructFun => { Ok(struct_fun(parse_expr(&args[0], registry)?)) } + ScalarFunction::ToDate => { + let args: Vec<_> = args + .iter() + .map(|expr| parse_expr(expr, registry)) + .collect::>()?; + Ok(Expr::ScalarFunction(expr::ScalarFunction::new( + BuiltinScalarFunction::ToDate, + args, + ))) + } } } ExprType::ScalarUdfExpr(protobuf::ScalarUdfExprNode { fun_name, args }) => { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index e094994840b26..af0371e43cb1d 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1569,6 +1569,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Levenshtein => Self::Levenshtein, BuiltinScalarFunction::SubstrIndex => Self::SubstrIndex, BuiltinScalarFunction::FindInSet => Self::FindInSet, + BuiltinScalarFunction::ToDate => Self::ToDate, }; Ok(scalar_function) diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/dates.slt index a93a7ff7e73cd..c0b6f20a152d3 100644 --- a/datafusion/sqllogictest/test_files/dates.slt +++ b/datafusion/sqllogictest/test_files/dates.slt @@ -107,3 +107,27 @@ query ? SELECT '2023-01-01T00:00:00'::timestamp - DATE '2021-01-01'; ---- 730 days 0 hours 0 mins 0.000000000 secs + +# to_date_test +statement ok +create table to_date_t1(ts bigint) as VALUES + (1235865600000), + (1235865660000), + (1238544000000); + + +# query_cast_timestamp_millis +query D +SELECT to_date(ts / 100000000) FROM to_date_t1 LIMIT 3 +---- +2003-11-02 +2003-11-02 +2003-11-29 + +query D +SELECT to_date('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z'); +---- +2023-01-13 + +statement error DataFusion error: Internal error: to_date function unsupported data type at index 1: List +SELECT to_date('2022-08-03T14:38:50+05:30', make_array('%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+'));