diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index b30238762a36..5b45b4b06967 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -23,7 +23,8 @@ use arrow::{ compute::kernels::cast::{cast, cast_with_options, CastOptions}, datatypes::{ ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Float32Type, - Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit, TimeUnit, + Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, + IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, DECIMAL_MAX_PRECISION, @@ -618,6 +619,28 @@ impl ScalarValue { precision, scale ))) } + + /// Returns a [`ScalarValue::IntervalYearMonth`] representing + /// `years` years and `months` months + pub fn new_interval_ym(years: i32, months: i32) -> Self { + let val = IntervalYearMonthType::make_value(years, months); + ScalarValue::IntervalYearMonth(Some(val)) + } + + /// Returns a [`ScalarValue::IntervalDayTime`] representing + /// `days` days and `millis` milliseconds + pub fn new_interval_dt(days: i32, millis: i32) -> Self { + let val = IntervalDayTimeType::make_value(days, millis); + Self::IntervalDayTime(Some(val)) + } + + /// Returns a [`ScalarValue::IntervalMonthDayNano`] representing + /// `months` months and `days` days, and `nanos` nanoseconds + pub fn new_interval_mdn(months: i32, days: i32, nanos: i64) -> Self { + let val = IntervalMonthDayNanoType::make_value(months, days, nanos); + ScalarValue::IntervalMonthDayNano(Some(val)) + } + /// Getter for the `DataType` of the value pub fn get_datatype(&self) -> DataType { match self { diff --git a/datafusion/sql/src/interval.rs b/datafusion/sql/src/interval.rs new file mode 100644 index 000000000000..303b16a23c07 --- /dev/null +++ b/datafusion/sql/src/interval.rs @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Interval parsing logic +use datafusion_common::{DataFusionError, Result, ScalarValue}; +use std::str::FromStr; + +const SECONDS_PER_HOUR: f32 = 3_600_f32; +const MILLIS_PER_SECOND: f32 = 1_000_f32; + +/// Parses a string with an interval like `'0.5 MONTH'` to an +/// appropriately typed [`ScalarValue`] +pub(crate) fn parse_interval(leading_field: &str, value: &str) -> Result { + // We are storing parts as integers, it's why we need to align parts fractional + // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days + // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours + let align_interval_parts = + |month_part: f32, mut day_part: f32, mut milles_part: f32| -> (i32, i32, f32) { + // Convert fractional month to days, It's not supported by Arrow types, but anyway + day_part += (month_part - (month_part as i32) as f32) * 30_f32; + + // Convert fractional days to hours + milles_part += (day_part - ((day_part as i32) as f32)) + * 24_f32 + * SECONDS_PER_HOUR + * MILLIS_PER_SECOND; + + (month_part as i32, day_part as i32, milles_part) + }; + + let calculate_from_part = + |interval_period_str: &str, interval_type: &str| -> Result<(i32, i32, f32)> { + // @todo It's better to use Decimal in order to protect rounding errors + // Wait https://github.com/apache/arrow/pull/9232 + let interval_period = match f32::from_str(interval_period_str) { + Ok(n) => n, + Err(_) => { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with value {:?}", + value + ))); + } + }; + + if interval_period > (i32::MAX as f32) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + match interval_type.to_lowercase().as_str() { + "year" => Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0)), + "month" => Ok(align_interval_parts(interval_period, 0.0, 0.0)), + "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)), + "hour" | "hours" => { + Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND)) + } + "minutes" | "minute" => { + Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND)) + } + "seconds" | "second" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)), + "milliseconds" | "millisecond" => Ok((0, 0, interval_period)), + _ => Err(DataFusionError::NotImplemented(format!( + "Invalid input syntax for type interval: {:?}", + value + ))), + } + }; + + let mut result_month: i64 = 0; + let mut result_days: i64 = 0; + let mut result_millis: i64 = 0; + + let mut parts = value.split_whitespace(); + + loop { + let interval_period_str = parts.next(); + if interval_period_str.is_none() { + break; + } + + let unit = parts.next().unwrap_or(leading_field); + + let (diff_month, diff_days, diff_millis) = + calculate_from_part(interval_period_str.unwrap(), unit)?; + + result_month += diff_month as i64; + + if result_month > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + result_days += diff_days as i64; + + if result_days > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + result_millis += diff_millis as i64; + + if result_millis > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + } + + // Interval is tricky thing + // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month + // The true way to store and calculate intervals is to store it as it defined + // It's why we there are 3 different interval types in Arrow + if result_month != 0 && (result_days != 0 || result_millis != 0) { + let result: i128 = ((result_month as i128) << 96) + | ((result_days as i128) << 64) + // IntervalMonthDayNano uses nanos, but IntervalDayTime uses milles + | ((result_millis * 1_000_000_i64) as i128); + + return Ok(ScalarValue::IntervalMonthDayNano(Some(result))); + } + + // Month interval + if result_month != 0 { + return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32))); + } + + let result: i64 = (result_days << 32) | result_millis; + Ok(ScalarValue::IntervalDayTime(Some(result))) +} + +#[cfg(test)] +mod test { + use crate::assert_contains; + + use super::*; + + #[test] + fn test_parse_ym() { + assert_eq!( + parse_interval("months", "1 month").unwrap(), + ScalarValue::new_interval_ym(0, 1) + ); + + assert_eq!( + parse_interval("months", "2 month").unwrap(), + ScalarValue::new_interval_ym(0, 2) + ); + + assert_eq!( + parse_interval("months", "3 year 1 month").unwrap(), + ScalarValue::new_interval_ym(3, 1) + ); + + assert_contains!( + parse_interval("months", "1 years 1 month") + .unwrap_err() + .to_string(), + r#"Invalid input syntax for type interval: "1 years 1 month""# + ); + } + + #[test] + fn test_dt() { + assert_eq!( + parse_interval("months", "5 days").unwrap(), + ScalarValue::new_interval_dt(5, 0) + ); + + assert_eq!( + parse_interval("months", "7 days 3 hours").unwrap(), + ScalarValue::new_interval_dt( + 7, + (3.0 * SECONDS_PER_HOUR * MILLIS_PER_SECOND) as i32 + ) + ); + + assert_eq!( + parse_interval("months", "7 days 5 minutes").unwrap(), + ScalarValue::new_interval_dt(7, (5.0 * 60.0 * MILLIS_PER_SECOND) as i32) + ); + } + + #[test] + // https://github.com/apache/arrow-rs/pull/2235 + #[ignore] + fn test_mdn() { + // these should be the same, I think + assert_eq!( + parse_interval("months", "1 year 25 millisecond").unwrap(), + ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000) + ); + } +} diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index 19404419b26c..20d3ed2f72be 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -18,6 +18,7 @@ //! This module provides a SQL parser that translates SQL queries into an abstract syntax //! tree (AST), and a SQL query planner that creates a logical plan from the AST. +mod interval; pub mod parser; pub mod planner; mod table_reference; diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 9c4f5f027e89..593140120966 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -17,6 +17,7 @@ //! SQL Query Planner (produces logical plan from SQL AST) +use crate::interval::parse_interval; use crate::parser::{CreateExternalTable, DescribeTable, Statement as DFStatement}; use arrow::datatypes::*; use datafusion_common::ToDFSchema; @@ -2216,146 +2217,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - const SECONDS_PER_HOUR: f32 = 3_600_f32; - const MILLIS_PER_SECOND: f32 = 1_000_f32; - - // We are storing parts as integers, it's why we need to align parts fractional - // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days - // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours - let align_interval_parts = |month_part: f32, - mut day_part: f32, - mut milles_part: f32| - -> (i32, i32, f32) { - // Convert fractional month to days, It's not supported by Arrow types, but anyway - day_part += (month_part - (month_part as i32) as f32) * 30_f32; - - // Convert fractional days to hours - milles_part += (day_part - ((day_part as i32) as f32)) - * 24_f32 - * SECONDS_PER_HOUR - * MILLIS_PER_SECOND; - - (month_part as i32, day_part as i32, milles_part) - }; - - let calculate_from_part = |interval_period_str: &str, - interval_type: &str| - -> Result<(i32, i32, f32)> { - // @todo It's better to use Decimal in order to protect rounding errors - // Wait https://github.com/apache/arrow/pull/9232 - let interval_period = match f32::from_str(interval_period_str) { - Ok(n) => n, - Err(_) => { - return Err(DataFusionError::SQL(ParserError(format!( - "Unsupported Interval Expression with value {:?}", - value - )))); - } - }; - - if interval_period > (i32::MAX as f32) { - return Err(DataFusionError::NotImplemented(format!( - "Interval field value out of range: {:?}", - value - ))); - } - - match interval_type.to_lowercase().as_str() { - "year" => Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0)), - "month" => Ok(align_interval_parts(interval_period, 0.0, 0.0)), - "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)), - "hour" | "hours" => { - Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND)) - } - "minutes" | "minute" => { - Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND)) - } - "seconds" | "second" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)), - "milliseconds" | "millisecond" => Ok((0, 0, interval_period)), - _ => Err(DataFusionError::NotImplemented(format!( - "Invalid input syntax for type interval: {:?}", - value - ))), - } - }; - - let mut result_month: i64 = 0; - let mut result_days: i64 = 0; - let mut result_millis: i64 = 0; - - let mut parts = value.split_whitespace(); - - loop { - let interval_period_str = parts.next(); - if interval_period_str.is_none() { - break; - } - - let leading_field = leading_field - .as_ref() - .map(|dt| dt.to_string()) - .unwrap_or_else(|| "second".to_string()); - - let unit = parts - .next() - .map(|part| part.to_string()) - .unwrap_or(leading_field); - - let (diff_month, diff_days, diff_millis) = - calculate_from_part(interval_period_str.unwrap(), &unit)?; - - result_month += diff_month as i64; - - if result_month > (i32::MAX as i64) { - return Err(DataFusionError::NotImplemented(format!( - "Interval field value out of range: {:?}", - value - ))); - } - - result_days += diff_days as i64; - - if result_days > (i32::MAX as i64) { - return Err(DataFusionError::NotImplemented(format!( - "Interval field value out of range: {:?}", - value - ))); - } - - result_millis += diff_millis as i64; - - if result_millis > (i32::MAX as i64) { - return Err(DataFusionError::NotImplemented(format!( - "Interval field value out of range: {:?}", - value - ))); - } - } - - // Interval is tricky thing - // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month - // The true way to store and calculate intervals is to store it as it defined - // It's why we there are 3 different interval types in Arrow - if result_month != 0 && (result_days != 0 || result_millis != 0) { - let result: i128 = ((result_month as i128) << 96) - | ((result_days as i128) << 64) - // IntervalMonthDayNano uses nanos, but IntervalDayTime uses milles - | ((result_millis * 1_000_000_i64) as i128); - - return Ok(Expr::Literal(ScalarValue::IntervalMonthDayNano(Some( - result, - )))); - } - - // Month interval - if result_month != 0 { - return Ok(Expr::Literal(ScalarValue::IntervalYearMonth(Some( - result_month as i32, - )))); - } + let leading_field = leading_field + .as_ref() + .map(|dt| dt.to_string()) + .unwrap_or_else(|| "second".to_string()); - let result: i64 = (result_days << 32) | result_millis; - Ok(Expr::Literal(ScalarValue::IntervalDayTime(Some(result)))) + Ok(lit(parse_interval(&leading_field, &value)?)) } fn show_variable_to_plan(&self, variable: &[Ident]) -> Result {