Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support scientific notation in parse_float_as_decimal #8494

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ unicode_expressions = []
[dependencies]
arrow = { workspace = true }
arrow-schema = { workspace = true }
bigdecimal = { workspace = true }
datafusion-common = { workspace = true }
datafusion-expr = { workspace = true }
log = { workspace = true }
Expand Down
63 changes: 40 additions & 23 deletions datafusion/sql/src/expr/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@

use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
use arrow::compute::kernels::cast_utils::parse_interval_month_day_nano;
use arrow::datatypes::DECIMAL128_MAX_PRECISION;
use arrow_schema::DataType;
use arrow::datatypes::{ArrowNativeTypeOp, DECIMAL128_MAX_PRECISION};
use arrow_schema::{DataType, DECIMAL128_MAX_SCALE};
use bigdecimal::BigDecimal;
use datafusion_common::{
not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue,
};
Expand All @@ -30,6 +31,7 @@ use log::debug;
use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, Value};
use sqlparser::parser::ParserError::ParserError;
use std::borrow::Cow;
use std::str::FromStr;

impl<'a, S: ContextProvider> SqlToRel<'a, S> {
pub(crate) fn parse_value(
Expand Down Expand Up @@ -405,39 +407,48 @@ const fn try_decode_hex_char(c: u8) -> Option<u8> {
}

/// Parse Decimal128 from a string
///
/// TODO: support parsing from scientific notation
fn parse_decimal_128(unsigned_number: &str, negative: bool) -> Result<Expr> {
// remove leading zeroes
let trimmed = unsigned_number.trim_start_matches('0');
// parse precision and scale, remove decimal point if exists
let (precision, scale, replaced_str) = if trimmed == "." {
// special cases for numbers such as “0.”, “000.”, and so on.
(1, 0, Cow::Borrowed("0"))
} else if let Some(i) = trimmed.find('.') {
let big_decimal = BigDecimal::from_str(unsigned_number).map_err(|e| {
DataFusionError::from(ParserError(format!(
"Cannot parse {unsigned_number} into BigDecimal when building decimal: {e}"
)))
})?;

let replaced_str = big_decimal.to_string().replace('.', "");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once you have parsed the data into a BigDecimal, I think you could just convert to a i128 directly which would be more performant and I think potentially more correct

Perhaps you could use
https://docs.rs/bigdecimal/latest/bigdecimal/struct.BigDecimal.html#method.fractional_digit_count to get the scale

And then compute the precision and the i128 from
https://docs.rs/bigdecimal/latest/bigdecimal/struct.BigDecimal.html#method.into_bigint_and_exponent


let (precision, scale) = if big_decimal.fractional_digit_count() < 0 {
(
trimmed.len() - 1,
trimmed.len() - i - 1,
Cow::Owned(trimmed.replace('.', "")),
(replaced_str.len() as i64)
.add_checked(big_decimal.fractional_digit_count())? as u64,
big_decimal.fractional_digit_count(),
)
} else {
// no decimal point, keep as is
(trimmed.len(), 0, Cow::Borrowed(trimmed))
(
replaced_str.len() as u64,
big_decimal.fractional_digit_count(),
)
};

// check precision overflow
if precision > DECIMAL128_MAX_PRECISION as u64 {
return Err(DataFusionError::from(ParserError(format!(
"Cannot parse {unsigned_number} as i128 when building decimal: precision overflow"
))));
}

// check scale overflow
if scale < -DECIMAL128_MAX_SCALE as i64 || scale > DECIMAL128_MAX_SCALE as i64 {
return Err(DataFusionError::from(ParserError(format!(
"Cannot parse {unsigned_number} as i128 when building decimal: scale overflow"
))));
}

let number = replaced_str.parse::<i128>().map_err(|e| {
DataFusionError::from(ParserError(format!(
"Cannot parse {replaced_str} as i128 when building decimal: {e}"
)))
})?;

// check precision overflow
if precision as u8 > DECIMAL128_MAX_PRECISION {
return Err(DataFusionError::from(ParserError(format!(
"Cannot parse {replaced_str} as i128 when building decimal: precision overflow"
))));
}

Ok(Expr::Literal(ScalarValue::Decimal128(
Some(if negative { -number } else { number }),
precision as u8,
Expand Down Expand Up @@ -468,4 +479,10 @@ mod tests {
assert_eq!(output, expect);
}
}

#[test]
fn test_parse_128_decimal_number() {
let number = "1.23456e10";
parse_decimal_128(number, false).unwrap();
}
}
4 changes: 2 additions & 2 deletions datafusion/sql/tests/sql_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ fn parse_decimals() {
let test_data = [
("1", "Int64(1)"),
("001", "Int64(1)"),
("0.1", "Decimal128(Some(1),1,1)"),
("0.01", "Decimal128(Some(1),2,2)"),
("0.1", "Decimal128(Some(1),2,1)"),
("0.01", "Decimal128(Some(1),3,2)"),
Weijun-H marked this conversation as resolved.
Show resolved Hide resolved
("1.0", "Decimal128(Some(10),2,1)"),
("10.01", "Decimal128(Some(1001),4,2)"),
(
Expand Down
22 changes: 17 additions & 5 deletions datafusion/sqllogictest/test_files/options.slt
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ select .0 as c1, 0. as c2, 0000. as c3, 00000.00 as c4
query TTTT
select arrow_typeof(.0) as c1, arrow_typeof(0.) as c2, arrow_typeof(0000.) as c3, arrow_typeof(00000.00) as c4
----
Decimal128(1, 1) Decimal128(1, 0) Decimal128(1, 0) Decimal128(2, 2)
Decimal128(2, 1) Decimal128(1, 0) Decimal128(1, 0) Decimal128(3, 2)
Weijun-H marked this conversation as resolved.
Show resolved Hide resolved

query RR
select 999999999999999999999999999999999999, -999999999999999999999999999999999999
Expand Down Expand Up @@ -193,19 +193,31 @@ select arrow_typeof(00009999999999999999999999999999999999.9999),
Decimal128(38, 4) Decimal128(38, 4) Decimal128(20, 0)

# precision overflow
statement error DataFusion error: SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
statement error DataFusion error: SQL error: ParserError\("Cannot parse 123456789.012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
select 123456789.012345678901234567890123456789

statement error SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
statement error SQL error: ParserError\("Cannot parse 123456789.012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
select -123456789.012345678901234567890123456789

# can not fit in i128
statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\)
statement error SQL error: ParserError\("Cannot parse 123456789.0123456789012345678901234567890 as i128 when building decimal: precision overflow"\)
select 123456789.0123456789012345678901234567890

statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\)
statement error SQL error: ParserError\("Cannot parse 123456789.0123456789012345678901234567890 as i128 when building decimal: precision overflow"\)
select -123456789.0123456789012345678901234567890

# scientific notation
query RRRR
select 1.234e2, 1.234e-2, -1.234e2, -1.234e-2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.234e-2 is not -123.4 I don't think 🤔

DataFusion CLI v33.0.0
❯ select 1.234e-2;
+------------------+
| Float64(0.01234) |
+------------------+
| 0.01234          |
+------------------+

----
123.4 0.01234 -123.4 -0.01234

statement error DataFusion error: SQL error: ParserError\("Cannot parse 1.234e-40 as i128 when building decimal: precision overflow"\)
select 1.234e-40

statement error DataFusion error: SQL error: ParserError\("Cannot parse 12340000000000000000000000000000000000000 as i128 when building decimal: number too large to fit in target type"\)
select 1.234e+40

# Restore option to default value
statement ok
set datafusion.sql_parser.parse_float_as_decimal = false;