From 94cb9b39fa62dcd7657752fcda141409e8dc697b Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Wed, 27 Sep 2023 13:14:33 -0500 Subject: [PATCH] fix: use epoch instead of ce for date stats (#1672) # Description date32 statistics logic was subjectively wrong. It was using `from_num_days_from_ce_opt` which > Makes a new NaiveDate from a day's number in the proleptic Gregorian calendar, with January 1, 1 being day 1. while date32 is commonly represented as days since UNIX epoch (1970-01-01) # Related Issue(s) closes #1670 # Documentation It doesn't seem like parquet actually has a spec for what a `date` should be, but many other tools seem to use the epoch logic. duckdb, and polars seem to use epoch instead of gregorian. Also arrow spec states that date32 should be epoch. for example, if i write using polars ```py import polars as pl # %% df = pl.DataFrame( { "a": [ 10561, 9200, 9201, 9202, 9203, 9204, 9205, 9206, 9207, 9208, 9199, ] } ) # %% df.select(pl.col("a").cast(pl.Date)).write_delta("./db/polars/") ``` the stats are correctly interpreted ``` {"add":{"path":"0-7b8f11ab-a259-4673-be06-9deedeec34ff-0.parquet","size":557,"partitionValues":{},"modificationTime":1695779554372,"dataChange":true,"stats":"{\"numRecords\": 11, \"minValues\": {\"a\": \"1995-03-10\"}, \"maxValues\": {\"a\": \"1998-12-01\"}, \"nullCount\": {\"a\": 0}}"}} ``` --- rust/src/writer/stats.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/rust/src/writer/stats.rs b/rust/src/writer/stats.rs index 0d369de46d..6cd1961798 100644 --- a/rust/src/writer/stats.rs +++ b/rust/src/writer/stats.rs @@ -147,12 +147,8 @@ impl StatsScalar { (Statistics::Boolean(v), _) => Ok(Self::Boolean(get_stat!(v))), // Int32 can be date, decimal, or just int32 (Statistics::Int32(v), Some(LogicalType::Date)) => { - let date = chrono::NaiveDate::from_num_days_from_ce_opt(get_stat!(v)).ok_or( - DeltaWriterError::StatsParsingFailed { - debug_value: v.to_string(), - logical_type: Some(LogicalType::Date), - }, - )?; + let epoch_start = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); // creating from epoch should be infallible + let date = epoch_start + chrono::Duration::days(get_stat!(v) as i64); Ok(Self::Date(date)) } (Statistics::Int32(v), Some(LogicalType::Decimal { scale, .. })) => { @@ -540,9 +536,9 @@ mod tests { Value::from(12340.0), ), ( - simple_parquet_stat!(Statistics::Int32, 737821), + simple_parquet_stat!(Statistics::Int32, 10561), Some(LogicalType::Date), - Value::from("2021-01-31"), + Value::from("1998-12-01"), ), ( simple_parquet_stat!(Statistics::Int64, 1641040496789123456),